summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs.c5
-rw-r--r--fs/9p/vfs_file.c4
-rw-r--r--fs/9p/vfs_inode.c65
-rw-r--r--fs/9p/vfs_inode_dotl.c9
-rw-r--r--fs/9p/vfs_super.c6
-rw-r--r--fs/Makefile1
-rw-r--r--fs/adfs/dir_f.c12
-rw-r--r--fs/affs/amigaffs.c27
-rw-r--r--fs/affs/file.c26
-rw-r--r--fs/affs/inode.c2
-rw-r--r--fs/affs/super.c6
-rw-r--r--fs/afs/cmservice.c16
-rw-r--r--fs/afs/dynroot.c20
-rw-r--r--fs/afs/file.c2
-rw-r--r--fs/afs/flock.c1
-rw-r--r--fs/afs/fs_operation.c1
-rw-r--r--fs/afs/fs_probe.c4
-rw-r--r--fs/afs/fsclient.c42
-rw-r--r--fs/afs/inode.c47
-rw-r--r--fs/afs/internal.h15
-rw-r--r--fs/afs/misc.c18
-rw-r--r--fs/afs/proc.c5
-rw-r--r--fs/afs/rotate.c2
-rw-r--r--fs/afs/rxrpc.c6
-rw-r--r--fs/afs/super.c1
-rw-r--r--fs/afs/vl_list.c1
-rw-r--r--fs/afs/vl_probe.c82
-rw-r--r--fs/afs/vl_rotate.c7
-rw-r--r--fs/afs/vlclient.c24
-rw-r--r--fs/afs/write.c13
-rw-r--r--fs/afs/yfsclient.c50
-rw-r--r--fs/aio.c10
-rw-r--r--fs/autofs/dev-ioctl.c4
-rw-r--r--fs/autofs/waitq.c2
-rw-r--r--fs/binfmt_flat.c20
-rw-r--r--fs/block_dev.c184
-rw-r--r--fs/btrfs/Kconfig1
-rw-r--r--fs/btrfs/backref.c3
-rw-r--r--fs/btrfs/block-group.c70
-rw-r--r--fs/btrfs/btrfs_inode.h30
-rw-r--r--fs/btrfs/compression.c35
-rw-r--r--fs/btrfs/compression.h35
-rw-r--r--fs/btrfs/ctree.c212
-rw-r--r--fs/btrfs/ctree.h109
-rw-r--r--fs/btrfs/delalloc-space.c123
-rw-r--r--fs/btrfs/delayed-inode.c6
-rw-r--r--fs/btrfs/dev-replace.c118
-rw-r--r--fs/btrfs/disk-io.c173
-rw-r--r--fs/btrfs/disk-io.h9
-rw-r--r--fs/btrfs/extent-io-tree.h5
-rw-r--r--fs/btrfs/extent-tree.c258
-rw-r--r--fs/btrfs/extent_io.c224
-rw-r--r--fs/btrfs/extent_io.h29
-rw-r--r--fs/btrfs/file-item.c4
-rw-r--r--fs/btrfs/file.c318
-rw-r--r--fs/btrfs/free-space-cache.c29
-rw-r--r--fs/btrfs/free-space-tree.c4
-rw-r--r--fs/btrfs/inode.c831
-rw-r--r--fs/btrfs/ioctl.c96
-rw-r--r--fs/btrfs/locking.c45
-rw-r--r--fs/btrfs/locking.h78
-rw-r--r--fs/btrfs/ordered-data.c113
-rw-r--r--fs/btrfs/ordered-data.h24
-rw-r--r--fs/btrfs/print-tree.c50
-rw-r--r--fs/btrfs/print-tree.h4
-rw-r--r--fs/btrfs/qgroup.c2
-rw-r--r--fs/btrfs/reada.c30
-rw-r--r--fs/btrfs/reflink.c46
-rw-r--r--fs/btrfs/relocation.c11
-rw-r--r--fs/btrfs/root-tree.c13
-rw-r--r--fs/btrfs/scrub.c130
-rw-r--r--fs/btrfs/send.c365
-rw-r--r--fs/btrfs/send.h1
-rw-r--r--fs/btrfs/space-info.c323
-rw-r--r--fs/btrfs/space-info.h2
-rw-r--r--fs/btrfs/struct-funcs.c10
-rw-r--r--fs/btrfs/super.c37
-rw-r--r--fs/btrfs/sysfs.c257
-rw-r--r--fs/btrfs/sysfs.h11
-rw-r--r--fs/btrfs/tests/extent-buffer-tests.c3
-rw-r--r--fs/btrfs/tests/inode-tests.c7
-rw-r--r--fs/btrfs/transaction.c16
-rw-r--r--fs/btrfs/transaction.h8
-rw-r--r--fs/btrfs/tree-checker.c19
-rw-r--r--fs/btrfs/tree-log.c302
-rw-r--r--fs/btrfs/tree-log.h32
-rw-r--r--fs/btrfs/volumes.c433
-rw-r--r--fs/btrfs/volumes.h11
-rw-r--r--fs/buffer.c27
-rw-r--r--fs/ceph/Kconfig2
-rw-r--r--fs/ceph/addr.c23
-rw-r--r--fs/ceph/caps.c26
-rw-r--r--fs/ceph/debugfs.c20
-rw-r--r--fs/ceph/dir.c37
-rw-r--r--fs/ceph/file.c12
-rw-r--r--fs/ceph/inode.c19
-rw-r--r--fs/ceph/mds_client.c184
-rw-r--r--fs/ceph/mds_client.h9
-rw-r--r--fs/ceph/mdsmap.c10
-rw-r--r--fs/ceph/metric.c149
-rw-r--r--fs/ceph/metric.h91
-rw-r--r--fs/ceph/quota.c4
-rw-r--r--fs/ceph/super.c64
-rw-r--r--fs/ceph/super.h79
-rw-r--r--fs/ceph/xattr.c12
-rw-r--r--fs/cifs/cifsglob.h15
-rw-r--r--fs/cifs/cifssmb.c2
-rw-r--r--fs/cifs/connect.c13
-rw-r--r--fs/cifs/inode.c4
-rw-r--r--fs/cifs/sess.c6
-rw-r--r--fs/cifs/smb2inode.c1
-rw-r--r--fs/cifs/smb2ops.c2
-rw-r--r--fs/cifs/smb2pdu.c6
-rw-r--r--fs/compat.c132
-rw-r--r--fs/configfs/dir.c4
-rw-r--r--fs/coredump.c17
-rw-r--r--fs/crypto/crypto.c4
-rw-r--r--fs/crypto/fname.c60
-rw-r--r--fs/crypto/fscrypt_private.h10
-rw-r--r--fs/crypto/hooks.c80
-rw-r--r--fs/crypto/inline_crypt.c7
-rw-r--r--fs/crypto/keyring.c9
-rw-r--r--fs/crypto/keysetup.c182
-rw-r--r--fs/crypto/keysetup_v1.c8
-rw-r--r--fs/crypto/policy.c209
-rw-r--r--fs/d_path.c6
-rw-r--r--fs/dax.c30
-rw-r--r--fs/dcache.c2
-rw-r--r--fs/debugfs/file.c4
-rw-r--r--fs/direct-io.c19
-rw-r--r--fs/dlm/Kconfig1
-rw-r--r--fs/dlm/config.c66
-rw-r--r--fs/dlm/config.h4
-rw-r--r--fs/dlm/lock.c2
-rw-r--r--fs/dlm/lowcomms.c329
-rw-r--r--fs/dlm/midcomms.c136
-rw-r--r--fs/dlm/midcomms.h3
-rw-r--r--fs/efivarfs/super.c3
-rw-r--r--fs/erofs/data.c2
-rw-r--r--fs/erofs/super.c2
-rw-r--r--fs/erofs/xattr.c2
-rw-r--r--fs/erofs/zdata.c48
-rw-r--r--fs/erofs/zmap.c6
-rw-r--r--fs/eventpoll.c102
-rw-r--r--fs/exec.c38
-rw-r--r--fs/exfat/balloc.c4
-rw-r--r--fs/exfat/cache.c11
-rw-r--r--fs/exfat/dir.c32
-rw-r--r--fs/exfat/exfat_fs.h17
-rw-r--r--fs/exfat/exfat_raw.h5
-rw-r--r--fs/exfat/fatent.c58
-rw-r--r--fs/exfat/file.c9
-rw-r--r--fs/exfat/inode.c15
-rw-r--r--fs/exfat/misc.c22
-rw-r--r--fs/exfat/namei.c45
-rw-r--r--fs/exfat/super.c53
-rw-r--r--fs/ext2/file.c6
-rw-r--r--fs/ext2/inode.c4
-rw-r--r--fs/ext2/super.c2
-rw-r--r--fs/ext4/Kconfig2
-rw-r--r--fs/ext4/balloc.c16
-rw-r--r--fs/ext4/block_validity.c159
-rw-r--r--fs/ext4/dir.c2
-rw-r--r--fs/ext4/ext4.h97
-rw-r--r--fs/ext4/ext4_jbd2.c25
-rw-r--r--fs/ext4/extents.c42
-rw-r--r--fs/ext4/file.c11
-rw-r--r--fs/ext4/hash.c4
-rw-r--r--fs/ext4/ialloc.c119
-rw-r--r--fs/ext4/indirect.c20
-rw-r--r--fs/ext4/inline.c4
-rw-r--r--fs/ext4/inode.c32
-rw-r--r--fs/ext4/ioctl.c34
-rw-r--r--fs/ext4/mballoc.c289
-rw-r--r--fs/ext4/mballoc.h4
-rw-r--r--fs/ext4/move_extent.c4
-rw-r--r--fs/ext4/namei.c73
-rw-r--r--fs/ext4/readpage.c4
-rw-r--r--fs/ext4/super.c284
-rw-r--r--fs/ext4/sysfs.c13
-rw-r--r--fs/ext4/xattr.c3
-rw-r--r--fs/f2fs/checkpoint.c15
-rw-r--r--fs/f2fs/compress.c89
-rw-r--r--fs/f2fs/data.c96
-rw-r--r--fs/f2fs/debug.c64
-rw-r--r--fs/f2fs/dir.c8
-rw-r--r--fs/f2fs/extent_cache.c18
-rw-r--r--fs/f2fs/f2fs.h102
-rw-r--r--fs/f2fs/file.c264
-rw-r--r--fs/f2fs/gc.c75
-rw-r--r--fs/f2fs/inline.c21
-rw-r--r--fs/f2fs/inode.c4
-rw-r--r--fs/f2fs/namei.c25
-rw-r--r--fs/f2fs/node.c45
-rw-r--r--fs/f2fs/recovery.c12
-rw-r--r--fs/f2fs/segment.c131
-rw-r--r--fs/f2fs/segment.h10
-rw-r--r--fs/f2fs/super.c83
-rw-r--r--fs/f2fs/sysfs.c23
-rw-r--r--fs/f2fs/verity.c6
-rw-r--r--fs/f2fs/xattr.c4
-rw-r--r--fs/fat/Kconfig2
-rw-r--r--fs/fat/fatent.c3
-rw-r--r--fs/fat/file.c4
-rw-r--r--fs/fcntl.c4
-rw-r--r--fs/file.c2
-rw-r--r--fs/fs-writeback.c112
-rw-r--r--fs/fs_context.c2
-rw-r--r--fs/fs_parser.c2
-rw-r--r--fs/fs_struct.c4
-rw-r--r--fs/fsopen.c2
-rw-r--r--fs/fuse/file.c25
-rw-r--r--fs/fuse/inode.c4
-rw-r--r--fs/fuse/virtio_fs.c4
-rw-r--r--fs/gfs2/bmap.c73
-rw-r--r--fs/gfs2/file.c31
-rw-r--r--fs/gfs2/glock.c10
-rw-r--r--fs/gfs2/log.c33
-rw-r--r--fs/gfs2/quota.c2
-rw-r--r--fs/gfs2/super.c1
-rw-r--r--fs/gfs2/trans.c30
-rw-r--r--fs/hfsplus/wrapper.c2
-rw-r--r--fs/hugetlbfs/inode.c6
-rw-r--r--fs/internal.h3
-rw-r--r--fs/io-wq.c221
-rw-r--r--fs/io-wq.h4
-rw-r--r--fs/io_uring.c2990
-rw-r--r--fs/iomap/buffered-io.c194
-rw-r--r--fs/iomap/direct-io.c49
-rw-r--r--fs/iomap/seek.c4
-rw-r--r--fs/jbd2/journal.c16
-rw-r--r--fs/jbd2/recovery.c46
-rw-r--r--fs/jbd2/transaction.c33
-rw-r--r--fs/jffs2/dir.c6
-rw-r--r--fs/jffs2/fs.c2
-rw-r--r--fs/jffs2/readinode.c2
-rw-r--r--fs/jffs2/scan.c3
-rw-r--r--fs/jfs/jfs_metapage.c2
-rw-r--r--fs/libfs.c4
-rw-r--r--fs/locks.c6
-rw-r--r--fs/minix/inode.c42
-rw-r--r--fs/minix/itree_common.c8
-rw-r--r--fs/minix/itree_v1.c12
-rw-r--r--fs/minix/itree_v2.c13
-rw-r--r--fs/minix/minix.h1
-rw-r--r--fs/namei.c16
-rw-r--r--fs/namespace.c29
-rw-r--r--fs/nfs/Makefile2
-rw-r--r--fs/nfs/blocklayout/blocklayout.c2
-rw-r--r--fs/nfs/blocklayout/rpc_pipefs.c2
-rw-r--r--fs/nfs/client.c22
-rw-r--r--fs/nfs/dir.c29
-rw-r--r--fs/nfs/direct.c2
-rw-r--r--fs/nfs/file.c17
-rw-r--r--fs/nfs/filelayout/filelayout.c2
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c107
-rw-r--r--fs/nfs/fs_context.c219
-rw-r--r--fs/nfs/inode.c20
-rw-r--r--fs/nfs/nfs3acl.c4
-rw-r--r--fs/nfs/nfs42.h24
-rw-r--r--fs/nfs/nfs42proc.c268
-rw-r--r--fs/nfs/nfs42xattr.c1056
-rw-r--r--fs/nfs/nfs42xdr.c438
-rw-r--r--fs/nfs/nfs4_fs.h37
-rw-r--r--fs/nfs/nfs4client.c33
-rw-r--r--fs/nfs/nfs4file.c7
-rw-r--r--fs/nfs/nfs4idmap.c4
-rw-r--r--fs/nfs/nfs4proc.c282
-rw-r--r--fs/nfs/nfs4state.c16
-rw-r--r--fs/nfs/nfs4super.c10
-rw-r--r--fs/nfs/nfs4trace.h46
-rw-r--r--fs/nfs/nfs4xdr.c39
-rw-r--r--fs/nfs/nfstrace.h3
-rw-r--r--fs/nfs/pagelist.c2
-rw-r--r--fs/nfs/pnfs.c54
-rw-r--r--fs/nfs/pnfs.h2
-rw-r--r--fs/nfs/super.c9
-rw-r--r--fs/nfs_common/nfsacl.c2
-rw-r--r--fs/nfsd/blocklayout.c8
-rw-r--r--fs/nfsd/nfs4callback.c2
-rw-r--r--fs/nfsd/nfs4layouts.c2
-rw-r--r--fs/nfsd/nfs4proc.c2
-rw-r--r--fs/nfsd/nfs4state.c14
-rw-r--r--fs/nfsd/nfsfh.c4
-rw-r--r--fs/nfsd/nfsproc.c2
-rw-r--r--fs/nfsd/nfssvc.c2
-rw-r--r--fs/nfsd/vfs.c4
-rw-r--r--fs/nilfs2/alloc.c38
-rw-r--r--fs/nilfs2/bmap.c2
-rw-r--r--fs/nilfs2/btree.c42
-rw-r--r--fs/nilfs2/cpfile.c10
-rw-r--r--fs/nilfs2/dat.c14
-rw-r--r--fs/nilfs2/direct.c14
-rw-r--r--fs/nilfs2/gcinode.c2
-rw-r--r--fs/nilfs2/ifile.c4
-rw-r--r--fs/nilfs2/inode.c32
-rw-r--r--fs/nilfs2/ioctl.c37
-rw-r--r--fs/nilfs2/mdt.c2
-rw-r--r--fs/nilfs2/namei.c6
-rw-r--r--fs/nilfs2/nilfs.h18
-rw-r--r--fs/nilfs2/page.c11
-rw-r--r--fs/nilfs2/recovery.c34
-rw-r--r--fs/nilfs2/segbuf.c2
-rw-r--r--fs/nilfs2/segment.c57
-rw-r--r--fs/nilfs2/sufile.c29
-rw-r--r--fs/nilfs2/super.c73
-rw-r--r--fs/nilfs2/sysfs.c29
-rw-r--r--fs/nilfs2/the_nilfs.c85
-rw-r--r--fs/notify/fanotify/fanotify_user.c2
-rw-r--r--fs/ntfs/inode.c6
-rw-r--r--fs/ocfs2/alloc.c6
-rw-r--r--fs/ocfs2/cluster/heartbeat.c28
-rw-r--r--fs/ocfs2/cluster/quorum.c2
-rw-r--r--fs/ocfs2/localalloc.c2
-rw-r--r--fs/open.c6
-rw-r--r--fs/orangefs/acl.c19
-rw-r--r--fs/orangefs/orangefs-mod.c1
-rw-r--r--fs/pipe.c73
-rw-r--r--fs/proc/base.c14
-rw-r--r--fs/proc/page.c3
-rw-r--r--fs/proc/task_mmu.c112
-rw-r--r--fs/pstore/zone.c1
-rw-r--r--fs/quota/Kconfig5
-rw-r--r--fs/quota/Makefile1
-rw-r--r--fs/quota/compat.c120
-rw-r--r--fs/quota/compat.h34
-rw-r--r--fs/quota/quota.c75
-rw-r--r--fs/read_write.c370
-rw-r--r--fs/romfs/storage.c4
-rw-r--r--fs/seq_file.c2
-rw-r--r--fs/signalfd.c12
-rw-r--r--fs/splice.c85
-rw-r--r--fs/squashfs/block.c6
-rw-r--r--fs/super.c2
-rw-r--r--fs/ubifs/dir.c40
-rw-r--r--fs/ubifs/journal.c10
-rw-r--r--fs/ubifs/lprops.c4
-rw-r--r--fs/ubifs/misc.h2
-rw-r--r--fs/ubifs/sb.c5
-rw-r--r--fs/ubifs/super.c20
-rw-r--r--fs/ubifs/ubifs.h1
-rw-r--r--fs/udf/symlink.c2
-rw-r--r--fs/ufs/super.c2
-rw-r--r--fs/ufs/util.h12
-rw-r--r--fs/userfaultfd.c4
-rw-r--r--fs/vboxsf/super.c4
-rw-r--r--fs/vboxsf/utils.c2
-rw-r--r--fs/xattr.c22
-rw-r--r--fs/xfs/kmem.c22
-rw-r--r--fs/xfs/kmem.h7
-rw-r--r--fs/xfs/libxfs/xfs_ag.c5
-rw-r--r--fs/xfs/libxfs/xfs_attr.c14
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c47
-rw-r--r--fs/xfs/libxfs/xfs_attr_sf.h29
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c2
-rw-r--r--fs/xfs/libxfs/xfs_da_format.h6
-rw-r--r--fs/xfs/libxfs/xfs_dquot_buf.c35
-rw-r--r--fs/xfs/libxfs/xfs_format.h211
-rw-r--r--fs/xfs/libxfs/xfs_fs.h1
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c9
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c65
-rw-r--r--fs/xfs/libxfs/xfs_iext_tree.c2
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c130
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.h15
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c8
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h7
-rw-r--r--fs/xfs/libxfs/xfs_log_recover.h1
-rw-r--r--fs/xfs/libxfs/xfs_quota_defs.h8
-rw-r--r--fs/xfs/libxfs/xfs_sb.c8
-rw-r--r--fs/xfs/libxfs/xfs_shared.h3
-rw-r--r--fs/xfs/libxfs/xfs_trans_inode.c21
-rw-r--r--fs/xfs/libxfs/xfs_trans_space.h2
-rw-r--r--fs/xfs/scrub/agheader.c30
-rw-r--r--fs/xfs/scrub/agheader_repair.c24
-rw-r--r--fs/xfs/scrub/inode.c31
-rw-r--r--fs/xfs/scrub/symlink.c2
-rw-r--r--fs/xfs/xfs_acl.c2
-rw-r--r--fs/xfs/xfs_aops.c2
-rw-r--r--fs/xfs/xfs_attr_list.c8
-rw-r--r--fs/xfs/xfs_bmap_util.c18
-rw-r--r--fs/xfs/xfs_buf.c208
-rw-r--r--fs/xfs/xfs_buf.h17
-rw-r--r--fs/xfs/xfs_buf_item.c266
-rw-r--r--fs/xfs/xfs_buf_item.h12
-rw-r--r--fs/xfs/xfs_buf_item_recover.c4
-rw-r--r--fs/xfs/xfs_dquot.c68
-rw-r--r--fs/xfs/xfs_dquot.h3
-rw-r--r--fs/xfs/xfs_export.c2
-rw-r--r--fs/xfs/xfs_file.c29
-rw-r--r--fs/xfs/xfs_icache.c19
-rw-r--r--fs/xfs/xfs_inode.c87
-rw-r--r--fs/xfs/xfs_inode.h38
-rw-r--r--fs/xfs/xfs_inode_item.c65
-rw-r--r--fs/xfs/xfs_inode_item.h5
-rw-r--r--fs/xfs/xfs_inode_item_recover.c76
-rw-r--r--fs/xfs/xfs_ioctl.c7
-rw-r--r--fs/xfs/xfs_iomap.c2
-rw-r--r--fs/xfs/xfs_log_cil.c2
-rw-r--r--fs/xfs/xfs_log_recover.c62
-rw-r--r--fs/xfs/xfs_mount.c32
-rw-r--r--fs/xfs/xfs_mount.h1
-rw-r--r--fs/xfs/xfs_ondisk.h38
-rw-r--r--fs/xfs/xfs_qm.c13
-rw-r--r--fs/xfs/xfs_qm.h4
-rw-r--r--fs/xfs/xfs_qm_syscalls.c18
-rw-r--r--fs/xfs/xfs_quota.h8
-rw-r--r--fs/xfs/xfs_refcount_item.c2
-rw-r--r--fs/xfs/xfs_reflink.c2
-rw-r--r--fs/xfs/xfs_rtalloc.c13
-rw-r--r--fs/xfs/xfs_super.c28
-rw-r--r--fs/xfs/xfs_sysfs.h6
-rw-r--r--fs/xfs/xfs_trace.h29
-rw-r--r--fs/xfs/xfs_trans.c2
-rw-r--r--fs/xfs/xfs_trans.h2
-rw-r--r--fs/xfs/xfs_trans_ail.c4
-rw-r--r--fs/xfs/xfs_trans_buf.c46
-rw-r--r--fs/xfs/xfs_trans_dquot.c6
-rw-r--r--fs/zonefs/super.c16
-rw-r--r--fs/zonefs/zonefs.h3
419 files changed, 13535 insertions, 7819 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 15a99f9c7253..39def020a074 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -500,10 +500,9 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
}
#ifdef CONFIG_9P_FSCACHE
- if (v9ses->fscache) {
+ if (v9ses->fscache)
v9fs_cache_session_put_cookie(v9ses);
- kfree(v9ses->cachetag);
- }
+ kfree(v9ses->cachetag);
#endif
kfree(v9ses->uname);
kfree(v9ses->aname);
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 92cd1d80218d..6ecf863bfa2f 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -213,7 +213,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
break;
default:
WARN_ONCE(1, "unknown lock status code: %d\n", status);
- /* fall through */
+ fallthrough;
case P9_LOCK_ERROR:
case P9_LOCK_GRACE:
res = -ENOLCK;
@@ -625,7 +625,7 @@ static void v9fs_mmap_vm_close(struct vm_area_struct *vma)
inode = file_inode(vma->vm_file);
- if (!mapping_cap_writeback_dirty(inode->i_mapping))
+ if (!mapping_can_writeback(inode->i_mapping))
wbc.nr_to_write = 0;
might_sleep();
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index c9255d399917..ae0c38ad1fcb 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -223,8 +223,7 @@ v9fs_blank_wstat(struct p9_wstat *wstat)
struct inode *v9fs_alloc_inode(struct super_block *sb)
{
struct v9fs_inode *v9inode;
- v9inode = (struct v9fs_inode *)kmem_cache_alloc(v9fs_inode_cache,
- GFP_KERNEL);
+ v9inode = kmem_cache_alloc(v9fs_inode_cache, GFP_KERNEL);
if (!v9inode)
return NULL;
#ifdef CONFIG_9P_FSCACHE
@@ -368,59 +367,6 @@ struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t rdev)
return inode;
}
-/*
-static struct v9fs_fid*
-v9fs_clone_walk(struct v9fs_session_info *v9ses, u32 fid, struct dentry *dentry)
-{
- int err;
- int nfid;
- struct v9fs_fid *ret;
- struct v9fs_fcall *fcall;
-
- nfid = v9fs_get_idpool(&v9ses->fidpool);
- if (nfid < 0) {
- eprintk(KERN_WARNING, "no free fids available\n");
- return ERR_PTR(-ENOSPC);
- }
-
- err = v9fs_t_walk(v9ses, fid, nfid, (char *) dentry->d_name.name,
- &fcall);
-
- if (err < 0) {
- if (fcall && fcall->id == RWALK)
- goto clunk_fid;
-
- PRINT_FCALL_ERROR("walk error", fcall);
- v9fs_put_idpool(nfid, &v9ses->fidpool);
- goto error;
- }
-
- kfree(fcall);
- fcall = NULL;
- ret = v9fs_fid_create(v9ses, nfid);
- if (!ret) {
- err = -ENOMEM;
- goto clunk_fid;
- }
-
- err = v9fs_fid_insert(ret, dentry);
- if (err < 0) {
- v9fs_fid_destroy(ret);
- goto clunk_fid;
- }
-
- return ret;
-
-clunk_fid:
- v9fs_t_clunk(v9ses, nfid);
-
-error:
- kfree(fcall);
- return ERR_PTR(err);
-}
-*/
-
-
/**
* v9fs_clear_inode - release an inode
* @inode: inode to release
@@ -1090,7 +1036,7 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
{
int retval;
struct v9fs_session_info *v9ses;
- struct p9_fid *fid;
+ struct p9_fid *fid = NULL;
struct p9_wstat wstat;
p9_debug(P9_DEBUG_VFS, "\n");
@@ -1100,7 +1046,12 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
retval = -EPERM;
v9ses = v9fs_dentry2v9ses(dentry);
- fid = v9fs_fid_lookup(dentry);
+ if (iattr->ia_valid & ATTR_FILE) {
+ fid = iattr->ia_file->private_data;
+ WARN_ON(!fid);
+ }
+ if (!fid)
+ fid = v9fs_fid_lookup(dentry);
if(IS_ERR(fid))
return PTR_ERR(fid);
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 60328b21c5fb..0028eccb665a 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -540,7 +540,7 @@ static int v9fs_mapped_iattr_valid(int iattr_valid)
int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
{
int retval;
- struct p9_fid *fid;
+ struct p9_fid *fid = NULL;
struct p9_iattr_dotl p9attr;
struct inode *inode = d_inode(dentry);
@@ -560,7 +560,12 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
p9attr.mtime_sec = iattr->ia_mtime.tv_sec;
p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec;
- fid = v9fs_fid_lookup(dentry);
+ if (iattr->ia_valid & ATTR_FILE) {
+ fid = iattr->ia_file->private_data;
+ WARN_ON(!fid);
+ }
+ if (!fid)
+ fid = v9fs_fid_lookup(dentry);
if (IS_ERR(fid))
return PTR_ERR(fid);
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 74df32be4c6a..e34fa20acf61 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -80,8 +80,10 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
if (ret)
return ret;
- if (v9ses->cache)
- sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
+ if (!v9ses->cache) {
+ sb->s_bdi->ra_pages = 0;
+ sb->s_bdi->io_pages = 0;
+ }
sb->s_flags |= SB_ACTIVE | SB_DIRSYNC;
if (!v9ses->cache)
diff --git a/fs/Makefile b/fs/Makefile
index 1c7b0e3f6daa..d72ee2ce7af0 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -37,7 +37,6 @@ obj-$(CONFIG_FS_DAX) += dax.o
obj-$(CONFIG_FS_ENCRYPTION) += crypto/
obj-$(CONFIG_FS_VERITY) += verity/
obj-$(CONFIG_FILE_LOCKING) += locks.o
-obj-$(CONFIG_COMPAT) += compat.o
obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o
obj-$(CONFIG_BINFMT_EM86) += binfmt_em86.o
obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o
diff --git a/fs/adfs/dir_f.c b/fs/adfs/dir_f.c
index 30d526fecc3f..05e963402e25 100644
--- a/fs/adfs/dir_f.c
+++ b/fs/adfs/dir_f.c
@@ -18,11 +18,11 @@ static inline unsigned int adfs_readval(unsigned char *p, int len)
switch (len) {
case 4: val |= p[3] << 24;
- /* fall through */
+ fallthrough;
case 3: val |= p[2] << 16;
- /* fall through */
+ fallthrough;
case 2: val |= p[1] << 8;
- /* fall through */
+ fallthrough;
default: val |= p[0];
}
return val;
@@ -32,11 +32,11 @@ static inline void adfs_writeval(unsigned char *p, int len, unsigned int val)
{
switch (len) {
case 4: p[3] = val >> 24;
- /* fall through */
+ fallthrough;
case 3: p[2] = val >> 16;
- /* fall through */
+ fallthrough;
case 2: p[1] = val >> 8;
- /* fall through */
+ fallthrough;
default: p[0] = val;
}
}
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index f708c45d5f66..29f11e10a7c7 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -420,24 +420,51 @@ affs_mode_to_prot(struct inode *inode)
u32 prot = AFFS_I(inode)->i_protect;
umode_t mode = inode->i_mode;
+ /*
+ * First, clear all RWED bits for owner, group, other.
+ * Then, recalculate them afresh.
+ *
+ * We'll always clear the delete-inhibit bit for the owner, as that is
+ * the classic single-user mode AmigaOS protection bit and we need to
+ * stay compatible with all scenarios.
+ *
+ * Since multi-user AmigaOS is an extension, we'll only set the
+ * delete-allow bit if any of the other bits in the same user class
+ * (group/other) are used.
+ */
+ prot &= ~(FIBF_NOEXECUTE | FIBF_NOREAD
+ | FIBF_NOWRITE | FIBF_NODELETE
+ | FIBF_GRP_EXECUTE | FIBF_GRP_READ
+ | FIBF_GRP_WRITE | FIBF_GRP_DELETE
+ | FIBF_OTR_EXECUTE | FIBF_OTR_READ
+ | FIBF_OTR_WRITE | FIBF_OTR_DELETE);
+
+ /* Classic single-user AmigaOS flags. These are inverted. */
if (!(mode & 0100))
prot |= FIBF_NOEXECUTE;
if (!(mode & 0400))
prot |= FIBF_NOREAD;
if (!(mode & 0200))
prot |= FIBF_NOWRITE;
+
+ /* Multi-user extended flags. Not inverted. */
if (mode & 0010)
prot |= FIBF_GRP_EXECUTE;
if (mode & 0040)
prot |= FIBF_GRP_READ;
if (mode & 0020)
prot |= FIBF_GRP_WRITE;
+ if (mode & 0070)
+ prot |= FIBF_GRP_DELETE;
+
if (mode & 0001)
prot |= FIBF_OTR_EXECUTE;
if (mode & 0004)
prot |= FIBF_OTR_READ;
if (mode & 0002)
prot |= FIBF_OTR_WRITE;
+ if (mode & 0007)
+ prot |= FIBF_OTR_DELETE;
AFFS_I(inode)->i_protect = prot;
}
diff --git a/fs/affs/file.c b/fs/affs/file.c
index a26a0f96c119..d91b0133d95d 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -429,6 +429,24 @@ static int affs_write_begin(struct file *file, struct address_space *mapping,
return ret;
}
+static int affs_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned int len, unsigned int copied,
+ struct page *page, void *fsdata)
+{
+ struct inode *inode = mapping->host;
+ int ret;
+
+ ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
+
+ /* Clear Archived bit on file writes, as AmigaOS would do */
+ if (AFFS_I(inode)->i_protect & FIBF_ARCHIVED) {
+ AFFS_I(inode)->i_protect &= ~FIBF_ARCHIVED;
+ mark_inode_dirty(inode);
+ }
+
+ return ret;
+}
+
static sector_t _affs_bmap(struct address_space *mapping, sector_t block)
{
return generic_block_bmap(mapping,block,affs_get_block);
@@ -438,7 +456,7 @@ const struct address_space_operations affs_aops = {
.readpage = affs_readpage,
.writepage = affs_writepage,
.write_begin = affs_write_begin,
- .write_end = generic_write_end,
+ .write_end = affs_write_end,
.direct_IO = affs_direct_IO,
.bmap = _affs_bmap
};
@@ -795,6 +813,12 @@ done:
if (tmp > inode->i_size)
inode->i_size = AFFS_I(inode)->mmu_private = tmp;
+ /* Clear Archived bit on file writes, as AmigaOS would do */
+ if (AFFS_I(inode)->i_protect & FIBF_ARCHIVED) {
+ AFFS_I(inode)->i_protect &= ~FIBF_ARCHIVED;
+ mark_inode_dirty(inode);
+ }
+
err_first_bh:
unlock_page(page);
put_page(page);
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index a346cf7659f1..044412110b52 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -93,7 +93,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
case ST_ROOT:
inode->i_uid = sbi->s_uid;
inode->i_gid = sbi->s_gid;
- /* fall through */
+ fallthrough;
case ST_USERDIR:
if (be32_to_cpu(tail->stype) == ST_USERDIR ||
affs_test_opt(sbi->s_flags, SF_SETMODE)) {
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 47107c6712a6..a100cd9950c8 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -474,7 +474,7 @@ got_root:
case MUFS_INTLFFS:
case MUFS_DCFFS:
affs_set_opt(sbi->s_flags, SF_MUFS);
- /* fall thru */
+ fallthrough;
case FS_INTLFFS:
case FS_DCFFS:
affs_set_opt(sbi->s_flags, SF_INTL);
@@ -486,7 +486,7 @@ got_root:
break;
case MUFS_OFS:
affs_set_opt(sbi->s_flags, SF_MUFS);
- /* fall through */
+ fallthrough;
case FS_OFS:
affs_set_opt(sbi->s_flags, SF_OFS);
sb->s_flags |= SB_NOEXEC;
@@ -494,7 +494,7 @@ got_root:
case MUFS_DCOFS:
case MUFS_INTLOFS:
affs_set_opt(sbi->s_flags, SF_MUFS);
- /* fall through */
+ fallthrough;
case FS_DCOFS:
case FS_INTLOFS:
affs_set_opt(sbi->s_flags, SF_INTL);
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index bef413818af7..a4e9e6e07e93 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -252,7 +252,7 @@ static int afs_deliver_cb_callback(struct afs_call *call)
call->unmarshall++;
/* extract the FID array and its count in two steps */
- /* fall through */
+ fallthrough;
case 1:
_debug("extract FID count");
ret = afs_extract_data(call, true);
@@ -271,7 +271,7 @@ static int afs_deliver_cb_callback(struct afs_call *call)
afs_extract_to_buf(call, call->count * 3 * 4);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
case 2:
_debug("extract FID array");
ret = afs_extract_data(call, true);
@@ -297,7 +297,7 @@ static int afs_deliver_cb_callback(struct afs_call *call)
call->unmarshall++;
/* extract the callback array and its count in two steps */
- /* fall through */
+ fallthrough;
case 3:
_debug("extract CB count");
ret = afs_extract_data(call, true);
@@ -312,7 +312,7 @@ static int afs_deliver_cb_callback(struct afs_call *call)
iov_iter_discard(&call->def_iter, READ, call->count2 * 3 * 4);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
case 4:
_debug("extract discard %zu/%u",
iov_iter_count(call->iter), call->count2 * 3 * 4);
@@ -391,7 +391,7 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call)
afs_extract_to_buf(call, 11 * sizeof(__be32));
call->unmarshall++;
- /* Fall through */
+ fallthrough;
case 1:
_debug("extract UUID");
ret = afs_extract_data(call, false);
@@ -503,7 +503,7 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call)
afs_extract_to_buf(call, 11 * sizeof(__be32));
call->unmarshall++;
- /* Fall through */
+ fallthrough;
case 1:
_debug("extract UUID");
ret = afs_extract_data(call, false);
@@ -618,7 +618,7 @@ static int afs_deliver_yfs_cb_callback(struct afs_call *call)
call->unmarshall++;
/* extract the FID array and its count in two steps */
- /* Fall through */
+ fallthrough;
case 1:
_debug("extract FID count");
ret = afs_extract_data(call, true);
@@ -637,7 +637,7 @@ static int afs_deliver_yfs_cb_callback(struct afs_call *call)
afs_extract_to_buf(call, size);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
case 2:
_debug("extract FID array");
ret = afs_extract_data(call, false);
diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c
index b79879aacc02..7b784af604fd 100644
--- a/fs/afs/dynroot.c
+++ b/fs/afs/dynroot.c
@@ -382,15 +382,17 @@ void afs_dynroot_depopulate(struct super_block *sb)
net->dynroot_sb = NULL;
mutex_unlock(&net->proc_cells_lock);
- inode_lock(root->d_inode);
-
- /* Remove all the pins for dirs created for manually added cells */
- list_for_each_entry_safe(subdir, tmp, &root->d_subdirs, d_child) {
- if (subdir->d_fsdata) {
- subdir->d_fsdata = NULL;
- dput(subdir);
+ if (root) {
+ inode_lock(root->d_inode);
+
+ /* Remove all the pins for dirs created for manually added cells */
+ list_for_each_entry_safe(subdir, tmp, &root->d_subdirs, d_child) {
+ if (subdir->d_fsdata) {
+ subdir->d_fsdata = NULL;
+ dput(subdir);
+ }
}
- }
- inode_unlock(root->d_inode);
+ inode_unlock(root->d_inode);
+ }
}
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 6f6ed1605cfe..371d1488cc54 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -311,7 +311,7 @@ int afs_page_filler(void *data, struct page *page)
case -ENOBUFS:
_debug("cache said ENOBUFS");
- /* fall through */
+ fallthrough;
default:
go_on:
req = kzalloc(struct_size(req, array, 1), GFP_KERNEL);
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index ffb8575345ca..cb3054c7843e 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -376,7 +376,6 @@ again:
spin_unlock(&vnode->lock);
return;
- /* Fall through */
default:
/* Looks like a lock request was withdrawn. */
spin_unlock(&vnode->lock);
diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c
index 24fd163c6323..97cab12b0a6c 100644
--- a/fs/afs/fs_operation.c
+++ b/fs/afs/fs_operation.c
@@ -235,6 +235,7 @@ int afs_put_operation(struct afs_operation *op)
afs_end_cursor(&op->ac);
afs_put_serverlist(op->net, op->server_list);
afs_put_volume(op->net, op->volume, afs_volume_trace_put_put_op);
+ key_put(op->key);
kfree(op);
return ret;
}
diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c
index 5d9ef517cf81..e7e98ad63a91 100644
--- a/fs/afs/fs_probe.c
+++ b/fs/afs/fs_probe.c
@@ -161,8 +161,8 @@ responded:
}
}
- rtt_us = rxrpc_kernel_get_srtt(call->net->socket, call->rxcall);
- if (rtt_us < server->probe.rtt) {
+ if (rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us) &&
+ rtt_us < server->probe.rtt) {
server->probe.rtt = rtt_us;
server->rtt = rtt_us;
alist->preferred = index;
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index acb4d0ca2649..1d95ed9dd86e 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -320,7 +320,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
call->tmp_u = htonl(0);
afs_extract_to_tmp(call);
}
- /* Fall through */
+ fallthrough;
/* extract the returned data length */
case 1:
@@ -348,7 +348,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
call->bvec[0].bv_page = req->pages[req->index];
iov_iter_bvec(&call->def_iter, READ, call->bvec, 1, size);
ASSERTCMP(size, <=, PAGE_SIZE);
- /* Fall through */
+ fallthrough;
/* extract the returned data */
case 2:
@@ -375,7 +375,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
/* Discard any excess data the server gave us */
afs_extract_discard(call, req->actual_len - req->len);
call->unmarshall = 3;
- /* Fall through */
+ fallthrough;
case 3:
_debug("extract discard %zu/%llu",
@@ -388,7 +388,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
no_more_data:
call->unmarshall = 4;
afs_extract_to_buf(call, (21 + 3 + 6) * 4);
- /* Fall through */
+ fallthrough;
/* extract the metadata */
case 4:
@@ -1343,7 +1343,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
case 0:
call->unmarshall++;
afs_extract_to_buf(call, 12 * 4);
- /* Fall through */
+ fallthrough;
/* extract the returned status record */
case 1:
@@ -1356,7 +1356,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
xdr_decode_AFSFetchVolumeStatus(&bp, &op->volstatus.vs);
call->unmarshall++;
afs_extract_to_tmp(call);
- /* Fall through */
+ fallthrough;
/* extract the volume name length */
case 2:
@@ -1371,7 +1371,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
size = (call->count + 3) & ~3; /* It's padded */
afs_extract_to_buf(call, size);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* extract the volume name */
case 3:
@@ -1385,7 +1385,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
_debug("volname '%s'", p);
afs_extract_to_tmp(call);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* extract the offline message length */
case 4:
@@ -1400,7 +1400,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
size = (call->count + 3) & ~3; /* It's padded */
afs_extract_to_buf(call, size);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* extract the offline message */
case 5:
@@ -1415,7 +1415,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
afs_extract_to_tmp(call);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* extract the message of the day length */
case 6:
@@ -1430,7 +1430,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
size = (call->count + 3) & ~3; /* It's padded */
afs_extract_to_buf(call, size);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* extract the message of the day */
case 7:
@@ -1682,7 +1682,7 @@ static int afs_deliver_fs_get_capabilities(struct afs_call *call)
case 0:
afs_extract_to_tmp(call);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* Extract the capabilities word count */
case 1:
@@ -1696,7 +1696,7 @@ static int afs_deliver_fs_get_capabilities(struct afs_call *call)
call->count2 = count;
afs_extract_discard(call, count * sizeof(__be32));
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* Extract capabilities words */
case 2:
@@ -1776,7 +1776,7 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call)
case 0:
afs_extract_to_tmp(call);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* Extract the file status count and array in two steps */
case 1:
@@ -1794,7 +1794,7 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call)
call->unmarshall++;
more_counts:
afs_extract_to_buf(call, 21 * sizeof(__be32));
- /* Fall through */
+ fallthrough;
case 2:
_debug("extract status array %u", call->count);
@@ -1824,7 +1824,7 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call)
call->count = 0;
call->unmarshall++;
afs_extract_to_tmp(call);
- /* Fall through */
+ fallthrough;
/* Extract the callback count and array in two steps */
case 3:
@@ -1841,7 +1841,7 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call)
call->unmarshall++;
more_cbs:
afs_extract_to_buf(call, 3 * sizeof(__be32));
- /* Fall through */
+ fallthrough;
case 4:
_debug("extract CB array");
@@ -1870,7 +1870,7 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call)
afs_extract_to_buf(call, 6 * sizeof(__be32));
call->unmarshall++;
- /* Fall through */
+ fallthrough;
case 5:
ret = afs_extract_data(call, false);
@@ -1974,7 +1974,7 @@ static int afs_deliver_fs_fetch_acl(struct afs_call *call)
case 0:
afs_extract_to_tmp(call);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* extract the returned data length */
case 1:
@@ -1992,7 +1992,7 @@ static int afs_deliver_fs_fetch_acl(struct afs_call *call)
acl->size = call->count2;
afs_extract_begin(call, acl->data, size);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* extract the returned data */
case 2:
@@ -2002,7 +2002,7 @@ static int afs_deliver_fs_fetch_acl(struct afs_call *call)
afs_extract_to_buf(call, (21 + 6) * 4);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* extract the metadata */
case 3:
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 1d13d2e882ad..0fe8844b4bee 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -810,14 +810,32 @@ void afs_evict_inode(struct inode *inode)
static void afs_setattr_success(struct afs_operation *op)
{
- struct inode *inode = &op->file[0].vnode->vfs_inode;
+ struct afs_vnode_param *vp = &op->file[0];
+ struct inode *inode = &vp->vnode->vfs_inode;
+ loff_t old_i_size = i_size_read(inode);
+
+ op->setattr.old_i_size = old_i_size;
+ afs_vnode_commit_status(op, vp);
+ /* inode->i_size has now been changed. */
+
+ if (op->setattr.attr->ia_valid & ATTR_SIZE) {
+ loff_t size = op->setattr.attr->ia_size;
+ if (size > old_i_size)
+ pagecache_isize_extended(inode, old_i_size, size);
+ }
+}
+
+static void afs_setattr_edit_file(struct afs_operation *op)
+{
+ struct afs_vnode_param *vp = &op->file[0];
+ struct inode *inode = &vp->vnode->vfs_inode;
- afs_vnode_commit_status(op, &op->file[0]);
if (op->setattr.attr->ia_valid & ATTR_SIZE) {
- loff_t i_size = inode->i_size, size = op->setattr.attr->ia_size;
- if (size > i_size)
- pagecache_isize_extended(inode, i_size, size);
- truncate_pagecache(inode, size);
+ loff_t size = op->setattr.attr->ia_size;
+ loff_t i_size = op->setattr.old_i_size;
+
+ if (size < i_size)
+ truncate_pagecache(inode, size);
}
}
@@ -825,6 +843,7 @@ static const struct afs_operation_ops afs_setattr_operation = {
.issue_afs_rpc = afs_fs_setattr,
.issue_yfs_rpc = yfs_fs_setattr,
.success = afs_setattr_success,
+ .edit_dir = afs_setattr_edit_file,
};
/*
@@ -863,11 +882,16 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr)
if (S_ISREG(vnode->vfs_inode.i_mode))
filemap_write_and_wait(vnode->vfs_inode.i_mapping);
+ /* Prevent any new writebacks from starting whilst we do this. */
+ down_write(&vnode->validate_lock);
+
op = afs_alloc_operation(((attr->ia_valid & ATTR_FILE) ?
afs_file_key(attr->ia_file) : NULL),
vnode->volume);
- if (IS_ERR(op))
- return PTR_ERR(op);
+ if (IS_ERR(op)) {
+ ret = PTR_ERR(op);
+ goto out_unlock;
+ }
afs_op_set_vnode(op, 0, vnode);
op->setattr.attr = attr;
@@ -880,5 +904,10 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr)
op->file[0].update_ctime = 1;
op->ops = &afs_setattr_operation;
- return afs_do_sync_operation(op);
+ ret = afs_do_sync_operation(op);
+
+out_unlock:
+ up_write(&vnode->validate_lock);
+ _leave(" = %d", ret);
+ return ret;
}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 792ac711985e..e5f0446f27e5 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -401,22 +401,24 @@ struct afs_vlserver {
#define AFS_VLSERVER_FL_PROBED 0 /* The VL server has been probed */
#define AFS_VLSERVER_FL_PROBING 1 /* VL server is being probed */
#define AFS_VLSERVER_FL_IS_YFS 2 /* Server is YFS not AFS */
+#define AFS_VLSERVER_FL_RESPONDING 3 /* VL server is responding */
rwlock_t lock; /* Lock on addresses */
atomic_t usage;
+ unsigned int rtt; /* Server's current RTT in uS */
/* Probe state */
wait_queue_head_t probe_wq;
atomic_t probe_outstanding;
spinlock_t probe_lock;
struct {
- unsigned int rtt; /* RTT as ktime/64 */
+ unsigned int rtt; /* RTT in uS */
u32 abort_code;
short error;
- bool have_result;
- bool responded:1;
- bool is_yfs:1;
- bool not_yfs:1;
- bool local_failure:1;
+ unsigned short flags;
+#define AFS_VLSERVER_PROBE_RESPONDED 0x01 /* At least once response (may be abort) */
+#define AFS_VLSERVER_PROBE_IS_YFS 0x02 /* The peer appears to be YFS */
+#define AFS_VLSERVER_PROBE_NOT_YFS 0x04 /* The peer appears not to be YFS */
+#define AFS_VLSERVER_PROBE_LOCAL_FAILURE 0x08 /* A local failure prevented a probe */
} probe;
u16 port;
@@ -810,6 +812,7 @@ struct afs_operation {
} store;
struct {
struct iattr *attr;
+ loff_t old_i_size;
} setattr;
struct afs_acl *acl;
struct yfs_acl *yacl;
diff --git a/fs/afs/misc.c b/fs/afs/misc.c
index 5334f1bd2bca..1d1a8debe472 100644
--- a/fs/afs/misc.c
+++ b/fs/afs/misc.c
@@ -120,42 +120,42 @@ void afs_prioritise_error(struct afs_error *e, int error, u32 abort_code)
if (e->error == -ETIMEDOUT ||
e->error == -ETIME)
return;
- /* Fall through */
+ fallthrough;
case -ETIMEDOUT:
case -ETIME:
if (e->error == -ENOMEM ||
e->error == -ENONET)
return;
- /* Fall through */
+ fallthrough;
case -ENOMEM:
case -ENONET:
if (e->error == -ERFKILL)
return;
- /* Fall through */
+ fallthrough;
case -ERFKILL:
if (e->error == -EADDRNOTAVAIL)
return;
- /* Fall through */
+ fallthrough;
case -EADDRNOTAVAIL:
if (e->error == -ENETUNREACH)
return;
- /* Fall through */
+ fallthrough;
case -ENETUNREACH:
if (e->error == -EHOSTUNREACH)
return;
- /* Fall through */
+ fallthrough;
case -EHOSTUNREACH:
if (e->error == -EHOSTDOWN)
return;
- /* Fall through */
+ fallthrough;
case -EHOSTDOWN:
if (e->error == -ECONNREFUSED)
return;
- /* Fall through */
+ fallthrough;
case -ECONNREFUSED:
if (e->error == -ECONNRESET)
return;
- /* Fall through */
+ fallthrough;
case -ECONNRESET: /* Responded, but call expired. */
if (e->responded)
return;
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index e817fc740ba0..e8babb62ed44 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -310,6 +310,11 @@ static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v)
alist->preferred == i ? '>' : '-',
&alist->addrs[i].transport);
}
+ seq_printf(m, " info: fl=%lx rtt=%d\n", vlserver->flags, vlserver->rtt);
+ seq_printf(m, " probe: fl=%x e=%d ac=%d out=%d\n",
+ vlserver->probe.flags, vlserver->probe.error,
+ vlserver->probe.abort_code,
+ atomic_read(&vlserver->probe_outstanding));
return 0;
}
diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
index 6a0935cb822f..d83f13c44b92 100644
--- a/fs/afs/rotate.c
+++ b/fs/afs/rotate.c
@@ -281,7 +281,7 @@ bool afs_select_fileserver(struct afs_operation *op)
case -ETIME:
if (op->error != -EDESTADDRREQ)
goto iterate_address;
- /* Fall through */
+ fallthrough;
case -ERFKILL:
case -EADDRNOTAVAIL:
case -ENETUNREACH:
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 8fc8fb406a5a..8be709cb8542 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -568,7 +568,7 @@ static void afs_deliver_to_call(struct afs_call *call)
case -EIO:
pr_err("kAFS: Call %u in bad state %u\n",
call->debug_id, state);
- /* Fall through */
+ fallthrough;
case -ENODATA:
case -EBADMSG:
case -EMSGSIZE:
@@ -669,7 +669,7 @@ long afs_wait_for_call_to_complete(struct afs_call *call,
ret = call->ret0;
call->ret0 = 0;
- /* Fall through */
+ fallthrough;
case -ECONNABORTED:
ac->responded = true;
break;
@@ -872,7 +872,7 @@ void afs_send_empty_reply(struct afs_call *call)
_debug("oom");
rxrpc_kernel_abort_call(net->socket, call->rxcall,
RX_USER_ABORT, -ENOMEM, "KOO");
- /* Fall through */
+ fallthrough;
default:
_leave(" [error]");
return;
diff --git a/fs/afs/super.c b/fs/afs/super.c
index b552357b1d13..3a40ee752c1e 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -456,7 +456,6 @@ static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx)
ret = super_setup_bdi(sb);
if (ret)
return ret;
- sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
/* allocate the root inode and dentry */
if (as->dyn_root) {
diff --git a/fs/afs/vl_list.c b/fs/afs/vl_list.c
index 8fea54eba0c2..38b2ba1d9ec0 100644
--- a/fs/afs/vl_list.c
+++ b/fs/afs/vl_list.c
@@ -21,6 +21,7 @@ struct afs_vlserver *afs_alloc_vlserver(const char *name, size_t name_len,
rwlock_init(&vlserver->lock);
init_waitqueue_head(&vlserver->probe_wq);
spin_lock_init(&vlserver->probe_lock);
+ vlserver->rtt = UINT_MAX;
vlserver->name_len = name_len;
vlserver->port = port;
memcpy(vlserver->name, name, name_len);
diff --git a/fs/afs/vl_probe.c b/fs/afs/vl_probe.c
index e3aa013c2177..d1c7068b4346 100644
--- a/fs/afs/vl_probe.c
+++ b/fs/afs/vl_probe.c
@@ -11,15 +11,33 @@
#include "internal.h"
#include "protocol_yfs.h"
-static bool afs_vl_probe_done(struct afs_vlserver *server)
+
+/*
+ * Handle the completion of a set of probes.
+ */
+static void afs_finished_vl_probe(struct afs_vlserver *server)
{
- if (!atomic_dec_and_test(&server->probe_outstanding))
- return false;
+ if (!(server->probe.flags & AFS_VLSERVER_PROBE_RESPONDED)) {
+ server->rtt = UINT_MAX;
+ clear_bit(AFS_VLSERVER_FL_RESPONDING, &server->flags);
+ }
- wake_up_var(&server->probe_outstanding);
clear_bit_unlock(AFS_VLSERVER_FL_PROBING, &server->flags);
wake_up_bit(&server->flags, AFS_VLSERVER_FL_PROBING);
- return true;
+}
+
+/*
+ * Handle the completion of a probe RPC call.
+ */
+static void afs_done_one_vl_probe(struct afs_vlserver *server, bool wake_up)
+{
+ if (atomic_dec_and_test(&server->probe_outstanding)) {
+ afs_finished_vl_probe(server);
+ wake_up = true;
+ }
+
+ if (wake_up)
+ wake_up_all(&server->probe_wq);
}
/*
@@ -45,15 +63,20 @@ void afs_vlserver_probe_result(struct afs_call *call)
server->probe.error = 0;
goto responded;
case -ECONNABORTED:
- if (!server->probe.responded) {
+ if (!(server->probe.flags & AFS_VLSERVER_PROBE_RESPONDED)) {
server->probe.abort_code = call->abort_code;
server->probe.error = ret;
}
goto responded;
case -ENOMEM:
case -ENONET:
- server->probe.local_failure = true;
- afs_io_error(call, afs_io_error_vl_probe_fail);
+ case -EKEYEXPIRED:
+ case -EKEYREVOKED:
+ case -EKEYREJECTED:
+ server->probe.flags |= AFS_VLSERVER_PROBE_LOCAL_FAILURE;
+ if (server->probe.error == 0)
+ server->probe.error = ret;
+ trace_afs_io_error(call->debug_id, ret, afs_io_error_vl_probe_fail);
goto out;
case -ECONNRESET: /* Responded, but call expired. */
case -ERFKILL:
@@ -67,12 +90,12 @@ void afs_vlserver_probe_result(struct afs_call *call)
default:
clear_bit(index, &alist->responded);
set_bit(index, &alist->failed);
- if (!server->probe.responded &&
+ if (!(server->probe.flags & AFS_VLSERVER_PROBE_RESPONDED) &&
(server->probe.error == 0 ||
server->probe.error == -ETIMEDOUT ||
server->probe.error == -ETIME))
server->probe.error = ret;
- afs_io_error(call, afs_io_error_vl_probe_fail);
+ trace_afs_io_error(call->debug_id, ret, afs_io_error_vl_probe_fail);
goto out;
}
@@ -81,39 +104,36 @@ responded:
clear_bit(index, &alist->failed);
if (call->service_id == YFS_VL_SERVICE) {
- server->probe.is_yfs = true;
+ server->probe.flags |= AFS_VLSERVER_PROBE_IS_YFS;
set_bit(AFS_VLSERVER_FL_IS_YFS, &server->flags);
alist->addrs[index].srx_service = call->service_id;
} else {
- server->probe.not_yfs = true;
- if (!server->probe.is_yfs) {
+ server->probe.flags |= AFS_VLSERVER_PROBE_NOT_YFS;
+ if (!(server->probe.flags & AFS_VLSERVER_PROBE_IS_YFS)) {
clear_bit(AFS_VLSERVER_FL_IS_YFS, &server->flags);
alist->addrs[index].srx_service = call->service_id;
}
}
- rtt_us = rxrpc_kernel_get_srtt(call->net->socket, call->rxcall);
- if (rtt_us < server->probe.rtt) {
+ if (rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us) &&
+ rtt_us < server->probe.rtt) {
server->probe.rtt = rtt_us;
+ server->rtt = rtt_us;
alist->preferred = index;
- have_result = true;
}
smp_wmb(); /* Set rtt before responded. */
- server->probe.responded = true;
+ server->probe.flags |= AFS_VLSERVER_PROBE_RESPONDED;
set_bit(AFS_VLSERVER_FL_PROBED, &server->flags);
+ set_bit(AFS_VLSERVER_FL_RESPONDING, &server->flags);
+ have_result = true;
out:
spin_unlock(&server->probe_lock);
_debug("probe [%u][%u] %pISpc rtt=%u ret=%d",
server_index, index, &alist->addrs[index].transport, rtt_us, ret);
- have_result |= afs_vl_probe_done(server);
- if (have_result) {
- server->probe.have_result = true;
- wake_up_var(&server->probe.have_result);
- wake_up_all(&server->probe_wq);
- }
+ afs_done_one_vl_probe(server, have_result);
}
/*
@@ -151,11 +171,10 @@ static bool afs_do_probe_vlserver(struct afs_net *net,
in_progress = true;
} else {
afs_prioritise_error(_e, PTR_ERR(call), ac.abort_code);
+ afs_done_one_vl_probe(server, false);
}
}
- if (!in_progress)
- afs_vl_probe_done(server);
return in_progress;
}
@@ -193,7 +212,7 @@ int afs_wait_for_vl_probes(struct afs_vlserver_list *vllist,
{
struct wait_queue_entry *waits;
struct afs_vlserver *server;
- unsigned int rtt = UINT_MAX;
+ unsigned int rtt = UINT_MAX, rtt_s;
bool have_responders = false;
int pref = -1, i;
@@ -205,7 +224,7 @@ int afs_wait_for_vl_probes(struct afs_vlserver_list *vllist,
server = vllist->servers[i].server;
if (!test_bit(AFS_VLSERVER_FL_PROBING, &server->flags))
__clear_bit(i, &untried);
- if (server->probe.responded)
+ if (server->probe.flags & AFS_VLSERVER_PROBE_RESPONDED)
have_responders = true;
}
}
@@ -231,7 +250,7 @@ int afs_wait_for_vl_probes(struct afs_vlserver_list *vllist,
for (i = 0; i < vllist->nr_servers; i++) {
if (test_bit(i, &untried)) {
server = vllist->servers[i].server;
- if (server->probe.responded)
+ if (server->probe.flags & AFS_VLSERVER_PROBE_RESPONDED)
goto stop;
if (test_bit(AFS_VLSERVER_FL_PROBING, &server->flags))
still_probing = true;
@@ -249,10 +268,11 @@ stop:
for (i = 0; i < vllist->nr_servers; i++) {
if (test_bit(i, &untried)) {
server = vllist->servers[i].server;
- if (server->probe.responded &&
- server->probe.rtt < rtt) {
+ rtt_s = READ_ONCE(server->rtt);
+ if (test_bit(AFS_VLSERVER_FL_RESPONDING, &server->flags) &&
+ rtt_s < rtt) {
pref = i;
- rtt = server->probe.rtt;
+ rtt = rtt_s;
}
remove_wait_queue(&server->probe_wq, &waits[i]);
diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c
index f405ca8b240a..c0458c903b31 100644
--- a/fs/afs/vl_rotate.c
+++ b/fs/afs/vl_rotate.c
@@ -192,7 +192,8 @@ pick_server:
for (i = 0; i < vc->server_list->nr_servers; i++) {
struct afs_vlserver *s = vc->server_list->servers[i].server;
- if (!test_bit(i, &vc->untried) || !s->probe.responded)
+ if (!test_bit(i, &vc->untried) ||
+ !test_bit(AFS_VLSERVER_FL_RESPONDING, &s->flags))
continue;
if (s->probe.rtt < rtt) {
vc->index = i;
@@ -262,10 +263,14 @@ no_more_servers:
for (i = 0; i < vc->server_list->nr_servers; i++) {
struct afs_vlserver *s = vc->server_list->servers[i].server;
+ if (test_bit(AFS_VLSERVER_FL_RESPONDING, &s->flags))
+ e.responded = true;
afs_prioritise_error(&e, READ_ONCE(s->probe.error),
s->probe.abort_code);
}
+ error = e.error;
+
failed_set_error:
vc->error = error;
failed:
diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
index fd82850cd424..dc9327332f06 100644
--- a/fs/afs/vlclient.c
+++ b/fs/afs/vlclient.c
@@ -196,7 +196,7 @@ static int afs_deliver_vl_get_addrs_u(struct afs_call *call)
/* Extract the returned uuid, uniquifier, nentries and
* blkaddrs size */
- /* Fall through */
+ fallthrough;
case 1:
ret = afs_extract_data(call, true);
if (ret < 0)
@@ -221,7 +221,7 @@ static int afs_deliver_vl_get_addrs_u(struct afs_call *call)
count = min(call->count, 4U);
afs_extract_to_buf(call, count * sizeof(__be32));
- /* Fall through - and extract entries */
+ fallthrough; /* and extract entries */
case 2:
ret = afs_extract_data(call, call->count > 4);
if (ret < 0)
@@ -324,7 +324,7 @@ static int afs_deliver_vl_get_capabilities(struct afs_call *call)
afs_extract_to_tmp(call);
call->unmarshall++;
- /* Fall through - and extract the capabilities word count */
+ fallthrough; /* and extract the capabilities word count */
case 1:
ret = afs_extract_data(call, true);
if (ret < 0)
@@ -337,7 +337,7 @@ static int afs_deliver_vl_get_capabilities(struct afs_call *call)
call->unmarshall++;
afs_extract_discard(call, count * sizeof(__be32));
- /* Fall through - and extract capabilities words */
+ fallthrough; /* and extract capabilities words */
case 2:
ret = afs_extract_data(call, false);
if (ret < 0)
@@ -436,7 +436,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call)
/* Extract the returned uuid, uniquifier, fsEndpoints count and
* either the first fsEndpoint type or the volEndpoints
* count if there are no fsEndpoints. */
- /* Fall through */
+ fallthrough;
case 1:
ret = afs_extract_data(call, true);
if (ret < 0)
@@ -475,7 +475,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call)
afs_extract_to_buf(call, size);
call->unmarshall = 2;
- /* Fall through - and extract fsEndpoints[] entries */
+ fallthrough; /* and extract fsEndpoints[] entries */
case 2:
ret = afs_extract_data(call, true);
if (ret < 0)
@@ -526,7 +526,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call)
* extract the type of the next endpoint when we extract the
* data of the current one, but this is the first...
*/
- /* Fall through */
+ fallthrough;
case 3:
ret = afs_extract_data(call, true);
if (ret < 0)
@@ -552,7 +552,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call)
afs_extract_to_buf(call, size);
call->unmarshall = 4;
- /* Fall through - and extract volEndpoints[] entries */
+ fallthrough; /* and extract volEndpoints[] entries */
case 4:
ret = afs_extract_data(call, true);
if (ret < 0)
@@ -587,7 +587,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call)
afs_extract_discard(call, 0);
call->unmarshall = 5;
- /* Fall through - Done */
+ fallthrough; /* Done */
case 5:
ret = afs_extract_data(call, false);
if (ret < 0)
@@ -663,7 +663,7 @@ static int afs_deliver_yfsvl_get_cell_name(struct afs_call *call)
afs_extract_to_tmp(call);
call->unmarshall++;
- /* Fall through - and extract the cell name length */
+ fallthrough; /* and extract the cell name length */
case 1:
ret = afs_extract_data(call, true);
if (ret < 0)
@@ -685,7 +685,7 @@ static int afs_deliver_yfsvl_get_cell_name(struct afs_call *call)
afs_extract_begin(call, cell_name, namesz);
call->unmarshall++;
- /* Fall through - and extract cell name */
+ fallthrough; /* and extract cell name */
case 2:
ret = afs_extract_data(call, true);
if (ret < 0)
@@ -694,7 +694,7 @@ static int afs_deliver_yfsvl_get_cell_name(struct afs_call *call)
afs_extract_discard(call, call->count2);
call->unmarshall++;
- /* Fall through - and extract padding */
+ fallthrough; /* and extract padding */
case 3:
ret = afs_extract_data(call, false);
if (ret < 0)
diff --git a/fs/afs/write.c b/fs/afs/write.c
index a121c247d95a..da12abd6db21 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -609,7 +609,7 @@ no_more:
default:
pr_notice("kAFS: Unexpected error from FS.StoreData %d\n", ret);
- /* Fall through */
+ fallthrough;
case -EACCES:
case -EPERM:
case -ENOKEY:
@@ -738,11 +738,21 @@ static int afs_writepages_region(struct address_space *mapping,
int afs_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
+ struct afs_vnode *vnode = AFS_FS_I(mapping->host);
pgoff_t start, end, next;
int ret;
_enter("");
+ /* We have to be careful as we can end up racing with setattr()
+ * truncating the pagecache since the caller doesn't take a lock here
+ * to prevent it.
+ */
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ down_read(&vnode->validate_lock);
+ else if (!down_read_trylock(&vnode->validate_lock))
+ return 0;
+
if (wbc->range_cyclic) {
start = mapping->writeback_index;
end = -1;
@@ -762,6 +772,7 @@ int afs_writepages(struct address_space *mapping,
ret = afs_writepages_region(mapping, wbc, start, end, &next);
}
+ up_read(&vnode->validate_lock);
_leave(" = %d", ret);
return ret;
}
diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c
index 8c24fdc899e3..3b1239b7e90d 100644
--- a/fs/afs/yfsclient.c
+++ b/fs/afs/yfsclient.c
@@ -373,7 +373,7 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
req->offset = req->pos & (PAGE_SIZE - 1);
afs_extract_to_tmp64(call);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* extract the returned data length */
case 1:
@@ -401,7 +401,7 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
call->bvec[0].bv_page = req->pages[req->index];
iov_iter_bvec(&call->def_iter, READ, call->bvec, 1, size);
ASSERTCMP(size, <=, PAGE_SIZE);
- /* Fall through */
+ fallthrough;
/* extract the returned data */
case 2:
@@ -428,7 +428,7 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
/* Discard any excess data the server gave us */
afs_extract_discard(call, req->actual_len - req->len);
call->unmarshall = 3;
- /* Fall through */
+ fallthrough;
case 3:
_debug("extract discard %zu/%llu",
@@ -444,7 +444,7 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
sizeof(struct yfs_xdr_YFSFetchStatus) +
sizeof(struct yfs_xdr_YFSCallBack) +
sizeof(struct yfs_xdr_YFSVolSync));
- /* Fall through */
+ fallthrough;
/* extract the metadata */
case 4:
@@ -461,7 +461,7 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
req->file_size = vp->scb.status.size;
call->unmarshall++;
- /* Fall through */
+ fallthrough;
case 5:
break;
@@ -1262,7 +1262,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call)
case 0:
call->unmarshall++;
afs_extract_to_buf(call, sizeof(struct yfs_xdr_YFSFetchVolumeStatus));
- /* Fall through */
+ fallthrough;
/* extract the returned status record */
case 1:
@@ -1275,7 +1275,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call)
xdr_decode_YFSFetchVolumeStatus(&bp, &op->volstatus.vs);
call->unmarshall++;
afs_extract_to_tmp(call);
- /* Fall through */
+ fallthrough;
/* extract the volume name length */
case 2:
@@ -1290,7 +1290,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call)
size = (call->count + 3) & ~3; /* It's padded */
afs_extract_to_buf(call, size);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* extract the volume name */
case 3:
@@ -1304,7 +1304,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call)
_debug("volname '%s'", p);
afs_extract_to_tmp(call);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* extract the offline message length */
case 4:
@@ -1319,7 +1319,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call)
size = (call->count + 3) & ~3; /* It's padded */
afs_extract_to_buf(call, size);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* extract the offline message */
case 5:
@@ -1334,7 +1334,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call)
afs_extract_to_tmp(call);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* extract the message of the day length */
case 6:
@@ -1349,7 +1349,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call)
size = (call->count + 3) & ~3; /* It's padded */
afs_extract_to_buf(call, size);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* extract the message of the day */
case 7:
@@ -1363,7 +1363,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call)
_debug("motd '%s'", p);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
case 8:
break;
@@ -1622,7 +1622,7 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call)
case 0:
afs_extract_to_tmp(call);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* Extract the file status count and array in two steps */
case 1:
@@ -1640,7 +1640,7 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call)
call->unmarshall++;
more_counts:
afs_extract_to_buf(call, sizeof(struct yfs_xdr_YFSFetchStatus));
- /* Fall through */
+ fallthrough;
case 2:
_debug("extract status array %u", call->count);
@@ -1670,7 +1670,7 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call)
call->count = 0;
call->unmarshall++;
afs_extract_to_tmp(call);
- /* Fall through */
+ fallthrough;
/* Extract the callback count and array in two steps */
case 3:
@@ -1687,7 +1687,7 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call)
call->unmarshall++;
more_cbs:
afs_extract_to_buf(call, sizeof(struct yfs_xdr_YFSCallBack));
- /* Fall through */
+ fallthrough;
case 4:
_debug("extract CB array");
@@ -1716,7 +1716,7 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call)
afs_extract_to_buf(call, sizeof(struct yfs_xdr_YFSVolSync));
call->unmarshall++;
- /* Fall through */
+ fallthrough;
case 5:
ret = afs_extract_data(call, false);
@@ -1727,7 +1727,7 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call)
xdr_decode_YFSVolSync(&bp, &op->volsync);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
case 6:
break;
@@ -1804,7 +1804,7 @@ static int yfs_deliver_fs_fetch_opaque_acl(struct afs_call *call)
case 0:
afs_extract_to_tmp(call);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* Extract the file ACL length */
case 1:
@@ -1826,7 +1826,7 @@ static int yfs_deliver_fs_fetch_opaque_acl(struct afs_call *call)
afs_extract_discard(call, size);
}
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* Extract the file ACL */
case 2:
@@ -1836,7 +1836,7 @@ static int yfs_deliver_fs_fetch_opaque_acl(struct afs_call *call)
afs_extract_to_tmp(call);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* Extract the volume ACL length */
case 3:
@@ -1858,7 +1858,7 @@ static int yfs_deliver_fs_fetch_opaque_acl(struct afs_call *call)
afs_extract_discard(call, size);
}
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* Extract the volume ACL */
case 4:
@@ -1871,7 +1871,7 @@ static int yfs_deliver_fs_fetch_opaque_acl(struct afs_call *call)
sizeof(struct yfs_xdr_YFSFetchStatus) +
sizeof(struct yfs_xdr_YFSVolSync));
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* extract the metadata */
case 5:
@@ -1886,7 +1886,7 @@ static int yfs_deliver_fs_fetch_opaque_acl(struct afs_call *call)
xdr_decode_YFSVolSync(&bp, &op->volsync);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
case 6:
break;
diff --git a/fs/aio.c b/fs/aio.c
index 5736bff48e9e..c45c20d87538 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1489,12 +1489,8 @@ static ssize_t aio_setup_rw(int rw, const struct iocb *iocb,
*iovec = NULL;
return ret;
}
-#ifdef CONFIG_COMPAT
- if (compat)
- return compat_import_iovec(rw, buf, len, UIO_FASTIOV, iovec,
- iter);
-#endif
- return import_iovec(rw, buf, len, UIO_FASTIOV, iovec, iter);
+
+ return __import_iovec(rw, buf, len, UIO_FASTIOV, iovec, iter, compat);
}
static inline void aio_rw_done(struct kiocb *req, ssize_t ret)
@@ -1511,7 +1507,7 @@ static inline void aio_rw_done(struct kiocb *req, ssize_t ret)
* may be already running. Just fail this IO with EINTR.
*/
ret = -EINTR;
- /*FALLTHRU*/
+ fallthrough;
default:
req->ki_complete(req, ret, 0);
}
diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c
index f3a0f412b43b..75105f45c51a 100644
--- a/fs/autofs/dev-ioctl.c
+++ b/fs/autofs/dev-ioctl.c
@@ -20,7 +20,7 @@
* another mount. This situation arises when starting automount(8)
* or other user space daemon which uses direct mounts or offset
* mounts (used for autofs lazy mount/umount of nested mount trees),
- * which have been left busy at at service shutdown.
+ * which have been left busy at service shutdown.
*/
typedef int (*ioctl_fn)(struct file *, struct autofs_sb_info *,
@@ -496,7 +496,7 @@ static int autofs_dev_ioctl_askumount(struct file *fp,
* located path is the root of a mount we return 1 along with
* the super magic of the mount or 0 otherwise.
*
- * In both cases the the device number (as returned by
+ * In both cases the device number (as returned by
* new_encode_dev()) is also returned.
*/
static int autofs_dev_ioctl_ismountpoint(struct file *fp,
diff --git a/fs/autofs/waitq.c b/fs/autofs/waitq.c
index 74c886f7c51c..5ced859dac53 100644
--- a/fs/autofs/waitq.c
+++ b/fs/autofs/waitq.c
@@ -53,7 +53,7 @@ static int autofs_write(struct autofs_sb_info *sbi,
mutex_lock(&sbi->pipe_mutex);
while (bytes) {
- wr = kernel_write(file, data, bytes, &file->f_pos);
+ wr = __kernel_write(file, data, bytes, NULL);
if (wr <= 0)
break;
data += wr;
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index f2f9086ebe98..b9c658e0548e 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -576,7 +576,7 @@ static int load_flat_file(struct linux_binprm *bprm,
goto err;
}
- len = data_len + extra;
+ len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long);
len = PAGE_ALIGN(len);
realdatastart = vm_mmap(NULL, 0, len,
PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0);
@@ -590,7 +590,9 @@ static int load_flat_file(struct linux_binprm *bprm,
vm_munmap(textpos, text_len);
goto err;
}
- datapos = ALIGN(realdatastart, FLAT_DATA_ALIGN);
+ datapos = ALIGN(realdatastart +
+ MAX_SHARED_LIBS * sizeof(unsigned long),
+ FLAT_DATA_ALIGN);
pr_debug("Allocated data+bss+stack (%u bytes): %lx\n",
data_len + bss_len + stack_len, datapos);
@@ -620,7 +622,7 @@ static int load_flat_file(struct linux_binprm *bprm,
memp_size = len;
} else {
- len = text_len + data_len + extra;
+ len = text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(u32);
len = PAGE_ALIGN(len);
textpos = vm_mmap(NULL, 0, len,
PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0);
@@ -635,7 +637,9 @@ static int load_flat_file(struct linux_binprm *bprm,
}
realdatastart = textpos + ntohl(hdr->data_start);
- datapos = ALIGN(realdatastart, FLAT_DATA_ALIGN);
+ datapos = ALIGN(realdatastart +
+ MAX_SHARED_LIBS * sizeof(u32),
+ FLAT_DATA_ALIGN);
reloc = (__be32 __user *)
(datapos + (ntohl(hdr->reloc_start) - text_len));
@@ -652,9 +656,8 @@ static int load_flat_file(struct linux_binprm *bprm,
(text_len + full_data
- sizeof(struct flat_hdr)),
0);
- if (datapos != realdatastart)
- memmove((void *)datapos, (void *)realdatastart,
- full_data);
+ memmove((void *) datapos, (void *) realdatastart,
+ full_data);
#else
/*
* This is used on MMU systems mainly for testing.
@@ -710,7 +713,8 @@ static int load_flat_file(struct linux_binprm *bprm,
if (IS_ERR_VALUE(result)) {
ret = result;
pr_err("Unable to read code+data+bss, errno %d\n", ret);
- vm_munmap(textpos, text_len + data_len + extra);
+ vm_munmap(textpos, text_len + data_len + extra +
+ MAX_SHARED_LIBS * sizeof(u32));
goto err;
}
}
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 8ae833e00443..9e84b1928b94 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -103,6 +103,35 @@ void invalidate_bdev(struct block_device *bdev)
}
EXPORT_SYMBOL(invalidate_bdev);
+/*
+ * Drop all buffers & page cache for given bdev range. This function bails
+ * with error if bdev has other exclusive owner (such as filesystem).
+ */
+int truncate_bdev_range(struct block_device *bdev, fmode_t mode,
+ loff_t lstart, loff_t lend)
+{
+ struct block_device *claimed_bdev = NULL;
+ int err;
+
+ /*
+ * If we don't hold exclusive handle for the device, upgrade to it
+ * while we discard the buffer cache to avoid discarding buffers
+ * under live filesystem.
+ */
+ if (!(mode & FMODE_EXCL)) {
+ claimed_bdev = bdev->bd_contains;
+ err = bd_prepare_to_claim(bdev, claimed_bdev,
+ truncate_bdev_range);
+ if (err)
+ return err;
+ }
+ truncate_inode_pages_range(bdev->bd_inode->i_mapping, lstart, lend);
+ if (claimed_bdev)
+ bd_abort_claiming(bdev, claimed_bdev, truncate_bdev_range);
+ return 0;
+}
+EXPORT_SYMBOL(truncate_bdev_range);
+
static void set_init_blocksize(struct block_device *bdev)
{
bdev->bd_inode->i_blkbits = blksize_bits(bdev_logical_block_size(bdev));
@@ -862,7 +891,7 @@ static int bdev_set(struct inode *inode, void *data)
return 0;
}
-struct block_device *bdget(dev_t dev)
+static struct block_device *bdget(dev_t dev)
{
struct block_device *bdev;
struct inode *inode;
@@ -876,11 +905,11 @@ struct block_device *bdget(dev_t dev)
bdev = &BDEV_I(inode)->bdev;
if (inode->i_state & I_NEW) {
+ spin_lock_init(&bdev->bd_size_lock);
bdev->bd_contains = NULL;
bdev->bd_super = NULL;
bdev->bd_inode = inode;
bdev->bd_part_count = 0;
- bdev->bd_invalidated = 0;
inode->i_mode = S_IFBLK;
inode->i_rdev = dev;
inode->i_bdev = bdev;
@@ -891,8 +920,6 @@ struct block_device *bdget(dev_t dev)
return bdev;
}
-EXPORT_SYMBOL(bdget);
-
/**
* bdgrab -- Grab a reference to an already referenced block device
* @bdev: Block device to grab a reference to.
@@ -904,6 +931,11 @@ struct block_device *bdgrab(struct block_device *bdev)
}
EXPORT_SYMBOL(bdgrab);
+struct block_device *bdget_part(struct hd_struct *part)
+{
+ return bdget(part_devt(part));
+}
+
long nr_blockdev_pages(void)
{
struct inode *inode;
@@ -1290,6 +1322,7 @@ static void check_disk_size_change(struct gendisk *disk,
{
loff_t disk_size, bdev_size;
+ spin_lock(&bdev->bd_size_lock);
disk_size = (loff_t)get_capacity(disk) << 9;
bdev_size = i_size_read(bdev->bd_inode);
if (disk_size != bdev_size) {
@@ -1299,85 +1332,51 @@ static void check_disk_size_change(struct gendisk *disk,
disk->disk_name, bdev_size, disk_size);
}
i_size_write(bdev->bd_inode, disk_size);
- if (bdev_size > disk_size && __invalidate_device(bdev, false))
+ }
+ spin_unlock(&bdev->bd_size_lock);
+
+ if (bdev_size > disk_size) {
+ if (__invalidate_device(bdev, false))
pr_warn("VFS: busy inodes on resized disk %s\n",
disk->disk_name);
}
- bdev->bd_invalidated = 0;
}
/**
- * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back
- * @disk: struct gendisk to be revalidated
+ * revalidate_disk_size - checks for disk size change and adjusts bdev size.
+ * @disk: struct gendisk to check
+ * @verbose: if %true log a message about a size change if there is any
*
- * This routine is a wrapper for lower-level driver's revalidate_disk
- * call-backs. It is used to do common pre and post operations needed
- * for all revalidate_disk operations.
+ * This routine checks to see if the bdev size does not match the disk size
+ * and adjusts it if it differs. When shrinking the bdev size, its all caches
+ * are freed.
*/
-int revalidate_disk(struct gendisk *disk)
+void revalidate_disk_size(struct gendisk *disk, bool verbose)
{
- int ret = 0;
-
- if (disk->fops->revalidate_disk)
- ret = disk->fops->revalidate_disk(disk);
+ struct block_device *bdev;
/*
* Hidden disks don't have associated bdev so there's no point in
- * revalidating it.
+ * revalidating them.
*/
- if (!(disk->flags & GENHD_FL_HIDDEN)) {
- struct block_device *bdev = bdget_disk(disk, 0);
-
- if (!bdev)
- return ret;
+ if (disk->flags & GENHD_FL_HIDDEN)
+ return;
- mutex_lock(&bdev->bd_mutex);
- check_disk_size_change(disk, bdev, ret == 0);
- mutex_unlock(&bdev->bd_mutex);
+ bdev = bdget_disk(disk, 0);
+ if (bdev) {
+ check_disk_size_change(disk, bdev, verbose);
bdput(bdev);
}
- return ret;
}
-EXPORT_SYMBOL(revalidate_disk);
+EXPORT_SYMBOL(revalidate_disk_size);
-/*
- * This routine checks whether a removable media has been changed,
- * and invalidates all buffer-cache-entries in that case. This
- * is a relatively slow routine, so we have to try to minimize using
- * it. Thus it is called only upon a 'mount' or 'open'. This
- * is the best way of combining speed and utility, I think.
- * People changing diskettes in the middle of an operation deserve
- * to lose :-)
- */
-int check_disk_change(struct block_device *bdev)
+void bd_set_nr_sectors(struct block_device *bdev, sector_t sectors)
{
- struct gendisk *disk = bdev->bd_disk;
- const struct block_device_operations *bdops = disk->fops;
- unsigned int events;
-
- events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE |
- DISK_EVENT_EJECT_REQUEST);
- if (!(events & DISK_EVENT_MEDIA_CHANGE))
- return 0;
-
- if (__invalidate_device(bdev, true))
- pr_warn("VFS: busy inodes on changed media %s\n",
- disk->disk_name);
- bdev->bd_invalidated = 1;
- if (bdops->revalidate_disk)
- bdops->revalidate_disk(bdev->bd_disk);
- return 1;
-}
-
-EXPORT_SYMBOL(check_disk_change);
-
-void bd_set_size(struct block_device *bdev, loff_t size)
-{
- inode_lock(bdev->bd_inode);
- i_size_write(bdev->bd_inode, size);
- inode_unlock(bdev->bd_inode);
+ spin_lock(&bdev->bd_size_lock);
+ i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT);
+ spin_unlock(&bdev->bd_size_lock);
}
-EXPORT_SYMBOL(bd_set_size);
+EXPORT_SYMBOL(bd_set_nr_sectors);
static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
@@ -1388,6 +1387,8 @@ int bdev_disk_changed(struct block_device *bdev, bool invalidate)
lockdep_assert_held(&bdev->bd_mutex);
+ clear_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state);
+
rescan:
ret = blk_drop_partitions(bdev);
if (ret)
@@ -1446,22 +1447,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder,
struct gendisk *disk;
int ret;
int partno;
- int perm = 0;
bool first_open = false, unblock_events = true, need_restart;
- if (mode & FMODE_READ)
- perm |= MAY_READ;
- if (mode & FMODE_WRITE)
- perm |= MAY_WRITE;
- /*
- * hooks: /n/, see "layering violations".
- */
- if (!for_part) {
- ret = devcgroup_inode_permission(bdev->bd_inode, perm);
- if (ret != 0)
- return ret;
- }
-
restart:
need_restart = false;
ret = -ENXIO;
@@ -1514,7 +1501,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder,
}
if (!ret) {
- bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
+ bd_set_nr_sectors(bdev, get_capacity(disk));
set_init_blocksize(bdev);
}
@@ -1524,7 +1511,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder,
* The latter is necessary to prevent ghost
* partitions on a removed medium.
*/
- if (bdev->bd_invalidated &&
+ if (test_bit(GD_NEED_PART_SCAN, &disk->state) &&
(!ret || ret == -ENOMEDIUM))
bdev_disk_changed(bdev, ret == -ENOMEDIUM);
@@ -1542,7 +1529,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder,
ret = -ENXIO;
goto out_clear;
}
- bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
+ bd_set_nr_sectors(bdev, bdev->bd_part->nr_sects);
set_init_blocksize(bdev);
}
@@ -1554,7 +1541,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder,
if (bdev->bd_disk->fops->open)
ret = bdev->bd_disk->fops->open(bdev, mode);
/* the same as first opener case, read comment there */
- if (bdev->bd_invalidated &&
+ if (test_bit(GD_NEED_PART_SCAN, &disk->state) &&
(!ret || ret == -ENOMEDIUM))
bdev_disk_changed(bdev, ret == -ENOMEDIUM);
if (ret)
@@ -1632,16 +1619,27 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder,
* RETURNS:
* 0 on success, -errno on failure.
*/
-int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
+static int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
{
- int res;
+ int ret, perm = 0;
- res =__blkdev_get(bdev, mode, holder, 0);
- if (res)
- bdput(bdev);
- return res;
+ if (mode & FMODE_READ)
+ perm |= MAY_READ;
+ if (mode & FMODE_WRITE)
+ perm |= MAY_WRITE;
+ ret = devcgroup_inode_permission(bdev->bd_inode, perm);
+ if (ret)
+ goto bdput;
+
+ ret =__blkdev_get(bdev, mode, holder, 0);
+ if (ret)
+ goto bdput;
+ return 0;
+
+bdput:
+ bdput(bdev);
+ return ret;
}
-EXPORT_SYMBOL(blkdev_get);
/**
* blkdev_get_by_path - open a block device by name
@@ -1889,7 +1887,7 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (bdev_read_only(I_BDEV(bd_inode)))
return -EPERM;
- if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode))
+ if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode->i_rdev))
return -ETXTBSY;
if (!iov_iter_count(from))
@@ -1969,7 +1967,6 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
loff_t len)
{
struct block_device *bdev = I_BDEV(bdev_file_inode(file));
- struct address_space *mapping;
loff_t end = start + len - 1;
loff_t isize;
int error;
@@ -1997,8 +1994,9 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
return -EINVAL;
/* Invalidate the page cache, including dirty pages. */
- mapping = bdev->bd_inode->i_mapping;
- truncate_inode_pages_range(mapping, start, end);
+ error = truncate_bdev_range(bdev, file->f_mode, start, end);
+ if (error)
+ return error;
switch (mode) {
case FALLOC_FL_ZERO_RANGE:
@@ -2025,7 +2023,7 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
* the caller will be given -EBUSY. The third argument is
* inclusive, so the rounding here is safe.
*/
- return invalidate_inode_pages2_range(mapping,
+ return invalidate_inode_pages2_range(bdev->bd_inode->i_mapping,
start >> PAGE_SHIFT,
end >> PAGE_SHIFT);
}
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 575636f6491e..68b95ad82126 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -14,6 +14,7 @@ config BTRFS_FS
select LZO_DECOMPRESS
select ZSTD_COMPRESS
select ZSTD_DECOMPRESS
+ select FS_IOMAP
select RAID6_PQ
select XOR_BLOCKS
select SRCU
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index ea10f7bc99ab..b3268f4ea5f3 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -2303,7 +2303,7 @@ struct btrfs_backref_iter *btrfs_backref_iter_alloc(
return NULL;
ret->path = btrfs_alloc_path();
- if (!ret) {
+ if (!ret->path) {
kfree(ret);
return NULL;
}
@@ -2997,7 +2997,6 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache,
while (!list_empty(&pending_edge)) {
struct btrfs_backref_node *upper;
struct btrfs_backref_node *lower;
- struct rb_node *rb_node;
edge = list_first_entry(&pending_edge,
struct btrfs_backref_edge, list[UPPER]);
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 613920c17ac1..c0f1d6818df7 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1766,16 +1766,10 @@ static void link_block_group(struct btrfs_block_group *cache)
{
struct btrfs_space_info *space_info = cache->space_info;
int index = btrfs_bg_flags_to_raid_index(cache->flags);
- bool first = false;
down_write(&space_info->groups_sem);
- if (list_empty(&space_info->block_groups[index]))
- first = true;
list_add_tail(&cache->list, &space_info->block_groups[index]);
up_write(&space_info->groups_sem);
-
- if (first)
- btrfs_sysfs_add_block_group_type(cache);
}
static struct btrfs_block_group *btrfs_create_block_group_cache(
@@ -1798,7 +1792,6 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
cache->fs_info = fs_info;
cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
- set_free_space_tree_thresholds(cache);
cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
@@ -1874,7 +1867,7 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
return ret;
}
-static int read_block_group_item(struct btrfs_block_group *cache,
+static void read_block_group_item(struct btrfs_block_group *cache,
struct btrfs_path *path,
const struct btrfs_key *key)
{
@@ -1888,8 +1881,6 @@ static int read_block_group_item(struct btrfs_block_group *cache,
sizeof(bgi));
cache->used = btrfs_stack_block_group_used(&bgi);
cache->flags = btrfs_stack_block_group_flags(&bgi);
-
- return 0;
}
static int read_one_block_group(struct btrfs_fs_info *info,
@@ -1908,9 +1899,9 @@ static int read_one_block_group(struct btrfs_fs_info *info,
if (!cache)
return -ENOMEM;
- ret = read_block_group_item(cache, path, key);
- if (ret < 0)
- goto error;
+ read_block_group_item(cache, path, key);
+
+ set_free_space_tree_thresholds(cache);
if (need_clear) {
/*
@@ -2034,8 +2025,18 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
btrfs_release_path(path);
}
- rcu_read_lock();
- list_for_each_entry_rcu(space_info, &info->space_info, list) {
+ list_for_each_entry(space_info, &info->space_info, list) {
+ int i;
+
+ for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
+ if (list_empty(&space_info->block_groups[i]))
+ continue;
+ cache = list_first_entry(&space_info->block_groups[i],
+ struct btrfs_block_group,
+ list);
+ btrfs_sysfs_add_block_group_type(cache);
+ }
+
if (!(btrfs_get_alloc_profile(info, space_info->flags) &
(BTRFS_BLOCK_GROUP_RAID10 |
BTRFS_BLOCK_GROUP_RAID1_MASK |
@@ -2055,7 +2056,6 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
list)
inc_block_group_ro(cache, 1);
}
- rcu_read_unlock();
btrfs_init_global_block_rsv(info);
ret = check_chunk_block_group_mappings(info);
@@ -2096,12 +2096,16 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
return;
while (!list_empty(&trans->new_bgs)) {
+ int index;
+
block_group = list_first_entry(&trans->new_bgs,
struct btrfs_block_group,
bg_list);
if (ret)
goto next;
+ index = btrfs_bg_flags_to_raid_index(block_group->flags);
+
ret = insert_block_group_item(trans, block_group);
if (ret)
btrfs_abort_transaction(trans, ret);
@@ -2110,6 +2114,16 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
if (ret)
btrfs_abort_transaction(trans, ret);
add_block_group_free_space(trans, block_group);
+
+ /*
+ * If we restriped during balance, we may have added a new raid
+ * type, so now add the sysfs entries when it is safe to do so.
+ * We don't have to worry about locking here as it's handled in
+ * btrfs_sysfs_add_block_group_type.
+ */
+ if (block_group->space_info->block_group_kobjs[index] == NULL)
+ btrfs_sysfs_add_block_group_type(block_group);
+
/* Already aborted the transaction if it failed. */
next:
btrfs_delayed_refs_rsv_release(fs_info, 1);
@@ -2132,6 +2146,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
return -ENOMEM;
cache->length = size;
+ set_free_space_tree_thresholds(cache);
cache->used = bytes_used;
cache->flags = type;
cache->last_byte_to_unpin = (u64)-1;
@@ -2783,7 +2798,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
* finished yet (no block group item in the extent tree
* yet, etc). If this is the case, wait for all free
* space endio workers to finish and retry. This is a
- * a very rare case so no need for a more efficient and
+ * very rare case so no need for a more efficient and
* complex approach.
*/
if (ret == -ENOENT) {
@@ -2959,6 +2974,13 @@ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
space_info, -ram_bytes);
if (delalloc)
cache->delalloc_bytes += num_bytes;
+
+ /*
+ * Compression can use less space than we reserved, so wake
+ * tickets if that happens
+ */
+ if (num_bytes < ram_bytes)
+ btrfs_try_granting_tickets(cache->fs_info, space_info);
}
spin_unlock(&cache->lock);
spin_unlock(&space_info->lock);
@@ -2992,6 +3014,8 @@ void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
if (delalloc)
cache->delalloc_bytes -= num_bytes;
spin_unlock(&cache->lock);
+
+ btrfs_try_granting_tickets(cache->fs_info, space_info);
spin_unlock(&space_info->lock);
}
@@ -3000,12 +3024,10 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
struct list_head *head = &info->space_info;
struct btrfs_space_info *found;
- rcu_read_lock();
- list_for_each_entry_rcu(found, head, list) {
+ list_for_each_entry(found, head, list) {
if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
found->force_alloc = CHUNK_ALLOC_FORCE;
}
- rcu_read_unlock();
}
static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
@@ -3336,14 +3358,6 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
}
spin_unlock(&info->block_group_cache_lock);
- /*
- * Now that all the block groups are freed, go through and free all the
- * space_info structs. This is only called during the final stages of
- * unmount, and so we know nobody is using them. We call
- * synchronize_rcu() once before we start, just to be on the safe side.
- */
- synchronize_rcu();
-
btrfs_release_global_block_rsv(info);
while (!list_empty(&info->space_info)) {
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index c47b6c6fea9f..92dd86bceae3 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -21,14 +21,18 @@
* new data the application may have written before commit.
*/
enum {
- BTRFS_INODE_ORDERED_DATA_CLOSE,
+ BTRFS_INODE_FLUSH_ON_CLOSE,
BTRFS_INODE_DUMMY,
BTRFS_INODE_IN_DEFRAG,
BTRFS_INODE_HAS_ASYNC_EXTENT,
+ /*
+ * Always set under the VFS' inode lock, otherwise it can cause races
+ * during fsync (we start as a fast fsync and then end up in a full
+ * fsync racing with ordered extent completion).
+ */
BTRFS_INODE_NEEDS_FULL_SYNC,
BTRFS_INODE_COPY_EVERYTHING,
BTRFS_INODE_IN_DELALLOC_LIST,
- BTRFS_INODE_READDIO_NEED_LOCK,
BTRFS_INODE_HAS_PROPS,
BTRFS_INODE_SNAPSHOT_FLUSH,
};
@@ -212,6 +216,11 @@ struct btrfs_inode {
struct inode vfs_inode;
};
+static inline u32 btrfs_inode_sectorsize(const struct btrfs_inode *inode)
+{
+ return inode->root->fs_info->sectorsize;
+}
+
static inline struct btrfs_inode *BTRFS_I(const struct inode *inode)
{
return container_of(inode, struct btrfs_inode, vfs_inode);
@@ -324,23 +333,6 @@ struct btrfs_dio_private {
u8 csums[];
};
-/*
- * Disable DIO read nolock optimization, so new dio readers will be forced
- * to grab i_mutex. It is used to avoid the endless truncate due to
- * nonlocked dio read.
- */
-static inline void btrfs_inode_block_unlocked_dio(struct btrfs_inode *inode)
-{
- set_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags);
- smp_mb();
-}
-
-static inline void btrfs_inode_resume_unlocked_dio(struct btrfs_inode *inode)
-{
- smp_mb__before_atomic();
- clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags);
-}
-
/* Array of bytes with variable length, hexadecimal format 0x1234 */
#define CSUM_FMT "0x%*phN"
#define CSUM_FMT_VALUE(size, bytes) size, bytes
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 1ab56a734e70..eeface30facd 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -29,41 +29,6 @@
#include "extent_io.h"
#include "extent_map.h"
-int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
- u64 start, struct page **pages, unsigned long *out_pages,
- unsigned long *total_in, unsigned long *total_out);
-int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
-int zlib_decompress(struct list_head *ws, unsigned char *data_in,
- struct page *dest_page, unsigned long start_byte, size_t srclen,
- size_t destlen);
-struct list_head *zlib_alloc_workspace(unsigned int level);
-void zlib_free_workspace(struct list_head *ws);
-struct list_head *zlib_get_workspace(unsigned int level);
-
-int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
- u64 start, struct page **pages, unsigned long *out_pages,
- unsigned long *total_in, unsigned long *total_out);
-int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
-int lzo_decompress(struct list_head *ws, unsigned char *data_in,
- struct page *dest_page, unsigned long start_byte, size_t srclen,
- size_t destlen);
-struct list_head *lzo_alloc_workspace(unsigned int level);
-void lzo_free_workspace(struct list_head *ws);
-
-int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
- u64 start, struct page **pages, unsigned long *out_pages,
- unsigned long *total_in, unsigned long *total_out);
-int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
-int zstd_decompress(struct list_head *ws, unsigned char *data_in,
- struct page *dest_page, unsigned long start_byte, size_t srclen,
- size_t destlen);
-void zstd_init_workspace_manager(void);
-void zstd_cleanup_workspace_manager(void);
-struct list_head *zstd_alloc_workspace(unsigned int level);
-void zstd_free_workspace(struct list_head *ws);
-struct list_head *zstd_get_workspace(unsigned int level);
-void zstd_put_workspace(struct list_head *ws);
-
static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" };
const char* btrfs_compress_type2str(enum btrfs_compression_type type)
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 9f3dbe372631..8001b700ea3a 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -144,4 +144,39 @@ bool btrfs_compress_is_valid_type(const char *str, size_t len);
int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end);
+int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct page **pages, unsigned long *out_pages,
+ unsigned long *total_in, unsigned long *total_out);
+int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
+int zlib_decompress(struct list_head *ws, unsigned char *data_in,
+ struct page *dest_page, unsigned long start_byte, size_t srclen,
+ size_t destlen);
+struct list_head *zlib_alloc_workspace(unsigned int level);
+void zlib_free_workspace(struct list_head *ws);
+struct list_head *zlib_get_workspace(unsigned int level);
+
+int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct page **pages, unsigned long *out_pages,
+ unsigned long *total_in, unsigned long *total_out);
+int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
+int lzo_decompress(struct list_head *ws, unsigned char *data_in,
+ struct page *dest_page, unsigned long start_byte, size_t srclen,
+ size_t destlen);
+struct list_head *lzo_alloc_workspace(unsigned int level);
+void lzo_free_workspace(struct list_head *ws);
+
+int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct page **pages, unsigned long *out_pages,
+ unsigned long *total_in, unsigned long *total_out);
+int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
+int zstd_decompress(struct list_head *ws, unsigned char *data_in,
+ struct page *dest_page, unsigned long start_byte, size_t srclen,
+ size_t destlen);
+void zstd_init_workspace_manager(void);
+void zstd_cleanup_workspace_manager(void);
+struct list_head *zstd_alloc_workspace(unsigned int level);
+void zstd_free_workspace(struct list_head *ws);
+struct list_head *zstd_get_workspace(unsigned int level);
+void zstd_put_workspace(struct list_head *ws);
+
#endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 70e49d8d4f6c..113da62dc17f 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -68,7 +68,7 @@ const char *btrfs_super_csum_driver(u16 csum_type)
btrfs_csums[csum_type].name;
}
-size_t __const btrfs_get_num_csums(void)
+size_t __attribute_const__ btrfs_get_num_csums(void)
{
return ARRAY_SIZE(btrfs_csums);
}
@@ -198,7 +198,8 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
btrfs_node_key(buf, &disk_key, 0);
cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
- &disk_key, level, buf->start, 0);
+ &disk_key, level, buf->start, 0,
+ BTRFS_NESTING_NEW_ROOT);
if (IS_ERR(cow))
return PTR_ERR(cow);
@@ -957,7 +958,8 @@ static struct extent_buffer *alloc_tree_block_no_bg_flush(
const struct btrfs_disk_key *disk_key,
int level,
u64 hint,
- u64 empty_size)
+ u64 empty_size,
+ enum btrfs_lock_nesting nest)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *ret;
@@ -986,7 +988,7 @@ static struct extent_buffer *alloc_tree_block_no_bg_flush(
ret = btrfs_alloc_tree_block(trans, root, parent_start,
root->root_key.objectid, disk_key, level,
- hint, empty_size);
+ hint, empty_size, nest);
trans->can_flush_pending_bgs = true;
return ret;
@@ -1009,7 +1011,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
struct extent_buffer *buf,
struct extent_buffer *parent, int parent_slot,
struct extent_buffer **cow_ret,
- u64 search_start, u64 empty_size)
+ u64 search_start, u64 empty_size,
+ enum btrfs_lock_nesting nest)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_disk_key disk_key;
@@ -1040,7 +1043,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
parent_start = parent->start;
cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key,
- level, search_start, empty_size);
+ level, search_start, empty_size, nest);
if (IS_ERR(cow))
return PTR_ERR(cow);
@@ -1061,6 +1064,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
if (ret) {
+ btrfs_tree_unlock(cow);
+ free_extent_buffer(cow);
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -1068,6 +1073,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
ret = btrfs_reloc_cow_block(trans, root, buf, cow);
if (ret) {
+ btrfs_tree_unlock(cow);
+ free_extent_buffer(cow);
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -1100,6 +1107,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
if (last_ref) {
ret = tree_mod_log_free_eb(buf);
if (ret) {
+ btrfs_tree_unlock(cow);
+ free_extent_buffer(cow);
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -1297,6 +1306,8 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
btrfs_tree_read_unlock_blocking(eb);
free_extent_buffer(eb);
+ btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin),
+ eb_rewin, btrfs_header_level(eb_rewin));
btrfs_tree_read_lock(eb_rewin);
__tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
WARN_ON(btrfs_header_nritems(eb_rewin) >
@@ -1370,7 +1381,6 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
if (!eb)
return NULL;
- btrfs_tree_read_lock(eb);
if (old_root) {
btrfs_set_header_bytenr(eb, eb->start);
btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
@@ -1378,6 +1388,9 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
btrfs_set_header_level(eb, old_root->level);
btrfs_set_header_generation(eb, old_generation);
}
+ btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb,
+ btrfs_header_level(eb));
+ btrfs_tree_read_lock(eb);
if (tm)
__tree_mod_log_rewind(fs_info, eb, time_seq, tm);
else
@@ -1442,7 +1455,8 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *buf,
struct extent_buffer *parent, int parent_slot,
- struct extent_buffer **cow_ret)
+ struct extent_buffer **cow_ret,
+ enum btrfs_lock_nesting nest)
{
struct btrfs_fs_info *fs_info = root->fs_info;
u64 search_start;
@@ -1481,7 +1495,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
*/
btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
ret = __btrfs_cow_block(trans, root, buf, parent,
- parent_slot, cow_ret, search_start, 0);
+ parent_slot, cow_ret, search_start, 0, nest);
trace_btrfs_cow_block(root, buf, *cow_ret);
@@ -1653,7 +1667,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
err = __btrfs_cow_block(trans, root, cur, parent, i,
&cur, search_start,
min(16 * blocksize,
- (end_slot - i) * blocksize));
+ (end_slot - i) * blocksize),
+ BTRFS_NESTING_COW);
if (err) {
btrfs_tree_unlock(cur);
free_extent_buffer(cur);
@@ -1851,7 +1866,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
btrfs_tree_lock(child);
btrfs_set_lock_blocking_write(child);
- ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
+ ret = btrfs_cow_block(trans, root, child, mid, 0, &child,
+ BTRFS_NESTING_COW);
if (ret) {
btrfs_tree_unlock(child);
free_extent_buffer(child);
@@ -1887,10 +1903,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
left = NULL;
if (left) {
- btrfs_tree_lock(left);
+ __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
btrfs_set_lock_blocking_write(left);
wret = btrfs_cow_block(trans, root, left,
- parent, pslot - 1, &left);
+ parent, pslot - 1, &left,
+ BTRFS_NESTING_LEFT_COW);
if (wret) {
ret = wret;
goto enospc;
@@ -1902,10 +1919,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
right = NULL;
if (right) {
- btrfs_tree_lock(right);
+ __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
btrfs_set_lock_blocking_write(right);
wret = btrfs_cow_block(trans, root, right,
- parent, pslot + 1, &right);
+ parent, pslot + 1, &right,
+ BTRFS_NESTING_RIGHT_COW);
if (wret) {
ret = wret;
goto enospc;
@@ -2065,7 +2083,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
if (left) {
u32 left_nr;
- btrfs_tree_lock(left);
+ __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
btrfs_set_lock_blocking_write(left);
left_nr = btrfs_header_nritems(left);
@@ -2073,7 +2091,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
wret = 1;
} else {
ret = btrfs_cow_block(trans, root, left, parent,
- pslot - 1, &left);
+ pslot - 1, &left,
+ BTRFS_NESTING_LEFT_COW);
if (ret)
wret = 1;
else {
@@ -2119,7 +2138,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
if (right) {
u32 right_nr;
- btrfs_tree_lock(right);
+ __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
btrfs_set_lock_blocking_write(right);
right_nr = btrfs_header_nritems(right);
@@ -2128,7 +2147,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
} else {
ret = btrfs_cow_block(trans, root, right,
parent, pslot + 1,
- &right);
+ &right, BTRFS_NESTING_RIGHT_COW);
if (ret)
wret = 1;
else {
@@ -2597,7 +2616,7 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
* We don't know the level of the root node until we actually
* have it read locked
*/
- b = btrfs_read_lock_root_node(root);
+ b = __btrfs_read_lock_root_node(root, p->recurse);
level = btrfs_header_level(b);
if (level > write_lock_level)
goto out;
@@ -2736,11 +2755,13 @@ again:
btrfs_set_path_blocking(p);
if (last_level)
err = btrfs_cow_block(trans, root, b, NULL, 0,
- &b);
+ &b,
+ BTRFS_NESTING_COW);
else
err = btrfs_cow_block(trans, root, b,
p->nodes[level + 1],
- p->slots[level + 1], &b);
+ p->slots[level + 1], &b,
+ BTRFS_NESTING_COW);
if (err) {
ret = err;
goto done;
@@ -2871,7 +2892,8 @@ cow_done:
} else {
if (!btrfs_tree_read_lock_atomic(b)) {
btrfs_set_path_blocking(p);
- btrfs_tree_read_lock(b);
+ __btrfs_tree_read_lock(b, BTRFS_NESTING_NORMAL,
+ p->recurse);
}
p->locks[level] = BTRFS_READ_LOCK;
}
@@ -3160,6 +3182,58 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
}
/*
+ * Check key order of two sibling extent buffers.
+ *
+ * Return true if something is wrong.
+ * Return false if everything is fine.
+ *
+ * Tree-checker only works inside one tree block, thus the following
+ * corruption can not be detected by tree-checker:
+ *
+ * Leaf @left | Leaf @right
+ * --------------------------------------------------------------
+ * | 1 | 2 | 3 | 4 | 5 | f6 | | 7 | 8 |
+ *
+ * Key f6 in leaf @left itself is valid, but not valid when the next
+ * key in leaf @right is 7.
+ * This can only be checked at tree block merge time.
+ * And since tree checker has ensured all key order in each tree block
+ * is correct, we only need to bother the last key of @left and the first
+ * key of @right.
+ */
+static bool check_sibling_keys(struct extent_buffer *left,
+ struct extent_buffer *right)
+{
+ struct btrfs_key left_last;
+ struct btrfs_key right_first;
+ int level = btrfs_header_level(left);
+ int nr_left = btrfs_header_nritems(left);
+ int nr_right = btrfs_header_nritems(right);
+
+ /* No key to check in one of the tree blocks */
+ if (!nr_left || !nr_right)
+ return false;
+
+ if (level) {
+ btrfs_node_key_to_cpu(left, &left_last, nr_left - 1);
+ btrfs_node_key_to_cpu(right, &right_first, 0);
+ } else {
+ btrfs_item_key_to_cpu(left, &left_last, nr_left - 1);
+ btrfs_item_key_to_cpu(right, &right_first, 0);
+ }
+
+ if (btrfs_comp_cpu_keys(&left_last, &right_first) >= 0) {
+ btrfs_crit(left->fs_info,
+"bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)",
+ left_last.objectid, left_last.type,
+ left_last.offset, right_first.objectid,
+ right_first.type, right_first.offset);
+ return true;
+ }
+ return false;
+}
+
+/*
* try to push data from one node into the next node left in the
* tree.
*
@@ -3203,6 +3277,12 @@ static int push_node_left(struct btrfs_trans_handle *trans,
} else
push_items = min(src_nritems - 8, push_items);
+ /* dst is the left eb, src is the middle eb */
+ if (check_sibling_keys(dst, src)) {
+ ret = -EUCLEAN;
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
ret = tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -3271,6 +3351,12 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
if (max_push < push_items)
push_items = max_push;
+ /* dst is the right eb, src is the middle eb */
+ if (check_sibling_keys(src, dst)) {
+ ret = -EUCLEAN;
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
BUG_ON(ret < 0);
memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
@@ -3327,7 +3413,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
btrfs_node_key(lower, &lower_key, 0);
c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level,
- root->node->start, 0);
+ root->node->start, 0,
+ BTRFS_NESTING_NEW_ROOT);
if (IS_ERR(c))
return PTR_ERR(c);
@@ -3457,7 +3544,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
btrfs_node_key(c, &disk_key, mid);
split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level,
- c->start, 0);
+ c->start, 0, BTRFS_NESTING_SPLIT);
if (IS_ERR(split))
return PTR_ERR(split);
@@ -3726,7 +3813,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
if (IS_ERR(right))
return 1;
- btrfs_tree_lock(right);
+ __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
btrfs_set_lock_blocking_write(right);
free_space = btrfs_leaf_free_space(right);
@@ -3735,7 +3822,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
/* cow and double check */
ret = btrfs_cow_block(trans, root, right, upper,
- slot + 1, &right);
+ slot + 1, &right, BTRFS_NESTING_RIGHT_COW);
if (ret)
goto out_unlock;
@@ -3747,6 +3834,12 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
if (left_nritems == 0)
goto out_unlock;
+ if (check_sibling_keys(left, right)) {
+ ret = -EUCLEAN;
+ btrfs_tree_unlock(right);
+ free_extent_buffer(right);
+ return ret;
+ }
if (path->slots[0] == left_nritems && !empty) {
/* Key greater than all keys in the leaf, right neighbor has
* enough room for it and we're not emptying our leaf to delete
@@ -3959,7 +4052,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
if (IS_ERR(left))
return 1;
- btrfs_tree_lock(left);
+ __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
btrfs_set_lock_blocking_write(left);
free_space = btrfs_leaf_free_space(left);
@@ -3970,7 +4063,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
/* cow and double check */
ret = btrfs_cow_block(trans, root, left,
- path->nodes[1], slot - 1, &left);
+ path->nodes[1], slot - 1, &left,
+ BTRFS_NESTING_LEFT_COW);
if (ret) {
/* we hit -ENOSPC, but it isn't fatal here */
if (ret == -ENOSPC)
@@ -3984,6 +4078,10 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
goto out;
}
+ if (check_sibling_keys(left, right)) {
+ ret = -EUCLEAN;
+ goto out;
+ }
return __push_leaf_left(path, min_data_size,
empty, left, free_space, right_nritems,
max_slot);
@@ -4232,8 +4330,18 @@ again:
else
btrfs_item_key(l, &disk_key, mid);
+ /*
+ * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double
+ * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES
+ * subclasses, which is 8 at the time of this patch, and we've maxed it
+ * out. In the future we could add a
+ * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
+ * use BTRFS_NESTING_NEW_ROOT.
+ */
right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0,
- l->start, 0);
+ l->start, 0, num_doubles ?
+ BTRFS_NESTING_NEW_ROOT :
+ BTRFS_NESTING_SPLIT);
if (IS_ERR(right))
return PTR_ERR(right);
@@ -4478,9 +4586,7 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
return ret;
path->slots[0]++;
- setup_items_for_insert(root, path, new_key, &item_size,
- item_size, item_size +
- sizeof(struct btrfs_item), 1);
+ setup_items_for_insert(root, path, new_key, &item_size, 1);
leaf = path->nodes[0];
memcpy_extent_buffer(leaf,
btrfs_item_ptr_offset(leaf, path->slots[0]),
@@ -4653,14 +4759,20 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
}
}
-/*
- * this is a helper for btrfs_insert_empty_items, the main goal here is
- * to save stack depth by doing the bulk of the work in a function
- * that doesn't call btrfs_search_slot
+/**
+ * setup_items_for_insert - Helper called before inserting one or more items
+ * to a leaf. Main purpose is to save stack depth by doing the bulk of the work
+ * in a function that doesn't call btrfs_search_slot
+ *
+ * @root: root we are inserting items to
+ * @path: points to the leaf/slot where we are going to insert new items
+ * @cpu_key: array of keys for items to be inserted
+ * @data_size: size of the body of each item we are going to insert
+ * @nr: size of @cpu_key/@data_size arrays
*/
void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
const struct btrfs_key *cpu_key, u32 *data_size,
- u32 total_data, u32 total_size, int nr)
+ int nr)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_item *item;
@@ -4671,6 +4783,12 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
struct extent_buffer *leaf;
int slot;
struct btrfs_map_token token;
+ u32 total_size;
+ u32 total_data = 0;
+
+ for (i = 0; i < nr; i++)
+ total_data += data_size[i];
+ total_size = total_data + (nr * sizeof(struct btrfs_item));
if (path->slots[0] == 0) {
btrfs_cpu_key_to_disk(&disk_key, cpu_key);
@@ -4697,7 +4815,8 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
if (old_data < data_end) {
btrfs_print_leaf(leaf);
- btrfs_crit(fs_info, "slot %d old_data %d data_end %d",
+ btrfs_crit(fs_info,
+ "item at slot %d with data offset %u beyond data end of leaf %u",
slot, old_data, data_end);
BUG();
}
@@ -4730,8 +4849,8 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
btrfs_set_item_key(leaf, &disk_key, slot + i);
item = btrfs_item_nr(slot + i);
- btrfs_set_token_item_offset(&token, item, data_end - data_size[i]);
data_end -= data_size[i];
+ btrfs_set_token_item_offset(&token, item, data_end);
btrfs_set_token_item_size(&token, item, data_size[i]);
}
@@ -4773,8 +4892,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
slot = path->slots[0];
BUG_ON(slot < 0);
- setup_items_for_insert(root, path, cpu_key, data_size,
- total_data, total_size, nr);
+ setup_items_for_insert(root, path, cpu_key, data_size, nr);
return 0;
}
@@ -5111,7 +5229,7 @@ again:
slot--;
/*
* check this node pointer against the min_trans parameters.
- * If it is too old, old, skip to the next one.
+ * If it is too old, skip to the next one.
*/
while (slot < nritems) {
u64 gen;
@@ -5375,7 +5493,9 @@ again:
}
if (!ret) {
btrfs_set_path_blocking(path);
- btrfs_tree_read_lock(next);
+ __btrfs_tree_read_lock(next,
+ BTRFS_NESTING_RIGHT,
+ path->recurse);
}
next_rw_lock = BTRFS_READ_LOCK;
}
@@ -5410,7 +5530,9 @@ again:
ret = btrfs_try_tree_read_lock(next);
if (!ret) {
btrfs_set_path_blocking(path);
- btrfs_tree_read_lock(next);
+ __btrfs_tree_read_lock(next,
+ BTRFS_NESTING_RIGHT,
+ path->recurse);
}
next_rw_lock = BTRFS_READ_LOCK;
}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9c7e466f27a9..aac3d6f4e35b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -374,6 +374,7 @@ struct btrfs_path {
unsigned int search_commit_root:1;
unsigned int need_commit_sem:1;
unsigned int skip_release_on_error:1;
+ unsigned int recurse:1;
};
#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \
sizeof(struct btrfs_item))
@@ -494,7 +495,7 @@ enum btrfs_orphan_cleanup_state {
ORPHAN_CLEANUP_DONE = 2,
};
-void btrfs_init_async_reclaim_work(struct work_struct *work);
+void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info);
/* fs_info */
struct reloc_control;
@@ -541,11 +542,6 @@ enum {
/* Used to record internally whether fs has been frozen */
BTRFS_FS_FROZEN,
/*
- * Indicate that a whole-filesystem exclusive operation is running
- * (device replace, resize, device add/delete, balance)
- */
- BTRFS_FS_EXCL_OP,
- /*
* Indicate that balance has been set up from the ioctl and is in the
* main phase. The fs_info::balance_ctl is initialized.
* Set and cleared while holding fs_info::balance_mutex.
@@ -565,6 +561,19 @@ enum {
BTRFS_FS_DISCARD_RUNNING,
};
+/*
+ * Exclusive operations (device replace, resize, device add/remove, balance)
+ */
+enum btrfs_exclusive_operation {
+ BTRFS_EXCLOP_NONE,
+ BTRFS_EXCLOP_BALANCE,
+ BTRFS_EXCLOP_DEV_ADD,
+ BTRFS_EXCLOP_DEV_REMOVE,
+ BTRFS_EXCLOP_DEV_REPLACE,
+ BTRFS_EXCLOP_RESIZE,
+ BTRFS_EXCLOP_SWAP_ACTIVATE,
+};
+
struct btrfs_fs_info {
u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
unsigned long flags;
@@ -912,6 +921,7 @@ struct btrfs_fs_info {
/* Used to reclaim the metadata space in the background. */
struct work_struct async_reclaim_work;
+ struct work_struct async_data_reclaim_work;
spinlock_t unused_bgs_lock;
struct list_head unused_bgs;
@@ -935,6 +945,9 @@ struct btrfs_fs_info {
*/
int send_in_progress;
+ /* Type of exclusive operation running */
+ unsigned long exclusive_operation;
+
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
spinlock_t ref_verify_lock;
struct rb_root block_tree;
@@ -1181,24 +1194,40 @@ struct btrfs_root {
#endif
};
-struct btrfs_clone_extent_info {
+/*
+ * Structure that conveys information about an extent that is going to replace
+ * all the extents in a file range.
+ */
+struct btrfs_replace_extent_info {
u64 disk_offset;
u64 disk_len;
u64 data_offset;
u64 data_len;
u64 file_offset;
+ /* Pointer to a file extent item of type regular or prealloc. */
char *extent_buf;
- u32 item_size;
+ /*
+ * Set to true when attempting to replace a file range with a new extent
+ * described by this structure, set to false when attempting to clone an
+ * existing extent into a file range.
+ */
+ bool is_new_extent;
+ /* Meaningful only if is_new_extent is true. */
+ int qgroup_reserved;
+ /*
+ * Meaningful only if is_new_extent is true.
+ * Used to track how many extent items we have already inserted in a
+ * subvolume tree that refer to the extent described by this structure,
+ * so that we know when to create a new delayed ref or update an existing
+ * one.
+ */
+ int insertions;
};
struct btrfs_file_private {
void *filldir_buf;
};
-static inline u32 btrfs_inode_sectorsize(const struct inode *inode)
-{
- return btrfs_sb(inode->i_sb)->sectorsize;
-}
static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info)
{
@@ -1391,6 +1420,16 @@ static inline void btrfs_init_map_token(struct btrfs_map_token *token,
#define cpu_to_le8(v) (v)
#define __le8 u8
+static inline u8 get_unaligned_le8(const void *p)
+{
+ return *(u8 *)p;
+}
+
+static inline void put_unaligned_le8(u8 val, void *p)
+{
+ *(u8 *)p = val;
+}
+
#define read_eb_member(eb, ptr, type, member, result) (\
read_extent_buffer(eb, (char *)(result), \
((unsigned long)(ptr)) + \
@@ -1449,27 +1488,25 @@ static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\
static inline u##bits btrfs_##name(const struct extent_buffer *eb) \
{ \
const type *p = page_address(eb->pages[0]); \
- u##bits res = le##bits##_to_cpu(p->member); \
- return res; \
+ return get_unaligned_le##bits(&p->member); \
} \
static inline void btrfs_set_##name(const struct extent_buffer *eb, \
u##bits val) \
{ \
type *p = page_address(eb->pages[0]); \
- p->member = cpu_to_le##bits(val); \
+ put_unaligned_le##bits(val, &p->member); \
}
#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \
static inline u##bits btrfs_##name(const type *s) \
{ \
- return le##bits##_to_cpu(s->member); \
+ return get_unaligned_le##bits(&s->member); \
} \
static inline void btrfs_set_##name(type *s, u##bits val) \
{ \
- s->member = cpu_to_le##bits(val); \
+ put_unaligned_le##bits(val, &s->member); \
}
-
static inline u64 btrfs_device_total_bytes(const struct extent_buffer *eb,
struct btrfs_dev_item *s)
{
@@ -2262,7 +2299,7 @@ BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block,
int btrfs_super_csum_size(const struct btrfs_super_block *s);
const char *btrfs_super_csum_name(u16 csum_type);
const char *btrfs_super_csum_driver(u16 csum_type);
-size_t __const btrfs_get_num_csums(void);
+size_t __attribute_const__ btrfs_get_num_csums(void);
/*
@@ -2518,13 +2555,14 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes);
int btrfs_exclude_logged_extents(struct extent_buffer *eb);
int btrfs_cross_ref_exist(struct btrfs_root *root,
- u64 objectid, u64 offset, u64 bytenr);
+ u64 objectid, u64 offset, u64 bytenr, bool strict);
struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 parent, u64 root_objectid,
const struct btrfs_disk_key *key,
int level, u64 hint,
- u64 empty_size);
+ u64 empty_size,
+ enum btrfs_lock_nesting nest);
void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
@@ -2592,6 +2630,8 @@ enum btrfs_reserve_flush_enum {
*
* Can be interruped by fatal signal.
*/
+ BTRFS_RESERVE_FLUSH_DATA,
+ BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE,
BTRFS_RESERVE_FLUSH_ALL,
/*
@@ -2619,7 +2659,7 @@ enum btrfs_flush_state {
int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
struct btrfs_block_rsv *rsv,
int nitems, bool use_global_rsv);
-void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
+void btrfs_subvolume_release_metadata(struct btrfs_root *root,
struct btrfs_block_rsv *rsv);
void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes);
@@ -2651,8 +2691,6 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
const struct btrfs_key *new_key);
struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
-struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
-struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root);
int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
struct btrfs_key *key, int lowest_level,
u64 min_trans);
@@ -2665,7 +2703,8 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
int btrfs_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *buf,
struct extent_buffer *parent, int parent_slot,
- struct extent_buffer **cow_ret);
+ struct extent_buffer **cow_ret,
+ enum btrfs_lock_nesting nest);
int btrfs_copy_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
@@ -2713,7 +2752,7 @@ static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
const struct btrfs_key *cpu_key, u32 *data_size,
- u32 total_data, u32 total_size, int nr);
+ int nr);
int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
const struct btrfs_key *key, void *data, u32 data_size);
int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
@@ -2930,11 +2969,15 @@ void btrfs_inode_safe_disk_i_size_write(struct inode *inode, u64 new_i_size);
u64 btrfs_file_extent_end(const struct btrfs_path *path);
/* inode.c */
+blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
+ int mirror_num, unsigned long bio_flags);
+int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u64 phy_offset,
+ struct page *page, u64 start, u64 end, int mirror);
struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
u64 start, u64 len);
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
u64 *orig_start, u64 *orig_block_len,
- u64 *ram_bytes);
+ u64 *ram_bytes, bool strict);
void __btrfs_del_delalloc_inode(struct btrfs_root *root,
struct btrfs_inode *inode);
@@ -2956,7 +2999,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
u32 min_type);
int btrfs_start_delalloc_snapshot(struct btrfs_root *root);
-int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr);
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr);
int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
unsigned int extra_bits,
struct extent_state **cached_state);
@@ -3017,6 +3060,7 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end);
void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
u64 end, int uptodate);
extern const struct dentry_operations btrfs_dentry_operations;
+ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter);
/* ioctl.c */
long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
@@ -3031,6 +3075,9 @@ void btrfs_get_block_group_info(struct list_head *groups_list,
struct btrfs_ioctl_space_info *space);
void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_balance_args *bargs);
+bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation type);
+void btrfs_exclop_finish(struct btrfs_fs_info *fs_info);
/* file.c */
int __init btrfs_auto_defrag_init(void);
@@ -3053,9 +3100,9 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
int btrfs_drop_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode, u64 start,
u64 end, int drop_cache);
-int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
+int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
const u64 start, const u64 end,
- struct btrfs_clone_extent_info *clone_info,
+ struct btrfs_replace_extent_info *extent_info,
struct btrfs_trans_handle **trans_out);
int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, u64 start, u64 end);
@@ -3536,9 +3583,7 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
/* Sanity test specific functions */
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-void btrfs_test_inode_set_ops(struct inode *inode);
void btrfs_test_destroy_inode(struct inode *inode);
-
static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
{
return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index 0e354e9e57d0..bacee09b7bfd 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -115,126 +115,15 @@ int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
- u64 used;
- int ret = 0;
- int need_commit = 2;
- int have_pinned_space;
+ enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_DATA;
/* Make sure bytes are sectorsize aligned */
bytes = ALIGN(bytes, fs_info->sectorsize);
- if (btrfs_is_free_space_inode(inode)) {
- need_commit = 0;
- ASSERT(current->journal_info);
- }
-
-again:
- /* Make sure we have enough space to handle the data first */
- spin_lock(&data_sinfo->lock);
- used = btrfs_space_info_used(data_sinfo, true);
-
- if (used + bytes > data_sinfo->total_bytes) {
- struct btrfs_trans_handle *trans;
-
- /*
- * If we don't have enough free bytes in this space then we need
- * to alloc a new chunk.
- */
- if (!data_sinfo->full) {
- u64 alloc_target;
-
- data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
- spin_unlock(&data_sinfo->lock);
-
- alloc_target = btrfs_data_alloc_profile(fs_info);
- /*
- * It is ugly that we don't call nolock join
- * transaction for the free space inode case here.
- * But it is safe because we only do the data space
- * reservation for the free space cache in the
- * transaction context, the common join transaction
- * just increase the counter of the current transaction
- * handler, doesn't try to acquire the trans_lock of
- * the fs.
- */
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
-
- ret = btrfs_chunk_alloc(trans, alloc_target,
- CHUNK_ALLOC_NO_FORCE);
- btrfs_end_transaction(trans);
- if (ret < 0) {
- if (ret != -ENOSPC)
- return ret;
- else {
- have_pinned_space = 1;
- goto commit_trans;
- }
- }
-
- goto again;
- }
-
- /*
- * If we don't have enough pinned space to deal with this
- * allocation, and no removed chunk in current transaction,
- * don't bother committing the transaction.
- */
- have_pinned_space = __percpu_counter_compare(
- &data_sinfo->total_bytes_pinned,
- used + bytes - data_sinfo->total_bytes,
- BTRFS_TOTAL_BYTES_PINNED_BATCH);
- spin_unlock(&data_sinfo->lock);
-
- /* Commit the current transaction and try again */
-commit_trans:
- if (need_commit) {
- need_commit--;
-
- if (need_commit > 0) {
- btrfs_start_delalloc_roots(fs_info, -1);
- btrfs_wait_ordered_roots(fs_info, U64_MAX, 0,
- (u64)-1);
- }
-
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
- if (have_pinned_space >= 0 ||
- test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
- &trans->transaction->flags) ||
- need_commit > 0) {
- ret = btrfs_commit_transaction(trans);
- if (ret)
- return ret;
- /*
- * The cleaner kthread might still be doing iput
- * operations. Wait for it to finish so that
- * more space is released. We don't need to
- * explicitly run the delayed iputs here because
- * the commit_transaction would have woken up
- * the cleaner.
- */
- ret = btrfs_wait_on_delayed_iputs(fs_info);
- if (ret)
- return ret;
- goto again;
- } else {
- btrfs_end_transaction(trans);
- }
- }
-
- trace_btrfs_space_reservation(fs_info,
- "space_info:enospc",
- data_sinfo->flags, bytes, 1);
- return -ENOSPC;
- }
- btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, bytes);
- spin_unlock(&data_sinfo->lock);
+ if (btrfs_is_free_space_inode(inode))
+ flush = BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE;
- return 0;
+ return btrfs_reserve_data_bytes(fs_info, bytes, flush);
}
int btrfs_check_data_free_space(struct btrfs_inode *inode,
@@ -277,9 +166,7 @@ void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info,
ASSERT(IS_ALIGNED(len, fs_info->sectorsize));
data_sinfo = fs_info->data_sinfo;
- spin_lock(&data_sinfo->lock);
- btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, -len);
- spin_unlock(&data_sinfo->lock);
+ btrfs_space_info_free_bytes_may_use(fs_info, data_sinfo, len);
}
/*
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index bf1595a42a98..5aba81e16113 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -627,8 +627,7 @@ static int btrfs_delayed_inode_reserve_metadata(
*/
if (!src_rsv || (!trans->bytes_reserved &&
src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
- ret = btrfs_qgroup_reserve_meta_prealloc(root,
- fs_info->nodesize, true);
+ ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true);
if (ret < 0)
return ret;
ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
@@ -769,8 +768,7 @@ static int btrfs_batch_insert_items(struct btrfs_root *root,
}
/* insert the keys of the items */
- setup_items_for_insert(root, path, keys, data_size,
- total_data_size, total_size, nitems);
+ setup_items_for_insert(root, path, keys, data_size, nitems);
/* insert the dir index items */
slot = path->slots[0];
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index db93909b25e0..4a0243cb9d97 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -64,10 +64,6 @@
static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
int scrub_ret);
-static void btrfs_dev_replace_update_device_in_mapping_tree(
- struct btrfs_fs_info *fs_info,
- struct btrfs_device *srcdev,
- struct btrfs_device *tgtdev);
static int btrfs_dev_replace_kthread(void *data);
int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
@@ -224,13 +220,12 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
{
struct btrfs_device *device;
struct block_device *bdev;
- struct list_head *devices;
struct rcu_string *name;
u64 devid = BTRFS_DEV_REPLACE_DEVID;
int ret = 0;
*device_out = NULL;
- if (fs_info->fs_devices->seeding) {
+ if (srcdev->fs_devices->seeding) {
btrfs_err(fs_info, "the filesystem is a seed filesystem!");
return -EINVAL;
}
@@ -244,8 +239,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
sync_blockdev(bdev);
- devices = &fs_info->fs_devices->devices;
- list_for_each_entry(device, devices, dev_list) {
+ list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
if (device->bdev == bdev) {
btrfs_err(fs_info,
"target device is in the filesystem!");
@@ -512,7 +506,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
up_write(&dev_replace->rwsem);
- ret = btrfs_sysfs_add_devices_dir(tgt_device->fs_devices, tgt_device);
+ ret = btrfs_sysfs_add_device(tgt_device);
if (ret)
btrfs_err(fs_info, "kobj add dev failed %d", ret);
@@ -599,6 +593,63 @@ static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
wake_up(&fs_info->dev_replace.replace_wait);
}
+/*
+ * When finishing the device replace, before swapping the source device with the
+ * target device we must update the chunk allocation state in the target device,
+ * as it is empty because replace works by directly copying the chunks and not
+ * through the normal chunk allocation path.
+ */
+static int btrfs_set_target_alloc_state(struct btrfs_device *srcdev,
+ struct btrfs_device *tgtdev)
+{
+ struct extent_state *cached_state = NULL;
+ u64 start = 0;
+ u64 found_start;
+ u64 found_end;
+ int ret = 0;
+
+ lockdep_assert_held(&srcdev->fs_info->chunk_mutex);
+
+ while (!find_first_extent_bit(&srcdev->alloc_state, start,
+ &found_start, &found_end,
+ CHUNK_ALLOCATED, &cached_state)) {
+ ret = set_extent_bits(&tgtdev->alloc_state, found_start,
+ found_end, CHUNK_ALLOCATED);
+ if (ret)
+ break;
+ start = found_end + 1;
+ }
+
+ free_extent_state(cached_state);
+ return ret;
+}
+
+static void btrfs_dev_replace_update_device_in_mapping_tree(
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_device *srcdev,
+ struct btrfs_device *tgtdev)
+{
+ struct extent_map_tree *em_tree = &fs_info->mapping_tree;
+ struct extent_map *em;
+ struct map_lookup *map;
+ u64 start = 0;
+ int i;
+
+ write_lock(&em_tree->lock);
+ do {
+ em = lookup_extent_mapping(em_tree, start, (u64)-1);
+ if (!em)
+ break;
+ map = em->map_lookup;
+ for (i = 0; i < map->num_stripes; i++)
+ if (srcdev == map->stripes[i].dev)
+ map->stripes[i].dev = tgtdev;
+ start = em->start + em->len;
+ free_extent_map(em);
+ } while (start);
+ write_unlock(&em_tree->lock);
+}
+
static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
int scrub_ret)
{
@@ -630,7 +681,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
* flush all outstanding I/O and inode extent mappings before the
* copy operation is declared as being finished
*/
- ret = btrfs_start_delalloc_roots(fs_info, -1);
+ ret = btrfs_start_delalloc_roots(fs_info, U64_MAX);
if (ret) {
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return ret;
@@ -673,8 +724,14 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
dev_replace->time_stopped = ktime_get_real_seconds();
dev_replace->item_needs_writeback = 1;
- /* replace old device with new one in mapping tree */
+ /*
+ * Update allocation state in the new device and replace the old device
+ * with the new one in the mapping tree.
+ */
if (!scrub_ret) {
+ scrub_ret = btrfs_set_target_alloc_state(src_device, tgt_device);
+ if (scrub_ret)
+ goto error;
btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
src_device,
tgt_device);
@@ -685,6 +742,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
btrfs_dev_name(src_device),
src_device->devid,
rcu_str_deref(tgt_device->name), scrub_ret);
+error:
up_write(&dev_replace->rwsem);
mutex_unlock(&fs_info->chunk_mutex);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
@@ -743,9 +801,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
/* replace the sysfs entry */
- btrfs_sysfs_remove_devices_dir(fs_info->fs_devices, src_device);
+ btrfs_sysfs_remove_device(src_device);
btrfs_sysfs_update_devid(tgt_device);
- btrfs_rm_dev_replace_free_srcdev(src_device);
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &src_device->dev_state))
+ btrfs_scratch_superblocks(fs_info, src_device->bdev,
+ src_device->name->str);
/* write back the superblocks */
trans = btrfs_start_transaction(root, 0);
@@ -754,33 +814,9 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
- return 0;
-}
-
-static void btrfs_dev_replace_update_device_in_mapping_tree(
- struct btrfs_fs_info *fs_info,
- struct btrfs_device *srcdev,
- struct btrfs_device *tgtdev)
-{
- struct extent_map_tree *em_tree = &fs_info->mapping_tree;
- struct extent_map *em;
- struct map_lookup *map;
- u64 start = 0;
- int i;
+ btrfs_rm_dev_replace_free_srcdev(src_device);
- write_lock(&em_tree->lock);
- do {
- em = lookup_extent_mapping(em_tree, start, (u64)-1);
- if (!em)
- break;
- map = em->map_lookup;
- for (i = 0; i < map->num_stripes; i++)
- if (srcdev == map->stripes[i].dev)
- map->stripes[i].dev = tgtdev;
- start = em->start + em->len;
- free_extent_map(em);
- } while (start);
- write_unlock(&em_tree->lock);
+ return 0;
}
/*
@@ -983,7 +1019,7 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
* should never allow both to start and pause. We don't want to allow
* dev-replace to start anyway.
*/
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) {
down_write(&dev_replace->rwsem);
dev_replace->replace_state =
BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
@@ -1020,7 +1056,7 @@ static int btrfs_dev_replace_kthread(void *data)
ret = btrfs_dev_replace_finishing(fs_info, ret);
WARN_ON(ret && ret != -ECANCELED);
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
return 0;
}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 9ae25f632157..8e3438672a82 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -50,7 +50,6 @@
BTRFS_SUPER_FLAG_METADUMP |\
BTRFS_SUPER_FLAG_METADUMP_V2)
-static const struct extent_io_ops btree_extent_io_ops;
static void end_workqueue_fn(struct btrfs_work *work);
static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
@@ -205,53 +204,6 @@ void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb,
#endif
/*
- * extents on the btree inode are pretty simple, there's one extent
- * that covers the entire device
- */
-struct extent_map *btree_get_extent(struct btrfs_inode *inode,
- struct page *page, size_t pg_offset,
- u64 start, u64 len)
-{
- struct extent_map_tree *em_tree = &inode->extent_tree;
- struct extent_map *em;
- int ret;
-
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, start, len);
- if (em) {
- read_unlock(&em_tree->lock);
- goto out;
- }
- read_unlock(&em_tree->lock);
-
- em = alloc_extent_map();
- if (!em) {
- em = ERR_PTR(-ENOMEM);
- goto out;
- }
- em->start = 0;
- em->len = (u64)-1;
- em->block_len = (u64)-1;
- em->block_start = 0;
-
- write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
- if (ret == -EEXIST) {
- free_extent_map(em);
- em = lookup_extent_mapping(em_tree, start, len);
- if (!em)
- em = ERR_PTR(-EIO);
- } else if (ret) {
- free_extent_map(em);
- em = ERR_PTR(ret);
- }
- write_unlock(&em_tree->lock);
-
-out:
- return em;
-}
-
-/*
* Compute the csum of a btree block and store the result to provided buffer.
*/
static void csum_tree_block(struct extent_buffer *buf, u8 *result)
@@ -545,38 +497,35 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page)
static int check_tree_block_fsid(struct extent_buffer *eb)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
u8 fsid[BTRFS_FSID_SIZE];
- int ret = 1;
+ u8 *metadata_uuid;
read_extent_buffer(eb, fsid, offsetof(struct btrfs_header, fsid),
BTRFS_FSID_SIZE);
- while (fs_devices) {
- u8 *metadata_uuid;
+ /*
+ * Checking the incompat flag is only valid for the current fs. For
+ * seed devices it's forbidden to have their uuid changed so reading
+ * ->fsid in this case is fine
+ */
+ if (btrfs_fs_incompat(fs_info, METADATA_UUID))
+ metadata_uuid = fs_devices->metadata_uuid;
+ else
+ metadata_uuid = fs_devices->fsid;
- /*
- * Checking the incompat flag is only valid for the current
- * fs. For seed devices it's forbidden to have their uuid
- * changed so reading ->fsid in this case is fine
- */
- if (fs_devices == fs_info->fs_devices &&
- btrfs_fs_incompat(fs_info, METADATA_UUID))
- metadata_uuid = fs_devices->metadata_uuid;
- else
- metadata_uuid = fs_devices->fsid;
+ if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE))
+ return 0;
- if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE)) {
- ret = 0;
- break;
- }
- fs_devices = fs_devices->seed;
- }
- return ret;
+ list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list)
+ if (!memcmp(fsid, seed_devs->fsid, BTRFS_FSID_SIZE))
+ return 0;
+
+ return 1;
}
-static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
- u64 phy_offset, struct page *page,
- u64 start, u64 end, int mirror)
+int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, u64 phy_offset,
+ struct page *page, u64 start, u64 end,
+ int mirror)
{
u64 found_start;
int found_level;
@@ -636,16 +585,15 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
csum_tree_block(eb, result);
if (memcmp_extent_buffer(eb, result, 0, csum_size)) {
- u32 val;
- u32 found = 0;
-
- memcpy(&found, result, csum_size);
+ u8 val[BTRFS_CSUM_SIZE] = { 0 };
read_extent_buffer(eb, &val, 0, csum_size);
btrfs_warn_rl(fs_info,
- "%s checksum verify failed on %llu wanted %x found %x level %d",
+ "%s checksum verify failed on %llu wanted " CSUM_FMT " found " CSUM_FMT " level %d",
fs_info->sb->s_id, eb->start,
- val, found, btrfs_header_level(eb));
+ CSUM_FMT_VALUE(csum_size, val),
+ CSUM_FMT_VALUE(csum_size, result),
+ btrfs_header_level(eb));
ret = -EUCLEAN;
goto err;
}
@@ -865,9 +813,8 @@ static int check_async_write(struct btrfs_fs_info *fs_info,
return 1;
}
-static blk_status_t btree_submit_bio_hook(struct inode *inode, struct bio *bio,
- int mirror_num,
- unsigned long bio_flags)
+blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,
+ int mirror_num, unsigned long bio_flags)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
int async = check_async_write(fs_info, BTRFS_I(inode));
@@ -952,11 +899,6 @@ static int btree_writepages(struct address_space *mapping,
return btree_write_cache_pages(mapping, wbc);
}
-static int btree_readpage(struct file *file, struct page *page)
-{
- return extent_read_full_page(page, btree_get_extent, 0);
-}
-
static int btree_releasepage(struct page *page, gfp_t gfp_flags)
{
if (PageWriteback(page) || PageDirty(page))
@@ -996,7 +938,6 @@ static int btree_set_page_dirty(struct page *page)
}
static const struct address_space_operations btree_aops = {
- .readpage = btree_readpage,
.writepages = btree_writepages,
.releasepage = btree_releasepage,
.invalidatepage = btree_invalidatepage,
@@ -1209,7 +1150,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
root->root_key.type = BTRFS_ROOT_ITEM_KEY;
root->root_key.offset = 0;
- leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0);
+ leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
+ BTRFS_NESTING_NORMAL);
if (IS_ERR(leaf)) {
ret = PTR_ERR(leaf);
leaf = NULL;
@@ -1281,7 +1223,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
*/
leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,
- NULL, 0, 0, 0);
+ NULL, 0, 0, 0, BTRFS_NESTING_NORMAL);
if (IS_ERR(leaf)) {
btrfs_put_root(root);
return ERR_CAST(leaf);
@@ -1506,10 +1448,12 @@ void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info)
struct btrfs_root *root;
while (!list_empty(&fs_info->allocated_roots)) {
+ char buf[BTRFS_ROOT_NAME_BUF_LEN];
+
root = list_first_entry(&fs_info->allocated_roots,
struct btrfs_root, leak_list);
- btrfs_err(fs_info, "leaked root %llu-%llu refcount %d",
- root->root_key.objectid, root->root_key.offset,
+ btrfs_err(fs_info, "leaked root %s refcount %d",
+ btrfs_root_name(root->root_key.objectid, buf),
refcount_read(&root->refs));
while (refcount_read(&root->refs) > 1)
btrfs_put_root(root);
@@ -2116,12 +2060,10 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree,
- IO_TREE_INODE_IO, inode);
+ IO_TREE_BTREE_INODE_IO, inode);
BTRFS_I(inode)->io_tree.track_uptodate = false;
extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
- BTRFS_I(inode)->io_tree.ops = &btree_extent_io_ops;
-
BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root);
memset(&BTRFS_I(inode)->location, 0, sizeof(struct btrfs_key));
set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
@@ -2627,18 +2569,17 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
level = btrfs_super_root_level(sb);
tree_root->node = read_tree_block(fs_info, btrfs_super_root(sb),
generation, level, NULL);
- if (IS_ERR(tree_root->node) ||
- !extent_buffer_uptodate(tree_root->node)) {
+ if (IS_ERR(tree_root->node)) {
handle_error = true;
+ ret = PTR_ERR(tree_root->node);
+ tree_root->node = NULL;
+ btrfs_warn(fs_info, "couldn't read tree root");
+ continue;
- if (IS_ERR(tree_root->node)) {
- ret = PTR_ERR(tree_root->node);
- tree_root->node = NULL;
- } else if (!extent_buffer_uptodate(tree_root->node)) {
- ret = -EUCLEAN;
- }
-
- btrfs_warn(fs_info, "failed to read tree root");
+ } else if (!extent_buffer_uptodate(tree_root->node)) {
+ handle_error = true;
+ ret = -EIO;
+ btrfs_warn(fs_info, "error while reading tree root");
continue;
}
@@ -2754,7 +2695,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
fs_info->check_integrity_print_mask = 0;
#endif
btrfs_init_balance(fs_info);
- btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work);
+ btrfs_init_async_reclaim_work(fs_info);
spin_lock_init(&fs_info->block_group_cache_lock);
fs_info->block_group_cache_tree = RB_ROOT;
@@ -2929,7 +2870,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
}
/*
- * Verify the type first, if that or the the checksum value are
+ * Verify the type first, if that or the checksum value are
* corrupted, we'll find out
*/
csum_type = btrfs_super_csum_type(disk_super);
@@ -3091,8 +3032,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
goto fail_sb_buffer;
}
- sb->s_bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK;
- sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE);
@@ -3418,6 +3357,8 @@ fail_block_groups:
btrfs_put_block_group_cache(fs_info);
fail_tree_roots:
+ if (fs_info->data_reloc_root)
+ btrfs_drop_and_free_fs_root(fs_info, fs_info->data_reloc_root);
free_root_pointers(fs_info, true);
invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
@@ -3481,8 +3422,12 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
return ERR_CAST(page);
super = page_address(page);
- if (btrfs_super_bytenr(super) != bytenr ||
- btrfs_super_magic(super) != BTRFS_MAGIC) {
+ if (btrfs_super_magic(super) != BTRFS_MAGIC) {
+ btrfs_release_disk_super(super);
+ return ERR_PTR(-ENODATA);
+ }
+
+ if (btrfs_super_bytenr(super) != bytenr) {
btrfs_release_disk_super(super);
return ERR_PTR(-EINVAL);
}
@@ -4055,6 +4000,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
btrfs_cleanup_defrag_inodes(fs_info);
cancel_work_sync(&fs_info->async_reclaim_work);
+ cancel_work_sync(&fs_info->async_data_reclaim_work);
/* Cancel or finish ongoing discard work */
btrfs_discard_cleanup(fs_info);
@@ -4551,6 +4497,7 @@ static void btrfs_cleanup_bg_io(struct btrfs_block_group *cache)
cache->io_ctl.inode = NULL;
iput(inode);
}
+ ASSERT(cache->io_ctl.pages == NULL);
btrfs_put_block_group(cache);
}
@@ -4685,9 +4632,3 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
return 0;
}
-
-static const struct extent_io_ops btree_extent_io_ops = {
- /* mandatory callbacks */
- .submit_bio_hook = btree_submit_bio_hook,
- .readpage_end_io_hook = btree_readpage_end_io_hook,
-};
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 00dc39d47ed3..fee69ced58b4 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -76,7 +76,11 @@ void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info);
void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info);
void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_root *root);
-
+int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, u64 phy_offset,
+ struct page *page, u64 start, u64 end,
+ int mirror);
+blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,
+ int mirror_num, unsigned long bio_flags);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info);
#endif
@@ -123,9 +127,6 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
u64 objectid);
int btree_lock_page_hook(struct page *page, void *data,
void (*flush_fn)(void *));
-struct extent_map *btree_get_extent(struct btrfs_inode *inode,
- struct page *page, size_t pg_offset,
- u64 start, u64 len);
int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags);
int __init btrfs_end_io_wq_init(void);
void __cold btrfs_end_io_wq_exit(void);
diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
index f39d47a2d01a..9800a8306368 100644
--- a/fs/btrfs/extent-io-tree.h
+++ b/fs/btrfs/extent-io-tree.h
@@ -34,10 +34,13 @@ struct io_failure_record;
*/
#define CHUNK_ALLOCATED EXTENT_DIRTY
#define CHUNK_TRIMMED EXTENT_DEFRAG
+#define CHUNK_STATE_MASK (CHUNK_ALLOCATED | \
+ CHUNK_TRIMMED)
enum {
IO_TREE_FS_PINNED_EXTENTS,
IO_TREE_FS_EXCLUDED_EXTENTS,
+ IO_TREE_BTREE_INODE_IO,
IO_TREE_INODE_IO,
IO_TREE_INODE_IO_FAILURE,
IO_TREE_RELOC_BLOCKS,
@@ -46,6 +49,7 @@ enum {
IO_TREE_INODE_FILE_EXTENT,
IO_TREE_LOG_CSUM_RANGE,
IO_TREE_SELFTEST,
+ IO_TREE_DEVICE_ALLOC_STATE,
};
struct extent_io_tree {
@@ -59,7 +63,6 @@ struct extent_io_tree {
u8 owner;
spinlock_t lock;
- const struct extent_io_ops *ops;
};
struct extent_state {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 61ede335f6c3..3b21fee13e77 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -33,6 +33,7 @@
#include "delalloc-space.h"
#include "block-group.h"
#include "discard.h"
+#include "rcu-string.h"
#undef SCRAMBLE_DELAYED_REFS
@@ -399,12 +400,11 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
ASSERT(eb->fs_info);
/*
- * Every shared one has parent tree
- * block, which must be aligned to
- * nodesize.
+ * Every shared one has parent tree block,
+ * which must be aligned to sector size.
*/
if (offset &&
- IS_ALIGNED(offset, eb->fs_info->nodesize))
+ IS_ALIGNED(offset, eb->fs_info->sectorsize))
return type;
}
} else if (is_data == BTRFS_REF_TYPE_DATA) {
@@ -413,12 +413,11 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
if (type == BTRFS_SHARED_DATA_REF_KEY) {
ASSERT(eb->fs_info);
/*
- * Every shared one has parent tree
- * block, which must be aligned to
- * nodesize.
+ * Every shared one has parent tree block,
+ * which must be aligned to sector size.
*/
if (offset &&
- IS_ALIGNED(offset, eb->fs_info->nodesize))
+ IS_ALIGNED(offset, eb->fs_info->sectorsize))
return type;
}
} else {
@@ -428,8 +427,9 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
}
btrfs_print_leaf((struct extent_buffer *)eb);
- btrfs_err(eb->fs_info, "eb %llu invalid extent inline ref type %d",
- eb->start, type);
+ btrfs_err(eb->fs_info,
+ "eb %llu iref 0x%lx invalid extent inline ref type %d",
+ eb->start, (unsigned long)iref, type);
WARN_ON(1);
return BTRFS_REF_TYPE_INVALID;
@@ -1177,7 +1177,22 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
num_bytes, parent, root_objectid,
owner, offset, 1);
if (ret == 0) {
- BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
+ /*
+ * We're adding refs to a tree block we already own, this
+ * should not happen at all.
+ */
+ if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+ btrfs_crit(trans->fs_info,
+"adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu",
+ bytenr, num_bytes, root_objectid);
+ if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) {
+ WARN_ON(1);
+ btrfs_crit(trans->fs_info,
+ "path->slots[0]=%d path->nodes[0]:", path->slots[0]);
+ btrfs_print_leaf(path->nodes[0]);
+ }
+ return -EUCLEAN;
+ }
update_inline_extent_backref(path, iref, refs_to_add,
extent_op, NULL);
} else if (ret == -ENOENT) {
@@ -1397,6 +1412,9 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
/*
* __btrfs_inc_extent_ref - insert backreference for a given extent
*
+ * The counterpart is in __btrfs_free_extent(), with examples and more details
+ * how it works.
+ *
* @trans: Handle of transaction
*
* @node: The delayed ref node used to get the bytenr/length for
@@ -2305,7 +2323,8 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
static noinline int check_committed_ref(struct btrfs_root *root,
struct btrfs_path *path,
- u64 objectid, u64 offset, u64 bytenr)
+ u64 objectid, u64 offset, u64 bytenr,
+ bool strict)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_root *extent_root = fs_info->extent_root;
@@ -2347,9 +2366,13 @@ static noinline int check_committed_ref(struct btrfs_root *root,
btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
goto out;
- /* If extent created before last snapshot => it's definitely shared */
- if (btrfs_extent_generation(leaf, ei) <=
- btrfs_root_last_snapshot(&root->root_item))
+ /*
+ * If extent created before last snapshot => it's shared unless the
+ * snapshot has been deleted. Use the heuristic if strict is false.
+ */
+ if (!strict &&
+ (btrfs_extent_generation(leaf, ei) <=
+ btrfs_root_last_snapshot(&root->root_item)))
goto out;
iref = (struct btrfs_extent_inline_ref *)(ei + 1);
@@ -2374,7 +2397,7 @@ out:
}
int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
- u64 bytenr)
+ u64 bytenr, bool strict)
{
struct btrfs_path *path;
int ret;
@@ -2385,7 +2408,7 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
do {
ret = check_committed_ref(root, path, objectid,
- offset, bytenr);
+ offset, bytenr, strict);
if (ret && ret != -ENOENT)
goto out;
@@ -2844,11 +2867,10 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
len -= to_add;
}
spin_unlock(&global_rsv->lock);
- /* Add to any tickets we may have */
- if (len)
- btrfs_try_granting_tickets(fs_info,
- space_info);
}
+ /* Add to any tickets we may have */
+ if (!readonly && return_free_space && len)
+ btrfs_try_granting_tickets(fs_info, space_info);
spin_unlock(&space_info->lock);
}
@@ -2930,6 +2952,65 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
return 0;
}
+/*
+ * Drop one or more refs of @node.
+ *
+ * 1. Locate the extent refs.
+ * It's either inline in EXTENT/METADATA_ITEM or in keyed SHARED_* item.
+ * Locate it, then reduce the refs number or remove the ref line completely.
+ *
+ * 2. Update the refs count in EXTENT/METADATA_ITEM
+ *
+ * Inline backref case:
+ *
+ * in extent tree we have:
+ *
+ * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
+ * refs 2 gen 6 flags DATA
+ * extent data backref root FS_TREE objectid 258 offset 0 count 1
+ * extent data backref root FS_TREE objectid 257 offset 0 count 1
+ *
+ * This function gets called with:
+ *
+ * node->bytenr = 13631488
+ * node->num_bytes = 1048576
+ * root_objectid = FS_TREE
+ * owner_objectid = 257
+ * owner_offset = 0
+ * refs_to_drop = 1
+ *
+ * Then we should get some like:
+ *
+ * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
+ * refs 1 gen 6 flags DATA
+ * extent data backref root FS_TREE objectid 258 offset 0 count 1
+ *
+ * Keyed backref case:
+ *
+ * in extent tree we have:
+ *
+ * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
+ * refs 754 gen 6 flags DATA
+ * [...]
+ * item 2 key (13631488 EXTENT_DATA_REF <HASH>) itemoff 3915 itemsize 28
+ * extent data backref root FS_TREE objectid 866 offset 0 count 1
+ *
+ * This function get called with:
+ *
+ * node->bytenr = 13631488
+ * node->num_bytes = 1048576
+ * root_objectid = FS_TREE
+ * owner_objectid = 866
+ * owner_offset = 0
+ * refs_to_drop = 1
+ *
+ * Then we should get some like:
+ *
+ * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
+ * refs 753 gen 6 flags DATA
+ *
+ * And that (13631488 EXTENT_DATA_REF <HASH>) gets removed.
+ */
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *node, u64 parent,
u64 root_objectid, u64 owner_objectid,
@@ -2962,7 +3043,15 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
path->leave_spinning = 1;
is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
- BUG_ON(!is_data && refs_to_drop != 1);
+
+ if (!is_data && refs_to_drop != 1) {
+ btrfs_crit(info,
+"invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u",
+ node->bytenr, refs_to_drop);
+ ret = -EINVAL;
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
if (is_data)
skinny_metadata = false;
@@ -2971,6 +3060,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
parent, root_objectid, owner_objectid,
owner_offset);
if (ret == 0) {
+ /*
+ * Either the inline backref or the SHARED_DATA_REF/
+ * SHARED_BLOCK_REF is found
+ *
+ * Here is a quick path to locate EXTENT/METADATA_ITEM.
+ * It's possible the EXTENT/METADATA_ITEM is near current slot.
+ */
extent_slot = path->slots[0];
while (extent_slot >= 0) {
btrfs_item_key_to_cpu(path->nodes[0], &key,
@@ -2987,13 +3083,21 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
found_extent = 1;
break;
}
+
+ /* Quick path didn't find the EXTEMT/METADATA_ITEM */
if (path->slots[0] - extent_slot > 5)
break;
extent_slot--;
}
if (!found_extent) {
- BUG_ON(iref);
+ if (iref) {
+ btrfs_crit(info,
+"invalid iref, no EXTENT/METADATA_ITEM found but has inline extent ref");
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
+ }
+ /* Must be SHARED_* item, remove the backref first */
ret = remove_extent_backref(trans, path, NULL,
refs_to_drop,
is_data, &last_ref);
@@ -3004,6 +3108,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
path->leave_spinning = 1;
+ /* Slow path to locate EXTENT/METADATA_ITEM */
key.objectid = bytenr;
key.type = BTRFS_EXTENT_ITEM_KEY;
key.offset = num_bytes;
@@ -3078,19 +3183,26 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
key.type == BTRFS_EXTENT_ITEM_KEY) {
struct btrfs_tree_block_info *bi;
- BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
+ if (item_size < sizeof(*ei) + sizeof(*bi)) {
+ btrfs_crit(info,
+"invalid extent item size for key (%llu, %u, %llu) owner %llu, has %u expect >= %lu",
+ key.objectid, key.type, key.offset,
+ owner_objectid, item_size,
+ sizeof(*ei) + sizeof(*bi));
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
+ }
bi = (struct btrfs_tree_block_info *)(ei + 1);
WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
}
refs = btrfs_extent_refs(leaf, ei);
if (refs < refs_to_drop) {
- btrfs_err(info,
- "trying to drop %d refs but we only have %Lu for bytenr %Lu",
+ btrfs_crit(info,
+ "trying to drop %d refs but we only have %llu for bytenr %llu",
refs_to_drop, refs, bytenr);
- ret = -EINVAL;
- btrfs_abort_transaction(trans, ret);
- goto out;
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
}
refs -= refs_to_drop;
@@ -3102,7 +3214,12 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
* be updated by remove_extent_backref
*/
if (iref) {
- BUG_ON(!found_extent);
+ if (!found_extent) {
+ btrfs_crit(info,
+"invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found");
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
+ }
} else {
btrfs_set_extent_refs(leaf, ei, refs);
btrfs_mark_buffer_dirty(leaf);
@@ -3117,13 +3234,39 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
}
} else {
+ /* In this branch refs == 1 */
if (found_extent) {
- BUG_ON(is_data && refs_to_drop !=
- extent_data_ref_count(path, iref));
+ if (is_data && refs_to_drop !=
+ extent_data_ref_count(path, iref)) {
+ btrfs_crit(info,
+ "invalid refs_to_drop, current refs %u refs_to_drop %u",
+ extent_data_ref_count(path, iref),
+ refs_to_drop);
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
+ }
if (iref) {
- BUG_ON(path->slots[0] != extent_slot);
+ if (path->slots[0] != extent_slot) {
+ btrfs_crit(info,
+"invalid iref, extent item key (%llu %u %llu) doesn't have wanted iref",
+ key.objectid, key.type,
+ key.offset);
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
+ }
} else {
- BUG_ON(path->slots[0] != extent_slot + 1);
+ /*
+ * No inline ref, we must be at SHARED_* item,
+ * And it's single ref, it must be:
+ * | extent_slot ||extent_slot + 1|
+ * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ]
+ */
+ if (path->slots[0] != extent_slot + 1) {
+ btrfs_crit(info,
+ "invalid SHARED_* item, previous item is not EXTENT/METADATA_ITEM");
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
+ }
path->slots[0] = extent_slot;
num_to_del = 2;
}
@@ -3164,6 +3307,19 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
out:
btrfs_free_path(path);
return ret;
+err_dump:
+ /*
+ * Leaf dump can take up a lot of log buffer, so we only do full leaf
+ * dump for debug build.
+ */
+ if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) {
+ btrfs_crit(info, "path->slots[0]=%d extent_slot=%d",
+ path->slots[0], extent_slot);
+ btrfs_print_leaf(path->nodes[0]);
+ }
+
+ btrfs_free_path(path);
+ return -EUCLEAN;
}
/*
@@ -3913,11 +4069,12 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info,
* |- Push harder to find free extents
* |- If not found, re-iterate all block groups
*/
-static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
+static noinline int find_free_extent(struct btrfs_root *root,
u64 ram_bytes, u64 num_bytes, u64 empty_size,
u64 hint_byte_orig, struct btrfs_key *ins,
u64 flags, int delalloc)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
int ret = 0;
int cache_block_group_error = 0;
struct btrfs_block_group *block_group = NULL;
@@ -3949,7 +4106,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
ins->objectid = 0;
ins->offset = 0;
- trace_find_free_extent(fs_info, num_bytes, empty_size, flags);
+ trace_find_free_extent(root, num_bytes, empty_size, flags);
space_info = btrfs_find_space_info(fs_info, flags);
if (!space_info) {
@@ -4198,7 +4355,7 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
flags = get_alloc_profile_by_root(root, is_data);
again:
WARN_ON(num_bytes < fs_info->sectorsize);
- ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size,
+ ret = find_free_extent(root, ram_bytes, num_bytes, empty_size,
hint_byte, ins, flags, delalloc);
if (!ret && !is_data) {
btrfs_dec_block_group_reservations(fs_info, ins->objectid);
@@ -4499,7 +4656,8 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
static struct extent_buffer *
btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- u64 bytenr, int level, u64 owner)
+ u64 bytenr, int level, u64 owner,
+ enum btrfs_lock_nesting nest)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *buf;
@@ -4521,8 +4679,8 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
return ERR_PTR(-EUCLEAN);
}
- btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
- btrfs_tree_lock(buf);
+ btrfs_set_buffer_lockdep_class(owner, buf, level);
+ __btrfs_tree_lock(buf, nest);
btrfs_clean_tree_block(buf);
clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
@@ -4568,7 +4726,8 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
u64 parent, u64 root_objectid,
const struct btrfs_disk_key *key,
int level, u64 hint,
- u64 empty_size)
+ u64 empty_size,
+ enum btrfs_lock_nesting nest)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key ins;
@@ -4584,7 +4743,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
if (btrfs_is_testing(fs_info)) {
buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
- level, root_objectid);
+ level, root_objectid, nest);
if (!IS_ERR(buf))
root->alloc_bytenr += blocksize;
return buf;
@@ -4601,7 +4760,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
goto out_unuse;
buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
- root_objectid);
+ root_objectid, nest);
if (IS_ERR(buf)) {
ret = PTR_ERR(buf);
goto out_free_reserved;
@@ -5668,6 +5827,19 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
&start, &end,
CHUNK_TRIMMED | CHUNK_ALLOCATED);
+ /* Check if there are any CHUNK_* bits left */
+ if (start > device->total_bytes) {
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ btrfs_warn_in_rcu(fs_info,
+"ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu",
+ start, end - start + 1,
+ rcu_str_deref(device->name),
+ device->total_bytes);
+ mutex_unlock(&fs_info->chunk_mutex);
+ ret = 0;
+ break;
+ }
+
/* Ensure we skip the reserved area in the first 1M */
start = max_t(u64, start, SZ_1M);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 6def411b2eba..60f5f68d892d 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -160,19 +160,20 @@ static int add_extent_changeset(struct extent_state *state, unsigned bits,
return ret;
}
-static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
- unsigned long bio_flags)
+int __must_check submit_one_bio(struct bio *bio, int mirror_num,
+ unsigned long bio_flags)
{
blk_status_t ret = 0;
struct extent_io_tree *tree = bio->bi_private;
bio->bi_private = NULL;
- if (tree->ops)
- ret = tree->ops->submit_bio_hook(tree->private_data, bio,
- mirror_num, bio_flags);
+ if (is_data_inode(tree->private_data))
+ ret = btrfs_submit_data_bio(tree->private_data, bio, mirror_num,
+ bio_flags);
else
- btrfsic_submit_bio(bio);
+ ret = btrfs_submit_metadata_bio(tree->private_data, bio,
+ mirror_num, bio_flags);
return blk_status_to_errno(ret);
}
@@ -280,7 +281,6 @@ void extent_io_tree_init(struct btrfs_fs_info *fs_info,
{
tree->fs_info = fs_info;
tree->state = RB_ROOT;
- tree->ops = NULL;
tree->dirty_bytes = 0;
spin_lock_init(&tree->lock);
tree->private_data = private_data;
@@ -2819,8 +2819,6 @@ static void end_bio_extent_readpage(struct bio *bio)
struct page *page = bvec->bv_page;
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- bool data_inode = btrfs_ino(BTRFS_I(inode))
- != BTRFS_BTREE_INODE_OBJECTID;
btrfs_debug(fs_info,
"end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
@@ -2851,9 +2849,12 @@ static void end_bio_extent_readpage(struct bio *bio)
mirror = io_bio->mirror_num;
if (likely(uptodate)) {
- ret = tree->ops->readpage_end_io_hook(io_bio, offset,
- page, start, end,
- mirror);
+ if (is_data_inode(inode))
+ ret = btrfs_verify_data_csum(io_bio, offset, page,
+ start, end, mirror);
+ else
+ ret = btrfs_validate_metadata_buffer(io_bio,
+ offset, page, start, end, mirror);
if (ret)
uptodate = 0;
else
@@ -2866,7 +2867,7 @@ static void end_bio_extent_readpage(struct bio *bio)
if (likely(uptodate))
goto readpage_ok;
- if (data_inode) {
+ if (is_data_inode(inode)) {
/*
* The generic bio_readpage_error handles errors the
@@ -2881,7 +2882,7 @@ static void end_bio_extent_readpage(struct bio *bio)
if (!btrfs_submit_read_repair(inode, bio, offset, page,
start - page_offset(page),
start, end, mirror,
- tree->ops->submit_bio_hook)) {
+ btrfs_submit_data_bio)) {
uptodate = !bio->bi_status;
offset += len;
continue;
@@ -3053,7 +3054,6 @@ static int submit_extent_page(unsigned int opf,
else
contig = bio_end_sector(bio) == sector;
- ASSERT(tree->ops);
if (btrfs_bio_fits_in_stripe(page, page_size, bio, bio_flags))
can_merge = false;
@@ -3110,8 +3110,7 @@ void set_page_extent_mapped(struct page *page)
static struct extent_map *
__get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
- u64 start, u64 len, get_extent_t *get_extent,
- struct extent_map **em_cached)
+ u64 start, u64 len, struct extent_map **em_cached)
{
struct extent_map *em;
@@ -3127,7 +3126,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
*em_cached = NULL;
}
- em = get_extent(BTRFS_I(inode), page, pg_offset, start, len);
+ em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len);
if (em_cached && !IS_ERR_OR_NULL(em)) {
BUG_ON(*em_cached);
refcount_inc(&em->refs);
@@ -3142,12 +3141,9 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
* XXX JDM: This needs looking at to ensure proper page locking
* return 0 on success, otherwise return error
*/
-static int __do_readpage(struct page *page,
- get_extent_t *get_extent,
- struct extent_map **em_cached,
- struct bio **bio, int mirror_num,
- unsigned long *bio_flags, unsigned int read_flags,
- u64 *prev_em_start)
+int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
+ struct bio **bio, unsigned long *bio_flags,
+ unsigned int read_flags, u64 *prev_em_start)
{
struct inode *inode = page->mapping->host;
u64 start = page_offset(page);
@@ -3209,7 +3205,7 @@ static int __do_readpage(struct page *page,
break;
}
em = __get_extent_map(inode, page, pg_offset, cur,
- end - cur + 1, get_extent, em_cached);
+ end - cur + 1, em_cached);
if (IS_ERR_OR_NULL(em)) {
SetPageError(page);
unlock_extent(tree, cur, end);
@@ -3241,7 +3237,7 @@ static int __do_readpage(struct page *page,
/*
* If we have a file range that points to a compressed extent
- * and it's followed by a consecutive file range that points to
+ * and it's followed by a consecutive file range that points
* to the same compressed extent (possibly with a different
* offset and/or length, so it either points to the whole extent
* or only part of it), we must make sure we do not submit a
@@ -3325,7 +3321,7 @@ static int __do_readpage(struct page *page,
ret = submit_extent_page(REQ_OP_READ | read_flags, NULL,
page, offset, disk_io_size,
pg_offset, bio,
- end_bio_extent_readpage, mirror_num,
+ end_bio_extent_readpage, 0,
*bio_flags,
this_bio_flag,
force_bio_submit);
@@ -3362,44 +3358,12 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages,
btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
for (index = 0; index < nr_pages; index++) {
- __do_readpage(pages[index], btrfs_get_extent, em_cached,
- bio, 0, bio_flags, REQ_RAHEAD, prev_em_start);
+ btrfs_do_readpage(pages[index], em_cached, bio, bio_flags,
+ REQ_RAHEAD, prev_em_start);
put_page(pages[index]);
}
}
-static int __extent_read_full_page(struct page *page,
- get_extent_t *get_extent,
- struct bio **bio, int mirror_num,
- unsigned long *bio_flags,
- unsigned int read_flags)
-{
- struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
- u64 start = page_offset(page);
- u64 end = start + PAGE_SIZE - 1;
- int ret;
-
- btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
-
- ret = __do_readpage(page, get_extent, NULL, bio, mirror_num,
- bio_flags, read_flags, NULL);
- return ret;
-}
-
-int extent_read_full_page(struct page *page, get_extent_t *get_extent,
- int mirror_num)
-{
- struct bio *bio = NULL;
- unsigned long bio_flags = 0;
- int ret;
-
- ret = __extent_read_full_page(page, get_extent, &bio, mirror_num,
- &bio_flags, 0);
- if (bio)
- ret = submit_one_bio(bio, mirror_num, bio_flags);
- return ret;
-}
-
static void update_nr_written(struct writeback_control *wbc,
unsigned long nr_written)
{
@@ -4552,7 +4516,7 @@ next:
* helper function for fiemap, which doesn't want to see any holes.
* This maps until we find something past 'last'
*/
-static struct extent_map *get_extent_skip_holes(struct inode *inode,
+static struct extent_map *get_extent_skip_holes(struct btrfs_inode *inode,
u64 offset, u64 last)
{
u64 sectorsize = btrfs_inode_sectorsize(inode);
@@ -4567,7 +4531,7 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
if (len == 0)
break;
len = ALIGN(len, sectorsize);
- em = btrfs_get_extent_fiemap(BTRFS_I(inode), offset, len);
+ em = btrfs_get_extent_fiemap(inode, offset, len);
if (IS_ERR_OR_NULL(em))
return em;
@@ -4696,7 +4660,7 @@ static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,
return ret;
}
-int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
int ret = 0;
@@ -4707,12 +4671,12 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 last;
u64 last_for_get_extent = 0;
u64 disko = 0;
- u64 isize = i_size_read(inode);
+ u64 isize = i_size_read(&inode->vfs_inode);
struct btrfs_key found_key;
struct extent_map *em = NULL;
struct extent_state *cached_state = NULL;
struct btrfs_path *path;
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root *root = inode->root;
struct fiemap_cache cache = { 0 };
struct ulist *roots;
struct ulist *tmp_ulist;
@@ -4743,8 +4707,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
* lookup the last file extent. We're not using i_size here
* because there might be preallocation past i_size
*/
- ret = btrfs_lookup_file_extent(NULL, root, path,
- btrfs_ino(BTRFS_I(inode)), -1, 0);
+ ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1,
+ 0);
if (ret < 0) {
goto out_free_ulist;
} else {
@@ -4758,7 +4722,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
found_type = found_key.type;
/* No extents, but there might be delalloc bits */
- if (found_key.objectid != btrfs_ino(BTRFS_I(inode)) ||
+ if (found_key.objectid != btrfs_ino(inode) ||
found_type != BTRFS_EXTENT_DATA_KEY) {
/* have to trust i_size as the end */
last = (u64)-1;
@@ -4784,7 +4748,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
last_for_get_extent = isize;
}
- lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1,
+ lock_extent_bits(&inode->io_tree, start, start + len - 1,
&cached_state);
em = get_extent_skip_holes(inode, start, last_for_get_extent);
@@ -4853,8 +4817,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
* then we're just getting a count and we can skip the
* lookup stuff.
*/
- ret = btrfs_check_shared(root,
- btrfs_ino(BTRFS_I(inode)),
+ ret = btrfs_check_shared(root, btrfs_ino(inode),
bytenr, roots, tmp_ulist);
if (ret < 0)
goto out_free;
@@ -4898,7 +4861,7 @@ out_free:
ret = emit_last_fiemap_cache(fieinfo, &cache);
free_extent_map(em);
out:
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1,
+ unlock_extent_cached(&inode->io_tree, start, start + len - 1,
&cached_state);
out_free_ulist:
@@ -4990,7 +4953,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
rwlock_init(&eb->lock);
atomic_set(&eb->blocking_readers, 0);
eb->blocking_writers = 0;
- eb->lock_nested = false;
+ eb->lock_recursed = false;
init_waitqueue_head(&eb->write_lock_wq);
init_waitqueue_head(&eb->read_lock_wq);
@@ -5574,20 +5537,19 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
}
ClearPageError(page);
- err = __extent_read_full_page(page,
- btree_get_extent, &bio,
- mirror_num, &bio_flags,
- REQ_META);
+ err = submit_extent_page(REQ_OP_READ | REQ_META, NULL,
+ page, page_offset(page), PAGE_SIZE, 0,
+ &bio, end_bio_extent_readpage,
+ mirror_num, 0, 0, false);
if (err) {
- ret = err;
/*
- * We use &bio in above __extent_read_full_page,
- * so we ensure that if it returns error, the
- * current page fails to add itself to bio and
- * it's been unlocked.
- *
- * We must dec io_pages by ourselves.
+ * We failed to submit the bio so it's the
+ * caller's responsibility to perform cleanup
+ * i.e unlock page/set error bit.
*/
+ ret = err;
+ SetPageError(page);
+ unlock_page(page);
atomic_dec(&eb->io_pages);
}
} else {
@@ -5622,6 +5584,36 @@ unlock_exit:
return ret;
}
+static bool report_eb_range(const struct extent_buffer *eb, unsigned long start,
+ unsigned long len)
+{
+ btrfs_warn(eb->fs_info,
+ "access to eb bytenr %llu len %lu out of range start %lu len %lu",
+ eb->start, eb->len, start, len);
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+
+ return true;
+}
+
+/*
+ * Check if the [start, start + len) range is valid before reading/writing
+ * the eb.
+ * NOTE: @start and @len are offset inside the eb, not logical address.
+ *
+ * Caller should not touch the dst/src memory if this function returns error.
+ */
+static inline int check_eb_range(const struct extent_buffer *eb,
+ unsigned long start, unsigned long len)
+{
+ unsigned long offset;
+
+ /* start, start + len should not go beyond eb->len nor overflow */
+ if (unlikely(check_add_overflow(start, len, &offset) || offset > eb->len))
+ return report_eb_range(eb, start, len);
+
+ return false;
+}
+
void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
unsigned long start, unsigned long len)
{
@@ -5632,12 +5624,8 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
char *dst = (char *)dstv;
unsigned long i = start >> PAGE_SHIFT;
- if (start + len > eb->len) {
- WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n",
- eb->start, eb->len, start, len);
- memset(dst, 0, len);
+ if (check_eb_range(eb, start, len))
return;
- }
offset = offset_in_page(start);
@@ -5655,9 +5643,9 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
}
}
-int read_extent_buffer_to_user(const struct extent_buffer *eb,
- void __user *dstv,
- unsigned long start, unsigned long len)
+int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
+ void __user *dstv,
+ unsigned long start, unsigned long len)
{
size_t cur;
size_t offset;
@@ -5677,7 +5665,7 @@ int read_extent_buffer_to_user(const struct extent_buffer *eb,
cur = min(len, (PAGE_SIZE - offset));
kaddr = page_address(page);
- if (copy_to_user(dst, kaddr + offset, cur)) {
+ if (copy_to_user_nofault(dst, kaddr + offset, cur)) {
ret = -EFAULT;
break;
}
@@ -5702,8 +5690,8 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
unsigned long i = start >> PAGE_SHIFT;
int ret = 0;
- WARN_ON(start > eb->len);
- WARN_ON(start + len > eb->start + eb->len);
+ if (check_eb_range(eb, start, len))
+ return -EINVAL;
offset = offset_in_page(start);
@@ -5756,8 +5744,8 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,
char *src = (char *)srcv;
unsigned long i = start >> PAGE_SHIFT;
- WARN_ON(start > eb->len);
- WARN_ON(start + len > eb->start + eb->len);
+ if (check_eb_range(eb, start, len))
+ return;
offset = offset_in_page(start);
@@ -5785,8 +5773,8 @@ void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
char *kaddr;
unsigned long i = start >> PAGE_SHIFT;
- WARN_ON(start > eb->len);
- WARN_ON(start + len > eb->start + eb->len);
+ if (check_eb_range(eb, start, len))
+ return;
offset = offset_in_page(start);
@@ -5830,6 +5818,10 @@ void copy_extent_buffer(const struct extent_buffer *dst,
char *kaddr;
unsigned long i = dst_offset >> PAGE_SHIFT;
+ if (check_eb_range(dst, dst_offset, len) ||
+ check_eb_range(src, src_offset, len))
+ return;
+
WARN_ON(src->len != dst_len);
offset = offset_in_page(dst_offset);
@@ -6019,25 +6011,15 @@ void memcpy_extent_buffer(const struct extent_buffer *dst,
unsigned long dst_offset, unsigned long src_offset,
unsigned long len)
{
- struct btrfs_fs_info *fs_info = dst->fs_info;
size_t cur;
size_t dst_off_in_page;
size_t src_off_in_page;
unsigned long dst_i;
unsigned long src_i;
- if (src_offset + len > dst->len) {
- btrfs_err(fs_info,
- "memmove bogus src_offset %lu move len %lu dst len %lu",
- src_offset, len, dst->len);
- BUG();
- }
- if (dst_offset + len > dst->len) {
- btrfs_err(fs_info,
- "memmove bogus dst_offset %lu move len %lu dst len %lu",
- dst_offset, len, dst->len);
- BUG();
- }
+ if (check_eb_range(dst, dst_offset, len) ||
+ check_eb_range(dst, src_offset, len))
+ return;
while (len > 0) {
dst_off_in_page = offset_in_page(dst_offset);
@@ -6064,7 +6046,6 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
unsigned long dst_offset, unsigned long src_offset,
unsigned long len)
{
- struct btrfs_fs_info *fs_info = dst->fs_info;
size_t cur;
size_t dst_off_in_page;
size_t src_off_in_page;
@@ -6073,18 +6054,9 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
unsigned long dst_i;
unsigned long src_i;
- if (src_offset + len > dst->len) {
- btrfs_err(fs_info,
- "memmove bogus src_offset %lu move len %lu len %lu",
- src_offset, len, dst->len);
- BUG();
- }
- if (dst_offset + len > dst->len) {
- btrfs_err(fs_info,
- "memmove bogus dst_offset %lu move len %lu len %lu",
- dst_offset, len, dst->len);
- BUG();
- }
+ if (check_eb_range(dst, dst_offset, len) ||
+ check_eb_range(dst, src_offset, len))
+ return;
if (dst_offset < src_offset) {
memcpy_extent_buffer(dst, dst_offset, src_offset, len);
return;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 00a88f2eb5ab..f39d02e7f7ef 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -74,18 +74,6 @@ typedef blk_status_t (submit_bio_hook_t)(struct inode *inode, struct bio *bio,
typedef blk_status_t (extent_submit_bio_start_t)(void *private_data,
struct bio *bio, u64 bio_offset);
-struct extent_io_ops {
- /*
- * The following callbacks must be always defined, the function
- * pointer will be called unconditionally.
- */
- submit_bio_hook_t *submit_bio_hook;
- int (*readpage_end_io_hook)(struct btrfs_io_bio *io_bio, u64 phy_offset,
- struct page *page, u64 start, u64 end,
- int mirror);
-};
-
-
#define INLINE_EXTENT_BUFFER_PAGES 16
#define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_SIZE)
struct extent_buffer {
@@ -102,7 +90,7 @@ struct extent_buffer {
int blocking_writers;
atomic_t blocking_readers;
- bool lock_nested;
+ bool lock_recursed;
/* >= 0 if eb belongs to a log tree, -1 otherwise */
short log_index;
@@ -193,8 +181,11 @@ typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode,
int try_release_extent_mapping(struct page *page, gfp_t mask);
int try_release_extent_buffer(struct page *page);
-int extent_read_full_page(struct page *page, get_extent_t *get_extent,
- int mirror_num);
+int __must_check submit_one_bio(struct bio *bio, int mirror_num,
+ unsigned long bio_flags);
+int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
+ struct bio **bio, unsigned long *bio_flags,
+ unsigned int read_flags, u64 *prev_em_start);
int extent_write_full_page(struct page *page, struct writeback_control *wbc);
int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
int mode);
@@ -203,7 +194,7 @@ int extent_writepages(struct address_space *mapping,
int btree_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc);
void extent_readahead(struct readahead_control *rac);
-int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len);
void set_page_extent_mapped(struct page *page);
@@ -241,9 +232,9 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
void read_extent_buffer(const struct extent_buffer *eb, void *dst,
unsigned long start,
unsigned long len);
-int read_extent_buffer_to_user(const struct extent_buffer *eb,
- void __user *dst, unsigned long start,
- unsigned long len);
+int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
+ void __user *dst, unsigned long start,
+ unsigned long len);
void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *src);
void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb,
const void *src);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 7d5ec71615b8..8f4f2bd6d9b9 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -318,8 +318,8 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
if (page_offsets)
offset = page_offset(bvec.bv_page) + bvec.bv_offset;
- count = btrfs_find_ordered_sum(inode, offset, disk_bytenr,
- csum, nblocks);
+ count = btrfs_find_ordered_sum(BTRFS_I(inode), offset,
+ disk_bytenr, csum, nblocks);
if (count)
goto found;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index bb824c7cb7c7..0ff659455b1e 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1057,11 +1057,7 @@ delete_extent_item:
if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
path->slots[0]++;
}
- setup_items_for_insert(root, path, &key,
- &extent_item_size,
- extent_item_size,
- sizeof(struct btrfs_item) +
- extent_item_size, 1);
+ setup_items_for_insert(root, path, &key, &extent_item_size, 1);
*key_inserted = 1;
}
@@ -1477,9 +1473,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
int ret = 0;
start_pos = round_down(pos, fs_info->sectorsize);
- last_pos = start_pos
- + round_up(pos + write_bytes - start_pos,
- fs_info->sectorsize) - 1;
+ last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - 1;
if (start_pos < inode->vfs_inode.i_size) {
struct btrfs_ordered_extent *ordered;
@@ -1497,8 +1491,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
unlock_page(pages[i]);
put_page(pages[i]);
}
- btrfs_start_ordered_extent(&inode->vfs_inode,
- ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
return -EAGAIN;
}
@@ -1571,7 +1564,7 @@ static int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
}
ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
- NULL, NULL, NULL);
+ NULL, NULL, NULL, false);
if (ret <= 0) {
ret = 0;
if (!nowait)
@@ -1872,7 +1865,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
loff_t endbyte;
int err;
- written = generic_file_direct_write(iocb, from);
+ written = btrfs_direct_IO(iocb, from);
if (written < 0 || !iov_iter_count(from))
return written;
@@ -2025,7 +2018,40 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
atomic_inc(&BTRFS_I(inode)->sync_writers);
if (iocb->ki_flags & IOCB_DIRECT) {
+ /*
+ * 1. We must always clear IOCB_DSYNC in order to not deadlock
+ * in iomap, as it calls generic_write_sync() in this case.
+ * 2. If we are async, we can call iomap_dio_complete() either
+ * in
+ *
+ * 2.1. A worker thread from the last bio completed. In this
+ * case we need to mark the btrfs_dio_data that it is
+ * async in order to call generic_write_sync() properly.
+ * This is handled by setting BTRFS_DIO_SYNC_STUB in the
+ * current->journal_info.
+ * 2.2 The submitter context, because all IO completed
+ * before we exited iomap_dio_rw(). In this case we can
+ * just re-set the IOCB_DSYNC on the iocb and we'll do
+ * the sync below. If our ->end_io() gets called and
+ * current->journal_info is set, then we know we're in
+ * our current context and we will clear
+ * current->journal_info to indicate that we need to
+ * sync below.
+ */
+ if (sync) {
+ ASSERT(current->journal_info == NULL);
+ iocb->ki_flags &= ~IOCB_DSYNC;
+ current->journal_info = BTRFS_DIO_SYNC_STUB;
+ }
num_written = __btrfs_direct_write(iocb, from);
+
+ /*
+ * As stated above, we cleared journal_info, so we need to do
+ * the sync ourselves.
+ */
+ if (sync && current->journal_info == NULL)
+ iocb->ki_flags |= IOCB_DSYNC;
+ current->journal_info = NULL;
} else {
num_written = btrfs_buffered_write(iocb, from);
if (num_written > 0)
@@ -2065,12 +2091,12 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
filp->private_data = NULL;
/*
- * ordered_data_close is set by setattr when we are about to truncate
- * a file from a non-zero size to a zero size. This tries to
- * flush down new bytes that may have been written if the
- * application were using truncate to replace a file in place.
+ * Set by setattr when we are about to truncate a file from a non-zero
+ * size to a zero size. This tries to flush down new bytes that may
+ * have been written if the application were using truncate to replace
+ * a file in place.
*/
- if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
+ if (test_and_clear_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
&BTRFS_I(inode)->runtime_flags))
filemap_flush(inode->i_mapping);
return 0;
@@ -2116,20 +2142,24 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
struct btrfs_trans_handle *trans;
struct btrfs_log_ctx ctx;
int ret = 0, err;
+ u64 len;
+ bool full_sync;
trace_btrfs_sync_file(file, datasync);
btrfs_init_log_ctx(&ctx, inode);
/*
- * Set the range to full if the NO_HOLES feature is not enabled.
- * This is to avoid missing file extent items representing holes after
- * replaying the log.
+ * Always set the range to a full range, otherwise we can get into
+ * several problems, from missing file extent items to represent holes
+ * when not using the NO_HOLES feature, to log tree corruption due to
+ * races between hole detection during logging and completion of ordered
+ * extents outside the range, to missing checksums due to ordered extents
+ * for which we flushed only a subset of their pages.
*/
- if (!btrfs_fs_incompat(fs_info, NO_HOLES)) {
- start = 0;
- end = LLONG_MAX;
- }
+ start = 0;
+ end = LLONG_MAX;
+ len = (u64)LLONG_MAX + 1;
/*
* We write the dirty pages in the range and wait until they complete
@@ -2153,19 +2183,12 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
atomic_inc(&root->log_batch);
/*
- * If the inode needs a full sync, make sure we use a full range to
- * avoid log tree corruption, due to hole detection racing with ordered
- * extent completion for adjacent ranges and races between logging and
- * completion of ordered extents for adjancent ranges - both races
- * could lead to file extent items in the log with overlapping ranges.
- * Do this while holding the inode lock, to avoid races with other
- * tasks.
+ * Always check for the full sync flag while holding the inode's lock,
+ * to avoid races with other tasks. The flag must be either set all the
+ * time during logging or always off all the time while logging.
*/
- if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &BTRFS_I(inode)->runtime_flags)) {
- start = 0;
- end = LLONG_MAX;
- }
+ full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
/*
* Before we acquired the inode's lock, someone may have dirtied more
@@ -2196,20 +2219,42 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* We have to do this here to avoid the priority inversion of waiting on
* IO of a lower priority task while holding a transaction open.
*
- * Also, the range length can be represented by u64, we have to do the
- * typecasts to avoid signed overflow if it's [0, LLONG_MAX].
+ * For a full fsync we wait for the ordered extents to complete while
+ * for a fast fsync we wait just for writeback to complete, and then
+ * attach the ordered extents to the transaction so that a transaction
+ * commit waits for their completion, to avoid data loss if we fsync,
+ * the current transaction commits before the ordered extents complete
+ * and a power failure happens right after that.
*/
- ret = btrfs_wait_ordered_range(inode, start, (u64)end - (u64)start + 1);
- if (ret) {
- up_write(&BTRFS_I(inode)->dio_sem);
- inode_unlock(inode);
- goto out;
+ if (full_sync) {
+ ret = btrfs_wait_ordered_range(inode, start, len);
+ } else {
+ /*
+ * Get our ordered extents as soon as possible to avoid doing
+ * checksum lookups in the csum tree, and use instead the
+ * checksums attached to the ordered extents.
+ */
+ btrfs_get_ordered_extents_for_logging(BTRFS_I(inode),
+ &ctx.ordered_extents);
+ ret = filemap_fdatawait_range(inode->i_mapping, start, end);
}
+
+ if (ret)
+ goto out_release_extents;
+
atomic_inc(&root->log_batch);
+ /*
+ * If we are doing a fast fsync we can not bail out if the inode's
+ * last_trans is <= then the last committed transaction, because we only
+ * update the last_trans of the inode during ordered extent completion,
+ * and for a fast fsync we don't wait for that, we only wait for the
+ * writeback to complete.
+ */
smp_mb();
if (btrfs_inode_in_log(BTRFS_I(inode), fs_info->generation) ||
- BTRFS_I(inode)->last_trans <= fs_info->last_trans_committed) {
+ (BTRFS_I(inode)->last_trans <= fs_info->last_trans_committed &&
+ (full_sync || list_empty(&ctx.ordered_extents)))) {
/*
* We've had everything committed since the last time we were
* modified so clear this flag in case it was set for whatever
@@ -2225,9 +2270,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* checked called fsync.
*/
ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err);
- up_write(&BTRFS_I(inode)->dio_sem);
- inode_unlock(inode);
- goto out;
+ goto out_release_extents;
}
/*
@@ -2244,12 +2287,11 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
- up_write(&BTRFS_I(inode)->dio_sem);
- inode_unlock(inode);
- goto out;
+ goto out_release_extents;
}
- ret = btrfs_log_dentry_safe(trans, dentry, start, end, &ctx);
+ ret = btrfs_log_dentry_safe(trans, dentry, &ctx);
+ btrfs_release_log_ctx_extents(&ctx);
if (ret < 0) {
/* Fallthrough and commit/free transaction. */
ret = 1;
@@ -2276,6 +2318,13 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
goto out;
}
}
+ if (!full_sync) {
+ ret = btrfs_wait_ordered_range(inode, start, len);
+ if (ret) {
+ btrfs_end_transaction(trans);
+ goto out;
+ }
+ }
ret = btrfs_commit_transaction(trans);
} else {
ret = btrfs_end_transaction(trans);
@@ -2286,6 +2335,12 @@ out:
if (!ret)
ret = err;
return ret > 0 ? -EIO : ret;
+
+out_release_extents:
+ btrfs_release_log_ctx_extents(&ctx);
+ up_write(&BTRFS_I(inode)->dio_sem);
+ inode_unlock(inode);
+ goto out;
}
static const struct vm_operations_struct btrfs_file_vm_ops = {
@@ -2481,7 +2536,8 @@ static int btrfs_punch_hole_lock_range(struct inode *inode,
lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
cached_state);
- ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
+ ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode),
+ lockend);
/*
* We need to make sure we have no ordered extents in this range
@@ -2509,11 +2565,11 @@ static int btrfs_punch_hole_lock_range(struct inode *inode,
return 0;
}
-static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans,
+static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
struct inode *inode,
struct btrfs_path *path,
- struct btrfs_clone_extent_info *clone_info,
- const u64 clone_len)
+ struct btrfs_replace_extent_info *extent_info,
+ const u64 replace_len)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -2522,51 +2578,69 @@ static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans,
struct btrfs_key key;
int slot;
struct btrfs_ref ref = { 0 };
- u64 ref_offset;
int ret;
- if (clone_len == 0)
+ if (replace_len == 0)
return 0;
- if (clone_info->disk_offset == 0 &&
+ if (extent_info->disk_offset == 0 &&
btrfs_fs_incompat(fs_info, NO_HOLES))
return 0;
key.objectid = btrfs_ino(BTRFS_I(inode));
key.type = BTRFS_EXTENT_DATA_KEY;
- key.offset = clone_info->file_offset;
+ key.offset = extent_info->file_offset;
ret = btrfs_insert_empty_item(trans, root, path, &key,
- clone_info->item_size);
+ sizeof(struct btrfs_file_extent_item));
if (ret)
return ret;
leaf = path->nodes[0];
slot = path->slots[0];
- write_extent_buffer(leaf, clone_info->extent_buf,
+ write_extent_buffer(leaf, extent_info->extent_buf,
btrfs_item_ptr_offset(leaf, slot),
- clone_info->item_size);
+ sizeof(struct btrfs_file_extent_item));
extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
- btrfs_set_file_extent_offset(leaf, extent, clone_info->data_offset);
- btrfs_set_file_extent_num_bytes(leaf, extent, clone_len);
+ ASSERT(btrfs_file_extent_type(leaf, extent) != BTRFS_FILE_EXTENT_INLINE);
+ btrfs_set_file_extent_offset(leaf, extent, extent_info->data_offset);
+ btrfs_set_file_extent_num_bytes(leaf, extent, replace_len);
+ if (extent_info->is_new_extent)
+ btrfs_set_file_extent_generation(leaf, extent, trans->transid);
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode),
- clone_info->file_offset, clone_len);
+ extent_info->file_offset, replace_len);
if (ret)
return ret;
/* If it's a hole, nothing more needs to be done. */
- if (clone_info->disk_offset == 0)
+ if (extent_info->disk_offset == 0)
return 0;
- inode_add_bytes(inode, clone_len);
- btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
- clone_info->disk_offset,
- clone_info->disk_len, 0);
- ref_offset = clone_info->file_offset - clone_info->data_offset;
- btrfs_init_data_ref(&ref, root->root_key.objectid,
- btrfs_ino(BTRFS_I(inode)), ref_offset);
- ret = btrfs_inc_extent_ref(trans, &ref);
+ inode_add_bytes(inode, replace_len);
+
+ if (extent_info->is_new_extent && extent_info->insertions == 0) {
+ key.objectid = extent_info->disk_offset;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = extent_info->disk_len;
+ ret = btrfs_alloc_reserved_file_extent(trans, root,
+ btrfs_ino(BTRFS_I(inode)),
+ extent_info->file_offset,
+ extent_info->qgroup_reserved,
+ &key);
+ } else {
+ u64 ref_offset;
+
+ btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
+ extent_info->disk_offset,
+ extent_info->disk_len, 0);
+ ref_offset = extent_info->file_offset - extent_info->data_offset;
+ btrfs_init_data_ref(&ref, root->root_key.objectid,
+ btrfs_ino(BTRFS_I(inode)), ref_offset);
+ ret = btrfs_inc_extent_ref(trans, &ref);
+ }
+
+ extent_info->insertions++;
return ret;
}
@@ -2574,15 +2648,15 @@ static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans,
/*
* The respective range must have been previously locked, as well as the inode.
* The end offset is inclusive (last byte of the range).
- * @clone_info is NULL for fallocate's hole punching and non-NULL for extent
- * cloning.
- * When cloning, we don't want to end up in a state where we dropped extents
- * without inserting a new one, so we must abort the transaction to avoid a
- * corruption.
+ * @extent_info is NULL for fallocate's hole punching and non-NULL when replacing
+ * the file range with an extent.
+ * When not punching a hole, we don't want to end up in a state where we dropped
+ * extents without inserting a new one, so we must abort the transaction to avoid
+ * a corruption.
*/
-int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
+int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
const u64 start, const u64 end,
- struct btrfs_clone_extent_info *clone_info,
+ struct btrfs_replace_extent_info *extent_info,
struct btrfs_trans_handle **trans_out)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -2611,10 +2685,10 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
/*
* 1 - update the inode
* 1 - removing the extents in the range
- * 1 - adding the hole extent if no_holes isn't set or if we are cloning
- * an extent
+ * 1 - adding the hole extent if no_holes isn't set or if we are
+ * replacing the range with a new extent
*/
- if (!btrfs_fs_incompat(fs_info, NO_HOLES) || clone_info)
+ if (!btrfs_fs_incompat(fs_info, NO_HOLES) || extent_info)
rsv_count = 3;
else
rsv_count = 2;
@@ -2644,14 +2718,15 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
* returned by __btrfs_drop_extents() without having
* changed anything in the file.
*/
- if (clone_info && ret && ret != -EOPNOTSUPP)
+ if (extent_info && !extent_info->is_new_extent &&
+ ret && ret != -EOPNOTSUPP)
btrfs_abort_transaction(trans, ret);
break;
}
trans->block_rsv = &fs_info->trans_block_rsv;
- if (!clone_info && cur_offset < drop_end &&
+ if (!extent_info && cur_offset < drop_end &&
cur_offset < ino_size) {
ret = fill_holes(trans, BTRFS_I(inode), path,
cur_offset, drop_end);
@@ -2665,7 +2740,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
btrfs_abort_transaction(trans, ret);
break;
}
- } else if (!clone_info && cur_offset < drop_end) {
+ } else if (!extent_info && cur_offset < drop_end) {
/*
* We are past the i_size here, but since we didn't
* insert holes we need to clear the mapped area so we
@@ -2685,18 +2760,18 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
}
}
- if (clone_info && drop_end > clone_info->file_offset) {
- u64 clone_len = drop_end - clone_info->file_offset;
+ if (extent_info && drop_end > extent_info->file_offset) {
+ u64 replace_len = drop_end - extent_info->file_offset;
- ret = btrfs_insert_clone_extent(trans, inode, path,
- clone_info, clone_len);
+ ret = btrfs_insert_replace_extent(trans, inode, path,
+ extent_info, replace_len);
if (ret) {
btrfs_abort_transaction(trans, ret);
break;
}
- clone_info->data_len -= clone_len;
- clone_info->data_offset += clone_len;
- clone_info->file_offset += clone_len;
+ extent_info->data_len -= replace_len;
+ extent_info->data_offset += replace_len;
+ extent_info->file_offset += replace_len;
}
cur_offset = drop_end;
@@ -2720,7 +2795,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
BUG_ON(ret); /* shouldn't happen */
trans->block_rsv = rsv;
- if (!clone_info) {
+ if (!extent_info) {
ret = find_first_non_hole(inode, &cur_offset, &len);
if (unlikely(ret < 0))
break;
@@ -2739,7 +2814,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
* than 16Mb would force the full fsync any way (when
* try_release_extent_mapping() is invoked during page cache truncation.
*/
- if (clone_info)
+ if (extent_info && !extent_info->is_new_extent)
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&BTRFS_I(inode)->runtime_flags);
@@ -2765,7 +2840,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
* (because it's useless) or if it represents a 0 bytes range (when
* cur_offset == drop_end).
*/
- if (!clone_info && cur_offset < ino_size && cur_offset < drop_end) {
+ if (!extent_info && cur_offset < ino_size && cur_offset < drop_end) {
ret = fill_holes(trans, BTRFS_I(inode), path,
cur_offset, drop_end);
if (ret) {
@@ -2773,7 +2848,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
btrfs_abort_transaction(trans, ret);
goto out_trans;
}
- } else if (!clone_info && cur_offset < drop_end) {
+ } else if (!extent_info && cur_offset < drop_end) {
/* See the comment in the loop above for the reasoning here. */
ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode),
cur_offset, drop_end - cur_offset);
@@ -2783,9 +2858,9 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
}
}
- if (clone_info) {
- ret = btrfs_insert_clone_extent(trans, inode, path, clone_info,
- clone_info->data_len);
+ if (extent_info) {
+ ret = btrfs_insert_replace_extent(trans, inode, path, extent_info,
+ extent_info->data_len);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out_trans;
@@ -2840,9 +2915,9 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
goto out_only_mutex;
}
- lockstart = round_up(offset, btrfs_inode_sectorsize(inode));
+ lockstart = round_up(offset, btrfs_inode_sectorsize(BTRFS_I(inode)));
lockend = round_down(offset + len,
- btrfs_inode_sectorsize(inode)) - 1;
+ btrfs_inode_sectorsize(BTRFS_I(inode))) - 1;
same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset))
== (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1));
/*
@@ -2927,7 +3002,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
goto out;
}
- ret = btrfs_punch_hole_range(inode, path, lockstart, lockend, NULL,
+ ret = btrfs_replace_file_extents(inode, path, lockstart, lockend, NULL,
&trans);
btrfs_free_path(path);
if (ret)
@@ -3044,7 +3119,7 @@ enum {
RANGE_BOUNDARY_HOLE,
};
-static int btrfs_zero_range_check_range_boundary(struct inode *inode,
+static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode,
u64 offset)
{
const u64 sectorsize = btrfs_inode_sectorsize(inode);
@@ -3052,7 +3127,7 @@ static int btrfs_zero_range_check_range_boundary(struct inode *inode,
int ret;
offset = round_down(offset, sectorsize);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize);
if (IS_ERR(em))
return PTR_ERR(em);
@@ -3077,7 +3152,7 @@ static int btrfs_zero_range(struct inode *inode,
struct extent_changeset *data_reserved = NULL;
int ret;
u64 alloc_hint = 0;
- const u64 sectorsize = btrfs_inode_sectorsize(inode);
+ const u64 sectorsize = btrfs_inode_sectorsize(BTRFS_I(inode));
u64 alloc_start = round_down(offset, sectorsize);
u64 alloc_end = round_up(offset + len, sectorsize);
u64 bytes_to_reserve = 0;
@@ -3167,7 +3242,8 @@ static int btrfs_zero_range(struct inode *inode,
* to cover them.
*/
if (!IS_ALIGNED(offset, sectorsize)) {
- ret = btrfs_zero_range_check_range_boundary(inode, offset);
+ ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
+ offset);
if (ret < 0)
goto out;
if (ret == RANGE_BOUNDARY_HOLE) {
@@ -3183,7 +3259,7 @@ static int btrfs_zero_range(struct inode *inode,
}
if (!IS_ALIGNED(offset + len, sectorsize)) {
- ret = btrfs_zero_range_check_range_boundary(inode,
+ ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
offset + len);
if (ret < 0)
goto out;
@@ -3258,7 +3334,7 @@ static long btrfs_fallocate(struct file *file, int mode,
u64 locked_end;
u64 actual_end = 0;
struct extent_map *em;
- int blocksize = btrfs_inode_sectorsize(inode);
+ int blocksize = btrfs_inode_sectorsize(BTRFS_I(inode));
int ret;
alloc_start = round_down(offset, blocksize);
@@ -3340,7 +3416,8 @@ static long btrfs_fallocate(struct file *file, int mode,
*/
lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
locked_end, &cached_state);
- ordered = btrfs_lookup_first_ordered_extent(inode, locked_end);
+ ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode),
+ locked_end);
if (ordered &&
ordered->file_offset + ordered->num_bytes > alloc_start &&
@@ -3541,9 +3618,26 @@ static int btrfs_file_open(struct inode *inode, struct file *filp)
return generic_file_open(inode, filp);
}
+static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ ssize_t ret = 0;
+
+ if (iocb->ki_flags & IOCB_DIRECT) {
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ inode_lock_shared(inode);
+ ret = btrfs_direct_IO(iocb, to);
+ inode_unlock_shared(inode);
+ if (ret < 0)
+ return ret;
+ }
+
+ return generic_file_buffered_read(iocb, to, ret);
+}
+
const struct file_operations btrfs_file_operations = {
.llseek = btrfs_file_llseek,
- .read_iter = generic_file_read_iter,
+ .read_iter = btrfs_file_read_iter,
.splice_read = generic_file_splice_read,
.write_iter = btrfs_file_write_iter,
.splice_write = iter_file_splice_write,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 6d961e11639e..af0013d3df63 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -413,8 +413,6 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate)
static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
{
- __le64 *val;
-
io_ctl_map_page(io_ctl, 1);
/*
@@ -429,14 +427,13 @@ static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
io_ctl->size -= sizeof(u64) * 2;
}
- val = io_ctl->cur;
- *val = cpu_to_le64(generation);
+ put_unaligned_le64(generation, io_ctl->cur);
io_ctl->cur += sizeof(u64);
}
static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
{
- __le64 *gen;
+ u64 cache_gen;
/*
* Skip the crc area. If we don't check crcs then we just have a 64bit
@@ -451,11 +448,11 @@ static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
io_ctl->size -= sizeof(u64) * 2;
}
- gen = io_ctl->cur;
- if (le64_to_cpu(*gen) != generation) {
+ cache_gen = get_unaligned_le64(io_ctl->cur);
+ if (cache_gen != generation) {
btrfs_err_rl(io_ctl->fs_info,
"space cache generation (%llu) does not match inode (%llu)",
- *gen, generation);
+ cache_gen, generation);
io_ctl_unmap_page(io_ctl);
return -EIO;
}
@@ -525,8 +522,8 @@ static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes,
return -ENOSPC;
entry = io_ctl->cur;
- entry->offset = cpu_to_le64(offset);
- entry->bytes = cpu_to_le64(bytes);
+ put_unaligned_le64(offset, &entry->offset);
+ put_unaligned_le64(bytes, &entry->bytes);
entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
BTRFS_FREE_SPACE_EXTENT;
io_ctl->cur += sizeof(struct btrfs_free_space_entry);
@@ -599,8 +596,8 @@ static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl,
}
e = io_ctl->cur;
- entry->offset = le64_to_cpu(e->offset);
- entry->bytes = le64_to_cpu(e->bytes);
+ entry->offset = get_unaligned_le64(&e->offset);
+ entry->bytes = get_unaligned_le64(&e->bytes);
*type = e->type;
io_ctl->cur += sizeof(struct btrfs_free_space_entry);
io_ctl->size -= sizeof(struct btrfs_free_space_entry);
@@ -1186,7 +1183,6 @@ static int __btrfs_wait_cache_io(struct btrfs_root *root,
ret = update_cache_item(trans, root, inode, path, offset,
io_ctl->entries, io_ctl->bitmaps);
out:
- io_ctl_free(io_ctl);
if (ret) {
invalidate_inode_pages2(inode->i_mapping);
BTRFS_I(inode)->generation = 0;
@@ -1347,13 +1343,14 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
* them out later
*/
io_ctl_drop_pages(io_ctl);
+ io_ctl_free(io_ctl);
unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
i_size_read(inode) - 1, &cached_state);
/*
* at this point the pages are under IO and we're happy,
- * The caller is responsible for waiting on them and updating the
+ * The caller is responsible for waiting on them and updating
* the cache and the inode
*/
io_ctl->entries = entries;
@@ -2282,7 +2279,7 @@ out:
static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info, bool update_stat)
{
- struct btrfs_free_space *left_info;
+ struct btrfs_free_space *left_info = NULL;
struct btrfs_free_space *right_info;
bool merged = false;
u64 offset = info->offset;
@@ -2298,7 +2295,7 @@ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
if (right_info && rb_prev(&right_info->offset_index))
left_info = rb_entry(rb_prev(&right_info->offset_index),
struct btrfs_free_space, offset_index);
- else
+ else if (!right_info)
left_info = tree_search_offset(ctl, offset - 1, 0, 0);
/* See try_merge_free_space() comment. */
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 8b1f5c8897b7..6b9faf3b0e96 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -22,6 +22,10 @@ void set_free_space_tree_thresholds(struct btrfs_block_group *cache)
size_t bitmap_size;
u64 num_bitmaps, total_bitmap_size;
+ if (WARN_ON(cache->length == 0))
+ btrfs_warn(cache->fs_info, "block group %llu length is zero",
+ cache->start);
+
/*
* We convert to bitmaps when the disk space required for using extents
* exceeds that required for using bitmaps.
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6dc03bab0c9d..936c3137c646 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6,7 +6,6 @@
#include <crypto/hash.h>
#include <linux/kernel.h>
#include <linux/bio.h>
-#include <linux/buffer_head.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
@@ -31,6 +30,7 @@
#include <linux/swap.h>
#include <linux/migrate.h>
#include <linux/sched/mm.h>
+#include <linux/iomap.h>
#include <asm/unaligned.h>
#include "misc.h"
#include "ctree.h"
@@ -59,9 +59,10 @@ struct btrfs_iget_args {
struct btrfs_dio_data {
u64 reserve;
- u64 unsubmitted_oe_range_start;
- u64 unsubmitted_oe_range_end;
- int overwrite;
+ loff_t length;
+ ssize_t submitted;
+ struct extent_changeset *data_reserved;
+ bool sync;
};
static const struct inode_operations btrfs_dir_inode_operations;
@@ -70,7 +71,6 @@ static const struct inode_operations btrfs_special_inode_operations;
static const struct inode_operations btrfs_file_inode_operations;
static const struct address_space_operations btrfs_aops;
static const struct file_operations btrfs_dir_file_operations;
-static const struct extent_io_ops btrfs_extent_io_ops;
static struct kmem_cache *btrfs_inode_cachep;
struct kmem_cache *btrfs_trans_handle_cachep;
@@ -140,13 +140,6 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
static int btrfs_dirty_inode(struct inode *inode);
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-void btrfs_test_inode_set_ops(struct inode *inode)
-{
- BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
-}
-#endif
-
static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
struct inode *inode, struct inode *dir,
const struct qstr *qstr)
@@ -654,12 +647,18 @@ cont:
page_error_op |
PAGE_END_WRITEBACK);
- for (i = 0; i < nr_pages; i++) {
- WARN_ON(pages[i]->mapping);
- put_page(pages[i]);
+ /*
+ * Ensure we only free the compressed pages if we have
+ * them allocated, as we can still reach here with
+ * inode_need_compress() == false.
+ */
+ if (pages) {
+ for (i = 0; i < nr_pages; i++) {
+ WARN_ON(pages[i]->mapping);
+ put_page(pages[i]);
+ }
+ kfree(pages);
}
- kfree(pages);
-
return 0;
}
}
@@ -1604,7 +1603,7 @@ next_slot:
goto out_check;
ret = btrfs_cross_ref_exist(root, ino,
found_key.offset -
- extent_offset, disk_bytenr);
+ extent_offset, disk_bytenr, false);
if (ret) {
/*
* ret could be -EIO if the above fails to read
@@ -2155,11 +2154,8 @@ static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio,
u64 bio_offset)
{
struct inode *inode = private_data;
- blk_status_t ret = 0;
- ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
- BUG_ON(ret); /* -ENOMEM */
- return 0;
+ return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
}
/*
@@ -2180,9 +2176,8 @@ static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio,
*
* c-3) otherwise: async submit
*/
-static blk_status_t btrfs_submit_bio_hook(struct inode *inode, struct bio *bio,
- int mirror_num,
- unsigned long bio_flags)
+blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
+ int mirror_num, unsigned long bio_flags)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -2242,16 +2237,15 @@ out:
* given a list of ordered sums record them in the inode. This happens
* at IO completion time based on sums calculated at bio submission time.
*/
-static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
- struct inode *inode, struct list_head *list)
+static int add_pending_csums(struct btrfs_trans_handle *trans,
+ struct list_head *list)
{
struct btrfs_ordered_sum *sum;
int ret;
list_for_each_entry(sum, list, list) {
trans->adding_csums = true;
- ret = btrfs_csum_file_blocks(trans,
- BTRFS_I(inode)->root->fs_info->csum_root, sum);
+ ret = btrfs_csum_file_blocks(trans, trans->fs_info->csum_root, sum);
trans->adding_csums = false;
if (ret)
return ret;
@@ -2354,7 +2348,7 @@ again:
unlock_extent_cached(&inode->io_tree, page_start, page_end,
&cached_state);
unlock_page(page);
- btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
goto again;
}
@@ -2545,7 +2539,6 @@ static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
}
static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
- struct inode *inode,
struct btrfs_ordered_extent *oe)
{
struct btrfs_file_extent_item stack_fi;
@@ -2565,8 +2558,9 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type);
/* Encryption and other encoding is reserved and all 0 */
- return insert_reserved_file_extent(trans, BTRFS_I(inode), oe->file_offset,
- &stack_fi, oe->qgroup_rsv);
+ return insert_reserved_file_extent(trans, BTRFS_I(oe->inode),
+ oe->file_offset, &stack_fi,
+ oe->qgroup_rsv);
}
/*
@@ -2663,8 +2657,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
logical_len);
} else {
BUG_ON(root == fs_info->tree_root);
- ret = insert_ordered_extent_file_extent(trans, inode,
- ordered_extent);
+ ret = insert_ordered_extent_file_extent(trans, ordered_extent);
if (!ret) {
clear_reserved_extent = false;
btrfs_release_delalloc_bytes(fs_info,
@@ -2680,7 +2673,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
goto out;
}
- ret = add_pending_csums(trans, inode, &ordered_extent->list);
+ ret = add_pending_csums(trans, &ordered_extent->list);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -2749,7 +2742,7 @@ out:
* This needs to be done to make sure anybody waiting knows we are done
* updating everything for this ordered extent.
*/
- btrfs_remove_ordered_extent(inode, ordered_extent);
+ btrfs_remove_ordered_extent(BTRFS_I(inode), ordered_extent);
/* once for us */
btrfs_put_ordered_extent(ordered_extent);
@@ -2769,8 +2762,8 @@ static void finish_ordered_fn(struct btrfs_work *work)
void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
u64 end, int uptodate)
{
- struct inode *inode = page->mapping->host;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_ordered_extent *ordered_extent = NULL;
struct btrfs_workqueue *wq;
@@ -2781,7 +2774,7 @@ void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
end - start + 1, uptodate))
return;
- if (btrfs_is_free_space_inode(BTRFS_I(inode)))
+ if (btrfs_is_free_space_inode(inode))
wq = fs_info->endio_freespace_worker;
else
wq = fs_info->endio_write_workers;
@@ -2830,9 +2823,8 @@ zeroit:
* if there's a match, we allow the bio to finish. If not, the code in
* extent_io.c will try to find good copies for us.
*/
-static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
- u64 phy_offset, struct page *page,
- u64 start, u64 end, int mirror)
+int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u64 phy_offset,
+ struct page *page, u64 start, u64 end, int mirror)
{
size_t offset = start - page_offset(page);
struct inode *inode = page->mapping->host;
@@ -3052,7 +3044,6 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
if (ret == -ENOENT && root == fs_info->tree_root) {
struct btrfs_root *dead_root;
- struct btrfs_fs_info *fs_info = root->fs_info;
int is_dead_root = 0;
/*
@@ -3392,7 +3383,6 @@ cache_acl:
switch (inode->i_mode & S_IFMT) {
case S_IFREG:
inode->i_mapping->a_ops = &btrfs_aops;
- BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
inode->i_fop = &btrfs_file_operations;
inode->i_op = &btrfs_file_inode_operations;
break;
@@ -4048,7 +4038,7 @@ out_end_trans:
err = ret;
inode->i_flags |= S_DEAD;
out_release:
- btrfs_subvolume_release_metadata(fs_info, &block_rsv);
+ btrfs_subvolume_release_metadata(root, &block_rsv);
out_up_write:
up_write(&fs_info->subvol_sem);
if (err) {
@@ -4580,7 +4570,7 @@ again:
&cached_state);
unlock_page(page);
put_page(page);
- btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
goto again;
}
@@ -4845,19 +4835,16 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
/*
* We're truncating a file that used to have good data down to
- * zero. Make sure it gets into the ordered flush list so that
- * any new writes get down to disk quickly.
+ * zero. Make sure any new writes to the file get on disk
+ * on close.
*/
if (newsize == 0)
- set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
+ set_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
&BTRFS_I(inode)->runtime_flags);
truncate_setsize(inode, newsize);
- /* Disable nonlocked read DIO to avoid the endless truncate */
- btrfs_inode_block_unlocked_dio(BTRFS_I(inode));
inode_dio_wait(inode);
- btrfs_inode_resume_unlocked_dio(BTRFS_I(inode));
ret = btrfs_truncate(inode, newsize == oldsize);
if (ret && inode->i_nlink) {
@@ -5302,15 +5289,15 @@ static void inode_tree_add(struct inode *inode)
spin_unlock(&root->inode_lock);
}
-static void inode_tree_del(struct inode *inode)
+static void inode_tree_del(struct btrfs_inode *inode)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root *root = inode->root;
int empty = 0;
spin_lock(&root->inode_lock);
- if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
- rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
- RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
+ if (!RB_EMPTY_NODE(&inode->rb_node)) {
+ rb_erase(&inode->rb_node, &root->inode_tree);
+ RB_CLEAR_NODE(&inode->rb_node);
empty = RB_EMPTY_ROOT(&root->inode_tree);
}
spin_unlock(&root->inode_lock);
@@ -6308,7 +6295,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
if (err)
goto out_unlock;
- BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
d_instantiate_new(dentry, inode);
out_unlock:
@@ -6371,7 +6357,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
drop_inode = 1;
} else {
struct dentry *parent = dentry->d_parent;
- int ret;
err = btrfs_update_inode(trans, root, inode);
if (err)
@@ -6386,12 +6371,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
goto fail;
}
d_instantiate(dentry, inode);
- ret = btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent,
- true, NULL);
- if (ret == BTRFS_NEED_TRANS_COMMIT) {
- err = btrfs_commit_transaction(trans);
- trans = NULL;
- }
+ btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent);
}
fail:
@@ -6537,8 +6517,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
u64 start, u64 len)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- int ret;
- int err = 0;
+ int ret = 0;
u64 extent_start = 0;
u64 extent_end = 0;
u64 objectid = btrfs_ino(inode);
@@ -6566,7 +6545,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
}
em = alloc_extent_map();
if (!em) {
- err = -ENOMEM;
+ ret = -ENOMEM;
goto out;
}
em->start = EXTENT_MAP_HOLE;
@@ -6576,7 +6555,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
path = btrfs_alloc_path();
if (!path) {
- err = -ENOMEM;
+ ret = -ENOMEM;
goto out;
}
@@ -6589,14 +6568,16 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
*/
path->leave_spinning = 1;
+ path->recurse = btrfs_is_free_space_inode(inode);
+
ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0);
if (ret < 0) {
- err = ret;
goto out;
} else if (ret > 0) {
if (path->slots[0] == 0)
goto not_found;
path->slots[0]--;
+ ret = 0;
}
leaf = path->nodes[0];
@@ -6640,12 +6621,11 @@ next:
path->slots[0]++;
if (path->slots[0] >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
- if (ret < 0) {
- err = ret;
+ if (ret < 0)
goto out;
- } else if (ret > 0) {
+ else if (ret > 0)
goto not_found;
- }
+
leaf = path->nodes[0];
}
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
@@ -6696,10 +6676,8 @@ next:
BTRFS_COMPRESS_NONE) {
ret = uncompress_inline(path, page, pg_offset,
extent_offset, item);
- if (ret) {
- err = ret;
+ if (ret)
goto out;
- }
} else {
map = kmap(page);
read_extent_buffer(leaf, map + pg_offset, ptr,
@@ -6723,29 +6701,28 @@ not_found:
em->len = len;
em->block_start = EXTENT_MAP_HOLE;
insert:
+ ret = 0;
btrfs_release_path(path);
if (em->start > start || extent_map_end(em) <= start) {
btrfs_err(fs_info,
"bad extent! em: [%llu %llu] passed [%llu %llu]",
em->start, em->len, start, len);
- err = -EIO;
+ ret = -EIO;
goto out;
}
- err = 0;
write_lock(&em_tree->lock);
- err = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
+ ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
write_unlock(&em_tree->lock);
out:
btrfs_free_path(path);
trace_btrfs_get_extent(root, inode, em);
- if (err) {
+ if (ret) {
free_extent_map(em);
- return ERR_PTR(err);
+ return ERR_PTR(ret);
}
- BUG_ON(!em); /* Error is always set */
return em;
}
@@ -6947,6 +6924,8 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
* @orig_start: (optional) Return the original file offset of the file extent
* @orig_len: (optional) Return the original on-disk length of the file extent
* @ram_bytes: (optional) Return the ram_bytes of the file extent
+ * @strict: if true, omit optimizations that might force us into unnecessary
+ * cow. e.g., don't trust generation number.
*
* This function will flush ordered extents in the range to ensure proper
* nocow checks for (nowait == false) case.
@@ -6961,7 +6940,7 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
*/
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
u64 *orig_start, u64 *orig_block_len,
- u64 *ram_bytes)
+ u64 *ram_bytes, bool strict)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_path *path;
@@ -7039,8 +7018,9 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
* Do the same check as in btrfs_cross_ref_exist but without the
* unnecessary search.
*/
- if (btrfs_file_extent_generation(leaf, fi) <=
- btrfs_root_last_snapshot(&root->root_item))
+ if (!strict &&
+ (btrfs_file_extent_generation(leaf, fi) <=
+ btrfs_root_last_snapshot(&root->root_item)))
goto out;
backref_offset = btrfs_file_extent_offset(leaf, fi);
@@ -7076,7 +7056,8 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
*/
ret = btrfs_cross_ref_exist(root, btrfs_ino(BTRFS_I(inode)),
- key.offset - backref_offset, disk_bytenr);
+ key.offset - backref_offset, disk_bytenr,
+ strict);
if (ret) {
ret = 0;
goto out;
@@ -7104,7 +7085,7 @@ out:
}
static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
- struct extent_state **cached_state, int writing)
+ struct extent_state **cached_state, bool writing)
{
struct btrfs_ordered_extent *ordered;
int ret = 0;
@@ -7153,7 +7134,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
*/
if (writing ||
test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
- btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
else
ret = -ENOTBLK;
btrfs_put_ordered_extent(ordered);
@@ -7242,30 +7223,7 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
}
-static int btrfs_get_blocks_direct_read(struct extent_map *em,
- struct buffer_head *bh_result,
- struct inode *inode,
- u64 start, u64 len)
-{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-
- if (em->block_start == EXTENT_MAP_HOLE ||
- test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
- return -ENOENT;
-
- len = min(len, em->len - (start - em->start));
-
- bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
- inode->i_blkbits;
- bh_result->b_size = len;
- bh_result->b_bdev = fs_info->fs_devices->latest_bdev;
- set_buffer_mapped(bh_result);
-
- return 0;
-}
-
static int btrfs_get_blocks_direct_write(struct extent_map **map,
- struct buffer_head *bh_result,
struct inode *inode,
struct btrfs_dio_data *dio_data,
u64 start, u64 len)
@@ -7297,7 +7255,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
block_start = em->block_start + (start - em->start);
if (can_nocow_extent(inode, start, &len, &orig_start,
- &orig_block_len, &ram_bytes) == 1 &&
+ &orig_block_len, &ram_bytes, false) == 1 &&
btrfs_inc_nocow_writers(fs_info, block_start)) {
struct extent_map *em2;
@@ -7326,7 +7284,6 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
}
/* this will cow the extent */
- len = bh_result->b_size;
free_extent_map(em);
*map = em = btrfs_new_extent_direct(BTRFS_I(inode), start, len);
if (IS_ERR(em)) {
@@ -7337,64 +7294,88 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
len = min(len, em->len - (start - em->start));
skip_cow:
- bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
- inode->i_blkbits;
- bh_result->b_size = len;
- bh_result->b_bdev = fs_info->fs_devices->latest_bdev;
- set_buffer_mapped(bh_result);
-
- if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
- set_buffer_new(bh_result);
-
/*
* Need to update the i_size under the extent lock so buffered
* readers will get the updated i_size when we unlock.
*/
- if (!dio_data->overwrite && start + len > i_size_read(inode))
+ if (start + len > i_size_read(inode))
i_size_write(inode, start + len);
- WARN_ON(dio_data->reserve < len);
dio_data->reserve -= len;
- dio_data->unsubmitted_oe_range_end = start + len;
- current->journal_info = dio_data;
out:
return ret;
}
-static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
+static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
+ loff_t length, unsigned int flags, struct iomap *iomap,
+ struct iomap *srcmap)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_map *em;
struct extent_state *cached_state = NULL;
struct btrfs_dio_data *dio_data = NULL;
- u64 start = iblock << inode->i_blkbits;
u64 lockstart, lockend;
- u64 len = bh_result->b_size;
+ const bool write = !!(flags & IOMAP_WRITE);
int ret = 0;
+ u64 len = length;
+ bool unlock_extents = false;
+ bool sync = (current->journal_info == BTRFS_DIO_SYNC_STUB);
+
+ /*
+ * We used current->journal_info here to see if we were sync, but
+ * there's a lot of tests in the enospc machinery to not do flushing if
+ * we have a journal_info set, so we need to clear this out and re-set
+ * it in iomap_end.
+ */
+ ASSERT(current->journal_info == NULL ||
+ current->journal_info == BTRFS_DIO_SYNC_STUB);
+ current->journal_info = NULL;
- if (!create)
+ if (!write)
len = min_t(u64, len, fs_info->sectorsize);
lockstart = start;
lockend = start + len - 1;
- if (current->journal_info) {
- /*
- * Need to pull our outstanding extents and set journal_info to NULL so
- * that anything that needs to check if there's a transaction doesn't get
- * confused.
- */
- dio_data = current->journal_info;
- current->journal_info = NULL;
+ /*
+ * The generic stuff only does filemap_write_and_wait_range, which
+ * isn't enough if we've written compressed pages to this area, so we
+ * need to flush the dirty pages again to make absolutely sure that any
+ * outstanding dirty pages are on disk.
+ */
+ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags)) {
+ ret = filemap_fdatawrite_range(inode->i_mapping, start,
+ start + length - 1);
+ if (ret)
+ return ret;
}
+ dio_data = kzalloc(sizeof(*dio_data), GFP_NOFS);
+ if (!dio_data)
+ return -ENOMEM;
+
+ dio_data->sync = sync;
+ dio_data->length = length;
+ if (write) {
+ dio_data->reserve = round_up(length, fs_info->sectorsize);
+ ret = btrfs_delalloc_reserve_space(BTRFS_I(inode),
+ &dio_data->data_reserved,
+ start, dio_data->reserve);
+ if (ret) {
+ extent_changeset_free(dio_data->data_reserved);
+ kfree(dio_data);
+ return ret;
+ }
+ }
+ iomap->private = dio_data;
+
+
/*
* If this errors out it's because we couldn't invalidate pagecache for
* this range and we need to fallback to buffered.
*/
- if (lock_extent_direct(inode, lockstart, lockend, &cached_state,
- create)) {
+ if (lock_extent_direct(inode, lockstart, lockend, &cached_state, write)) {
ret = -ENOTBLK;
goto err;
}
@@ -7426,35 +7407,47 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
goto unlock_err;
}
- if (create) {
- ret = btrfs_get_blocks_direct_write(&em, bh_result, inode,
- dio_data, start, len);
+ len = min(len, em->len - (start - em->start));
+ if (write) {
+ ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
+ start, len);
if (ret < 0)
goto unlock_err;
-
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
- lockend, &cached_state);
+ unlock_extents = true;
+ /* Recalc len in case the new em is smaller than requested */
+ len = min(len, em->len - (start - em->start));
} else {
- ret = btrfs_get_blocks_direct_read(em, bh_result, inode,
- start, len);
- /* Can be negative only if we read from a hole */
- if (ret < 0) {
- ret = 0;
- free_extent_map(em);
- goto unlock_err;
- }
/*
* We need to unlock only the end area that we aren't using.
* The rest is going to be unlocked by the endio routine.
*/
- lockstart = start + bh_result->b_size;
- if (lockstart < lockend) {
- unlock_extent_cached(&BTRFS_I(inode)->io_tree,
- lockstart, lockend, &cached_state);
- } else {
- free_extent_state(cached_state);
- }
+ lockstart = start + len;
+ if (lockstart < lockend)
+ unlock_extents = true;
+ }
+
+ if (unlock_extents)
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+ lockstart, lockend, &cached_state);
+ else
+ free_extent_state(cached_state);
+
+ /*
+ * Translate extent map information to iomap.
+ * We trim the extents (and move the addr) even though iomap code does
+ * that, since we have locked only the parts we are performing I/O in.
+ */
+ if ((em->block_start == EXTENT_MAP_HOLE) ||
+ (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) {
+ iomap->addr = IOMAP_NULL_ADDR;
+ iomap->type = IOMAP_HOLE;
+ } else {
+ iomap->addr = em->block_start + (start - em->start);
+ iomap->type = IOMAP_MAPPED;
}
+ iomap->offset = start;
+ iomap->bdev = fs_info->fs_devices->latest_bdev;
+ iomap->length = len;
free_extent_map(em);
@@ -7464,8 +7457,63 @@ unlock_err:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
&cached_state);
err:
- if (dio_data)
- current->journal_info = dio_data;
+ if (dio_data) {
+ btrfs_delalloc_release_space(BTRFS_I(inode),
+ dio_data->data_reserved, start,
+ dio_data->reserve, true);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->reserve);
+ extent_changeset_free(dio_data->data_reserved);
+ kfree(dio_data);
+ }
+ return ret;
+}
+
+static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
+ ssize_t written, unsigned int flags, struct iomap *iomap)
+{
+ int ret = 0;
+ struct btrfs_dio_data *dio_data = iomap->private;
+ size_t submitted = dio_data->submitted;
+ const bool write = !!(flags & IOMAP_WRITE);
+
+ if (!write && (iomap->type == IOMAP_HOLE)) {
+ /* If reading from a hole, unlock and return */
+ unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1);
+ goto out;
+ }
+
+ if (submitted < length) {
+ pos += submitted;
+ length -= submitted;
+ if (write)
+ __endio_write_update_ordered(BTRFS_I(inode), pos,
+ length, false);
+ else
+ unlock_extent(&BTRFS_I(inode)->io_tree, pos,
+ pos + length - 1);
+ ret = -ENOTBLK;
+ }
+
+ if (write) {
+ if (dio_data->reserve)
+ btrfs_delalloc_release_space(BTRFS_I(inode),
+ dio_data->data_reserved, pos,
+ dio_data->reserve, true);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->length);
+ extent_changeset_free(dio_data->data_reserved);
+ }
+out:
+ /*
+ * We're all done, we can re-set the current->journal_info now safely
+ * for our endio.
+ */
+ if (dio_data->sync) {
+ ASSERT(current->journal_info == NULL);
+ current->journal_info = BTRFS_DIO_SYNC_STUB;
+ }
+ kfree(dio_data);
+ iomap->private = NULL;
+
return ret;
}
@@ -7489,7 +7537,7 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip)
dip->logical_offset + dip->bytes - 1);
}
- dio_end_io(dip->dio_bio);
+ bio_endio(dip->dio_bio);
kfree(dip);
}
@@ -7613,10 +7661,8 @@ static blk_status_t btrfs_submit_bio_start_direct_io(void *private_data,
struct bio *bio, u64 offset)
{
struct inode *inode = private_data;
- blk_status_t ret;
- ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, offset, 1);
- BUG_ON(ret); /* -ENOMEM */
- return 0;
+
+ return btrfs_csum_one_bio(BTRFS_I(inode), bio, offset, 1);
}
static void btrfs_end_dio_bio(struct bio *bio)
@@ -7725,24 +7771,11 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio,
dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
dip->dio_bio = dio_bio;
refcount_set(&dip->refs, 1);
-
- if (write) {
- struct btrfs_dio_data *dio_data = current->journal_info;
-
- /*
- * Setting range start and end to the same value means that
- * no cleanup will happen in btrfs_direct_IO
- */
- dio_data->unsubmitted_oe_range_end = dip->logical_offset +
- dip->bytes;
- dio_data->unsubmitted_oe_range_start =
- dio_data->unsubmitted_oe_range_end;
- }
return dip;
}
-static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
- loff_t file_offset)
+static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
+ struct bio *dio_bio, loff_t file_offset)
{
const bool write = (bio_op(dio_bio) == REQ_OP_WRITE);
const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
@@ -7759,6 +7792,7 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
int ret;
blk_status_t status;
struct btrfs_io_geometry geom;
+ struct btrfs_dio_data *dio_data = iomap->private;
dip = btrfs_create_dio_private(dio_bio, inode, file_offset);
if (!dip) {
@@ -7767,8 +7801,8 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
file_offset + dio_bio->bi_iter.bi_size - 1);
}
dio_bio->bi_status = BLK_STS_RESOURCE;
- dio_end_io(dio_bio);
- return;
+ bio_endio(dio_bio);
+ return BLK_QC_T_NONE;
}
if (!write && csum) {
@@ -7839,15 +7873,17 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
goto out_err;
}
+ dio_data->submitted += clone_len;
clone_offset += clone_len;
start_sector += clone_len >> 9;
file_offset += clone_len;
} while (submit_len > 0);
- return;
+ return BLK_QC_T_NONE;
out_err:
dip->dio_bio->bi_status = status;
btrfs_dio_private_put(dip);
+ return BLK_QC_T_NONE;
}
static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
@@ -7883,37 +7919,59 @@ out:
return retval;
}
-static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+static inline int btrfs_maybe_fsync_end_io(struct kiocb *iocb, ssize_t size,
+ int error, unsigned flags)
+{
+ /*
+ * Now if we're still in the context of our submitter we know we can't
+ * safely run generic_write_sync(), so clear our flag here so that the
+ * caller knows to follow up with a sync.
+ */
+ if (current->journal_info == BTRFS_DIO_SYNC_STUB) {
+ current->journal_info = NULL;
+ return error;
+ }
+
+ if (error)
+ return error;
+
+ if (size) {
+ iocb->ki_flags |= IOCB_DSYNC;
+ return generic_write_sync(iocb, size);
+ }
+
+ return 0;
+}
+
+static const struct iomap_ops btrfs_dio_iomap_ops = {
+ .iomap_begin = btrfs_dio_iomap_begin,
+ .iomap_end = btrfs_dio_iomap_end,
+};
+
+static const struct iomap_dio_ops btrfs_dio_ops = {
+ .submit_io = btrfs_submit_direct,
+};
+
+static const struct iomap_dio_ops btrfs_sync_dops = {
+ .submit_io = btrfs_submit_direct,
+ .end_io = btrfs_maybe_fsync_end_io,
+};
+
+ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_dio_data dio_data = { 0 };
struct extent_changeset *data_reserved = NULL;
loff_t offset = iocb->ki_pos;
size_t count = 0;
- int flags = 0;
- bool wakeup = true;
bool relock = false;
ssize_t ret;
if (check_direct_IO(fs_info, iter, offset))
return 0;
- inode_dio_begin(inode);
-
- /*
- * The generic stuff only does filemap_write_and_wait_range, which
- * isn't enough if we've written compressed pages to this area, so
- * we need to flush the dirty pages again to make absolutely sure
- * that any outstanding dirty pages are on disk.
- */
count = iov_iter_count(iter);
- if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags))
- filemap_fdatawrite_range(inode->i_mapping, offset,
- offset + count - 1);
-
if (iov_iter_rw(iter) == WRITE) {
/*
* If the write DIO is beyond the EOF, we need update
@@ -7921,66 +7979,29 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
* not unlock the i_mutex at this case.
*/
if (offset + count <= inode->i_size) {
- dio_data.overwrite = 1;
inode_unlock(inode);
relock = true;
}
- ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
- offset, count);
- if (ret)
- goto out;
-
- /*
- * We need to know how many extents we reserved so that we can
- * do the accounting properly if we go over the number we
- * originally calculated. Abuse current->journal_info for this.
- */
- dio_data.reserve = round_up(count,
- fs_info->sectorsize);
- dio_data.unsubmitted_oe_range_start = (u64)offset;
- dio_data.unsubmitted_oe_range_end = (u64)offset;
- current->journal_info = &dio_data;
down_read(&BTRFS_I(inode)->dio_sem);
- } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
- &BTRFS_I(inode)->runtime_flags)) {
- inode_dio_end(inode);
- flags = DIO_LOCKING | DIO_SKIP_HOLES;
- wakeup = false;
}
- ret = __blockdev_direct_IO(iocb, inode,
- fs_info->fs_devices->latest_bdev,
- iter, btrfs_get_blocks_direct, NULL,
- btrfs_submit_direct, flags);
- if (iov_iter_rw(iter) == WRITE) {
+ /*
+ * We have are actually a sync iocb, so we need our fancy endio to know
+ * if we need to sync.
+ */
+ if (current->journal_info)
+ ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops,
+ &btrfs_sync_dops, is_sync_kiocb(iocb));
+ else
+ ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops,
+ &btrfs_dio_ops, is_sync_kiocb(iocb));
+
+ if (ret == -ENOTBLK)
+ ret = 0;
+
+ if (iov_iter_rw(iter) == WRITE)
up_read(&BTRFS_I(inode)->dio_sem);
- current->journal_info = NULL;
- if (ret < 0 && ret != -EIOCBQUEUED) {
- if (dio_data.reserve)
- btrfs_delalloc_release_space(BTRFS_I(inode),
- data_reserved, offset, dio_data.reserve,
- true);
- /*
- * On error we might have left some ordered extents
- * without submitting corresponding bios for them, so
- * cleanup them up to avoid other tasks getting them
- * and waiting for them to complete forever.
- */
- if (dio_data.unsubmitted_oe_range_start <
- dio_data.unsubmitted_oe_range_end)
- __endio_write_update_ordered(BTRFS_I(inode),
- dio_data.unsubmitted_oe_range_start,
- dio_data.unsubmitted_oe_range_end -
- dio_data.unsubmitted_oe_range_start,
- false);
- } else if (ret >= 0 && (size_t)ret < count)
- btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
- offset, count - (size_t)ret, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode), count);
- }
-out:
- if (wakeup)
- inode_dio_end(inode);
+
if (relock)
inode_lock(inode);
@@ -7997,12 +8018,24 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
if (ret)
return ret;
- return extent_fiemap(inode, fieinfo, start, len);
+ return extent_fiemap(BTRFS_I(inode), fieinfo, start, len);
}
int btrfs_readpage(struct file *file, struct page *page)
{
- return extent_read_full_page(page, btrfs_get_extent, 0);
+ struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+ u64 start = page_offset(page);
+ u64 end = start + PAGE_SIZE - 1;
+ unsigned long bio_flags = 0;
+ struct bio *bio = NULL;
+ int ret;
+
+ btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
+
+ ret = btrfs_do_readpage(page, NULL, &bio, &bio_flags, 0, NULL);
+ if (bio)
+ ret = submit_one_bio(bio, 0, bio_flags);
+ return ret;
}
static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
@@ -8086,15 +8119,15 @@ static int btrfs_migratepage(struct address_space *mapping,
static void btrfs_invalidatepage(struct page *page, unsigned int offset,
unsigned int length)
{
- struct inode *inode = page->mapping->host;
- struct extent_io_tree *tree;
+ struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+ struct extent_io_tree *tree = &inode->io_tree;
struct btrfs_ordered_extent *ordered;
struct extent_state *cached_state = NULL;
u64 page_start = page_offset(page);
u64 page_end = page_start + PAGE_SIZE - 1;
u64 start;
u64 end;
- int inode_evicting = inode->i_state & I_FREEING;
+ int inode_evicting = inode->vfs_inode.i_state & I_FREEING;
/*
* we have the page locked, so new writeback can't start,
@@ -8105,7 +8138,6 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
*/
wait_on_page_writeback(page);
- tree = &BTRFS_I(inode)->io_tree;
if (offset) {
btrfs_releasepage(page, GFP_NOFS);
return;
@@ -8115,8 +8147,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
lock_extent_bits(tree, page_start, page_end, &cached_state);
again:
start = page_start;
- ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start,
- page_end - start + 1);
+ ordered = btrfs_lookup_ordered_range(inode, start, page_end - start + 1);
if (ordered) {
end = min(page_end,
ordered->file_offset + ordered->num_bytes - 1);
@@ -8137,7 +8168,7 @@ again:
struct btrfs_ordered_inode_tree *tree;
u64 new_len;
- tree = &BTRFS_I(inode)->ordered_tree;
+ tree = &inode->ordered_tree;
spin_lock_irq(&tree->lock);
set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
@@ -8176,7 +8207,7 @@ again:
* bit of its io_tree, and free the qgroup reserved data space.
* Since the IO will never happen for this page.
*/
- btrfs_qgroup_free_data(BTRFS_I(inode), NULL, page_start, PAGE_SIZE);
+ btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE);
if (!inode_evicting) {
clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED |
EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
@@ -8278,7 +8309,7 @@ again:
unlock_extent_cached(io_tree, page_start, page_end,
&cached_state);
unlock_page(page);
- btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
goto again;
}
@@ -8609,21 +8640,21 @@ void btrfs_free_inode(struct inode *inode)
kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
}
-void btrfs_destroy_inode(struct inode *inode)
+void btrfs_destroy_inode(struct inode *vfs_inode)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ordered_extent *ordered;
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_inode *inode = BTRFS_I(vfs_inode);
+ struct btrfs_root *root = inode->root;
- WARN_ON(!hlist_empty(&inode->i_dentry));
- WARN_ON(inode->i_data.nrpages);
- WARN_ON(BTRFS_I(inode)->block_rsv.reserved);
- WARN_ON(BTRFS_I(inode)->block_rsv.size);
- WARN_ON(BTRFS_I(inode)->outstanding_extents);
- WARN_ON(BTRFS_I(inode)->delalloc_bytes);
- WARN_ON(BTRFS_I(inode)->new_delalloc_bytes);
- WARN_ON(BTRFS_I(inode)->csum_bytes);
- WARN_ON(BTRFS_I(inode)->defrag_bytes);
+ WARN_ON(!hlist_empty(&vfs_inode->i_dentry));
+ WARN_ON(vfs_inode->i_data.nrpages);
+ WARN_ON(inode->block_rsv.reserved);
+ WARN_ON(inode->block_rsv.size);
+ WARN_ON(inode->outstanding_extents);
+ WARN_ON(inode->delalloc_bytes);
+ WARN_ON(inode->new_delalloc_bytes);
+ WARN_ON(inode->csum_bytes);
+ WARN_ON(inode->defrag_bytes);
/*
* This can happen where we create an inode, but somebody else also
@@ -8638,7 +8669,7 @@ void btrfs_destroy_inode(struct inode *inode)
if (!ordered)
break;
else {
- btrfs_err(fs_info,
+ btrfs_err(root->fs_info,
"found ordered extent %llu %llu on inode cleanup",
ordered->file_offset, ordered->num_bytes);
btrfs_remove_ordered_extent(inode, ordered);
@@ -8646,11 +8677,11 @@ void btrfs_destroy_inode(struct inode *inode)
btrfs_put_ordered_extent(ordered);
}
}
- btrfs_qgroup_check_reserved_leak(BTRFS_I(inode));
+ btrfs_qgroup_check_reserved_leak(inode);
inode_tree_del(inode);
- btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0);
- btrfs_inode_clear_file_extent_range(BTRFS_I(inode), 0, (u64)-1);
- btrfs_put_root(BTRFS_I(inode)->root);
+ btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
+ btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1);
+ btrfs_put_root(inode->root);
}
int btrfs_drop_inode(struct inode *inode)
@@ -8775,27 +8806,19 @@ static int btrfs_rename_exchange(struct inode *old_dir,
struct inode *new_inode = new_dentry->d_inode;
struct inode *old_inode = old_dentry->d_inode;
struct timespec64 ctime = current_time(old_inode);
- struct dentry *parent;
u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
u64 new_ino = btrfs_ino(BTRFS_I(new_inode));
u64 old_idx = 0;
u64 new_idx = 0;
int ret;
+ int ret2;
bool root_log_pinned = false;
bool dest_log_pinned = false;
- struct btrfs_log_ctx ctx_root;
- struct btrfs_log_ctx ctx_dest;
- bool sync_log_root = false;
- bool sync_log_dest = false;
- bool commit_transaction = false;
/* we only allow rename subvolume link between subvolumes */
if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
return -EXDEV;
- btrfs_init_log_ctx(&ctx_root, old_inode);
- btrfs_init_log_ctx(&ctx_dest, new_inode);
-
/* close the race window with snapshot create/destroy ioctl */
if (old_ino == BTRFS_FIRST_FREE_OBJECTID ||
new_ino == BTRFS_FIRST_FREE_OBJECTID)
@@ -8937,30 +8960,14 @@ static int btrfs_rename_exchange(struct inode *old_dir,
BTRFS_I(new_inode)->dir_index = new_idx;
if (root_log_pinned) {
- parent = new_dentry->d_parent;
- ret = btrfs_log_new_name(trans, BTRFS_I(old_inode),
- BTRFS_I(old_dir), parent,
- false, &ctx_root);
- if (ret == BTRFS_NEED_LOG_SYNC)
- sync_log_root = true;
- else if (ret == BTRFS_NEED_TRANS_COMMIT)
- commit_transaction = true;
- ret = 0;
+ btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir),
+ new_dentry->d_parent);
btrfs_end_log_trans(root);
root_log_pinned = false;
}
if (dest_log_pinned) {
- if (!commit_transaction) {
- parent = old_dentry->d_parent;
- ret = btrfs_log_new_name(trans, BTRFS_I(new_inode),
- BTRFS_I(new_dir), parent,
- false, &ctx_dest);
- if (ret == BTRFS_NEED_LOG_SYNC)
- sync_log_dest = true;
- else if (ret == BTRFS_NEED_TRANS_COMMIT)
- commit_transaction = true;
- ret = 0;
- }
+ btrfs_log_new_name(trans, BTRFS_I(new_inode), BTRFS_I(new_dir),
+ old_dentry->d_parent);
btrfs_end_log_trans(dest);
dest_log_pinned = false;
}
@@ -8993,46 +9000,13 @@ out_fail:
dest_log_pinned = false;
}
}
- if (!ret && sync_log_root && !commit_transaction) {
- ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root,
- &ctx_root);
- if (ret)
- commit_transaction = true;
- }
- if (!ret && sync_log_dest && !commit_transaction) {
- ret = btrfs_sync_log(trans, BTRFS_I(new_inode)->root,
- &ctx_dest);
- if (ret)
- commit_transaction = true;
- }
- if (commit_transaction) {
- /*
- * We may have set commit_transaction when logging the new name
- * in the destination root, in which case we left the source
- * root context in the list of log contextes. So make sure we
- * remove it to avoid invalid memory accesses, since the context
- * was allocated in our stack frame.
- */
- if (sync_log_root) {
- mutex_lock(&root->log_mutex);
- list_del_init(&ctx_root.list);
- mutex_unlock(&root->log_mutex);
- }
- ret = btrfs_commit_transaction(trans);
- } else {
- int ret2;
-
- ret2 = btrfs_end_transaction(trans);
- ret = ret ? ret : ret2;
- }
+ ret2 = btrfs_end_transaction(trans);
+ ret = ret ? ret : ret2;
out_notrans:
if (new_ino == BTRFS_FIRST_FREE_OBJECTID ||
old_ino == BTRFS_FIRST_FREE_OBJECTID)
up_read(&fs_info->subvol_sem);
- ASSERT(list_empty(&ctx_root.list));
- ASSERT(list_empty(&ctx_dest.list));
-
return ret;
}
@@ -9100,11 +9074,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *old_inode = d_inode(old_dentry);
u64 index = 0;
int ret;
+ int ret2;
u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
bool log_pinned = false;
- struct btrfs_log_ctx ctx;
- bool sync_log = false;
- bool commit_transaction = false;
if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
return -EPERM;
@@ -9254,17 +9226,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
BTRFS_I(old_inode)->dir_index = index;
if (log_pinned) {
- struct dentry *parent = new_dentry->d_parent;
-
- btrfs_init_log_ctx(&ctx, old_inode);
- ret = btrfs_log_new_name(trans, BTRFS_I(old_inode),
- BTRFS_I(old_dir), parent,
- false, &ctx);
- if (ret == BTRFS_NEED_LOG_SYNC)
- sync_log = true;
- else if (ret == BTRFS_NEED_TRANS_COMMIT)
- commit_transaction = true;
- ret = 0;
+ btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir),
+ new_dentry->d_parent);
btrfs_end_log_trans(root);
log_pinned = false;
}
@@ -9301,23 +9264,8 @@ out_fail:
btrfs_end_log_trans(root);
log_pinned = false;
}
- if (!ret && sync_log) {
- ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root, &ctx);
- if (ret)
- commit_transaction = true;
- } else if (sync_log) {
- mutex_lock(&root->log_mutex);
- list_del(&ctx.list);
- mutex_unlock(&root->log_mutex);
- }
- if (commit_transaction) {
- ret = btrfs_commit_transaction(trans);
- } else {
- int ret2;
-
- ret2 = btrfs_end_transaction(trans);
- ret = ret ? ret : ret2;
- }
+ ret2 = btrfs_end_transaction(trans);
+ ret = ret ? ret : ret2;
out_notrans:
if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
up_read(&fs_info->subvol_sem);
@@ -9383,7 +9331,7 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode
* some fairly slow code that needs optimization. This walks the list
* of all the inodes with pending delalloc and forces them to disk.
*/
-static int start_delalloc_inodes(struct btrfs_root *root, int nr, bool snapshot)
+static int start_delalloc_inodes(struct btrfs_root *root, u64 *nr, bool snapshot)
{
struct btrfs_inode *binode;
struct inode *inode;
@@ -9423,9 +9371,11 @@ static int start_delalloc_inodes(struct btrfs_root *root, int nr, bool snapshot)
list_add_tail(&work->list, &works);
btrfs_queue_work(root->fs_info->flush_workers,
&work->work);
- ret++;
- if (nr != -1 && ret >= nr)
- goto out;
+ if (*nr != U64_MAX) {
+ (*nr)--;
+ if (*nr == 0)
+ goto out;
+ }
cond_resched();
spin_lock(&root->delalloc_lock);
}
@@ -9450,18 +9400,15 @@ out:
int btrfs_start_delalloc_snapshot(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- int ret;
+ u64 nr = U64_MAX;
if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
return -EROFS;
- ret = start_delalloc_inodes(root, -1, true);
- if (ret > 0)
- ret = 0;
- return ret;
+ return start_delalloc_inodes(root, &nr, true);
}
-int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr)
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr)
{
struct btrfs_root *root;
struct list_head splice;
@@ -9484,15 +9431,10 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr)
&fs_info->delalloc_roots);
spin_unlock(&fs_info->delalloc_root_lock);
- ret = start_delalloc_inodes(root, nr, false);
+ ret = start_delalloc_inodes(root, &nr, false);
btrfs_put_root(root);
if (ret < 0)
goto out;
-
- if (nr != -1) {
- nr -= ret;
- WARN_ON(nr < 0);
- }
spin_lock(&fs_info->delalloc_root_lock);
}
spin_unlock(&fs_info->delalloc_root_lock);
@@ -9563,7 +9505,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
inode->i_fop = &btrfs_file_operations;
inode->i_op = &btrfs_file_inode_operations;
inode->i_mapping->a_ops = &btrfs_aops;
- BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
if (err)
@@ -9628,11 +9569,15 @@ out_unlock:
return err;
}
-static int insert_prealloc_file_extent(struct btrfs_trans_handle *trans,
+static struct btrfs_trans_handle *insert_prealloc_file_extent(
+ struct btrfs_trans_handle *trans_in,
struct inode *inode, struct btrfs_key *ins,
u64 file_offset)
{
struct btrfs_file_extent_item stack_fi;
+ struct btrfs_replace_extent_info extent_info;
+ struct btrfs_trans_handle *trans = trans_in;
+ struct btrfs_path *path;
u64 start = ins->objectid;
u64 len = ins->offset;
int ret;
@@ -9649,10 +9594,40 @@ static int insert_prealloc_file_extent(struct btrfs_trans_handle *trans,
ret = btrfs_qgroup_release_data(BTRFS_I(inode), file_offset, len);
if (ret < 0)
- return ret;
- return insert_reserved_file_extent(trans, BTRFS_I(inode), file_offset,
- &stack_fi, ret);
+ return ERR_PTR(ret);
+
+ if (trans) {
+ ret = insert_reserved_file_extent(trans, BTRFS_I(inode),
+ file_offset, &stack_fi, ret);
+ if (ret)
+ return ERR_PTR(ret);
+ return trans;
+ }
+
+ extent_info.disk_offset = start;
+ extent_info.disk_len = len;
+ extent_info.data_offset = 0;
+ extent_info.data_len = len;
+ extent_info.file_offset = file_offset;
+ extent_info.extent_buf = (char *)&stack_fi;
+ extent_info.is_new_extent = true;
+ extent_info.qgroup_reserved = ret;
+ extent_info.insertions = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return ERR_PTR(-ENOMEM);
+
+ ret = btrfs_replace_file_extents(inode, path, file_offset,
+ file_offset + len - 1, &extent_info,
+ &trans);
+ btrfs_free_path(path);
+ if (ret)
+ return ERR_PTR(ret);
+
+ return trans;
}
+
static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
u64 start, u64 num_bytes, u64 min_size,
loff_t actual_len, u64 *alloc_hint,
@@ -9675,14 +9650,6 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
if (trans)
own_trans = false;
while (num_bytes > 0) {
- if (own_trans) {
- trans = btrfs_start_transaction(root, 3);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- break;
- }
- }
-
cur_bytes = min_t(u64, num_bytes, SZ_256M);
cur_bytes = max(cur_bytes, min_size);
/*
@@ -9694,11 +9661,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
cur_bytes = min(cur_bytes, last_alloc);
ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes,
min_size, 0, *alloc_hint, &ins, 1, 0);
- if (ret) {
- if (own_trans)
- btrfs_end_transaction(trans);
+ if (ret)
break;
- }
/*
* We've reserved this space, and thus converted it from
@@ -9711,13 +9675,11 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
last_alloc = ins.offset;
- ret = insert_prealloc_file_extent(trans, inode, &ins, cur_offset);
- if (ret) {
+ trans = insert_prealloc_file_extent(trans, inode, &ins, cur_offset);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
btrfs_free_reserved_extent(fs_info, ins.objectid,
ins.offset, 0);
- btrfs_abort_transaction(trans, ret);
- if (own_trans)
- btrfs_end_transaction(trans);
break;
}
@@ -9780,8 +9742,10 @@ next:
break;
}
- if (own_trans)
+ if (own_trans) {
btrfs_end_transaction(trans);
+ trans = NULL;
+ }
}
if (clear_offset < end)
btrfs_free_reserved_data_space(BTRFS_I(inode), NULL, clear_offset,
@@ -9860,7 +9824,6 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
inode->i_op = &btrfs_file_inode_operations;
inode->i_mapping->a_ops = &btrfs_aops;
- BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
ret = btrfs_init_inode_security(trans, inode, dir, NULL);
if (ret)
@@ -10067,14 +10030,14 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
/*
* Balance or device remove/replace/resize can move stuff around from
- * under us. The EXCL_OP flag makes sure they aren't running/won't run
- * concurrently while we are mapping the swap extents, and
- * fs_info->swapfile_pins prevents them from running while the swap file
- * is active and moving the extents. Note that this also prevents a
- * concurrent device add which isn't actually necessary, but it's not
+ * under us. The exclop protection makes sure they aren't running/won't
+ * run concurrently while we are mapping the swap extents, and
+ * fs_info->swapfile_pins prevents them from running while the swap
+ * file is active and moving the extents. Note that this also prevents
+ * a concurrent device add which isn't actually necessary, but it's not
* really worth the trouble to allow it.
*/
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) {
btrfs_warn(fs_info,
"cannot activate swapfile while exclusive operation is running");
return -EBUSY;
@@ -10130,7 +10093,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
free_extent_map(em);
em = NULL;
- ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL);
+ ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, true);
if (ret < 0) {
goto out;
} else if (ret) {
@@ -10220,7 +10183,7 @@ out:
if (ret)
btrfs_swap_deactivate(file);
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
if (ret)
return ret;
@@ -10278,12 +10241,6 @@ static const struct file_operations btrfs_dir_file_operations = {
.fsync = btrfs_sync_file,
};
-static const struct extent_io_ops btrfs_extent_io_ops = {
- /* mandatory callbacks */
- .submit_bio_hook = btrfs_submit_bio_hook,
- .readpage_end_io_hook = btrfs_readpage_end_io_hook,
-};
-
/*
* btrfs doesn't support the bmap operation because swapfiles
* use bmap to make a mapping of extents in the file. They assume
@@ -10301,7 +10258,7 @@ static const struct address_space_operations btrfs_aops = {
.writepage = btrfs_writepage,
.writepages = btrfs_writepages,
.readahead = btrfs_readahead,
- .direct_IO = btrfs_direct_IO,
+ .direct_IO = noop_direct_IO,
.invalidatepage = btrfs_invalidatepage,
.releasepage = btrfs_releasepage,
#ifdef CONFIG_MIGRATION
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index bd3511c5ca81..ab408a23ba32 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -378,6 +378,18 @@ static int check_xflags(unsigned int flags)
return 0;
}
+bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation type)
+{
+ return !cmpxchg(&fs_info->exclusive_operation, BTRFS_EXCLOP_NONE, type);
+}
+
+void btrfs_exclop_finish(struct btrfs_fs_info *fs_info)
+{
+ WRITE_ONCE(fs_info->exclusive_operation, BTRFS_EXCLOP_NONE);
+ sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation");
+}
+
/*
* Set the xflags from the internal inode flags. The remaining items of fsxattr
* are zeroed.
@@ -618,7 +630,7 @@ static noinline int create_subvol(struct inode *dir,
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
- btrfs_subvolume_release_metadata(fs_info, &block_rsv);
+ btrfs_subvolume_release_metadata(root, &block_rsv);
goto fail_free;
}
trans->block_rsv = &block_rsv;
@@ -628,7 +640,8 @@ static noinline int create_subvol(struct inode *dir,
if (ret)
goto fail;
- leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0);
+ leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
+ BTRFS_NESTING_NORMAL);
if (IS_ERR(leaf)) {
ret = PTR_ERR(leaf);
goto fail;
@@ -742,7 +755,7 @@ fail:
kfree(root_item);
trans->block_rsv = NULL;
trans->bytes_reserved = 0;
- btrfs_subvolume_release_metadata(fs_info, &block_rsv);
+ btrfs_subvolume_release_metadata(root, &block_rsv);
err = btrfs_commit_transaction(trans);
if (err && !ret)
@@ -856,7 +869,7 @@ fail:
if (ret && pending_snapshot->snap)
pending_snapshot->snap->anon_dev = 0;
btrfs_put_root(pending_snapshot->snap);
- btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv);
+ btrfs_subvolume_release_metadata(root, &pending_snapshot->block_rsv);
free_pending:
if (pending_snapshot->anon_dev)
free_anon_bdev(pending_snapshot->anon_dev);
@@ -1306,7 +1319,7 @@ again:
break;
unlock_page(page);
- btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
lock_page(page);
/*
@@ -1638,7 +1651,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
if (ret)
return ret;
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_RESIZE)) {
mnt_drop_write_file(file);
return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
}
@@ -1752,7 +1765,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
out_free:
kfree(vol_args);
out:
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
mnt_drop_write_file(file);
return ret;
}
@@ -2086,9 +2099,14 @@ static noinline int copy_to_sk(struct btrfs_path *path,
sh.len = item_len;
sh.transid = found_transid;
- /* copy search result header */
- if (copy_to_user(ubuf + *sk_offset, &sh, sizeof(sh))) {
- ret = -EFAULT;
+ /*
+ * Copy search result header. If we fault then loop again so we
+ * can fault in the pages and -EFAULT there if there's a
+ * problem. Otherwise we'll fault and then copy the buffer in
+ * properly this next time through
+ */
+ if (copy_to_user_nofault(ubuf + *sk_offset, &sh, sizeof(sh))) {
+ ret = 0;
goto out;
}
@@ -2096,10 +2114,14 @@ static noinline int copy_to_sk(struct btrfs_path *path,
if (item_len) {
char __user *up = ubuf + *sk_offset;
- /* copy the item */
- if (read_extent_buffer_to_user(leaf, up,
- item_off, item_len)) {
- ret = -EFAULT;
+ /*
+ * Copy the item, same behavior as above, but reset the
+ * * sk_offset so we copy the full thing again.
+ */
+ if (read_extent_buffer_to_user_nofault(leaf, up,
+ item_off, item_len)) {
+ ret = 0;
+ *sk_offset -= sizeof(sh);
goto out;
}
@@ -2184,6 +2206,11 @@ static noinline int search_ioctl(struct inode *inode,
key.offset = sk->min_offset;
while (1) {
+ ret = fault_in_pages_writeable(ubuf + sk_offset,
+ *buf_size - sk_offset);
+ if (ret)
+ break;
+
ret = btrfs_search_forward(root, &key, path, sk->min_transid);
if (ret != 0) {
if (ret > 0)
@@ -3112,7 +3139,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags))
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_ADD))
return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
vol_args = memdup_user(arg, sizeof(*vol_args));
@@ -3129,7 +3156,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
kfree(vol_args);
out:
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
return ret;
}
@@ -3158,7 +3185,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
goto out;
}
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REMOVE)) {
ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
goto out;
}
@@ -3169,7 +3196,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
ret = btrfs_rm_device(fs_info, vol_args->name, 0);
}
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
if (!ret) {
if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID)
@@ -3200,7 +3227,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
if (ret)
return ret;
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REMOVE)) {
ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
goto out_drop_write;
}
@@ -3218,7 +3245,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
btrfs_info(fs_info, "disk deleted %s", vol_args->name);
kfree(vol_args);
out:
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
out_drop_write:
mnt_drop_write_file(file);
@@ -3448,15 +3475,12 @@ static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *tmp;
info = NULL;
- rcu_read_lock();
- list_for_each_entry_rcu(tmp, &fs_info->space_info,
- list) {
+ list_for_each_entry(tmp, &fs_info->space_info, list) {
if (tmp->flags == types[i]) {
info = tmp;
break;
}
}
- rcu_read_unlock();
if (!info)
continue;
@@ -3504,15 +3528,12 @@ static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info,
break;
info = NULL;
- rcu_read_lock();
- list_for_each_entry_rcu(tmp, &fs_info->space_info,
- list) {
+ list_for_each_entry(tmp, &fs_info->space_info, list) {
if (tmp->flags == types[i]) {
info = tmp;
break;
}
}
- rcu_read_unlock();
if (!info)
continue;
@@ -3722,11 +3743,11 @@ static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info,
ret = -EROFS;
goto out;
}
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) {
ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
} else {
ret = btrfs_dev_replace_by_ioctl(fs_info, p);
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
}
break;
case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS:
@@ -3937,7 +3958,7 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
return ret;
again:
- if (!test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+ if (btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
mutex_lock(&fs_info->balance_mutex);
need_unlock = true;
goto locked;
@@ -3983,7 +4004,6 @@ again:
}
locked:
- BUG_ON(!test_bit(BTRFS_FS_EXCL_OP, &fs_info->flags));
if (arg) {
bargs = memdup_user(arg, sizeof(*bargs));
@@ -4038,10 +4058,10 @@ locked:
do_balance:
/*
- * Ownership of bctl and filesystem flag BTRFS_FS_EXCL_OP goes to
- * btrfs_balance. bctl is freed in reset_balance_state, or, if
- * restriper was paused all the way until unmount, in free_fs_info.
- * The flag should be cleared after reset_balance_state.
+ * Ownership of bctl and exclusive operation goes to btrfs_balance.
+ * bctl is freed in reset_balance_state, or, if restriper was paused
+ * all the way until unmount, in free_fs_info. The flag should be
+ * cleared after reset_balance_state.
*/
need_unlock = false;
@@ -4060,7 +4080,7 @@ out_bargs:
out_unlock:
mutex_unlock(&fs_info->balance_mutex);
if (need_unlock)
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
out:
mnt_drop_write_file(file);
return ret;
@@ -4883,7 +4903,7 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_SYNC: {
int ret;
- ret = btrfs_start_delalloc_roots(fs_info, -1);
+ ret = btrfs_start_delalloc_roots(fs_info, U64_MAX);
if (ret)
return ret;
ret = btrfs_sync_fs(inode->i_sb, 1);
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index f75612e18a82..66e02ebdd340 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -57,8 +57,8 @@
* performance reasons.
*
*
- * Lock nesting
- * ------------
+ * Lock recursion
+ * --------------
*
* A write operation on a tree might indirectly start a look up on the same
* tree. This can happen when btrfs_cow_block locks the tree and needs to
@@ -201,7 +201,7 @@ void btrfs_set_lock_blocking_read(struct extent_buffer *eb)
* lock, but it won't change to or away from us. If we have the write
* lock, we are the owner and it'll never change.
*/
- if (eb->lock_nested && current->pid == eb->lock_owner)
+ if (eb->lock_recursed && current->pid == eb->lock_owner)
return;
btrfs_assert_tree_read_locked(eb);
atomic_inc(&eb->blocking_readers);
@@ -225,7 +225,7 @@ void btrfs_set_lock_blocking_write(struct extent_buffer *eb)
* lock, but it won't change to or away from us. If we have the write
* lock, we are the owner and it'll never change.
*/
- if (eb->lock_nested && current->pid == eb->lock_owner)
+ if (eb->lock_recursed && current->pid == eb->lock_owner)
return;
if (eb->blocking_writers == 0) {
btrfs_assert_spinning_writers_put(eb);
@@ -244,7 +244,8 @@ void btrfs_set_lock_blocking_write(struct extent_buffer *eb)
*
* The rwlock is held upon exit.
*/
-void btrfs_tree_read_lock(struct extent_buffer *eb)
+void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest,
+ bool recurse)
{
u64 start_ns = 0;
@@ -263,8 +264,9 @@ again:
* depends on this as it may be called on a partly
* (write-)locked tree.
*/
- BUG_ON(eb->lock_nested);
- eb->lock_nested = true;
+ WARN_ON(!recurse);
+ BUG_ON(eb->lock_recursed);
+ eb->lock_recursed = true;
read_unlock(&eb->lock);
trace_btrfs_tree_read_lock(eb, start_ns);
return;
@@ -279,6 +281,11 @@ again:
trace_btrfs_tree_read_lock(eb, start_ns);
}
+void btrfs_tree_read_lock(struct extent_buffer *eb)
+{
+ __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL, false);
+}
+
/*
* Lock extent buffer for read, optimistically expecting that there are no
* contending blocking writers. If there are, don't wait.
@@ -362,11 +369,11 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb)
/*
* if we're nested, we have the write lock. No new locking
* is needed as long as we are the lock owner.
- * The write unlock will do a barrier for us, and the lock_nested
+ * The write unlock will do a barrier for us, and the lock_recursed
* field only matters to the lock owner.
*/
- if (eb->lock_nested && current->pid == eb->lock_owner) {
- eb->lock_nested = false;
+ if (eb->lock_recursed && current->pid == eb->lock_owner) {
+ eb->lock_recursed = false;
return;
}
btrfs_assert_tree_read_locked(eb);
@@ -388,11 +395,11 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
/*
* if we're nested, we have the write lock. No new locking
* is needed as long as we are the lock owner.
- * The write unlock will do a barrier for us, and the lock_nested
+ * The write unlock will do a barrier for us, and the lock_recursed
* field only matters to the lock owner.
*/
- if (eb->lock_nested && current->pid == eb->lock_owner) {
- eb->lock_nested = false;
+ if (eb->lock_recursed && current->pid == eb->lock_owner) {
+ eb->lock_recursed = false;
return;
}
btrfs_assert_tree_read_locked(eb);
@@ -409,7 +416,7 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
*
* The rwlock is held for write upon exit.
*/
-void btrfs_tree_lock(struct extent_buffer *eb)
+void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest)
__acquires(&eb->lock)
{
u64 start_ns = 0;
@@ -434,6 +441,11 @@ again:
trace_btrfs_tree_lock(eb, start_ns);
}
+void btrfs_tree_lock(struct extent_buffer *eb)
+{
+ __btrfs_tree_lock(eb, BTRFS_NESTING_NORMAL);
+}
+
/*
* Release the write lock, either blocking or spinning (ie. there's no need
* for an explicit blocking unlock, like btrfs_tree_read_unlock_blocking).
@@ -552,13 +564,14 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
*
* Return: root extent buffer with read lock held
*/
-struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
+struct extent_buffer *__btrfs_read_lock_root_node(struct btrfs_root *root,
+ bool recurse)
{
struct extent_buffer *eb;
while (1) {
eb = btrfs_root_node(root);
- btrfs_tree_read_lock(eb);
+ __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL, recurse);
if (eb == root->node)
break;
btrfs_tree_read_unlock(eb);
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index d715846c10b8..3ea81ed3320b 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -16,11 +16,81 @@
#define BTRFS_WRITE_LOCK_BLOCKING 3
#define BTRFS_READ_LOCK_BLOCKING 4
+/*
+ * We are limited in number of subclasses by MAX_LOCKDEP_SUBCLASSES, which at
+ * the time of this patch is 8, which is how many we use. Keep this in mind if
+ * you decide you want to add another subclass.
+ */
+enum btrfs_lock_nesting {
+ BTRFS_NESTING_NORMAL,
+
+ /*
+ * When we COW a block we are holding the lock on the original block,
+ * and since our lockdep maps are rootid+level, this confuses lockdep
+ * when we lock the newly allocated COW'd block. Handle this by having
+ * a subclass for COW'ed blocks so that lockdep doesn't complain.
+ */
+ BTRFS_NESTING_COW,
+
+ /*
+ * Oftentimes we need to lock adjacent nodes on the same level while
+ * still holding the lock on the original node we searched to, such as
+ * for searching forward or for split/balance.
+ *
+ * Because of this we need to indicate to lockdep that this is
+ * acceptable by having a different subclass for each of these
+ * operations.
+ */
+ BTRFS_NESTING_LEFT,
+ BTRFS_NESTING_RIGHT,
+
+ /*
+ * When splitting we will be holding a lock on the left/right node when
+ * we need to cow that node, thus we need a new set of subclasses for
+ * these two operations.
+ */
+ BTRFS_NESTING_LEFT_COW,
+ BTRFS_NESTING_RIGHT_COW,
+
+ /*
+ * When splitting we may push nodes to the left or right, but still use
+ * the subsequent nodes in our path, keeping our locks on those adjacent
+ * blocks. Thus when we go to allocate a new split block we've already
+ * used up all of our available subclasses, so this subclass exists to
+ * handle this case where we need to allocate a new split block.
+ */
+ BTRFS_NESTING_SPLIT,
+
+ /*
+ * When promoting a new block to a root we need to have a special
+ * subclass so we don't confuse lockdep, as it will appear that we are
+ * locking a higher level node before a lower level one. Copying also
+ * has this problem as it appears we're locking the same block again
+ * when we make a snapshot of an existing root.
+ */
+ BTRFS_NESTING_NEW_ROOT,
+
+ /*
+ * We are limited to MAX_LOCKDEP_SUBLCLASSES number of subclasses, so
+ * add this in here and add a static_assert to keep us from going over
+ * the limit. As of this writing we're limited to 8, and we're
+ * definitely using 8, hence this check to keep us from messing up in
+ * the future.
+ */
+ BTRFS_NESTING_MAX,
+};
+
+static_assert(BTRFS_NESTING_MAX <= MAX_LOCKDEP_SUBCLASSES,
+ "too many lock subclasses defined");
+
struct btrfs_path;
+void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest);
void btrfs_tree_lock(struct extent_buffer *eb);
void btrfs_tree_unlock(struct extent_buffer *eb);
+void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest,
+ bool recurse);
void btrfs_tree_read_lock(struct extent_buffer *eb);
void btrfs_tree_read_unlock(struct extent_buffer *eb);
void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb);
@@ -29,6 +99,14 @@ void btrfs_set_lock_blocking_write(struct extent_buffer *eb);
int btrfs_try_tree_read_lock(struct extent_buffer *eb);
int btrfs_try_tree_write_lock(struct extent_buffer *eb);
int btrfs_tree_read_lock_atomic(struct extent_buffer *eb);
+struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
+struct extent_buffer *__btrfs_read_lock_root_node(struct btrfs_root *root,
+ bool recurse);
+
+static inline struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
+{
+ return __btrfs_read_lock_root_node(root, false);
+}
#ifdef CONFIG_BTRFS_DEBUG
static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) {
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index ebac13389e7e..87bac9ecdf4c 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -212,11 +212,12 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset
refcount_set(&entry->refs, 1);
init_waitqueue_head(&entry->wait);
INIT_LIST_HEAD(&entry->list);
+ INIT_LIST_HEAD(&entry->log_list);
INIT_LIST_HEAD(&entry->root_extent_list);
INIT_LIST_HEAD(&entry->work_list);
init_completion(&entry->completion);
- trace_btrfs_ordered_extent_add(&inode->vfs_inode, entry);
+ trace_btrfs_ordered_extent_add(inode, entry);
spin_lock_irq(&tree->lock);
node = tree_insert(&tree->tree, file_offset,
@@ -377,17 +378,16 @@ out:
* test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
* to make sure this function only returns 1 once for a given ordered extent.
*/
-int btrfs_dec_test_ordered_pending(struct inode *inode,
+int btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
struct btrfs_ordered_extent **cached,
u64 file_offset, u64 io_size, int uptodate)
{
- struct btrfs_ordered_inode_tree *tree;
+ struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
unsigned long flags;
int ret;
- tree = &BTRFS_I(inode)->ordered_tree;
spin_lock_irqsave(&tree->lock, flags);
if (cached && *cached) {
entry = *cached;
@@ -408,7 +408,7 @@ have_entry:
}
if (io_size > entry->bytes_left) {
- btrfs_crit(BTRFS_I(inode)->root->fs_info,
+ btrfs_crit(inode->root->fs_info,
"bad ordered accounting left %llu size %llu",
entry->bytes_left, io_size);
}
@@ -441,10 +441,11 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
struct list_head *cur;
struct btrfs_ordered_sum *sum;
- trace_btrfs_ordered_extent_put(entry->inode, entry);
+ trace_btrfs_ordered_extent_put(BTRFS_I(entry->inode), entry);
if (refcount_dec_and_test(&entry->refs)) {
ASSERT(list_empty(&entry->root_extent_list));
+ ASSERT(list_empty(&entry->log_list));
ASSERT(RB_EMPTY_NODE(&entry->rb_node));
if (entry->inode)
btrfs_add_delayed_iput(entry->inode);
@@ -462,14 +463,14 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
* remove an ordered extent from the tree. No references are dropped
* and waiters are woken up.
*/
-void btrfs_remove_ordered_extent(struct inode *inode,
+void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
struct btrfs_ordered_extent *entry)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ordered_inode_tree *tree;
- struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
struct btrfs_root *root = btrfs_inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct rb_node *node;
+ bool pending;
/* This is paired with btrfs_add_ordered_extent. */
spin_lock(&btrfs_inode->lock);
@@ -491,13 +492,41 @@ void btrfs_remove_ordered_extent(struct inode *inode,
if (tree->last == node)
tree->last = NULL;
set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
+ pending = test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags);
spin_unlock_irq(&tree->lock);
+ /*
+ * The current running transaction is waiting on us, we need to let it
+ * know that we're complete and wake it up.
+ */
+ if (pending) {
+ struct btrfs_transaction *trans;
+
+ /*
+ * The checks for trans are just a formality, it should be set,
+ * but if it isn't we don't want to deref/assert under the spin
+ * lock, so be nice and check if trans is set, but ASSERT() so
+ * if it isn't set a developer will notice.
+ */
+ spin_lock(&fs_info->trans_lock);
+ trans = fs_info->running_transaction;
+ if (trans)
+ refcount_inc(&trans->use_count);
+ spin_unlock(&fs_info->trans_lock);
+
+ ASSERT(trans);
+ if (trans) {
+ if (atomic_dec_and_test(&trans->pending_ordered))
+ wake_up(&trans->pending_wait);
+ btrfs_put_transaction(trans);
+ }
+ }
+
spin_lock(&root->ordered_extent_lock);
list_del_init(&entry->root_extent_list);
root->nr_ordered_extents--;
- trace_btrfs_ordered_extent_remove(inode, entry);
+ trace_btrfs_ordered_extent_remove(btrfs_inode, entry);
if (!root->nr_ordered_extents) {
spin_lock(&fs_info->ordered_root_lock);
@@ -514,7 +543,7 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
struct btrfs_ordered_extent *ordered;
ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
- btrfs_start_ordered_extent(ordered->inode, ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
complete(&ordered->completion);
}
@@ -620,12 +649,11 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
* in the extent, and it waits on the io completion code to insert
* metadata into the btree corresponding to the extent
*/
-void btrfs_start_ordered_extent(struct inode *inode,
- struct btrfs_ordered_extent *entry,
- int wait)
+void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait)
{
u64 start = entry->file_offset;
u64 end = start + entry->num_bytes - 1;
+ struct btrfs_inode *inode = BTRFS_I(entry->inode);
trace_btrfs_ordered_extent_start(inode, entry);
@@ -635,7 +663,7 @@ void btrfs_start_ordered_extent(struct inode *inode,
* for the flusher thread to find them
*/
if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
- filemap_fdatawrite_range(inode->i_mapping, start, end);
+ filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end);
if (wait) {
wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
&entry->flags));
@@ -679,7 +707,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
end = orig_end;
while (1) {
- ordered = btrfs_lookup_first_ordered_extent(inode, end);
+ ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), end);
if (!ordered)
break;
if (ordered->file_offset > orig_end) {
@@ -690,7 +718,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
btrfs_put_ordered_extent(ordered);
break;
}
- btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
end = ordered->file_offset;
/*
* If the ordered extent had an error save the error but don't
@@ -775,17 +803,45 @@ out:
}
/*
+ * Adds all ordered extents to the given list. The list ends up sorted by the
+ * file_offset of the ordered extents.
+ */
+void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
+ struct list_head *list)
+{
+ struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
+ struct rb_node *n;
+
+ ASSERT(inode_is_locked(&inode->vfs_inode));
+
+ spin_lock_irq(&tree->lock);
+ for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
+ struct btrfs_ordered_extent *ordered;
+
+ ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
+
+ if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
+ continue;
+
+ ASSERT(list_empty(&ordered->log_list));
+ list_add_tail(&ordered->log_list, list);
+ refcount_inc(&ordered->refs);
+ }
+ spin_unlock_irq(&tree->lock);
+}
+
+/*
* lookup and return any extent before 'file_offset'. NULL is returned
* if none is found
*/
struct btrfs_ordered_extent *
-btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
+btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset)
{
struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
- tree = &BTRFS_I(inode)->ordered_tree;
+ tree = &inode->ordered_tree;
spin_lock_irq(&tree->lock);
node = tree_search(tree, file_offset);
if (!node)
@@ -803,20 +859,21 @@ out:
* try to find a checksum. This is used because we allow pages to
* be reclaimed before their checksum is actually put into the btree
*/
-int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
- u8 *sum, int len)
+int btrfs_find_ordered_sum(struct btrfs_inode *inode, u64 offset,
+ u64 disk_bytenr, u8 *sum, int len)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_ordered_sum *ordered_sum;
struct btrfs_ordered_extent *ordered;
- struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
+ struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
unsigned long num_sectors;
unsigned long i;
u32 sectorsize = btrfs_inode_sectorsize(inode);
+ const u8 blocksize_bits = inode->vfs_inode.i_sb->s_blocksize_bits;
const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
int index = 0;
- ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), offset);
+ ordered = btrfs_lookup_ordered_extent(inode, offset);
if (!ordered)
return 0;
@@ -824,10 +881,8 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
if (disk_bytenr >= ordered_sum->bytenr &&
disk_bytenr < ordered_sum->bytenr + ordered_sum->len) {
- i = (disk_bytenr - ordered_sum->bytenr) >>
- inode->i_sb->s_blocksize_bits;
- num_sectors = ordered_sum->len >>
- inode->i_sb->s_blocksize_bits;
+ i = (disk_bytenr - ordered_sum->bytenr) >> blocksize_bits;
+ num_sectors = ordered_sum->len >> blocksize_bits;
num_sectors = min_t(int, len - index, num_sectors - i);
memcpy(sum + index, ordered_sum->sums + i * csum_size,
num_sectors * csum_size);
@@ -883,7 +938,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
break;
}
unlock_extent_cached(&inode->io_tree, start, end, cachedp);
- btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
}
}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index d61ea9c880a3..c3a2325e64a4 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -56,6 +56,12 @@ enum {
BTRFS_ORDERED_TRUNCATED,
/* Regular IO for COW */
BTRFS_ORDERED_REGULAR,
+ /* Used during fsync to track already logged extents */
+ BTRFS_ORDERED_LOGGED,
+ /* We have already logged all the csums of the ordered extent */
+ BTRFS_ORDERED_LOGGED_CSUM,
+ /* We wait for this extent to complete in the current transaction */
+ BTRFS_ORDERED_PENDING,
};
struct btrfs_ordered_extent {
@@ -104,6 +110,9 @@ struct btrfs_ordered_extent {
/* list of checksums for insertion when the extent io is done */
struct list_head list;
+ /* used for fast fsyncs */
+ struct list_head log_list;
+
/* used to wait for the BTRFS_ORDERED_COMPLETE bit */
wait_queue_head_t wait;
@@ -142,9 +151,9 @@ btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
}
void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
-void btrfs_remove_ordered_extent(struct inode *inode,
+void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
struct btrfs_ordered_extent *entry);
-int btrfs_dec_test_ordered_pending(struct inode *inode,
+int btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
struct btrfs_ordered_extent **cached,
u64 file_offset, u64 io_size, int uptodate);
int btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode,
@@ -165,17 +174,18 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
struct btrfs_ordered_sum *sum);
struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
u64 file_offset);
-void btrfs_start_ordered_extent(struct inode *inode,
- struct btrfs_ordered_extent *entry, int wait);
+void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait);
int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
struct btrfs_ordered_extent *
-btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
+btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset);
struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
struct btrfs_inode *inode,
u64 file_offset,
u64 len);
-int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
- u8 *sum, int len);
+void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
+ struct list_head *list);
+int btrfs_find_ordered_sum(struct btrfs_inode *inode, u64 offset,
+ u64 disk_bytenr, u8 *sum, int len);
u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
const u64 range_start, const u64 range_len);
void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 61f44e78e3c9..7695c4783d33 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -7,6 +7,44 @@
#include "disk-io.h"
#include "print-tree.h"
+struct root_name_map {
+ u64 id;
+ char name[16];
+};
+
+static const struct root_name_map root_map[] = {
+ { BTRFS_ROOT_TREE_OBJECTID, "ROOT_TREE" },
+ { BTRFS_EXTENT_TREE_OBJECTID, "EXTENT_TREE" },
+ { BTRFS_CHUNK_TREE_OBJECTID, "CHUNK_TREE" },
+ { BTRFS_DEV_TREE_OBJECTID, "DEV_TREE" },
+ { BTRFS_FS_TREE_OBJECTID, "FS_TREE" },
+ { BTRFS_CSUM_TREE_OBJECTID, "CSUM_TREE" },
+ { BTRFS_TREE_LOG_OBJECTID, "TREE_LOG" },
+ { BTRFS_QUOTA_TREE_OBJECTID, "QUOTA_TREE" },
+ { BTRFS_UUID_TREE_OBJECTID, "UUID_TREE" },
+ { BTRFS_FREE_SPACE_TREE_OBJECTID, "FREE_SPACE_TREE" },
+ { BTRFS_DATA_RELOC_TREE_OBJECTID, "DATA_RELOC_TREE" },
+};
+
+const char *btrfs_root_name(u64 objectid, char *buf)
+{
+ int i;
+
+ if (objectid == BTRFS_TREE_RELOC_OBJECTID) {
+ snprintf(buf, BTRFS_ROOT_NAME_BUF_LEN,
+ "TREE_RELOC offset=%llu", objectid);
+ return buf;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(root_map); i++) {
+ if (root_map[i].id == objectid)
+ return root_map[i].name;
+ }
+
+ snprintf(buf, BTRFS_ROOT_NAME_BUF_LEN, "%llu", objectid);
+ return buf;
+}
+
static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk)
{
int num_stripes = btrfs_chunk_num_stripes(eb, chunk);
@@ -95,9 +133,10 @@ static void print_extent_item(struct extent_buffer *eb, int slot, int type)
* offset is supposed to be a tree block which
* must be aligned to nodesize.
*/
- if (!IS_ALIGNED(offset, eb->fs_info->nodesize))
- pr_info("\t\t\t(parent %llu is NOT ALIGNED to nodesize %llu)\n",
- offset, (unsigned long long)eb->fs_info->nodesize);
+ if (!IS_ALIGNED(offset, eb->fs_info->sectorsize))
+ pr_info(
+ "\t\t\t(parent %llu not aligned to sectorsize %u)\n",
+ offset, eb->fs_info->sectorsize);
break;
case BTRFS_EXTENT_DATA_REF_KEY:
dref = (struct btrfs_extent_data_ref *)(&iref->offset);
@@ -112,8 +151,9 @@ static void print_extent_item(struct extent_buffer *eb, int slot, int type)
* must be aligned to nodesize.
*/
if (!IS_ALIGNED(offset, eb->fs_info->nodesize))
- pr_info("\t\t\t(parent %llu is NOT ALIGNED to nodesize %llu)\n",
- offset, (unsigned long long)eb->fs_info->nodesize);
+ pr_info(
+ "\t\t\t(parent %llu not aligned to sectorsize %u)\n",
+ offset, eb->fs_info->sectorsize);
break;
default:
pr_cont("(extent %llu has INVALID ref type %d)\n",
diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h
index e6bb38fd75ad..78b99385a503 100644
--- a/fs/btrfs/print-tree.h
+++ b/fs/btrfs/print-tree.h
@@ -6,7 +6,11 @@
#ifndef BTRFS_PRINT_TREE_H
#define BTRFS_PRINT_TREE_H
+/* Buffer size to contain tree name and possibly additional data (offset) */
+#define BTRFS_ROOT_NAME_BUF_LEN 48
+
void btrfs_print_leaf(struct extent_buffer *l);
void btrfs_print_tree(struct extent_buffer *c, bool follow);
+const char *btrfs_root_name(u64 objectid, char *buf);
#endif
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index c0f350c3a0cf..580899bdb991 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2315,7 +2315,7 @@ static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
* Update qgroup rfer/excl counters.
* Rfer update is easy, codes can explain themselves.
*
- * Excl update is tricky, the update is split into 2 part.
+ * Excl update is tricky, the update is split into 2 parts.
* Part 1: Possible exclusive <-> sharing detect:
* | A | !A |
* -------------------------------------
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 243a2e44526e..9d4f5316a7e8 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -767,31 +767,39 @@ static void reada_start_machine_worker(struct btrfs_work *work)
kfree(rmw);
}
-static void __reada_start_machine(struct btrfs_fs_info *fs_info)
+/* Try to start up to 10k READA requests for a group of devices */
+static int reada_start_for_fsdevs(struct btrfs_fs_devices *fs_devices)
{
- struct btrfs_device *device;
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
u64 enqueued;
u64 total = 0;
- int i;
+ struct btrfs_device *device;
-again:
do {
enqueued = 0;
- mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry(device, &fs_devices->devices, dev_list) {
if (atomic_read(&device->reada_in_flight) <
MAX_IN_FLIGHT)
enqueued += reada_start_machine_dev(device);
}
- mutex_unlock(&fs_devices->device_list_mutex);
total += enqueued;
} while (enqueued && total < 10000);
- if (fs_devices->seed) {
- fs_devices = fs_devices->seed;
- goto again;
- }
+ return total;
+}
+
+static void __reada_start_machine(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
+ int i;
+ u64 enqueued = 0;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+
+ enqueued += reada_start_for_fsdevs(fs_devices);
+ list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list)
+ enqueued += reada_start_for_fsdevs(seed_devs);
+
+ mutex_unlock(&fs_devices->device_list_mutex);
if (enqueued == 0)
return;
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index 5cd02514cf4d..99aa87c08912 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -45,7 +45,7 @@ out:
return ret;
}
-static int copy_inline_to_page(struct inode *inode,
+static int copy_inline_to_page(struct btrfs_inode *inode,
const u64 file_offset,
char *inline_data,
const u64 size,
@@ -58,6 +58,7 @@ static int copy_inline_to_page(struct inode *inode,
char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0);
struct extent_changeset *data_reserved = NULL;
struct page *page = NULL;
+ struct address_space *mapping = inode->vfs_inode.i_mapping;
int ret;
ASSERT(IS_ALIGNED(file_offset, block_size));
@@ -68,24 +69,23 @@ static int copy_inline_to_page(struct inode *inode,
* reservation here. Also we must not do the reservation while holding
* a transaction open, otherwise we would deadlock.
*/
- ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
- file_offset, block_size);
+ ret = btrfs_delalloc_reserve_space(inode, &data_reserved, file_offset,
+ block_size);
if (ret)
goto out;
- page = find_or_create_page(inode->i_mapping, file_offset >> PAGE_SHIFT,
- btrfs_alloc_write_mask(inode->i_mapping));
+ page = find_or_create_page(mapping, file_offset >> PAGE_SHIFT,
+ btrfs_alloc_write_mask(mapping));
if (!page) {
ret = -ENOMEM;
goto out_unlock;
}
set_page_extent_mapped(page);
- clear_extent_bit(&BTRFS_I(inode)->io_tree, file_offset, range_end,
+ clear_extent_bit(&inode->io_tree, file_offset, range_end,
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
0, 0, NULL);
- ret = btrfs_set_extent_delalloc(BTRFS_I(inode), file_offset, range_end,
- 0, NULL);
+ ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL);
if (ret)
goto out_unlock;
@@ -134,9 +134,9 @@ out_unlock:
put_page(page);
}
if (ret)
- btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
- file_offset, block_size, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode), block_size);
+ btrfs_delalloc_release_space(inode, data_reserved, file_offset,
+ block_size, true);
+ btrfs_delalloc_release_extents(inode, block_size);
out:
extent_changeset_free(data_reserved);
@@ -167,8 +167,8 @@ static int clone_copy_inline_extent(struct inode *dst,
struct btrfs_key key;
if (new_key->offset > 0) {
- ret = copy_inline_to_page(dst, new_key->offset, inline_data,
- size, datal, comp_type);
+ ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
+ inline_data, size, datal, comp_type);
goto out;
}
@@ -194,7 +194,7 @@ static int clone_copy_inline_extent(struct inode *dst,
* inline extent's data to the page.
*/
ASSERT(key.offset > 0);
- ret = copy_inline_to_page(dst, new_key->offset,
+ ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
inline_data, size, datal,
comp_type);
goto out;
@@ -213,8 +213,8 @@ static int clone_copy_inline_extent(struct inode *dst,
BTRFS_FILE_EXTENT_INLINE)
goto copy_inline_extent;
- ret = copy_inline_to_page(dst, new_key->offset, inline_data,
- size, datal, comp_type);
+ ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
+ inline_data, size, datal, comp_type);
goto out;
}
@@ -231,8 +231,8 @@ copy_inline_extent:
* clone. Deal with all these cases by copying the inline extent
* data into the respective page at the destination inode.
*/
- ret = copy_inline_to_page(dst, new_key->offset, inline_data,
- size, datal, comp_type);
+ ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
+ inline_data, size, datal, comp_type);
goto out;
}
@@ -439,7 +439,7 @@ process_slot:
if (type == BTRFS_FILE_EXTENT_REG ||
type == BTRFS_FILE_EXTENT_PREALLOC) {
- struct btrfs_clone_extent_info clone_info;
+ struct btrfs_replace_extent_info clone_info;
/*
* a | --- range to clone ---| b
@@ -462,8 +462,8 @@ process_slot:
clone_info.data_len = datal;
clone_info.file_offset = new_key.offset;
clone_info.extent_buf = buf;
- clone_info.item_size = size;
- ret = btrfs_punch_hole_range(inode, path, drop_start,
+ clone_info.is_new_extent = false;
+ ret = btrfs_replace_file_extents(inode, path, drop_start,
new_key.offset + datal - 1, &clone_info,
&trans);
if (ret)
@@ -520,6 +520,8 @@ process_slot:
ret = -EINTR;
goto out;
}
+
+ cond_resched();
}
ret = 0;
@@ -533,7 +535,7 @@ process_slot:
btrfs_release_path(path);
path->leave_spinning = 0;
- ret = btrfs_punch_hole_range(inode, path, last_dest_end,
+ ret = btrfs_replace_file_extents(inode, path, last_dest_end,
destoff + len - 1, NULL, &trans);
if (ret)
goto out;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 4ba1ab9cc76d..3602806d71bd 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1206,7 +1206,8 @@ again:
}
if (cow) {
- ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
+ ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb,
+ BTRFS_NESTING_COW);
BUG_ON(ret);
}
btrfs_set_lock_blocking_write(eb);
@@ -1274,7 +1275,8 @@ again:
btrfs_tree_lock(eb);
if (cow) {
ret = btrfs_cow_block(trans, dest, eb, parent,
- slot, &eb);
+ slot, &eb,
+ BTRFS_NESTING_COW);
BUG_ON(ret);
}
btrfs_set_lock_blocking_write(eb);
@@ -1781,7 +1783,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
* relocated and the block is tree root.
*/
leaf = btrfs_lock_root_node(root);
- ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf);
+ ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf,
+ BTRFS_NESTING_COW);
btrfs_tree_unlock(leaf);
free_extent_buffer(leaf);
if (ret < 0)
@@ -2308,7 +2311,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
if (!node->eb) {
ret = btrfs_cow_block(trans, root, eb, upper->eb,
- slot, &eb);
+ slot, &eb, BTRFS_NESTING_COW);
btrfs_tree_unlock(eb);
free_extent_buffer(eb);
if (ret < 0) {
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index c89697486366..702dc5441f03 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -512,11 +512,20 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
if (ret && qgroup_num_bytes)
btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
+ if (!ret) {
+ spin_lock(&rsv->lock);
+ rsv->qgroup_rsv_reserved += qgroup_num_bytes;
+ spin_unlock(&rsv->lock);
+ }
return ret;
}
-void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
+void btrfs_subvolume_release_metadata(struct btrfs_root *root,
struct btrfs_block_rsv *rsv)
{
- btrfs_block_rsv_release(fs_info, rsv, (u64)-1, NULL);
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ u64 qgroup_to_release;
+
+ btrfs_block_rsv_release(fs_info, rsv, (u64)-1, &qgroup_to_release);
+ btrfs_qgroup_convert_reserved_meta(root, qgroup_to_release);
}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 5a6cb9db512e..cf63f1e27a27 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -835,7 +835,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
int success;
bool full_stripe_locked;
unsigned int nofs_flag;
- static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
+ static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
BUG_ON(sblock_to_check->page_count < 1);
@@ -969,14 +969,14 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
spin_lock(&sctx->stat_lock);
sctx->stat.read_errors++;
spin_unlock(&sctx->stat_lock);
- if (__ratelimit(&_rs))
+ if (__ratelimit(&rs))
scrub_print_warning("i/o error", sblock_to_check);
btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
} else if (sblock_bad->checksum_error) {
spin_lock(&sctx->stat_lock);
sctx->stat.csum_errors++;
spin_unlock(&sctx->stat_lock);
- if (__ratelimit(&_rs))
+ if (__ratelimit(&rs))
scrub_print_warning("checksum error", sblock_to_check);
btrfs_dev_stat_inc_and_print(dev,
BTRFS_DEV_STAT_CORRUPTION_ERRS);
@@ -984,7 +984,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
spin_lock(&sctx->stat_lock);
sctx->stat.verify_errors++;
spin_unlock(&sctx->stat_lock);
- if (__ratelimit(&_rs))
+ if (__ratelimit(&rs))
scrub_print_warning("checksum/header error",
sblock_to_check);
if (sblock_bad->generation_error)
@@ -3716,50 +3716,84 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
return 0;
}
+static void scrub_workers_put(struct btrfs_fs_info *fs_info)
+{
+ if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt,
+ &fs_info->scrub_lock)) {
+ struct btrfs_workqueue *scrub_workers = NULL;
+ struct btrfs_workqueue *scrub_wr_comp = NULL;
+ struct btrfs_workqueue *scrub_parity = NULL;
+
+ scrub_workers = fs_info->scrub_workers;
+ scrub_wr_comp = fs_info->scrub_wr_completion_workers;
+ scrub_parity = fs_info->scrub_parity_workers;
+
+ fs_info->scrub_workers = NULL;
+ fs_info->scrub_wr_completion_workers = NULL;
+ fs_info->scrub_parity_workers = NULL;
+ mutex_unlock(&fs_info->scrub_lock);
+
+ btrfs_destroy_workqueue(scrub_workers);
+ btrfs_destroy_workqueue(scrub_wr_comp);
+ btrfs_destroy_workqueue(scrub_parity);
+ }
+}
+
/*
* get a reference count on fs_info->scrub_workers. start worker if necessary
*/
static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
int is_dev_replace)
{
+ struct btrfs_workqueue *scrub_workers = NULL;
+ struct btrfs_workqueue *scrub_wr_comp = NULL;
+ struct btrfs_workqueue *scrub_parity = NULL;
unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
int max_active = fs_info->thread_pool_size;
+ int ret = -ENOMEM;
- lockdep_assert_held(&fs_info->scrub_lock);
+ if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt))
+ return 0;
- if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
- ASSERT(fs_info->scrub_workers == NULL);
- fs_info->scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub",
- flags, is_dev_replace ? 1 : max_active, 4);
- if (!fs_info->scrub_workers)
- goto fail_scrub_workers;
-
- ASSERT(fs_info->scrub_wr_completion_workers == NULL);
- fs_info->scrub_wr_completion_workers =
- btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
- max_active, 2);
- if (!fs_info->scrub_wr_completion_workers)
- goto fail_scrub_wr_completion_workers;
+ scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub", flags,
+ is_dev_replace ? 1 : max_active, 4);
+ if (!scrub_workers)
+ goto fail_scrub_workers;
- ASSERT(fs_info->scrub_parity_workers == NULL);
- fs_info->scrub_parity_workers =
- btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
+ scrub_wr_comp = btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
max_active, 2);
- if (!fs_info->scrub_parity_workers)
- goto fail_scrub_parity_workers;
+ if (!scrub_wr_comp)
+ goto fail_scrub_wr_completion_workers;
+ scrub_parity = btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
+ max_active, 2);
+ if (!scrub_parity)
+ goto fail_scrub_parity_workers;
+
+ mutex_lock(&fs_info->scrub_lock);
+ if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
+ ASSERT(fs_info->scrub_workers == NULL &&
+ fs_info->scrub_wr_completion_workers == NULL &&
+ fs_info->scrub_parity_workers == NULL);
+ fs_info->scrub_workers = scrub_workers;
+ fs_info->scrub_wr_completion_workers = scrub_wr_comp;
+ fs_info->scrub_parity_workers = scrub_parity;
refcount_set(&fs_info->scrub_workers_refcnt, 1);
- } else {
- refcount_inc(&fs_info->scrub_workers_refcnt);
+ mutex_unlock(&fs_info->scrub_lock);
+ return 0;
}
- return 0;
+ /* Other thread raced in and created the workers for us */
+ refcount_inc(&fs_info->scrub_workers_refcnt);
+ mutex_unlock(&fs_info->scrub_lock);
+ ret = 0;
+ btrfs_destroy_workqueue(scrub_parity);
fail_scrub_parity_workers:
- btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
+ btrfs_destroy_workqueue(scrub_wr_comp);
fail_scrub_wr_completion_workers:
- btrfs_destroy_workqueue(fs_info->scrub_workers);
+ btrfs_destroy_workqueue(scrub_workers);
fail_scrub_workers:
- return -ENOMEM;
+ return ret;
}
int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
@@ -3770,9 +3804,6 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
int ret;
struct btrfs_device *dev;
unsigned int nofs_flag;
- struct btrfs_workqueue *scrub_workers = NULL;
- struct btrfs_workqueue *scrub_wr_comp = NULL;
- struct btrfs_workqueue *scrub_parity = NULL;
if (btrfs_fs_closing(fs_info))
return -EAGAIN;
@@ -3819,13 +3850,17 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
if (IS_ERR(sctx))
return PTR_ERR(sctx);
+ ret = scrub_workers_get(fs_info, is_dev_replace);
+ if (ret)
+ goto out_free_ctx;
+
mutex_lock(&fs_info->fs_devices->device_list_mutex);
dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
!is_dev_replace)) {
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
ret = -ENODEV;
- goto out_free_ctx;
+ goto out;
}
if (!is_dev_replace && !readonly &&
@@ -3834,7 +3869,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
btrfs_err_in_rcu(fs_info, "scrub: device %s is not writable",
rcu_str_deref(dev->name));
ret = -EROFS;
- goto out_free_ctx;
+ goto out;
}
mutex_lock(&fs_info->scrub_lock);
@@ -3843,7 +3878,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
mutex_unlock(&fs_info->scrub_lock);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
ret = -EIO;
- goto out_free_ctx;
+ goto out;
}
down_read(&fs_info->dev_replace.rwsem);
@@ -3854,17 +3889,10 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
mutex_unlock(&fs_info->scrub_lock);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
ret = -EINPROGRESS;
- goto out_free_ctx;
+ goto out;
}
up_read(&fs_info->dev_replace.rwsem);
- ret = scrub_workers_get(fs_info, is_dev_replace);
- if (ret) {
- mutex_unlock(&fs_info->scrub_lock);
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
- goto out_free_ctx;
- }
-
sctx->readonly = readonly;
dev->scrub_ctx = sctx;
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
@@ -3917,24 +3945,14 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
mutex_lock(&fs_info->scrub_lock);
dev->scrub_ctx = NULL;
- if (refcount_dec_and_test(&fs_info->scrub_workers_refcnt)) {
- scrub_workers = fs_info->scrub_workers;
- scrub_wr_comp = fs_info->scrub_wr_completion_workers;
- scrub_parity = fs_info->scrub_parity_workers;
-
- fs_info->scrub_workers = NULL;
- fs_info->scrub_wr_completion_workers = NULL;
- fs_info->scrub_parity_workers = NULL;
- }
mutex_unlock(&fs_info->scrub_lock);
- btrfs_destroy_workqueue(scrub_workers);
- btrfs_destroy_workqueue(scrub_wr_comp);
- btrfs_destroy_workqueue(scrub_parity);
+ scrub_workers_put(fs_info);
scrub_put_ctx(sctx);
return ret;
-
+out:
+ scrub_workers_put(fs_info);
out_free_ctx:
scrub_free_ctx(sctx);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index d9813a5b075a..340c76a12ce1 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -122,8 +122,6 @@ struct send_ctx {
struct file_ra_state ra;
- char *read_buf;
-
/*
* We process inodes by their increasing order, so if before an
* incremental send we reverse the parent/child relationship of
@@ -278,11 +276,6 @@ enum btrfs_compare_tree_result {
BTRFS_COMPARE_TREE_CHANGED,
BTRFS_COMPARE_TREE_SAME,
};
-typedef int (*btrfs_changed_cb_t)(struct btrfs_path *left_path,
- struct btrfs_path *right_path,
- struct btrfs_key *key,
- enum btrfs_compare_tree_result result,
- void *ctx);
__cold
static void inconsistent_snapshot_error(struct send_ctx *sctx,
@@ -584,8 +577,8 @@ static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len)
return -EOVERFLOW;
hdr = (struct btrfs_tlv_header *) (sctx->send_buf + sctx->send_size);
- hdr->tlv_type = cpu_to_le16(attr);
- hdr->tlv_len = cpu_to_le16(len);
+ put_unaligned_le16(attr, &hdr->tlv_type);
+ put_unaligned_le16(len, &hdr->tlv_len);
memcpy(hdr + 1, data, len);
sctx->send_size += total_len;
@@ -695,7 +688,7 @@ static int begin_cmd(struct send_ctx *sctx, int cmd)
sctx->send_size += sizeof(*hdr);
hdr = (struct btrfs_cmd_header *)sctx->send_buf;
- hdr->cmd = cpu_to_le16(cmd);
+ put_unaligned_le16(cmd, &hdr->cmd);
return 0;
}
@@ -707,17 +700,17 @@ static int send_cmd(struct send_ctx *sctx)
u32 crc;
hdr = (struct btrfs_cmd_header *)sctx->send_buf;
- hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr));
- hdr->crc = 0;
+ put_unaligned_le32(sctx->send_size - sizeof(*hdr), &hdr->len);
+ put_unaligned_le32(0, &hdr->crc);
crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
- hdr->crc = cpu_to_le32(crc);
+ put_unaligned_le32(crc, &hdr->crc);
ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
&sctx->send_off);
sctx->total_send_size += sctx->send_size;
- sctx->cmd_send_size[le16_to_cpu(hdr->cmd)] += sctx->send_size;
+ sctx->cmd_send_size[get_unaligned_le16(&hdr->cmd)] += sctx->send_size;
sctx->send_size = 0;
return ret;
@@ -3813,6 +3806,72 @@ static int update_ref_path(struct send_ctx *sctx, struct recorded_ref *ref)
}
/*
+ * When processing the new references for an inode we may orphanize an existing
+ * directory inode because its old name conflicts with one of the new references
+ * of the current inode. Later, when processing another new reference of our
+ * inode, we might need to orphanize another inode, but the path we have in the
+ * reference reflects the pre-orphanization name of the directory we previously
+ * orphanized. For example:
+ *
+ * parent snapshot looks like:
+ *
+ * . (ino 256)
+ * |----- f1 (ino 257)
+ * |----- f2 (ino 258)
+ * |----- d1/ (ino 259)
+ * |----- d2/ (ino 260)
+ *
+ * send snapshot looks like:
+ *
+ * . (ino 256)
+ * |----- d1 (ino 258)
+ * |----- f2/ (ino 259)
+ * |----- f2_link/ (ino 260)
+ * | |----- f1 (ino 257)
+ * |
+ * |----- d2 (ino 258)
+ *
+ * When processing inode 257 we compute the name for inode 259 as "d1", and we
+ * cache it in the name cache. Later when we start processing inode 258, when
+ * collecting all its new references we set a full path of "d1/d2" for its new
+ * reference with name "d2". When we start processing the new references we
+ * start by processing the new reference with name "d1", and this results in
+ * orphanizing inode 259, since its old reference causes a conflict. Then we
+ * move on the next new reference, with name "d2", and we find out we must
+ * orphanize inode 260, as its old reference conflicts with ours - but for the
+ * orphanization we use a source path corresponding to the path we stored in the
+ * new reference, which is "d1/d2" and not "o259-6-0/d2" - this makes the
+ * receiver fail since the path component "d1/" no longer exists, it was renamed
+ * to "o259-6-0/" when processing the previous new reference. So in this case we
+ * must recompute the path in the new reference and use it for the new
+ * orphanization operation.
+ */
+static int refresh_ref_path(struct send_ctx *sctx, struct recorded_ref *ref)
+{
+ char *name;
+ int ret;
+
+ name = kmemdup(ref->name, ref->name_len, GFP_KERNEL);
+ if (!name)
+ return -ENOMEM;
+
+ fs_path_reset(ref->full_path);
+ ret = get_cur_path(sctx, ref->dir, ref->dir_gen, ref->full_path);
+ if (ret < 0)
+ goto out;
+
+ ret = fs_path_add(ref->full_path, name, ref->name_len);
+ if (ret < 0)
+ goto out;
+
+ /* Update the reference's base name pointer. */
+ set_ref_path(ref, ref->full_path);
+out:
+ kfree(name);
+ return ret;
+}
+
+/*
* This does all the move/link/unlink/rmdir magic.
*/
static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
@@ -3880,52 +3939,56 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
goto out;
}
+ /*
+ * Before doing any rename and link operations, do a first pass on the
+ * new references to orphanize any unprocessed inodes that may have a
+ * reference that conflicts with one of the new references of the current
+ * inode. This needs to happen first because a new reference may conflict
+ * with the old reference of a parent directory, so we must make sure
+ * that the path used for link and rename commands don't use an
+ * orphanized name when an ancestor was not yet orphanized.
+ *
+ * Example:
+ *
+ * Parent snapshot:
+ *
+ * . (ino 256)
+ * |----- testdir/ (ino 259)
+ * | |----- a (ino 257)
+ * |
+ * |----- b (ino 258)
+ *
+ * Send snapshot:
+ *
+ * . (ino 256)
+ * |----- testdir_2/ (ino 259)
+ * | |----- a (ino 260)
+ * |
+ * |----- testdir (ino 257)
+ * |----- b (ino 257)
+ * |----- b2 (ino 258)
+ *
+ * Processing the new reference for inode 257 with name "b" may happen
+ * before processing the new reference with name "testdir". If so, we
+ * must make sure that by the time we send a link command to create the
+ * hard link "b", inode 259 was already orphanized, since the generated
+ * path in "valid_path" already contains the orphanized name for 259.
+ * We are processing inode 257, so only later when processing 259 we do
+ * the rename operation to change its temporary (orphanized) name to
+ * "testdir_2".
+ */
list_for_each_entry(cur, &sctx->new_refs, list) {
- /*
- * We may have refs where the parent directory does not exist
- * yet. This happens if the parent directories inum is higher
- * than the current inum. To handle this case, we create the
- * parent directory out of order. But we need to check if this
- * did already happen before due to other refs in the same dir.
- */
ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
if (ret < 0)
goto out;
- if (ret == inode_state_will_create) {
- ret = 0;
- /*
- * First check if any of the current inodes refs did
- * already create the dir.
- */
- list_for_each_entry(cur2, &sctx->new_refs, list) {
- if (cur == cur2)
- break;
- if (cur2->dir == cur->dir) {
- ret = 1;
- break;
- }
- }
-
- /*
- * If that did not happen, check if a previous inode
- * did already create the dir.
- */
- if (!ret)
- ret = did_create_dir(sctx, cur->dir);
- if (ret < 0)
- goto out;
- if (!ret) {
- ret = send_create_inode(sctx, cur->dir);
- if (ret < 0)
- goto out;
- }
- }
+ if (ret == inode_state_will_create)
+ continue;
/*
- * Check if this new ref would overwrite the first ref of
- * another unprocessed inode. If yes, orphanize the
- * overwritten inode. If we find an overwritten ref that is
- * not the first ref, simply unlink it.
+ * Check if this new ref would overwrite the first ref of another
+ * unprocessed inode. If yes, orphanize the overwritten inode.
+ * If we find an overwritten ref that is not the first ref,
+ * simply unlink it.
*/
ret = will_overwrite_ref(sctx, cur->dir, cur->dir_gen,
cur->name, cur->name_len,
@@ -3942,6 +4005,12 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
struct name_cache_entry *nce;
struct waiting_dir_move *wdm;
+ if (orphanized_dir) {
+ ret = refresh_ref_path(sctx, cur);
+ if (ret < 0)
+ goto out;
+ }
+
ret = orphanize_inode(sctx, ow_inode, ow_gen,
cur->full_path);
if (ret < 0)
@@ -4004,6 +4073,49 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
}
}
+ }
+
+ list_for_each_entry(cur, &sctx->new_refs, list) {
+ /*
+ * We may have refs where the parent directory does not exist
+ * yet. This happens if the parent directories inum is higher
+ * than the current inum. To handle this case, we create the
+ * parent directory out of order. But we need to check if this
+ * did already happen before due to other refs in the same dir.
+ */
+ ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
+ if (ret < 0)
+ goto out;
+ if (ret == inode_state_will_create) {
+ ret = 0;
+ /*
+ * First check if any of the current inodes refs did
+ * already create the dir.
+ */
+ list_for_each_entry(cur2, &sctx->new_refs, list) {
+ if (cur == cur2)
+ break;
+ if (cur2->dir == cur->dir) {
+ ret = 1;
+ break;
+ }
+ }
+
+ /*
+ * If that did not happen, check if a previous inode
+ * did already create the dir.
+ */
+ if (!ret)
+ ret = did_create_dir(sctx, cur->dir);
+ if (ret < 0)
+ goto out;
+ if (!ret) {
+ ret = send_create_inode(sctx, cur->dir);
+ if (ret < 0)
+ goto out;
+ }
+ }
+
if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root) {
ret = wait_for_dest_dir_move(sctx, cur, is_orphan);
if (ret < 0)
@@ -4799,7 +4911,25 @@ out:
return ret;
}
-static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
+static inline u64 max_send_read_size(const struct send_ctx *sctx)
+{
+ return sctx->send_max_size - SZ_16K;
+}
+
+static int put_data_header(struct send_ctx *sctx, u32 len)
+{
+ struct btrfs_tlv_header *hdr;
+
+ if (sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len)
+ return -EOVERFLOW;
+ hdr = (struct btrfs_tlv_header *)(sctx->send_buf + sctx->send_size);
+ put_unaligned_le16(BTRFS_SEND_A_DATA, &hdr->tlv_type);
+ put_unaligned_le16(len, &hdr->tlv_len);
+ sctx->send_size += sizeof(*hdr);
+ return 0;
+}
+
+static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
{
struct btrfs_root *root = sctx->send_root;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -4809,21 +4939,16 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
pgoff_t index = offset >> PAGE_SHIFT;
pgoff_t last_index;
unsigned pg_offset = offset_in_page(offset);
- ssize_t ret = 0;
+ int ret;
+
+ ret = put_data_header(sctx, len);
+ if (ret)
+ return ret;
inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root);
if (IS_ERR(inode))
return PTR_ERR(inode);
- if (offset + len > i_size_read(inode)) {
- if (offset > i_size_read(inode))
- len = 0;
- else
- len = offset - i_size_read(inode);
- }
- if (len == 0)
- goto out;
-
last_index = (offset + len - 1) >> PAGE_SHIFT;
/* initial readahead */
@@ -4864,16 +4989,16 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
}
addr = kmap(page);
- memcpy(sctx->read_buf + ret, addr + pg_offset, cur_len);
+ memcpy(sctx->send_buf + sctx->send_size, addr + pg_offset,
+ cur_len);
kunmap(page);
unlock_page(page);
put_page(page);
index++;
pg_offset = 0;
len -= cur_len;
- ret += cur_len;
+ sctx->send_size += cur_len;
}
-out:
iput(inode);
return ret;
}
@@ -4887,7 +5012,6 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
int ret = 0;
struct fs_path *p;
- ssize_t num_read = 0;
p = fs_path_alloc();
if (!p)
@@ -4895,13 +5019,6 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
btrfs_debug(fs_info, "send_write offset=%llu, len=%d", offset, len);
- num_read = fill_read_buf(sctx, offset, len);
- if (num_read <= 0) {
- if (num_read < 0)
- ret = num_read;
- goto out;
- }
-
ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
if (ret < 0)
goto out;
@@ -4912,16 +5029,16 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
- TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, num_read);
+ ret = put_file_data(sctx, offset, len);
+ if (ret < 0)
+ goto out;
ret = send_cmd(sctx);
tlv_put_failure:
out:
fs_path_free(p);
- if (ret < 0)
- return ret;
- return num_read;
+ return ret;
}
/*
@@ -5033,8 +5150,8 @@ out:
static int send_hole(struct send_ctx *sctx, u64 end)
{
struct fs_path *p = NULL;
+ u64 read_size = max_send_read_size(sctx);
u64 offset = sctx->cur_inode_last_extent;
- u64 len;
int ret = 0;
/*
@@ -5061,16 +5178,19 @@ static int send_hole(struct send_ctx *sctx, u64 end)
ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
if (ret < 0)
goto tlv_put_failure;
- memset(sctx->read_buf, 0, BTRFS_SEND_READ_SIZE);
while (offset < end) {
- len = min_t(u64, end - offset, BTRFS_SEND_READ_SIZE);
+ u64 len = min(end - offset, read_size);
ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
if (ret < 0)
break;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
- TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, len);
+ ret = put_data_header(sctx, len);
+ if (ret < 0)
+ break;
+ memset(sctx->send_buf + sctx->send_size, 0, len);
+ sctx->send_size += len;
ret = send_cmd(sctx);
if (ret < 0)
break;
@@ -5086,23 +5206,20 @@ static int send_extent_data(struct send_ctx *sctx,
const u64 offset,
const u64 len)
{
+ u64 read_size = max_send_read_size(sctx);
u64 sent = 0;
if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
return send_update_extent(sctx, offset, len);
while (sent < len) {
- u64 size = len - sent;
+ u64 size = min(len - sent, read_size);
int ret;
- if (size > BTRFS_SEND_READ_SIZE)
- size = BTRFS_SEND_READ_SIZE;
ret = send_write(sctx, offset + sent, size);
if (ret < 0)
return ret;
- if (!ret)
- break;
- sent += ret;
+ sent += size;
}
return 0;
}
@@ -5402,51 +5519,29 @@ static int send_write_or_clone(struct send_ctx *sctx,
struct clone_root *clone_root)
{
int ret = 0;
- struct btrfs_file_extent_item *ei;
u64 offset = key->offset;
- u64 len;
- u8 type;
+ u64 end;
u64 bs = sctx->send_root->fs_info->sb->s_blocksize;
- ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
- struct btrfs_file_extent_item);
- type = btrfs_file_extent_type(path->nodes[0], ei);
- if (type == BTRFS_FILE_EXTENT_INLINE) {
- len = btrfs_file_extent_ram_bytes(path->nodes[0], ei);
- /*
- * it is possible the inline item won't cover the whole page,
- * but there may be items after this page. Make
- * sure to send the whole thing
- */
- len = PAGE_ALIGN(len);
- } else {
- len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
- }
-
- if (offset >= sctx->cur_inode_size) {
- ret = 0;
- goto out;
- }
- if (offset + len > sctx->cur_inode_size)
- len = sctx->cur_inode_size - offset;
- if (len == 0) {
- ret = 0;
- goto out;
- }
+ end = min_t(u64, btrfs_file_extent_end(path), sctx->cur_inode_size);
+ if (offset >= end)
+ return 0;
- if (clone_root && IS_ALIGNED(offset + len, bs)) {
+ if (clone_root && IS_ALIGNED(end, bs)) {
+ struct btrfs_file_extent_item *ei;
u64 disk_byte;
u64 data_offset;
+ ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_file_extent_item);
disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei);
data_offset = btrfs_file_extent_offset(path->nodes[0], ei);
ret = clone_range(sctx, clone_root, disk_byte, data_offset,
- offset, len);
+ offset, end - offset);
} else {
- ret = send_extent_data(sctx, offset, len);
+ ret = send_extent_data(sctx, offset, end - offset);
}
- sctx->cur_inode_next_write_offset = offset + len;
-out:
+ sctx->cur_inode_next_write_offset = end;
return ret;
}
@@ -6692,8 +6787,7 @@ static int tree_compare_item(struct btrfs_path *left_path,
* If it detects a change, it aborts immediately.
*/
static int btrfs_compare_trees(struct btrfs_root *left_root,
- struct btrfs_root *right_root,
- btrfs_changed_cb_t changed_cb, void *ctx)
+ struct btrfs_root *right_root, void *ctx)
{
struct btrfs_fs_info *fs_info = left_root->fs_info;
int ret;
@@ -6960,8 +7054,7 @@ static int send_subvol(struct send_ctx *sctx)
goto out;
if (sctx->parent_root) {
- ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root,
- changed_cb, sctx);
+ ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root, sctx);
if (ret < 0)
goto out;
ret = finish_inode_if_needed(sctx, 1);
@@ -7087,7 +7180,7 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
u32 i;
u64 *clone_sources_tmp = NULL;
int clone_sources_to_rollback = 0;
- unsigned alloc_size;
+ size_t alloc_size;
int sort_clone_roots = 0;
if (!capable(CAP_SYS_ADMIN))
@@ -7169,25 +7262,20 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
goto out;
}
- sctx->read_buf = kvmalloc(BTRFS_SEND_READ_SIZE, GFP_KERNEL);
- if (!sctx->read_buf) {
- ret = -ENOMEM;
- goto out;
- }
-
sctx->pending_dir_moves = RB_ROOT;
sctx->waiting_dir_moves = RB_ROOT;
sctx->orphan_dirs = RB_ROOT;
- alloc_size = sizeof(struct clone_root) * (arg->clone_sources_count + 1);
-
- sctx->clone_roots = kzalloc(alloc_size, GFP_KERNEL);
+ sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots),
+ arg->clone_sources_count + 1,
+ GFP_KERNEL);
if (!sctx->clone_roots) {
ret = -ENOMEM;
goto out;
}
- alloc_size = arg->clone_sources_count * sizeof(*arg->clone_sources);
+ alloc_size = array_size(sizeof(*arg->clone_sources),
+ arg->clone_sources_count);
if (arg->clone_sources_count) {
clone_sources_tmp = kvmalloc(alloc_size, GFP_KERNEL);
@@ -7378,7 +7466,6 @@ out:
kvfree(sctx->clone_roots);
kvfree(sctx->send_buf);
- kvfree(sctx->read_buf);
name_cache_free(sctx);
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index ead397f7034f..de91488b7cd0 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -13,7 +13,6 @@
#define BTRFS_SEND_STREAM_VERSION 1
#define BTRFS_SEND_BUF_SIZE SZ_64K
-#define BTRFS_SEND_READ_SIZE (48 * SZ_1K)
enum btrfs_tlv_type {
BTRFS_TLV_U8,
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 475968ccbd1d..64099565ab8f 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -175,10 +175,8 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
struct list_head *head = &info->space_info;
struct btrfs_space_info *found;
- rcu_read_lock();
- list_for_each_entry_rcu(found, head, list)
+ list_for_each_entry(found, head, list)
found->full = 0;
- rcu_read_unlock();
}
static int create_space_info(struct btrfs_fs_info *info, u64 flags)
@@ -213,7 +211,7 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags)
if (ret)
return ret;
- list_add_rcu(&space_info->list, &info->space_info);
+ list_add(&space_info->list, &info->space_info);
if (flags & BTRFS_BLOCK_GROUP_DATA)
info->data_sinfo = space_info;
@@ -290,22 +288,13 @@ struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
- rcu_read_lock();
- list_for_each_entry_rcu(found, head, list) {
- if (found->flags & flags) {
- rcu_read_unlock();
+ list_for_each_entry(found, head, list) {
+ if (found->flags & flags)
return found;
- }
}
- rcu_read_unlock();
return NULL;
}
-static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
-{
- return (global->size << 1);
-}
-
static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
enum btrfs_reserve_flush_enum flush)
@@ -476,28 +465,6 @@ again:
up_read(&info->groups_sem);
}
-static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
- unsigned long nr_pages, int nr_items)
-{
- struct super_block *sb = fs_info->sb;
-
- if (down_read_trylock(&sb->s_umount)) {
- writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
- up_read(&sb->s_umount);
- } else {
- /*
- * We needn't worry the filesystem going from r/w to r/o though
- * we don't acquire ->s_umount mutex, because the filesystem
- * should guarantee the delalloc inodes list be empty after
- * the filesystem is readonly(all dirty pages are written to
- * the disk).
- */
- btrfs_start_delalloc_roots(fs_info, nr_items);
- if (!current->journal_info)
- btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
- }
-}
-
static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
u64 to_reclaim)
{
@@ -516,25 +483,33 @@ static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
/*
* shrink metadata reservation for delalloc
*/
-static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
- u64 orig, bool wait_ordered)
+static void shrink_delalloc(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ u64 to_reclaim, bool wait_ordered)
{
- struct btrfs_space_info *space_info;
struct btrfs_trans_handle *trans;
u64 delalloc_bytes;
u64 dio_bytes;
- u64 async_pages;
u64 items;
long time_left;
- unsigned long nr_pages;
int loops;
/* Calc the number of the pages we need flush for space reservation */
- items = calc_reclaim_items_nr(fs_info, to_reclaim);
- to_reclaim = items * EXTENT_SIZE_PER_ITEM;
+ if (to_reclaim == U64_MAX) {
+ items = U64_MAX;
+ } else {
+ /*
+ * to_reclaim is set to however much metadata we need to
+ * reclaim, but reclaiming that much data doesn't really track
+ * exactly, so increase the amount to reclaim by 2x in order to
+ * make sure we're flushing enough delalloc to hopefully reclaim
+ * some metadata reservations.
+ */
+ items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2;
+ to_reclaim = items * EXTENT_SIZE_PER_ITEM;
+ }
trans = (struct btrfs_trans_handle *)current->journal_info;
- space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
delalloc_bytes = percpu_counter_sum_positive(
&fs_info->delalloc_bytes);
@@ -557,37 +532,17 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
loops = 0;
while ((delalloc_bytes || dio_bytes) && loops < 3) {
- nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
-
- /*
- * Triggers inode writeback for up to nr_pages. This will invoke
- * ->writepages callback and trigger delalloc filling
- * (btrfs_run_delalloc_range()).
- */
- btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
+ btrfs_start_delalloc_roots(fs_info, items);
- /*
- * We need to wait for the compressed pages to start before
- * we continue.
- */
- async_pages = atomic_read(&fs_info->async_delalloc_pages);
- if (!async_pages)
- goto skip_async;
-
- /*
- * Calculate how many compressed pages we want to be written
- * before we continue. I.e if there are more async pages than we
- * require wait_event will wait until nr_pages are written.
- */
- if (async_pages <= nr_pages)
- async_pages = 0;
- else
- async_pages -= nr_pages;
+ loops++;
+ if (wait_ordered && !trans) {
+ btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
+ } else {
+ time_left = schedule_timeout_killable(1);
+ if (time_left)
+ break;
+ }
- wait_event(fs_info->async_submit_wait,
- atomic_read(&fs_info->async_delalloc_pages) <=
- (int)async_pages);
-skip_async:
spin_lock(&space_info->lock);
if (list_empty(&space_info->tickets) &&
list_empty(&space_info->priority_tickets)) {
@@ -596,14 +551,6 @@ skip_async:
}
spin_unlock(&space_info->lock);
- loops++;
- if (wait_ordered && !trans) {
- btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
- } else {
- time_left = schedule_timeout_killable(1);
- if (time_left)
- break;
- }
delalloc_bytes = percpu_counter_sum_positive(
&fs_info->delalloc_bytes);
dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
@@ -628,8 +575,8 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
struct btrfs_block_rsv *trans_rsv = &fs_info->trans_block_rsv;
struct btrfs_trans_handle *trans;
- u64 bytes_needed;
u64 reclaim_bytes = 0;
+ u64 bytes_needed = 0;
u64 cur_free_bytes = 0;
trans = (struct btrfs_trans_handle *)current->journal_info;
@@ -649,7 +596,8 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
else if (!list_empty(&space_info->tickets))
ticket = list_first_entry(&space_info->tickets,
struct reserve_ticket, list);
- bytes_needed = (ticket) ? ticket->bytes : 0;
+ if (ticket)
+ bytes_needed = ticket->bytes;
if (bytes_needed > cur_free_bytes)
bytes_needed -= cur_free_bytes;
@@ -676,8 +624,10 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
goto commit;
/*
- * See if there is some space in the delayed insertion reservation for
- * this reservation.
+ * See if there is some space in the delayed insertion reserve for this
+ * reservation. If the space_info's don't match (like for DATA or
+ * SYSTEM) then just go enospc, reclaiming this space won't recover any
+ * space to satisfy those reservations.
*/
if (space_info != delayed_rsv->space_info)
goto enospc;
@@ -742,7 +692,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
break;
case FLUSH_DELALLOC:
case FLUSH_DELALLOC_WAIT:
- shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
+ shrink_delalloc(fs_info, space_info, num_bytes,
state == FLUSH_DELALLOC_WAIT);
break;
case FLUSH_DELAYED_REFS_NR:
@@ -767,7 +717,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
break;
}
ret = btrfs_chunk_alloc(trans,
- btrfs_metadata_alloc_profile(fs_info),
+ btrfs_get_alloc_profile(fs_info, space_info->flags),
(state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE :
CHUNK_ALLOC_FORCE);
btrfs_end_transaction(trans);
@@ -1037,9 +987,132 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
} while (flush_state <= COMMIT_TRANS);
}
-void btrfs_init_async_reclaim_work(struct work_struct *work)
+/*
+ * FLUSH_DELALLOC_WAIT:
+ * Space is freed from flushing delalloc in one of two ways.
+ *
+ * 1) compression is on and we allocate less space than we reserved
+ * 2) we are overwriting existing space
+ *
+ * For #1 that extra space is reclaimed as soon as the delalloc pages are
+ * COWed, by way of btrfs_add_reserved_bytes() which adds the actual extent
+ * length to ->bytes_reserved, and subtracts the reserved space from
+ * ->bytes_may_use.
+ *
+ * For #2 this is trickier. Once the ordered extent runs we will drop the
+ * extent in the range we are overwriting, which creates a delayed ref for
+ * that freed extent. This however is not reclaimed until the transaction
+ * commits, thus the next stages.
+ *
+ * RUN_DELAYED_IPUTS
+ * If we are freeing inodes, we want to make sure all delayed iputs have
+ * completed, because they could have been on an inode with i_nlink == 0, and
+ * thus have been truncated and freed up space. But again this space is not
+ * immediately re-usable, it comes in the form of a delayed ref, which must be
+ * run and then the transaction must be committed.
+ *
+ * FLUSH_DELAYED_REFS
+ * The above two cases generate delayed refs that will affect
+ * ->total_bytes_pinned. However this counter can be inconsistent with
+ * reality if there are outstanding delayed refs. This is because we adjust
+ * the counter based solely on the current set of delayed refs and disregard
+ * any on-disk state which might include more refs. So for example, if we
+ * have an extent with 2 references, but we only drop 1, we'll see that there
+ * is a negative delayed ref count for the extent and assume that the space
+ * will be freed, and thus increase ->total_bytes_pinned.
+ *
+ * Running the delayed refs gives us the actual real view of what will be
+ * freed at the transaction commit time. This stage will not actually free
+ * space for us, it just makes sure that may_commit_transaction() has all of
+ * the information it needs to make the right decision.
+ *
+ * COMMIT_TRANS
+ * This is where we reclaim all of the pinned space generated by the previous
+ * two stages. We will not commit the transaction if we don't think we're
+ * likely to satisfy our request, which means if our current free space +
+ * total_bytes_pinned < reservation we will not commit. This is why the
+ * previous states are actually important, to make sure we know for sure
+ * whether committing the transaction will allow us to make progress.
+ *
+ * ALLOC_CHUNK_FORCE
+ * For data we start with alloc chunk force, however we could have been full
+ * before, and then the transaction commit could have freed new block groups,
+ * so if we now have space to allocate do the force chunk allocation.
+ */
+static const enum btrfs_flush_state data_flush_states[] = {
+ FLUSH_DELALLOC_WAIT,
+ RUN_DELAYED_IPUTS,
+ FLUSH_DELAYED_REFS,
+ COMMIT_TRANS,
+ ALLOC_CHUNK_FORCE,
+};
+
+static void btrfs_async_reclaim_data_space(struct work_struct *work)
+{
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_space_info *space_info;
+ u64 last_tickets_id;
+ int flush_state = 0;
+
+ fs_info = container_of(work, struct btrfs_fs_info, async_data_reclaim_work);
+ space_info = fs_info->data_sinfo;
+
+ spin_lock(&space_info->lock);
+ if (list_empty(&space_info->tickets)) {
+ space_info->flush = 0;
+ spin_unlock(&space_info->lock);
+ return;
+ }
+ last_tickets_id = space_info->tickets_id;
+ spin_unlock(&space_info->lock);
+
+ while (!space_info->full) {
+ flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE);
+ spin_lock(&space_info->lock);
+ if (list_empty(&space_info->tickets)) {
+ space_info->flush = 0;
+ spin_unlock(&space_info->lock);
+ return;
+ }
+ last_tickets_id = space_info->tickets_id;
+ spin_unlock(&space_info->lock);
+ }
+
+ while (flush_state < ARRAY_SIZE(data_flush_states)) {
+ flush_space(fs_info, space_info, U64_MAX,
+ data_flush_states[flush_state]);
+ spin_lock(&space_info->lock);
+ if (list_empty(&space_info->tickets)) {
+ space_info->flush = 0;
+ spin_unlock(&space_info->lock);
+ return;
+ }
+
+ if (last_tickets_id == space_info->tickets_id) {
+ flush_state++;
+ } else {
+ last_tickets_id = space_info->tickets_id;
+ flush_state = 0;
+ }
+
+ if (flush_state >= ARRAY_SIZE(data_flush_states)) {
+ if (space_info->full) {
+ if (maybe_fail_all_tickets(fs_info, space_info))
+ flush_state = 0;
+ else
+ space_info->flush = 0;
+ } else {
+ flush_state = 0;
+ }
+ }
+ spin_unlock(&space_info->lock);
+ }
+}
+
+void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info)
{
- INIT_WORK(work, btrfs_async_reclaim_metadata_space);
+ INIT_WORK(&fs_info->async_reclaim_work, btrfs_async_reclaim_metadata_space);
+ INIT_WORK(&fs_info->async_data_reclaim_work, btrfs_async_reclaim_data_space);
}
static const enum btrfs_flush_state priority_flush_states[] = {
@@ -1089,6 +1162,21 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
} while (flush_state < states_nr);
}
+static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ struct reserve_ticket *ticket)
+{
+ while (!space_info->full) {
+ flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE);
+ spin_lock(&space_info->lock);
+ if (ticket->bytes == 0) {
+ spin_unlock(&space_info->lock);
+ return;
+ }
+ spin_unlock(&space_info->lock);
+ }
+}
+
static void wait_reserve_ticket(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
struct reserve_ticket *ticket)
@@ -1141,6 +1229,7 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
int ret;
switch (flush) {
+ case BTRFS_RESERVE_FLUSH_DATA:
case BTRFS_RESERVE_FLUSH_ALL:
case BTRFS_RESERVE_FLUSH_ALL_STEAL:
wait_reserve_ticket(fs_info, space_info, ticket);
@@ -1155,6 +1244,9 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
evict_flush_states,
ARRAY_SIZE(evict_flush_states));
break;
+ case BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE:
+ priority_reclaim_data_space(fs_info, space_info, ticket);
+ break;
default:
ASSERT(0);
break;
@@ -1214,11 +1306,11 @@ static inline bool is_normal_flushing(enum btrfs_reserve_flush_enum flush)
* regain reservations will be made and this will fail if there is not enough
* space already.
*/
-static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info,
- u64 orig_bytes,
- enum btrfs_reserve_flush_enum flush)
+static int __reserve_bytes(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info, u64 orig_bytes,
+ enum btrfs_reserve_flush_enum flush)
{
+ struct work_struct *async_work;
struct reserve_ticket ticket;
u64 used;
int ret = 0;
@@ -1227,6 +1319,11 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
ASSERT(orig_bytes);
ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
+ if (flush == BTRFS_RESERVE_FLUSH_DATA)
+ async_work = &fs_info->async_data_reclaim_work;
+ else
+ async_work = &fs_info->async_reclaim_work;
+
spin_lock(&space_info->lock);
ret = -ENOSPC;
used = btrfs_space_info_used(space_info, true);
@@ -1268,7 +1365,8 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
init_waitqueue_head(&ticket.wait);
ticket.steal = (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL);
if (flush == BTRFS_RESERVE_FLUSH_ALL ||
- flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) {
+ flush == BTRFS_RESERVE_FLUSH_ALL_STEAL ||
+ flush == BTRFS_RESERVE_FLUSH_DATA) {
list_add_tail(&ticket.list, &space_info->tickets);
if (!space_info->flush) {
space_info->flush = 1;
@@ -1276,8 +1374,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
space_info->flags,
orig_bytes, flush,
"enospc");
- queue_work(system_unbound_wq,
- &fs_info->async_reclaim_work);
+ queue_work(system_unbound_wq, async_work);
}
} else {
list_add_tail(&ticket.list,
@@ -1329,8 +1426,7 @@ int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
int ret;
- ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
- orig_bytes, flush);
+ ret = __reserve_bytes(fs_info, block_rsv->space_info, orig_bytes, flush);
if (ret == -ENOSPC &&
unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
if (block_rsv != global_rsv &&
@@ -1348,3 +1444,32 @@ int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
}
return ret;
}
+
+/**
+ * btrfs_reserve_data_bytes - try to reserve data bytes for an allocation
+ * @fs_info - the filesystem
+ * @bytes - the number of bytes we need
+ * @flush - how we are allowed to flush
+ *
+ * This will reserve bytes from the data space info. If there is not enough
+ * space then we will attempt to flush space as specified by flush.
+ */
+int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
+ enum btrfs_reserve_flush_enum flush)
+{
+ struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
+ int ret;
+
+ ASSERT(flush == BTRFS_RESERVE_FLUSH_DATA ||
+ flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE);
+ ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA);
+
+ ret = __reserve_bytes(fs_info, data_sinfo, bytes, flush);
+ if (ret == -ENOSPC) {
+ trace_btrfs_space_reservation(fs_info, "space_info:enospc",
+ data_sinfo->flags, bytes, 1);
+ if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
+ btrfs_dump_space_info(fs_info, data_sinfo, bytes, 0);
+ }
+ return ret;
+}
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index c3c64019950a..5646393b928c 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -149,5 +149,7 @@ static inline void btrfs_space_info_free_bytes_may_use(
btrfs_try_granting_tickets(fs_info, space_info);
spin_unlock(&space_info->lock);
}
+int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
+ enum btrfs_reserve_flush_enum flush);
#endif /* BTRFS_SPACE_INFO_H */
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
index 079b059818e9..c46be27be700 100644
--- a/fs/btrfs/struct-funcs.c
+++ b/fs/btrfs/struct-funcs.c
@@ -7,16 +7,6 @@
#include "ctree.h"
-static inline u8 get_unaligned_le8(const void *p)
-{
- return *(u8 *)p;
-}
-
-static inline void put_unaligned_le8(u8 val, void *p)
-{
- *(u8 *)p = val;
-}
-
static bool check_setget_bounds(const struct extent_buffer *eb,
const void *ptr, unsigned off, int size)
{
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 5a9dc31d95c9..8840a4fa81eb 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -517,6 +517,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
char *compress_type;
bool compress_force = false;
enum btrfs_compression_type saved_compress_type;
+ int saved_compress_level;
bool saved_compress_force;
int no_compress = 0;
@@ -598,6 +599,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
info->compress_type : BTRFS_COMPRESS_NONE;
saved_compress_force =
btrfs_test_opt(info, FORCE_COMPRESS);
+ saved_compress_level = info->compress_level;
if (token == Opt_compress ||
token == Opt_compress_force ||
strncmp(args[0].from, "zlib", 4) == 0) {
@@ -623,6 +625,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
} else if (strncmp(args[0].from, "lzo", 3) == 0) {
compress_type = "lzo";
info->compress_type = BTRFS_COMPRESS_LZO;
+ info->compress_level = 0;
btrfs_set_opt(info->mount_opt, COMPRESS);
btrfs_clear_opt(info->mount_opt, NODATACOW);
btrfs_clear_opt(info->mount_opt, NODATASUM);
@@ -642,6 +645,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
no_compress = 0;
} else if (strncmp(args[0].from, "no", 2) == 0) {
compress_type = "no";
+ info->compress_level = 0;
+ info->compress_type = 0;
btrfs_clear_opt(info->mount_opt, COMPRESS);
btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
compress_force = false;
@@ -662,11 +667,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
*/
btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
}
- if ((btrfs_test_opt(info, COMPRESS) &&
- (info->compress_type != saved_compress_type ||
- compress_force != saved_compress_force)) ||
- (!btrfs_test_opt(info, COMPRESS) &&
- no_compress == 1)) {
+ if (no_compress == 1) {
+ btrfs_info(info, "use no compression");
+ } else if ((info->compress_type != saved_compress_type) ||
+ (compress_force != saved_compress_force) ||
+ (info->compress_level != saved_compress_level)) {
btrfs_info(info, "%s %s compression, level %d",
(compress_force) ? "force" : "use",
compress_type, info->compress_level);
@@ -1382,6 +1387,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
{
struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb);
const char *compress_type;
+ const char *subvol_name;
if (btrfs_test_opt(info, DEGRADED))
seq_puts(seq, ",degraded");
@@ -1468,8 +1474,13 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
seq_puts(seq, ",ref_verify");
seq_printf(seq, ",subvolid=%llu",
BTRFS_I(d_inode(dentry))->root->root_key.objectid);
- seq_puts(seq, ",subvol=");
- seq_dentry(seq, dentry, " \t\n\\");
+ subvol_name = btrfs_get_subvol_name_from_objectid(info,
+ BTRFS_I(d_inode(dentry))->root->root_key.objectid);
+ if (!IS_ERR(subvol_name)) {
+ seq_puts(seq, ",subvol=");
+ seq_escape(seq, subvol_name, " \t\n\\");
+ kfree(subvol_name);
+ }
return 0;
}
@@ -1860,6 +1871,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
* the filesystem is busy.
*/
cancel_work_sync(&fs_info->async_reclaim_work);
+ cancel_work_sync(&fs_info->async_data_reclaim_work);
btrfs_discard_cleanup(fs_info);
@@ -1950,6 +1962,12 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
set_bit(BTRFS_FS_OPEN, &fs_info->flags);
}
out:
+ /*
+ * We need to set SB_I_VERSION here otherwise it'll get cleared by VFS,
+ * since the absence of the flag means it can be toggled off by remount.
+ */
+ *flags |= SB_I_VERSION;
+
wake_up_process(fs_info->transaction_kthread);
btrfs_remount_cleanup(fs_info, old_opts);
clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
@@ -2146,8 +2164,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
u64 thresh = 0;
int mixed = 0;
- rcu_read_lock();
- list_for_each_entry_rcu(found, &fs_info->space_info, list) {
+ list_for_each_entry(found, &fs_info->space_info, list) {
if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
int i;
@@ -2176,8 +2193,6 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
total_used += found->disk_used;
}
- rcu_read_unlock();
-
buf->f_blocks = div_u64(btrfs_super_total_bytes(disk_super), factor);
buf->f_blocks >>= bits;
buf->f_bfree = buf->f_blocks - (div_u64(total_used, factor) >> bits);
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 104c80caaa74..279d9262b676 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -14,6 +14,7 @@
#include "ctree.h"
#include "discard.h"
#include "disk-io.h"
+#include "send.h"
#include "transaction.h"
#include "sysfs.h"
#include "volumes.h"
@@ -321,9 +322,17 @@ static ssize_t supported_checksums_show(struct kobject *kobj,
}
BTRFS_ATTR(static_feature, supported_checksums, supported_checksums_show);
+static ssize_t send_stream_version_show(struct kobject *kobj,
+ struct kobj_attribute *ka, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", BTRFS_SEND_STREAM_VERSION);
+}
+BTRFS_ATTR(static_feature, send_stream_version, send_stream_version_show);
+
static struct attribute *btrfs_supported_static_feature_attrs[] = {
BTRFS_ATTR_PTR(static_feature, rmdir_subvol),
BTRFS_ATTR_PTR(static_feature, supported_checksums),
+ BTRFS_ATTR_PTR(static_feature, send_stream_version),
NULL
};
@@ -809,6 +818,42 @@ static ssize_t btrfs_checksum_show(struct kobject *kobj,
BTRFS_ATTR(, checksum, btrfs_checksum_show);
+static ssize_t btrfs_exclusive_operation_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+ const char *str;
+
+ switch (READ_ONCE(fs_info->exclusive_operation)) {
+ case BTRFS_EXCLOP_NONE:
+ str = "none\n";
+ break;
+ case BTRFS_EXCLOP_BALANCE:
+ str = "balance\n";
+ break;
+ case BTRFS_EXCLOP_DEV_ADD:
+ str = "device add\n";
+ break;
+ case BTRFS_EXCLOP_DEV_REMOVE:
+ str = "device remove\n";
+ break;
+ case BTRFS_EXCLOP_DEV_REPLACE:
+ str = "device replace\n";
+ break;
+ case BTRFS_EXCLOP_RESIZE:
+ str = "resize\n";
+ break;
+ case BTRFS_EXCLOP_SWAP_ACTIVATE:
+ str = "swap activate\n";
+ break;
+ default:
+ str = "UNKNOWN\n";
+ break;
+ }
+ return scnprintf(buf, PAGE_SIZE, "%s", str);
+}
+BTRFS_ATTR(, exclusive_operation, btrfs_exclusive_operation_show);
+
static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(, label),
BTRFS_ATTR_PTR(, nodesize),
@@ -817,6 +862,7 @@ static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(, quota_override),
BTRFS_ATTR_PTR(, metadata_uuid),
BTRFS_ATTR_PTR(, checksum),
+ BTRFS_ATTR_PTR(, exclusive_operation),
NULL,
};
@@ -935,12 +981,24 @@ void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
}
}
+static void btrfs_sysfs_remove_fs_devices(struct btrfs_fs_devices *fs_devices)
+{
+ struct btrfs_device *device;
+ struct btrfs_fs_devices *seed;
+
+ list_for_each_entry(device, &fs_devices->devices, dev_list)
+ btrfs_sysfs_remove_device(device);
+
+ list_for_each_entry(seed, &fs_devices->seed_list, seed_list) {
+ list_for_each_entry(device, &seed->devices, dev_list)
+ btrfs_sysfs_remove_device(device);
+ }
+}
+
void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info)
{
struct kobject *fsid_kobj = &fs_info->fs_devices->fsid_kobj;
- btrfs_reset_fs_info_ptr(fs_info);
-
sysfs_remove_link(fsid_kobj, "bdi");
if (fs_info->space_info_kobj) {
@@ -964,7 +1022,7 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info)
addrm_unknown_feature_attrs(fs_info, false);
sysfs_remove_group(fsid_kobj, &btrfs_feature_attr_group);
sysfs_remove_files(fsid_kobj, btrfs_attrs);
- btrfs_sysfs_remove_devices_dir(fs_info->fs_devices, NULL);
+ btrfs_sysfs_remove_fs_devices(fs_info->fs_devices);
}
static const char * const btrfs_feature_set_names[FEAT_MAX] = {
@@ -973,7 +1031,7 @@ static const char * const btrfs_feature_set_names[FEAT_MAX] = {
[FEAT_INCOMPAT] = "incompat",
};
-const char * const btrfs_feature_set_name(enum btrfs_feature_set set)
+const char *btrfs_feature_set_name(enum btrfs_feature_set set)
{
return btrfs_feature_set_names[set];
}
@@ -1079,17 +1137,38 @@ void btrfs_sysfs_add_block_group_type(struct btrfs_block_group *cache)
rkobj->flags = cache->flags;
kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
+
+ /*
+ * We call this either on mount, or if we've created a block group for a
+ * new index type while running (i.e. when restriping). The running
+ * case is tricky because we could race with other threads, so we need
+ * to have this check to make sure we didn't already init the kobject.
+ *
+ * We don't have to protect on the free side because it only happens on
+ * unmount.
+ */
+ spin_lock(&space_info->lock);
+ if (space_info->block_group_kobjs[index]) {
+ spin_unlock(&space_info->lock);
+ kobject_put(&rkobj->kobj);
+ return;
+ } else {
+ space_info->block_group_kobjs[index] = &rkobj->kobj;
+ }
+ spin_unlock(&space_info->lock);
+
ret = kobject_add(&rkobj->kobj, &space_info->kobj, "%s",
btrfs_bg_type_to_raid_name(rkobj->flags));
memalloc_nofs_restore(nofs_flag);
if (ret) {
+ spin_lock(&space_info->lock);
+ space_info->block_group_kobjs[index] = NULL;
+ spin_unlock(&space_info->lock);
kobject_put(&rkobj->kobj);
btrfs_warn(fs_info,
"failed to add kobject for block cache, ignoring");
return;
}
-
- space_info->block_group_kobjs[index] = &rkobj->kobj;
}
/*
@@ -1151,48 +1230,30 @@ int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info,
return 0;
}
-/* when one_device is NULL, it removes all device links */
-
-int btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices,
- struct btrfs_device *one_device)
+void btrfs_sysfs_remove_device(struct btrfs_device *device)
{
struct hd_struct *disk;
struct kobject *disk_kobj;
+ struct kobject *devices_kobj;
- if (!fs_devices->devices_kobj)
- return -EINVAL;
-
- if (one_device) {
- if (one_device->bdev) {
- disk = one_device->bdev->bd_part;
- disk_kobj = &part_to_dev(disk)->kobj;
- sysfs_remove_link(fs_devices->devices_kobj,
- disk_kobj->name);
- }
-
- kobject_del(&one_device->devid_kobj);
- kobject_put(&one_device->devid_kobj);
-
- wait_for_completion(&one_device->kobj_unregister);
+ /*
+ * Seed fs_devices devices_kobj aren't used, fetch kobject from the
+ * fs_info::fs_devices.
+ */
+ devices_kobj = device->fs_info->fs_devices->devices_kobj;
+ ASSERT(devices_kobj);
- return 0;
+ if (device->bdev) {
+ disk = device->bdev->bd_part;
+ disk_kobj = &part_to_dev(disk)->kobj;
+ sysfs_remove_link(devices_kobj, disk_kobj->name);
}
- list_for_each_entry(one_device, &fs_devices->devices, dev_list) {
-
- if (one_device->bdev) {
- disk = one_device->bdev->bd_part;
- disk_kobj = &part_to_dev(disk)->kobj;
- sysfs_remove_link(fs_devices->devices_kobj,
- disk_kobj->name);
- }
- kobject_del(&one_device->devid_kobj);
- kobject_put(&one_device->devid_kobj);
-
- wait_for_completion(&one_device->kobj_unregister);
+ if (device->devid_kobj.state_initialized) {
+ kobject_del(&device->devid_kobj);
+ kobject_put(&device->devid_kobj);
+ wait_for_completion(&device->kobj_unregister);
}
-
- return 0;
}
static ssize_t btrfs_devinfo_in_fs_metadata_show(struct kobject *kobj,
@@ -1273,44 +1334,80 @@ static struct kobj_type devid_ktype = {
.release = btrfs_release_devid_kobj,
};
-int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices,
- struct btrfs_device *one_device)
+int btrfs_sysfs_add_device(struct btrfs_device *device)
{
- int error = 0;
- struct btrfs_device *dev;
+ int ret;
unsigned int nofs_flag;
+ struct kobject *devices_kobj;
+ struct kobject *devinfo_kobj;
- nofs_flag = memalloc_nofs_save();
- list_for_each_entry(dev, &fs_devices->devices, dev_list) {
+ /*
+ * Make sure we use the fs_info::fs_devices to fetch the kobjects even
+ * for the seed fs_devices
+ */
+ devices_kobj = device->fs_info->fs_devices->devices_kobj;
+ devinfo_kobj = device->fs_info->fs_devices->devinfo_kobj;
+ ASSERT(devices_kobj);
+ ASSERT(devinfo_kobj);
- if (one_device && one_device != dev)
- continue;
+ nofs_flag = memalloc_nofs_save();
- if (dev->bdev) {
- struct hd_struct *disk;
- struct kobject *disk_kobj;
+ if (device->bdev) {
+ struct hd_struct *disk;
+ struct kobject *disk_kobj;
- disk = dev->bdev->bd_part;
- disk_kobj = &part_to_dev(disk)->kobj;
+ disk = device->bdev->bd_part;
+ disk_kobj = &part_to_dev(disk)->kobj;
- error = sysfs_create_link(fs_devices->devices_kobj,
- disk_kobj, disk_kobj->name);
- if (error)
- break;
+ ret = sysfs_create_link(devices_kobj, disk_kobj, disk_kobj->name);
+ if (ret) {
+ btrfs_warn(device->fs_info,
+ "creating sysfs device link for devid %llu failed: %d",
+ device->devid, ret);
+ goto out;
}
+ }
- init_completion(&dev->kobj_unregister);
- error = kobject_init_and_add(&dev->devid_kobj, &devid_ktype,
- fs_devices->devinfo_kobj, "%llu",
- dev->devid);
- if (error) {
- kobject_put(&dev->devid_kobj);
- break;
- }
+ init_completion(&device->kobj_unregister);
+ ret = kobject_init_and_add(&device->devid_kobj, &devid_ktype,
+ devinfo_kobj, "%llu", device->devid);
+ if (ret) {
+ kobject_put(&device->devid_kobj);
+ btrfs_warn(device->fs_info,
+ "devinfo init for devid %llu failed: %d",
+ device->devid, ret);
}
+
+out:
memalloc_nofs_restore(nofs_flag);
+ return ret;
+}
- return error;
+static int btrfs_sysfs_add_fs_devices(struct btrfs_fs_devices *fs_devices)
+{
+ int ret;
+ struct btrfs_device *device;
+ struct btrfs_fs_devices *seed;
+
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ ret = btrfs_sysfs_add_device(device);
+ if (ret)
+ goto fail;
+ }
+
+ list_for_each_entry(seed, &fs_devices->seed_list, seed_list) {
+ list_for_each_entry(device, &seed->devices, dev_list) {
+ ret = btrfs_sysfs_add_device(device);
+ if (ret)
+ goto fail;
+ }
+ }
+
+ return 0;
+
+fail:
+ btrfs_sysfs_remove_fs_devices(fs_devices);
+ return ret;
}
void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action)
@@ -1324,8 +1421,8 @@ void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action)
&disk_to_dev(bdev->bd_disk)->kobj);
}
-void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices,
- const u8 *fsid)
+void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices)
+
{
char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
@@ -1333,7 +1430,7 @@ void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices,
* Sprouting changes fsid of the mounted filesystem, rename the fsid
* directory
*/
- snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", fsid);
+ snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", fs_devices->fsid);
if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf))
btrfs_warn(fs_devices->fs_info,
"sysfs: failed to create fsid for sprout");
@@ -1400,15 +1497,13 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info)
struct btrfs_fs_devices *fs_devs = fs_info->fs_devices;
struct kobject *fsid_kobj = &fs_devs->fsid_kobj;
- btrfs_set_fs_info_ptr(fs_info);
-
- error = btrfs_sysfs_add_devices_dir(fs_devs, NULL);
+ error = btrfs_sysfs_add_fs_devices(fs_devs);
if (error)
return error;
error = sysfs_create_files(fsid_kobj, btrfs_attrs);
if (error) {
- btrfs_sysfs_remove_devices_dir(fs_devs, NULL);
+ btrfs_sysfs_remove_fs_devices(fs_devs);
return error;
}
@@ -1565,9 +1660,11 @@ void btrfs_sysfs_del_qgroups(struct btrfs_fs_info *fs_info)
rbtree_postorder_for_each_entry_safe(qgroup, next,
&fs_info->qgroup_tree, node)
btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
- kobject_del(fs_info->qgroups_kobj);
- kobject_put(fs_info->qgroups_kobj);
- fs_info->qgroups_kobj = NULL;
+ if (fs_info->qgroups_kobj) {
+ kobject_del(fs_info->qgroups_kobj);
+ kobject_put(fs_info->qgroups_kobj);
+ fs_info->qgroups_kobj = NULL;
+ }
}
/* Called when qgroups get initialized, thus there is no need for locking */
@@ -1624,12 +1721,16 @@ void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info,
{
struct btrfs_fs_devices *fs_devs;
struct kobject *fsid_kobj;
- u64 features;
- int ret;
+ u64 __maybe_unused features;
+ int __maybe_unused ret;
if (!fs_info)
return;
+ /*
+ * See 14e46e04958df74 and e410e34fad913dd, feature bit updates are not
+ * safe when called from some contexts (eg. balance)
+ */
features = get_features(fs_info, set);
ASSERT(bit & supported_feature_masks[set]);
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index cf839c46a131..bacef43f7267 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -13,15 +13,12 @@ enum btrfs_feature_set {
};
char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags);
-const char * const btrfs_feature_set_name(enum btrfs_feature_set set);
-int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices,
- struct btrfs_device *one_device);
-int btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices,
- struct btrfs_device *one_device);
+const char *btrfs_feature_set_name(enum btrfs_feature_set set);
+int btrfs_sysfs_add_device(struct btrfs_device *device);
+void btrfs_sysfs_remove_device(struct btrfs_device *device);
int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs);
void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs);
-void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices,
- const u8 *fsid);
+void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices);
void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info,
u64 bit, enum btrfs_feature_set set);
void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action);
diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c
index a1b9f9b5978e..df54cdfdc250 100644
--- a/fs/btrfs/tests/extent-buffer-tests.c
+++ b/fs/btrfs/tests/extent-buffer-tests.c
@@ -60,8 +60,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
key.type = BTRFS_EXTENT_CSUM_KEY;
key.offset = 0;
- setup_items_for_insert(root, path, &key, &value_len, value_len,
- value_len + sizeof(struct btrfs_item), 1);
+ setup_items_for_insert(root, path, &key, &value_len, 1);
item = btrfs_item_nr(0);
write_extent_buffer(eb, value, btrfs_item_ptr_offset(eb, 0),
value_len);
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 894a63a92236..e6719f7db386 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -33,8 +33,7 @@ static void insert_extent(struct btrfs_root *root, u64 start, u64 len,
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = start;
- setup_items_for_insert(root, &path, &key, &value_len, value_len,
- value_len + sizeof(struct btrfs_item), 1);
+ setup_items_for_insert(root, &path, &key, &value_len, 1);
fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
btrfs_set_file_extent_generation(leaf, fi, 1);
btrfs_set_file_extent_type(leaf, fi, type);
@@ -64,8 +63,7 @@ static void insert_inode_item_key(struct btrfs_root *root)
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
- setup_items_for_insert(root, &path, &key, &value_len, value_len,
- value_len + sizeof(struct btrfs_item), 1);
+ setup_items_for_insert(root, &path, &key, &value_len, 1);
}
/*
@@ -951,7 +949,6 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
}
BTRFS_I(inode)->root = root;
- btrfs_test_inode_set_ops(inode);
/* [BTRFS_MAX_EXTENT_SIZE] */
ret = btrfs_set_extent_delalloc(BTRFS_I(inode), 0,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 20c6ac1a5de7..52ada47aff50 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -292,6 +292,8 @@ loop:
}
cur_trans->fs_info = fs_info;
+ atomic_set(&cur_trans->pending_ordered, 0);
+ init_waitqueue_head(&cur_trans->pending_wait);
atomic_set(&cur_trans->num_writers, 1);
extwriter_counter_init(cur_trans, type);
init_waitqueue_head(&cur_trans->writer_wait);
@@ -1182,7 +1184,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans)
eb = btrfs_lock_root_node(fs_info->tree_root);
ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL,
- 0, &eb);
+ 0, &eb, BTRFS_NESTING_COW);
btrfs_tree_unlock(eb);
free_extent_buffer(eb);
@@ -1587,7 +1589,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
btrfs_set_root_otransid(new_root_item, trans->transid);
old = btrfs_lock_root_node(root);
- ret = btrfs_cow_block(trans, root, old, NULL, 0, &old);
+ ret = btrfs_cow_block(trans, root, old, NULL, 0, &old,
+ BTRFS_NESTING_COW);
if (ret) {
btrfs_tree_unlock(old);
free_extent_buffer(old);
@@ -1636,6 +1639,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
pending->snap = btrfs_get_new_fs_root(fs_info, objectid, pending->anon_dev);
if (IS_ERR(pending->snap)) {
ret = PTR_ERR(pending->snap);
+ pending->snap = NULL;
btrfs_abort_transaction(trans, ret);
goto fail;
}
@@ -2164,6 +2168,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
btrfs_wait_delalloc_flush(trans);
+ /*
+ * Wait for all ordered extents started by a fast fsync that joined this
+ * transaction. Otherwise if this transaction commits before the ordered
+ * extents complete we lose logged data after a power failure.
+ */
+ wait_event(cur_trans->pending_wait,
+ atomic_read(&cur_trans->pending_ordered) == 0);
+
btrfs_scrub_pause(fs_info);
/*
* Ok now we need to make sure to block out any other joins while we
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index d60b055b8695..858d9153a1cd 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -85,6 +85,13 @@ struct btrfs_transaction {
spinlock_t dropped_roots_lock;
struct btrfs_delayed_ref_root delayed_refs;
struct btrfs_fs_info *fs_info;
+
+ /*
+ * Number of ordered extents the transaction must wait for before
+ * committing. These are ordered extents started by a fast fsync.
+ */
+ atomic_t pending_ordered;
+ wait_queue_head_t pending_wait;
};
#define __TRANS_FREEZABLE (1U << 0)
@@ -105,6 +112,7 @@ struct btrfs_transaction {
#define TRANS_EXTWRITERS (__TRANS_START | __TRANS_ATTACH)
#define BTRFS_SEND_TRANS_STUB ((void *)1)
+#define BTRFS_DIO_SYNC_STUB ((void *)2)
struct btrfs_trans_handle {
u64 transid;
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 517b44300a05..f0ffd5ee77bd 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -984,7 +984,7 @@ static int check_inode_item(struct extent_buffer *leaf,
/* Note for ROOT_TREE_DIR_ITEM, mkfs could set its transid 0 */
if (btrfs_inode_transid(leaf, iitem) > super_gen + 1) {
inode_item_err(leaf, slot,
- "invalid inode generation: has %llu expect [0, %llu]",
+ "invalid inode transid: has %llu expect [0, %llu]",
btrfs_inode_transid(leaf, iitem), super_gen + 1);
return -EUCLEAN;
}
@@ -1035,7 +1035,7 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key,
int slot)
{
struct btrfs_fs_info *fs_info = leaf->fs_info;
- struct btrfs_root_item ri;
+ struct btrfs_root_item ri = { 0 };
const u64 valid_root_flags = BTRFS_ROOT_SUBVOL_RDONLY |
BTRFS_ROOT_SUBVOL_DEAD;
int ret;
@@ -1044,14 +1044,21 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key,
if (ret < 0)
return ret;
- if (btrfs_item_size_nr(leaf, slot) != sizeof(ri)) {
+ if (btrfs_item_size_nr(leaf, slot) != sizeof(ri) &&
+ btrfs_item_size_nr(leaf, slot) != btrfs_legacy_root_item_size()) {
generic_err(leaf, slot,
- "invalid root item size, have %u expect %zu",
- btrfs_item_size_nr(leaf, slot), sizeof(ri));
+ "invalid root item size, have %u expect %zu or %u",
+ btrfs_item_size_nr(leaf, slot), sizeof(ri),
+ btrfs_legacy_root_item_size());
}
+ /*
+ * For legacy root item, the members starting at generation_v2 will be
+ * all filled with 0.
+ * And since we allow geneartion_v2 as 0, it will still pass the check.
+ */
read_extent_buffer(leaf, &ri, btrfs_item_ptr_offset(leaf, slot),
- sizeof(ri));
+ btrfs_item_size_nr(leaf, slot));
/* Generation related */
if (btrfs_root_generation(&ri) >
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index ea8136dcf71f..56cbc1706b6f 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -96,8 +96,6 @@ enum {
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_inode *inode,
int inode_only,
- const loff_t start,
- const loff_t end,
struct btrfs_log_ctx *ctx);
static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -176,7 +174,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
atomic_inc(&root->log_batch);
atomic_inc(&root->log_writers);
- if (ctx) {
+ if (ctx && !ctx->logging_new_name) {
int index = root->log_transid % 2;
list_add_tail(&ctx->list, &root->log_ctxs[index]);
ctx->log_transid = root->log_transid;
@@ -215,9 +213,7 @@ static int join_running_log_trans(struct btrfs_root *root)
*/
void btrfs_pin_log_trans(struct btrfs_root *root)
{
- mutex_lock(&root->log_mutex);
atomic_inc(&root->log_writers);
- mutex_unlock(&root->log_mutex);
}
/*
@@ -3449,11 +3445,13 @@ fail:
btrfs_free_path(path);
out_unlock:
mutex_unlock(&dir->log_mutex);
- if (ret == -ENOSPC) {
+ if (err == -ENOSPC) {
btrfs_set_log_full_commit(trans);
- ret = 0;
- } else if (ret < 0)
- btrfs_abort_transaction(trans, ret);
+ err = 0;
+ } else if (err < 0 && err != -ENOENT) {
+ /* ENOENT can be returned if the entry hasn't been fsynced yet */
+ btrfs_abort_transaction(trans, err);
+ }
btrfs_end_log_trans(root);
@@ -3613,6 +3611,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
* search and this search we'll not find the key again and can just
* bail.
*/
+search:
ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
if (ret != 0)
goto done;
@@ -3632,6 +3631,13 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
if (min_key.objectid != ino || min_key.type != key_type)
goto done;
+
+ if (need_resched()) {
+ btrfs_release_path(path);
+ cond_resched();
+ goto search;
+ }
+
ret = overwrite_item(trans, log, dst_path, src, i,
&min_key);
if (ret) {
@@ -4036,11 +4042,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
fs_info->csum_root,
ds + cs, ds + cs + cl - 1,
&ordered_sums, 0);
- if (ret) {
- btrfs_release_path(dst_path);
- kfree(ins_data);
- return ret;
- }
+ if (ret)
+ break;
}
}
}
@@ -4053,7 +4056,6 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
* we have to do this after the loop above to avoid changing the
* log tree while trying to change the log tree.
*/
- ret = 0;
while (!list_empty(&ordered_sums)) {
struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
struct btrfs_ordered_sum,
@@ -4084,10 +4086,14 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
static int log_extent_csums(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct btrfs_root *log_root,
- const struct extent_map *em)
+ const struct extent_map *em,
+ struct btrfs_log_ctx *ctx)
{
+ struct btrfs_ordered_extent *ordered;
u64 csum_offset;
u64 csum_len;
+ u64 mod_start = em->mod_start;
+ u64 mod_len = em->mod_len;
LIST_HEAD(ordered_sums);
int ret = 0;
@@ -4096,13 +4102,71 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
em->block_start == EXTENT_MAP_HOLE)
return 0;
+ list_for_each_entry(ordered, &ctx->ordered_extents, log_list) {
+ const u64 ordered_end = ordered->file_offset + ordered->num_bytes;
+ const u64 mod_end = mod_start + mod_len;
+ struct btrfs_ordered_sum *sums;
+
+ if (mod_len == 0)
+ break;
+
+ if (ordered_end <= mod_start)
+ continue;
+ if (mod_end <= ordered->file_offset)
+ break;
+
+ /*
+ * We are going to copy all the csums on this ordered extent, so
+ * go ahead and adjust mod_start and mod_len in case this ordered
+ * extent has already been logged.
+ */
+ if (ordered->file_offset > mod_start) {
+ if (ordered_end >= mod_end)
+ mod_len = ordered->file_offset - mod_start;
+ /*
+ * If we have this case
+ *
+ * |--------- logged extent ---------|
+ * |----- ordered extent ----|
+ *
+ * Just don't mess with mod_start and mod_len, we'll
+ * just end up logging more csums than we need and it
+ * will be ok.
+ */
+ } else {
+ if (ordered_end < mod_end) {
+ mod_len = mod_end - ordered_end;
+ mod_start = ordered_end;
+ } else {
+ mod_len = 0;
+ }
+ }
+
+ /*
+ * To keep us from looping for the above case of an ordered
+ * extent that falls inside of the logged extent.
+ */
+ if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags))
+ continue;
+
+ list_for_each_entry(sums, &ordered->list, list) {
+ ret = log_csums(trans, inode, log_root, sums);
+ if (ret)
+ return ret;
+ }
+ }
+
+ /* We're done, found all csums in the ordered extents. */
+ if (mod_len == 0)
+ return 0;
+
/* If we're compressed we have to save the entire range of csums. */
if (em->compress_type) {
csum_offset = 0;
csum_len = max(em->block_len, em->orig_block_len);
} else {
- csum_offset = em->mod_start - em->start;
- csum_len = em->mod_len;
+ csum_offset = mod_start - em->start;
+ csum_len = mod_len;
}
/* block start is already adjusted for the file extent offset. */
@@ -4142,7 +4206,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
int ret;
int extent_inserted = 0;
- ret = log_extent_csums(trans, inode, log, em);
+ ret = log_extent_csums(trans, inode, log, em, ctx);
if (ret)
return ret;
@@ -4344,10 +4408,10 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_inode *inode,
struct btrfs_path *path,
- struct btrfs_log_ctx *ctx,
- const u64 start,
- const u64 end)
+ struct btrfs_log_ctx *ctx)
{
+ struct btrfs_ordered_extent *ordered;
+ struct btrfs_ordered_extent *tmp;
struct extent_map *em, *n;
struct list_head extents;
struct extent_map_tree *tree = &inode->extent_tree;
@@ -4361,23 +4425,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
test_gen = root->fs_info->last_trans_committed;
list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
- /*
- * Skip extents outside our logging range. It's important to do
- * it for correctness because if we don't ignore them, we may
- * log them before their ordered extent completes, and therefore
- * we could log them without logging their respective checksums
- * (the checksum items are added to the csum tree at the very
- * end of btrfs_finish_ordered_io()). Also leave such extents
- * outside of our range in the list, since we may have another
- * ranged fsync in the near future that needs them. If an extent
- * outside our range corresponds to a hole, log it to avoid
- * leaving gaps between extents (fsck will complain when we are
- * not using the NO_HOLES feature).
- */
- if ((em->start > end || em->start + em->len <= start) &&
- em->block_start != EXTENT_MAP_HOLE)
- continue;
-
list_del_init(&em->list);
/*
* Just an arbitrary number, this can be really CPU intensive
@@ -4436,8 +4483,32 @@ process:
btrfs_release_path(path);
if (!ret)
ret = btrfs_log_prealloc_extents(trans, inode, path);
+ if (ret)
+ return ret;
- return ret;
+ /*
+ * We have logged all extents successfully, now make sure the commit of
+ * the current transaction waits for the ordered extents to complete
+ * before it commits and wipes out the log trees, otherwise we would
+ * lose data if an ordered extents completes after the transaction
+ * commits and a power failure happens after the transaction commit.
+ */
+ list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
+ list_del_init(&ordered->log_list);
+ set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags);
+
+ if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
+ spin_lock_irq(&inode->ordered_tree.lock);
+ if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
+ set_bit(BTRFS_ORDERED_PENDING, &ordered->flags);
+ atomic_inc(&trans->transaction->pending_ordered);
+ }
+ spin_unlock_irq(&inode->ordered_tree.lock);
+ }
+ btrfs_put_ordered_extent(ordered);
+ }
+
+ return 0;
}
static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
@@ -4843,7 +4914,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
ret = btrfs_log_inode(trans, root,
BTRFS_I(inode),
LOG_OTHER_INODE_ALL,
- 0, LLONG_MAX, ctx);
+ ctx);
btrfs_add_delayed_iput(inode);
}
}
@@ -4885,7 +4956,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
* Check the inode's logged_trans only instead of
* btrfs_inode_in_log(). This is because the last_log_commit of
* the inode is not updated when we only log that it exists and
- * and it has the full sync bit set (see btrfs_log_inode()).
+ * it has the full sync bit set (see btrfs_log_inode()).
*/
if (BTRFS_I(inode)->logged_trans == trans->transid) {
spin_unlock(&BTRFS_I(inode)->lock);
@@ -4901,7 +4972,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
* log with the new name before we unpin it.
*/
ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
- LOG_OTHER_INODE, 0, LLONG_MAX, ctx);
+ LOG_OTHER_INODE, ctx);
if (ret) {
btrfs_add_delayed_iput(inode);
continue;
@@ -5114,8 +5185,6 @@ next_key:
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_inode *inode,
int inode_only,
- const loff_t start,
- const loff_t end,
struct btrfs_log_ctx *ctx)
{
struct btrfs_path *path;
@@ -5294,7 +5363,7 @@ log_extents:
}
if (fast_search) {
ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
- ctx, start, end);
+ ctx);
if (ret) {
err = ret;
goto out_unlock;
@@ -5303,31 +5372,8 @@ log_extents:
struct extent_map *em, *n;
write_lock(&em_tree->lock);
- /*
- * We can't just remove every em if we're called for a ranged
- * fsync - that is, one that doesn't cover the whole possible
- * file range (0 to LLONG_MAX). This is because we can have
- * em's that fall outside the range we're logging and therefore
- * their ordered operations haven't completed yet
- * (btrfs_finish_ordered_io() not invoked yet). This means we
- * didn't get their respective file extent item in the fs/subvol
- * tree yet, and need to let the next fast fsync (one which
- * consults the list of modified extent maps) find the em so
- * that it logs a matching file extent item and waits for the
- * respective ordered operation to complete (if it's still
- * running).
- *
- * Removing every em outside the range we're logging would make
- * the next fast fsync not log their matching file extent items,
- * therefore making us lose data after a log replay.
- */
- list_for_each_entry_safe(em, n, &em_tree->modified_extents,
- list) {
- const u64 mod_end = em->mod_start + em->mod_len - 1;
-
- if (em->mod_start >= start && mod_end <= end)
- list_del_init(&em->list);
- }
+ list_for_each_entry_safe(em, n, &em_tree->modified_extents, list)
+ list_del_init(&em->list);
write_unlock(&em_tree->lock);
}
@@ -5341,19 +5387,34 @@ log_extents:
}
/*
- * Don't update last_log_commit if we logged that an inode exists after
- * it was loaded to memory (full_sync bit set).
- * This is to prevent data loss when we do a write to the inode, then
- * the inode gets evicted after all delalloc was flushed, then we log
- * it exists (due to a rename for example) and then fsync it. This last
- * fsync would do nothing (not logging the extents previously written).
+ * If we are logging that an ancestor inode exists as part of logging a
+ * new name from a link or rename operation, don't mark the inode as
+ * logged - otherwise if an explicit fsync is made against an ancestor,
+ * the fsync considers the inode in the log and doesn't sync the log,
+ * resulting in the ancestor missing after a power failure unless the
+ * log was synced as part of an fsync against any other unrelated inode.
+ * So keep it simple for this case and just don't flag the ancestors as
+ * logged.
*/
- spin_lock(&inode->lock);
- inode->logged_trans = trans->transid;
- if (inode_only != LOG_INODE_EXISTS ||
- !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
- inode->last_log_commit = inode->last_sub_trans;
- spin_unlock(&inode->lock);
+ if (!ctx ||
+ !(S_ISDIR(inode->vfs_inode.i_mode) && ctx->logging_new_name &&
+ &inode->vfs_inode != ctx->inode)) {
+ spin_lock(&inode->lock);
+ inode->logged_trans = trans->transid;
+ /*
+ * Don't update last_log_commit if we logged that an inode exists
+ * after it was loaded to memory (full_sync bit set).
+ * This is to prevent data loss when we do a write to the inode,
+ * then the inode gets evicted after all delalloc was flushed,
+ * then we log it exists (due to a rename for example) and then
+ * fsync it. This last fsync would do nothing (not logging the
+ * extents previously written).
+ */
+ if (inode_only != LOG_INODE_EXISTS ||
+ !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
+ inode->last_log_commit = inode->last_sub_trans;
+ spin_unlock(&inode->lock);
+ }
out_unlock:
mutex_unlock(&inode->log_mutex);
@@ -5593,7 +5654,7 @@ process_leaf:
if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
log_mode = LOG_INODE_ALL;
ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode),
- log_mode, 0, LLONG_MAX, ctx);
+ log_mode, ctx);
if (!ret &&
btrfs_must_commit_transaction(trans, BTRFS_I(di_inode)))
ret = 1;
@@ -5737,7 +5798,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
if (ctx)
ctx->log_new_dentries = false;
ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode),
- LOG_INODE_ALL, 0, LLONG_MAX, ctx);
+ LOG_INODE_ALL, ctx);
if (!ret &&
btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode)))
ret = 1;
@@ -5788,8 +5849,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans,
if (BTRFS_I(inode)->generation > last_committed)
ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
- LOG_INODE_EXISTS,
- 0, LLONG_MAX, ctx);
+ LOG_INODE_EXISTS, ctx);
btrfs_add_delayed_iput(inode);
if (ret)
return ret;
@@ -5844,7 +5904,7 @@ static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
if (inode->generation > fs_info->last_trans_committed) {
ret = btrfs_log_inode(trans, root, inode,
- LOG_INODE_EXISTS, 0, LLONG_MAX, ctx);
+ LOG_INODE_EXISTS, ctx);
if (ret)
break;
}
@@ -5952,8 +6012,6 @@ out:
static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct dentry *parent,
- const loff_t start,
- const loff_t end,
int inode_only,
struct btrfs_log_ctx *ctx)
{
@@ -6006,7 +6064,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
if (ret)
goto end_no_trans;
- ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx);
+ ret = btrfs_log_inode(trans, root, inode, inode_only, ctx);
if (ret)
goto end_trans;
@@ -6102,15 +6160,13 @@ end_no_trans:
*/
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
struct dentry *dentry,
- const loff_t start,
- const loff_t end,
struct btrfs_log_ctx *ctx)
{
struct dentry *parent = dget_parent(dentry);
int ret;
ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent,
- start, end, LOG_INODE_ALL, ctx);
+ LOG_INODE_ALL, ctx);
dput(parent);
return ret;
@@ -6373,26 +6429,13 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
/*
* Call this after adding a new name for a file and it will properly
* update the log to reflect the new name.
- *
- * @ctx can not be NULL when @sync_log is false, and should be NULL when it's
- * true (because it's not used).
- *
- * Return value depends on whether @sync_log is true or false.
- * When true: returns BTRFS_NEED_TRANS_COMMIT if the transaction needs to be
- * committed by the caller, and BTRFS_DONT_NEED_TRANS_COMMIT
- * otherwise.
- * When false: returns BTRFS_DONT_NEED_LOG_SYNC if the caller does not need to
- * to sync the log, BTRFS_NEED_LOG_SYNC if it needs to sync the log,
- * or BTRFS_NEED_TRANS_COMMIT if the transaction needs to be
- * committed (without attempting to sync the log).
*/
-int btrfs_log_new_name(struct btrfs_trans_handle *trans,
+void btrfs_log_new_name(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, struct btrfs_inode *old_dir,
- struct dentry *parent,
- bool sync_log, struct btrfs_log_ctx *ctx)
+ struct dentry *parent)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- int ret;
+ struct btrfs_log_ctx ctx;
/*
* this will force the logging code to walk the dentry chain
@@ -6407,34 +6450,17 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
*/
if (inode->logged_trans <= fs_info->last_trans_committed &&
(!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed))
- return sync_log ? BTRFS_DONT_NEED_TRANS_COMMIT :
- BTRFS_DONT_NEED_LOG_SYNC;
-
- if (sync_log) {
- struct btrfs_log_ctx ctx2;
-
- btrfs_init_log_ctx(&ctx2, &inode->vfs_inode);
- ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX,
- LOG_INODE_EXISTS, &ctx2);
- if (ret == BTRFS_NO_LOG_SYNC)
- return BTRFS_DONT_NEED_TRANS_COMMIT;
- else if (ret)
- return BTRFS_NEED_TRANS_COMMIT;
-
- ret = btrfs_sync_log(trans, inode->root, &ctx2);
- if (ret)
- return BTRFS_NEED_TRANS_COMMIT;
- return BTRFS_DONT_NEED_TRANS_COMMIT;
- }
-
- ASSERT(ctx);
- ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX,
- LOG_INODE_EXISTS, ctx);
- if (ret == BTRFS_NO_LOG_SYNC)
- return BTRFS_DONT_NEED_LOG_SYNC;
- else if (ret)
- return BTRFS_NEED_TRANS_COMMIT;
+ return;
- return BTRFS_NEED_LOG_SYNC;
+ btrfs_init_log_ctx(&ctx, &inode->vfs_inode);
+ ctx.logging_new_name = true;
+ /*
+ * We don't care about the return value. If we fail to log the new name
+ * then we know the next attempt to sync the log will fallback to a full
+ * transaction commit (due to a call to btrfs_set_log_full_commit()), so
+ * we don't need to worry about getting a log committed that has an
+ * inconsistent state after a rename operation.
+ */
+ btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx);
}
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 132e43d29034..731bd9c029f5 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -16,8 +16,11 @@ struct btrfs_log_ctx {
int log_ret;
int log_transid;
bool log_new_dentries;
+ bool logging_new_name;
struct inode *inode;
struct list_head list;
+ /* Only used for fast fsyncs. */
+ struct list_head ordered_extents;
};
static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx,
@@ -26,8 +29,23 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx,
ctx->log_ret = 0;
ctx->log_transid = 0;
ctx->log_new_dentries = false;
+ ctx->logging_new_name = false;
ctx->inode = inode;
INIT_LIST_HEAD(&ctx->list);
+ INIT_LIST_HEAD(&ctx->ordered_extents);
+}
+
+static inline void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_ordered_extent *ordered;
+ struct btrfs_ordered_extent *tmp;
+
+ ASSERT(inode_is_locked(ctx->inode));
+
+ list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
+ list_del_init(&ordered->log_list);
+ btrfs_put_ordered_extent(ordered);
+ }
}
static inline void btrfs_set_log_full_commit(struct btrfs_trans_handle *trans)
@@ -49,8 +67,6 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
int btrfs_recover_log_trees(struct btrfs_root *tree_root);
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
struct dentry *dentry,
- const loff_t start,
- const loff_t end,
struct btrfs_log_ctx *ctx);
int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -67,16 +83,8 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
int for_rename);
void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir);
-/* Return values for btrfs_log_new_name() */
-enum {
- BTRFS_DONT_NEED_TRANS_COMMIT,
- BTRFS_NEED_TRANS_COMMIT,
- BTRFS_DONT_NEED_LOG_SYNC,
- BTRFS_NEED_LOG_SYNC,
-};
-int btrfs_log_new_name(struct btrfs_trans_handle *trans,
+void btrfs_log_new_name(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, struct btrfs_inode *old_dir,
- struct dentry *parent,
- bool sync_log, struct btrfs_log_ctx *ctx);
+ struct dentry *parent);
#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index d7670e2a9f39..58b9c419a2b6 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4,6 +4,7 @@
*/
#include <linux/sched.h>
+#include <linux/sched/mm.h>
#include <linux/bio.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
@@ -290,8 +291,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
* balance_mutex
*
*
- * Exclusive operations, BTRFS_FS_EXCL_OP
- * ======================================
+ * Exclusive operations
+ * ====================
*
* Maintains the exclusivity of the following operations that apply to the
* whole filesystem and cannot run in parallel.
@@ -317,11 +318,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
* - system power-cycle and filesystem mounted as read-only
* - filesystem or device errors leading to forced read-only
*
- * BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations.
- * During the course of Paused state, the BTRFS_FS_EXCL_OP remains set.
+ * The status of exclusive operation is set and cleared atomically.
+ * During the course of Paused state, fs_info::exclusive_operation remains set.
* A device operation in Paused or Running state can be canceled or resumed
* either by ioctl (Balance only) or when remounted as read-write.
- * BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or
+ * The exclusive status is cleared when the device operation is canceled or
* completed.
*/
@@ -355,6 +356,7 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
INIT_LIST_HEAD(&fs_devs->devices);
INIT_LIST_HEAD(&fs_devs->alloc_list);
INIT_LIST_HEAD(&fs_devs->fs_list);
+ INIT_LIST_HEAD(&fs_devs->seed_list);
if (fsid)
memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
@@ -405,7 +407,7 @@ void __exit btrfs_cleanup_fs_uuids(void)
* Returned struct is not linked onto any lists and must be destroyed using
* btrfs_free_device.
*/
-static struct btrfs_device *__alloc_device(void)
+static struct btrfs_device *__alloc_device(struct btrfs_fs_info *fs_info)
{
struct btrfs_device *dev;
@@ -432,7 +434,8 @@ static struct btrfs_device *__alloc_device(void)
btrfs_device_data_ordered_init(dev);
INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
- extent_io_tree_init(NULL, &dev->alloc_state, 0, NULL);
+ extent_io_tree_init(fs_info, &dev->alloc_state,
+ IO_TREE_DEVICE_ALLOC_STATE, NULL);
return dev;
}
@@ -592,8 +595,6 @@ static int btrfs_free_stale_devices(const char *path,
btrfs_free_device(device);
ret = 0;
- if (fs_devices->num_devices == 0)
- break;
}
mutex_unlock(&fs_devices->device_list_mutex);
@@ -940,16 +941,18 @@ static noinline struct btrfs_device *device_list_add(const char *path,
bdput(path_bdev);
mutex_unlock(&fs_devices->device_list_mutex);
btrfs_warn_in_rcu(device->fs_info,
- "duplicate device fsid:devid for %pU:%llu old:%s new:%s",
- disk_super->fsid, devid,
- rcu_str_deref(device->name), path);
+ "duplicate device %s devid %llu generation %llu scanned by %s (%d)",
+ path, devid, found_transid,
+ current->comm,
+ task_pid_nr(current));
return ERR_PTR(-EEXIST);
}
bdput(path_bdev);
btrfs_info_in_rcu(device->fs_info,
- "device fsid %pU devid %llu moved old:%s new:%s",
- disk_super->fsid, devid,
- rcu_str_deref(device->name), path);
+ "devid %llu device path %s changed to %s scanned by %s (%d)",
+ devid, rcu_str_deref(device->name),
+ path, current->comm,
+ task_pid_nr(current));
}
name = rcu_string_strdup(path, GFP_NOFS);
@@ -1034,28 +1037,21 @@ error:
return ERR_PTR(ret);
}
-/*
- * After we have read the system tree and know devids belonging to
- * this filesystem, remove the device which does not belong there.
- */
-void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
+static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
+ int step, struct btrfs_device **latest_dev)
{
struct btrfs_device *device, *next;
- struct btrfs_device *latest_dev = NULL;
- mutex_lock(&uuid_mutex);
-again:
/* This is the initialized path, it is safe to release the devices. */
list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
- if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
- &device->dev_state)) {
+ if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) {
if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
- &device->dev_state) &&
+ &device->dev_state) &&
!test_bit(BTRFS_DEV_STATE_MISSING,
&device->dev_state) &&
- (!latest_dev ||
- device->generation > latest_dev->generation)) {
- latest_dev = device;
+ (!*latest_dev ||
+ device->generation > (*latest_dev)->generation)) {
+ *latest_dev = device;
}
continue;
}
@@ -1093,10 +1089,22 @@ again:
btrfs_free_device(device);
}
- if (fs_devices->seed) {
- fs_devices = fs_devices->seed;
- goto again;
- }
+}
+
+/*
+ * After we have read the system tree and know devids belonging to this
+ * filesystem, remove the device which does not belong there.
+ */
+void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
+{
+ struct btrfs_device *latest_dev = NULL;
+ struct btrfs_fs_devices *seed_dev;
+
+ mutex_lock(&uuid_mutex);
+ __btrfs_free_extra_devids(fs_devices, step, &latest_dev);
+
+ list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
+ __btrfs_free_extra_devids(seed_dev, step, &latest_dev);
fs_devices->latest_bdev = latest_dev->bdev;
@@ -1148,47 +1156,41 @@ static void btrfs_close_one_device(struct btrfs_device *device)
ASSERT(atomic_read(&device->reada_in_flight) == 0);
}
-static int close_fs_devices(struct btrfs_fs_devices *fs_devices)
+static void close_fs_devices(struct btrfs_fs_devices *fs_devices)
{
struct btrfs_device *device, *tmp;
+ lockdep_assert_held(&uuid_mutex);
+
if (--fs_devices->opened > 0)
- return 0;
+ return;
- mutex_lock(&fs_devices->device_list_mutex);
- list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
+ list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list)
btrfs_close_one_device(device);
- }
- mutex_unlock(&fs_devices->device_list_mutex);
WARN_ON(fs_devices->open_devices);
WARN_ON(fs_devices->rw_devices);
fs_devices->opened = 0;
fs_devices->seeding = false;
-
- return 0;
+ fs_devices->fs_info = NULL;
}
-int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
+void btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
{
- struct btrfs_fs_devices *seed_devices = NULL;
- int ret;
+ LIST_HEAD(list);
+ struct btrfs_fs_devices *tmp;
mutex_lock(&uuid_mutex);
- ret = close_fs_devices(fs_devices);
- if (!fs_devices->opened) {
- seed_devices = fs_devices->seed;
- fs_devices->seed = NULL;
- }
- mutex_unlock(&uuid_mutex);
+ close_fs_devices(fs_devices);
+ if (!fs_devices->opened)
+ list_splice_init(&fs_devices->seed_list, &list);
- while (seed_devices) {
- fs_devices = seed_devices;
- seed_devices = fs_devices->seed;
+ list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) {
close_fs_devices(fs_devices);
+ list_del(&fs_devices->seed_list);
free_fs_devices(fs_devices);
}
- return ret;
+ mutex_unlock(&uuid_mutex);
}
static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
@@ -1196,17 +1198,23 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
{
struct btrfs_device *device;
struct btrfs_device *latest_dev = NULL;
+ struct btrfs_device *tmp_device;
flags |= FMODE_EXCL;
- list_for_each_entry(device, &fs_devices->devices, dev_list) {
- /* Just open everything we can; ignore failures here */
- if (btrfs_open_one_device(fs_devices, device, flags, holder))
- continue;
+ list_for_each_entry_safe(device, tmp_device, &fs_devices->devices,
+ dev_list) {
+ int ret;
- if (!latest_dev ||
- device->generation > latest_dev->generation)
+ ret = btrfs_open_one_device(fs_devices, device, flags, holder);
+ if (ret == 0 &&
+ (!latest_dev || device->generation > latest_dev->generation)) {
latest_dev = device;
+ } else if (ret == -ENODATA) {
+ fs_devices->num_devices--;
+ list_del(&device->dev_list);
+ btrfs_free_device(device);
+ }
}
if (fs_devices->open_devices == 0)
return -EINVAL;
@@ -1960,16 +1968,13 @@ static struct btrfs_device * btrfs_find_next_active_device(
* this_dev) which is active.
*/
void __cold btrfs_assign_next_active_device(struct btrfs_device *device,
- struct btrfs_device *this_dev)
+ struct btrfs_device *next_device)
{
struct btrfs_fs_info *fs_info = device->fs_info;
- struct btrfs_device *next_device;
- if (this_dev)
- next_device = this_dev;
- else
+ if (!next_device)
next_device = btrfs_find_next_active_device(fs_info->fs_devices,
- device);
+ device);
ASSERT(next_device);
if (fs_info->sb->s_bdev &&
@@ -1998,9 +2003,9 @@ static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
return num_devices;
}
-static void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
- struct block_device *bdev,
- const char *device_path)
+void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
+ struct block_device *bdev,
+ const char *device_path)
{
struct btrfs_super_block *disk_super;
int copy_num;
@@ -2039,7 +2044,7 @@ static void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
}
int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
- u64 devid)
+ u64 devid)
{
struct btrfs_device *device;
struct btrfs_fs_devices *cur_devices;
@@ -2143,7 +2148,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
if (device->bdev) {
cur_devices->open_devices--;
/* remove sysfs entry */
- btrfs_sysfs_remove_devices_dir(fs_devices, device);
+ btrfs_sysfs_remove_device(device);
}
num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
@@ -2164,14 +2169,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
btrfs_free_device(device);
if (cur_devices->open_devices == 0) {
- while (fs_devices) {
- if (fs_devices->seed == cur_devices) {
- fs_devices->seed = cur_devices->seed;
- break;
- }
- fs_devices = fs_devices->seed;
- }
- cur_devices->seed = NULL;
+ list_del_init(&cur_devices->seed_list);
close_fs_devices(cur_devices);
free_fs_devices(cur_devices);
}
@@ -2220,14 +2218,9 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)
void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
{
- struct btrfs_fs_info *fs_info = srcdev->fs_info;
struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
- if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) {
- /* zero out the old super if it is writable */
- btrfs_scratch_superblocks(fs_info, srcdev->bdev,
- srcdev->name->str);
- }
+ mutex_lock(&uuid_mutex);
btrfs_close_bdev(srcdev);
synchronize_rcu();
@@ -2235,8 +2228,6 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
/* if this is no devs we rather delete the fs_devices */
if (!fs_devices->num_devices) {
- struct btrfs_fs_devices *tmp_fs_devices;
-
/*
* On a mounted FS, num_devices can't be zero unless it's a
* seed. In case of a seed device being replaced, the replace
@@ -2245,18 +2236,11 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
*/
ASSERT(fs_devices->seeding);
- tmp_fs_devices = fs_info->fs_devices;
- while (tmp_fs_devices) {
- if (tmp_fs_devices->seed == fs_devices) {
- tmp_fs_devices->seed = fs_devices->seed;
- break;
- }
- tmp_fs_devices = tmp_fs_devices->seed;
- }
- fs_devices->seed = NULL;
+ list_del_init(&fs_devices->seed_list);
close_fs_devices(fs_devices);
free_fs_devices(fs_devices);
}
+ mutex_unlock(&uuid_mutex);
}
void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
@@ -2265,7 +2249,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
mutex_lock(&fs_devices->device_list_mutex);
- btrfs_sysfs_remove_devices_dir(fs_devices, tgtdev);
+ btrfs_sysfs_remove_device(tgtdev);
if (tgtdev->bdev)
fs_devices->open_devices--;
@@ -2374,10 +2358,20 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
if (!fs_devices->seeding)
return -EINVAL;
+ /*
+ * Private copy of the seed devices, anchored at
+ * fs_info->fs_devices->seed_list
+ */
seed_devices = alloc_fs_devices(NULL, NULL);
if (IS_ERR(seed_devices))
return PTR_ERR(seed_devices);
+ /*
+ * It's necessary to retain a copy of the original seed fs_devices in
+ * fs_uuids so that filesystems which have been seeded can successfully
+ * reference the seed device from open_seed_devices. This also supports
+ * multiple fs seed.
+ */
old_devices = clone_fs_devices(fs_devices);
if (IS_ERR(old_devices)) {
kfree(seed_devices);
@@ -2398,16 +2392,12 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
list_for_each_entry(device, &seed_devices->devices, dev_list)
device->fs_devices = seed_devices;
- mutex_lock(&fs_info->chunk_mutex);
- list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
- mutex_unlock(&fs_info->chunk_mutex);
-
fs_devices->seeding = false;
fs_devices->num_devices = 0;
fs_devices->open_devices = 0;
fs_devices->missing_devices = 0;
fs_devices->rotating = false;
- fs_devices->seed = seed_devices;
+ list_add(&seed_devices->seed_list, &fs_devices->seed_list);
generate_random_uuid(fs_devices->fsid);
memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
@@ -2510,7 +2500,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
u64 orig_super_num_devices;
int seeding_dev = 0;
int ret = 0;
- bool unlocked = false;
+ bool locked = false;
if (sb_rdonly(sb) && !fs_devices->seeding)
return -EROFS;
@@ -2524,20 +2514,20 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
seeding_dev = 1;
down_write(&sb->s_umount);
mutex_lock(&uuid_mutex);
+ locked = true;
}
- filemap_write_and_wait(bdev->bd_inode->i_mapping);
+ sync_blockdev(bdev);
- mutex_lock(&fs_devices->device_list_mutex);
- list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
if (device->bdev == bdev) {
ret = -EEXIST;
- mutex_unlock(
- &fs_devices->device_list_mutex);
+ rcu_read_unlock();
goto error;
}
}
- mutex_unlock(&fs_devices->device_list_mutex);
+ rcu_read_unlock();
device = btrfs_alloc_device(fs_info, NULL, NULL);
if (IS_ERR(device)) {
@@ -2612,9 +2602,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
btrfs_set_super_num_devices(fs_info->super_copy,
orig_super_num_devices + 1);
- /* add sysfs device entry */
- btrfs_sysfs_add_devices_dir(fs_devices, device);
-
/*
* we've got more storage, clear any full flags on the space
* infos
@@ -2622,6 +2609,10 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
btrfs_clear_space_info_full(fs_info);
mutex_unlock(&fs_info->chunk_mutex);
+
+ /* Add sysfs device entry */
+ btrfs_sysfs_add_device(device);
+
mutex_unlock(&fs_devices->device_list_mutex);
if (seeding_dev) {
@@ -2647,8 +2638,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
goto error_sysfs;
}
- btrfs_sysfs_update_sprout_fsid(fs_devices,
- fs_info->fs_devices->fsid);
+ /*
+ * fs_devices now represents the newly sprouted filesystem and
+ * its fsid has been changed by btrfs_prepare_sprout
+ */
+ btrfs_sysfs_update_sprout_fsid(fs_devices);
}
ret = btrfs_commit_transaction(trans);
@@ -2656,7 +2650,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
if (seeding_dev) {
mutex_unlock(&uuid_mutex);
up_write(&sb->s_umount);
- unlocked = true;
+ locked = false;
if (ret) /* transaction commit */
return ret;
@@ -2691,7 +2685,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
return ret;
error_sysfs:
- btrfs_sysfs_remove_devices_dir(fs_devices, device);
+ btrfs_sysfs_remove_device(device);
mutex_lock(&fs_info->fs_devices->device_list_mutex);
mutex_lock(&fs_info->chunk_mutex);
list_del_rcu(&device->dev_list);
@@ -2717,7 +2711,7 @@ error_free_device:
btrfs_free_device(device);
error:
blkdev_put(bdev, FMODE_EXCL);
- if (seeding_dev && !unlocked) {
+ if (locked) {
mutex_unlock(&uuid_mutex);
up_write(&sb->s_umount);
}
@@ -4044,7 +4038,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
/*
* rw_devices will not change at the moment, device add/delete/replace
- * are excluded by EXCL_OP
+ * are exclusive
*/
num_devices = fs_info->fs_devices->rw_devices;
@@ -4180,7 +4174,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
balance_need_close(fs_info)) {
reset_balance_state(fs_info);
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
}
wake_up(&fs_info->balance_wait_q);
@@ -4191,7 +4185,7 @@ out:
reset_balance_state(fs_info);
else
kfree(bctl);
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
return ret;
}
@@ -4293,7 +4287,7 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
* is in a paused state and must have fs_info::balance_ctl properly
* set up.
*/
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags))
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
btrfs_warn(fs_info,
"balance: cannot set exclusive op status, resume manually");
@@ -4375,7 +4369,7 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
if (fs_info->balance_ctl) {
reset_balance_state(fs_info);
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
btrfs_info(fs_info, "balance: canceled");
}
}
@@ -4462,6 +4456,7 @@ int btrfs_uuid_scan_kthread(void *data)
goto skip;
}
update_tree:
+ btrfs_release_path(path);
if (!btrfs_is_empty_uuid(root_item.uuid)) {
ret = btrfs_uuid_tree_add(trans, root_item.uuid,
BTRFS_UUID_KEY_SUBVOL,
@@ -4486,6 +4481,7 @@ update_tree:
}
skip:
+ btrfs_release_path(path);
if (trans) {
ret = btrfs_end_transaction(trans);
trans = NULL;
@@ -4493,7 +4489,6 @@ skip:
break;
}
- btrfs_release_path(path);
if (key.offset < (u64)-1) {
key.offset++;
} else if (key.type < BTRFS_ROOT_ITEM_KEY) {
@@ -4720,6 +4715,10 @@ again:
}
mutex_lock(&fs_info->chunk_mutex);
+ /* Clear all state bits beyond the shrunk device size */
+ clear_extent_bits(&device->alloc_state, new_size, (u64)-1,
+ CHUNK_STATE_MASK);
+
btrfs_device_set_disk_total_bytes(device, new_size);
if (list_empty(&device->post_commit_list))
list_add_tail(&device->post_commit_list,
@@ -6455,11 +6454,21 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
bool seed)
{
struct btrfs_device *device;
+ struct btrfs_fs_devices *seed_devs;
+
+ if (!fsid || !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ if (device->devid == devid &&
+ (!uuid || memcmp(device->uuid, uuid,
+ BTRFS_UUID_SIZE) == 0))
+ return device;
+ }
+ }
- while (fs_devices) {
+ list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
if (!fsid ||
- !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
- list_for_each_entry(device, &fs_devices->devices,
+ !memcmp(seed_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
+ list_for_each_entry(device, &seed_devs->devices,
dev_list) {
if (device->devid == devid &&
(!uuid || memcmp(device->uuid, uuid,
@@ -6467,11 +6476,8 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
return device;
}
}
- if (seed)
- fs_devices = fs_devices->seed;
- else
- return NULL;
}
+
return NULL;
}
@@ -6479,8 +6485,17 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
u64 devid, u8 *dev_uuid)
{
struct btrfs_device *device;
+ unsigned int nofs_flag;
+ /*
+ * We call this under the chunk_mutex, so we want to use NOFS for this
+ * allocation, however we don't want to change btrfs_alloc_device() to
+ * always do NOFS because we use it in a lot of other GFP_KERNEL safe
+ * places.
+ */
+ nofs_flag = memalloc_nofs_save();
device = btrfs_alloc_device(NULL, &devid, dev_uuid);
+ memalloc_nofs_restore(nofs_flag);
if (IS_ERR(device))
return device;
@@ -6517,7 +6532,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
if (WARN_ON(!devid && !fs_info))
return ERR_PTR(-EINVAL);
- dev = __alloc_device();
+ dev = __alloc_device(fs_info);
if (IS_ERR(dev))
return dev;
@@ -6713,13 +6728,11 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
lockdep_assert_held(&uuid_mutex);
ASSERT(fsid);
- fs_devices = fs_info->fs_devices->seed;
- while (fs_devices) {
+ /* This will match only for multi-device seed fs */
+ list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list)
if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
return fs_devices;
- fs_devices = fs_devices->seed;
- }
fs_devices = find_fsid(fsid, NULL);
if (!fs_devices) {
@@ -6735,6 +6748,10 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
return fs_devices;
}
+ /*
+ * Upon first call for a seed fs fsid, just create a private copy of the
+ * respective fs_devices and anchor it at fs_info->fs_devices->seed_list
+ */
fs_devices = clone_fs_devices(fs_devices);
if (IS_ERR(fs_devices))
return fs_devices;
@@ -6742,20 +6759,17 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder);
if (ret) {
free_fs_devices(fs_devices);
- fs_devices = ERR_PTR(ret);
- goto out;
+ return ERR_PTR(ret);
}
if (!fs_devices->seeding) {
close_fs_devices(fs_devices);
free_fs_devices(fs_devices);
- fs_devices = ERR_PTR(-EINVAL);
- goto out;
+ return ERR_PTR(-EINVAL);
}
- fs_devices->seed = fs_info->fs_devices->seed;
- fs_info->fs_devices->seed = fs_devices;
-out:
+ list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list);
+
return fs_devices;
}
@@ -7174,17 +7188,22 @@ error:
void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
{
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
struct btrfs_device *device;
- while (fs_devices) {
- mutex_lock(&fs_devices->device_list_mutex);
- list_for_each_entry(device, &fs_devices->devices, dev_list)
+ fs_devices->fs_info = fs_info;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_for_each_entry(device, &fs_devices->devices, dev_list)
+ device->fs_info = fs_info;
+
+ list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
+ list_for_each_entry(device, &seed_devs->devices, dev_list)
device->fs_info = fs_info;
- mutex_unlock(&fs_devices->device_list_mutex);
- fs_devices = fs_devices->seed;
+ seed_devs->fs_info = fs_info;
}
+ mutex_unlock(&fs_devices->device_list_mutex);
}
static u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
@@ -7210,17 +7229,53 @@ static void btrfs_set_dev_stats_value(struct extent_buffer *eb,
sizeof(val));
}
-int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
+static int btrfs_device_init_dev_stats(struct btrfs_device *device,
+ struct btrfs_path *path)
{
- struct btrfs_key key;
- struct btrfs_root *dev_root = fs_info->dev_root;
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_dev_stats_item *ptr;
struct extent_buffer *eb;
- int slot;
- int ret = 0;
+ struct btrfs_key key;
+ int item_size;
+ int i, ret, slot;
+
+ key.objectid = BTRFS_DEV_STATS_OBJECTID;
+ key.type = BTRFS_PERSISTENT_ITEM_KEY;
+ key.offset = device->devid;
+ ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0);
+ if (ret) {
+ for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
+ btrfs_dev_stat_set(device, i, 0);
+ device->dev_stats_valid = 1;
+ btrfs_release_path(path);
+ return ret < 0 ? ret : 0;
+ }
+ slot = path->slots[0];
+ eb = path->nodes[0];
+ item_size = btrfs_item_size_nr(eb, slot);
+
+ ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item);
+
+ for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
+ if (item_size >= (1 + i) * sizeof(__le64))
+ btrfs_dev_stat_set(device, i,
+ btrfs_dev_stats_value(eb, ptr, i));
+ else
+ btrfs_dev_stat_set(device, i, 0);
+ }
+
+ device->dev_stats_valid = 1;
+ btrfs_dev_stat_print_on_load(device);
+ btrfs_release_path(path);
+
+ return 0;
+}
+
+int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
struct btrfs_device *device;
struct btrfs_path *path = NULL;
- int i;
+ int ret = 0;
path = btrfs_alloc_path();
if (!path)
@@ -7228,43 +7283,22 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry(device, &fs_devices->devices, dev_list) {
- int item_size;
- struct btrfs_dev_stats_item *ptr;
-
- key.objectid = BTRFS_DEV_STATS_OBJECTID;
- key.type = BTRFS_PERSISTENT_ITEM_KEY;
- key.offset = device->devid;
- ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
- if (ret) {
- for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
- btrfs_dev_stat_set(device, i, 0);
- device->dev_stats_valid = 1;
- btrfs_release_path(path);
- continue;
- }
- slot = path->slots[0];
- eb = path->nodes[0];
- item_size = btrfs_item_size_nr(eb, slot);
-
- ptr = btrfs_item_ptr(eb, slot,
- struct btrfs_dev_stats_item);
-
- for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
- if (item_size >= (1 + i) * sizeof(__le64))
- btrfs_dev_stat_set(device, i,
- btrfs_dev_stats_value(eb, ptr, i));
- else
- btrfs_dev_stat_set(device, i, 0);
+ ret = btrfs_device_init_dev_stats(device, path);
+ if (ret)
+ goto out;
+ }
+ list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
+ list_for_each_entry(device, &seed_devs->devices, dev_list) {
+ ret = btrfs_device_init_dev_stats(device, path);
+ if (ret)
+ goto out;
}
-
- device->dev_stats_valid = 1;
- btrfs_dev_stat_print_on_load(device);
- btrfs_release_path(path);
}
+out:
mutex_unlock(&fs_devices->device_list_mutex);
btrfs_free_path(path);
- return ret < 0 ? ret : 0;
+ return ret;
}
static int update_dev_stat_item(struct btrfs_trans_handle *trans,
@@ -7481,24 +7515,6 @@ void btrfs_commit_device_sizes(struct btrfs_transaction *trans)
mutex_unlock(&trans->fs_info->chunk_mutex);
}
-void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info)
-{
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
- while (fs_devices) {
- fs_devices->fs_info = fs_info;
- fs_devices = fs_devices->seed;
- }
-}
-
-void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info)
-{
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
- while (fs_devices) {
- fs_devices->fs_info = NULL;
- fs_devices = fs_devices->seed;
- }
-}
-
/*
* Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10.
*/
@@ -7579,8 +7595,11 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
/* It's possible this device is a dummy for seed device */
if (dev->disk_total_bytes == 0) {
- dev = btrfs_find_device(fs_info->fs_devices->seed, devid, NULL,
- NULL, false);
+ struct btrfs_fs_devices *devs;
+
+ devs = list_first_entry(&fs_info->fs_devices->seed_list,
+ struct btrfs_fs_devices, seed_list);
+ dev = btrfs_find_device(devs, devid, NULL, NULL, false);
if (!dev) {
btrfs_err(fs_info, "failed to find seed devid %llu",
devid);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 5eea93916fbf..bf27ac07d315 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -58,7 +58,7 @@ struct btrfs_device {
struct btrfs_fs_devices *fs_devices;
struct btrfs_fs_info *fs_info;
- struct rcu_string *name;
+ struct rcu_string __rcu *name;
u64 generation;
@@ -246,7 +246,7 @@ struct btrfs_fs_devices {
*/
struct list_head alloc_list;
- struct btrfs_fs_devices *seed;
+ struct list_head seed_list;
bool seeding;
int opened;
@@ -435,7 +435,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *btrfs_scan_one_device(const char *path,
fmode_t flags, void *holder);
int btrfs_forget_devices(const char *path);
-int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
+void btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step);
void btrfs_assign_next_active_device(struct btrfs_device *device,
struct btrfs_device *this_dev);
@@ -569,10 +569,11 @@ static inline enum btrfs_raid_types btrfs_bg_flags_to_raid_index(u64 flags)
void btrfs_commit_device_sizes(struct btrfs_transaction *trans);
struct list_head * __attribute_const__ btrfs_get_fs_uuids(void);
-void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info);
-void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info);
bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
struct btrfs_device *failing_dev);
+void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
+ struct block_device *bdev,
+ const char *device_path);
int btrfs_bg_type_to_factor(u64 flags);
const char *btrfs_bg_type_to_raid_name(u64 flags);
diff --git a/fs/buffer.c b/fs/buffer.c
index 061dd202979d..5a28a6aa7f16 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1958,7 +1958,7 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
*/
set_buffer_new(bh);
set_buffer_unwritten(bh);
- /* FALLTHRU */
+ fallthrough;
case IOMAP_MAPPED:
if ((iomap->flags & IOMAP_F_NEW) ||
offset >= i_size_read(inode))
@@ -2771,16 +2771,6 @@ int nobh_writepage(struct page *page, get_block_t *get_block,
/* Is the page fully outside i_size? (truncate in progress) */
offset = i_size & (PAGE_SIZE-1);
if (page->index >= end_index+1 || !offset) {
- /*
- * The page may have dirty, unmapped buffers. For example,
- * they may have been added in ext3_writepage(). Make them
- * freeable here, so the page does not leak.
- */
-#if 0
- /* Not really sure about this - do we need this ? */
- if (page->mapping->a_ops->invalidatepage)
- page->mapping->a_ops->invalidatepage(page, offset);
-#endif
unlock_page(page);
return 0; /* don't care */
}
@@ -2975,12 +2965,6 @@ int block_write_full_page(struct page *page, get_block_t *get_block,
/* Is the page fully outside i_size? (truncate in progress) */
offset = i_size & (PAGE_SIZE-1);
if (page->index >= end_index+1 || !offset) {
- /*
- * The page may have dirty, unmapped buffers. For example,
- * they may have been added in ext3_writepage(). Make them
- * freeable here, so the page does not leak.
- */
- do_invalidatepage(page, 0, PAGE_SIZE);
unlock_page(page);
return 0; /* don't care */
}
@@ -3157,6 +3141,15 @@ int __sync_dirty_buffer(struct buffer_head *bh, int op_flags)
WARN_ON(atomic_read(&bh->b_count) < 1);
lock_buffer(bh);
if (test_clear_buffer_dirty(bh)) {
+ /*
+ * The bh should be mapped, but it might not be if the
+ * device was hot-removed. Not much we can do but fail the I/O.
+ */
+ if (!buffer_mapped(bh)) {
+ unlock_buffer(bh);
+ return -EIO;
+ }
+
get_bh(bh);
bh->b_end_io = end_buffer_write_sync;
ret = submit_bh(REQ_OP_WRITE, op_flags, bh);
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
index cf235f6eacf9..471e40156065 100644
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig
@@ -13,7 +13,7 @@ config CEPH_FS
scalable file system designed to provide high performance,
reliable access to petabytes of storage.
- More information at http://ceph.newdream.net/.
+ More information at https://ceph.io/.
If unsure, say N.
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 01ad09733ac7..6ea761c84494 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -862,8 +862,7 @@ static void writepages_finish(struct ceph_osd_request *req)
osd_data = osd_req_op_extent_osd_data(req, 0);
if (osd_data->pages_from_pool)
- mempool_free(osd_data->pages,
- ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);
+ mempool_free(osd_data->pages, ceph_wb_pagevec_pool);
else
kfree(osd_data->pages);
ceph_osdc_put_request(req);
@@ -955,10 +954,10 @@ retry:
int num_ops = 0, op_idx;
unsigned i, pvec_pages, max_pages, locked_pages = 0;
struct page **pages = NULL, **data_pages;
- mempool_t *pool = NULL; /* Becomes non-null if mempool used */
struct page *page;
pgoff_t strip_unit_end = 0;
u64 offset = 0, len = 0;
+ bool from_pool = false;
max_pages = wsize >> PAGE_SHIFT;
@@ -1057,16 +1056,16 @@ get_more_pages:
sizeof(*pages),
GFP_NOFS);
if (!pages) {
- pool = fsc->wb_pagevec_pool;
- pages = mempool_alloc(pool, GFP_NOFS);
+ from_pool = true;
+ pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS);
BUG_ON(!pages);
}
len = 0;
} else if (page->index !=
(offset + len) >> PAGE_SHIFT) {
- if (num_ops >= (pool ? CEPH_OSD_SLAB_OPS :
- CEPH_OSD_MAX_OPS)) {
+ if (num_ops >= (from_pool ? CEPH_OSD_SLAB_OPS :
+ CEPH_OSD_MAX_OPS)) {
redirty_page_for_writepage(wbc, page);
unlock_page(page);
break;
@@ -1161,7 +1160,7 @@ new_request:
offset, len);
osd_req_op_extent_osd_data_pages(req, op_idx,
data_pages, len, 0,
- !!pool, false);
+ from_pool, false);
osd_req_op_extent_update(req, op_idx, len);
len = 0;
@@ -1188,12 +1187,12 @@ new_request:
dout("writepages got pages at %llu~%llu\n", offset, len);
osd_req_op_extent_osd_data_pages(req, op_idx, data_pages, len,
- 0, !!pool, false);
+ 0, from_pool, false);
osd_req_op_extent_update(req, op_idx, len);
BUG_ON(op_idx + 1 != req->r_num_ops);
- pool = NULL;
+ from_pool = false;
if (i < locked_pages) {
BUG_ON(num_ops <= req->r_num_ops);
num_ops -= req->r_num_ops;
@@ -1204,8 +1203,8 @@ new_request:
pages = kmalloc_array(locked_pages, sizeof(*pages),
GFP_NOFS);
if (!pages) {
- pool = fsc->wb_pagevec_pool;
- pages = mempool_alloc(pool, GFP_NOFS);
+ from_pool = true;
+ pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS);
BUG_ON(!pages);
}
memcpy(pages, data_pages + i,
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 972c13aa4225..034b3f4fdd3a 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -668,6 +668,7 @@ void ceph_add_cap(struct inode *inode,
spin_lock(&session->s_cap_lock);
list_add_tail(&cap->session_caps, &session->s_caps);
session->s_nr_caps++;
+ atomic64_inc(&mdsc->metric.total_caps);
spin_unlock(&session->s_cap_lock);
} else {
spin_lock(&session->s_cap_lock);
@@ -886,8 +887,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
int have = ci->i_snap_caps;
if ((have & mask) == mask) {
- dout("__ceph_caps_issued_mask ino 0x%lx snap issued %s"
- " (mask %s)\n", ci->vfs_inode.i_ino,
+ dout("__ceph_caps_issued_mask ino 0x%llx snap issued %s"
+ " (mask %s)\n", ceph_ino(&ci->vfs_inode),
ceph_cap_string(have),
ceph_cap_string(mask));
return 1;
@@ -898,8 +899,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
if (!__cap_is_valid(cap))
continue;
if ((cap->issued & mask) == mask) {
- dout("__ceph_caps_issued_mask ino 0x%lx cap %p issued %s"
- " (mask %s)\n", ci->vfs_inode.i_ino, cap,
+ dout("__ceph_caps_issued_mask ino 0x%llx cap %p issued %s"
+ " (mask %s)\n", ceph_ino(&ci->vfs_inode), cap,
ceph_cap_string(cap->issued),
ceph_cap_string(mask));
if (touch)
@@ -910,8 +911,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
/* does a combination of caps satisfy mask? */
have |= cap->issued;
if ((have & mask) == mask) {
- dout("__ceph_caps_issued_mask ino 0x%lx combo issued %s"
- " (mask %s)\n", ci->vfs_inode.i_ino,
+ dout("__ceph_caps_issued_mask ino 0x%llx combo issued %s"
+ " (mask %s)\n", ceph_ino(&ci->vfs_inode),
ceph_cap_string(cap->issued),
ceph_cap_string(mask));
if (touch) {
@@ -1161,6 +1162,7 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
} else {
list_del_init(&cap->session_caps);
session->s_nr_caps--;
+ atomic64_dec(&mdsc->metric.total_caps);
cap->session = NULL;
removed = 1;
}
@@ -2870,7 +2872,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
struct cap_wait cw;
DEFINE_WAIT_FUNC(wait, woken_wake_function);
- cw.ino = inode->i_ino;
+ cw.ino = ceph_ino(inode);
cw.tgid = current->tgid;
cw.need = need;
cw.want = want;
@@ -4187,10 +4189,8 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
struct ceph_inode_info *ci;
dout("check_delayed_caps\n");
- while (1) {
- spin_lock(&mdsc->cap_delay_lock);
- if (list_empty(&mdsc->cap_delay_list))
- break;
+ spin_lock(&mdsc->cap_delay_lock);
+ while (!list_empty(&mdsc->cap_delay_list)) {
ci = list_first_entry(&mdsc->cap_delay_list,
struct ceph_inode_info,
i_cap_delay_list);
@@ -4200,13 +4200,13 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
list_del_init(&ci->i_cap_delay_list);
inode = igrab(&ci->vfs_inode);
- spin_unlock(&mdsc->cap_delay_lock);
-
if (inode) {
+ spin_unlock(&mdsc->cap_delay_lock);
dout("check_delayed_caps on %p\n", inode);
ceph_check_caps(ci, 0, NULL);
/* avoid calling iput_final() in tick thread */
ceph_async_iput(inode);
+ spin_lock(&mdsc->cap_delay_lock);
}
}
spin_unlock(&mdsc->cap_delay_lock);
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 070ed8481340..3e3fcda9b276 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -145,7 +145,7 @@ static int metric_show(struct seq_file *s, void *p)
struct ceph_fs_client *fsc = s->private;
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_client_metric *m = &mdsc->metric;
- int i, nr_caps = 0;
+ int nr_caps = 0;
s64 total, sum, avg, min, max, sq;
seq_printf(s, "item total avg_lat(us) min_lat(us) max_lat(us) stdev(us)\n");
@@ -190,17 +190,7 @@ static int metric_show(struct seq_file *s, void *p)
percpu_counter_sum(&m->d_lease_mis),
percpu_counter_sum(&m->d_lease_hit));
- mutex_lock(&mdsc->mutex);
- for (i = 0; i < mdsc->max_sessions; i++) {
- struct ceph_mds_session *s;
-
- s = __ceph_lookup_mds_session(mdsc, i);
- if (!s)
- continue;
- nr_caps += s->s_nr_caps;
- ceph_put_mds_session(s);
- }
- mutex_unlock(&mdsc->mutex);
+ nr_caps = atomic64_read(&m->total_caps);
seq_printf(s, "%-14s%-16d%-16lld%lld\n", "caps", nr_caps,
percpu_counter_sum(&m->i_caps_mis),
percpu_counter_sum(&m->i_caps_hit));
@@ -212,7 +202,7 @@ static int caps_show_cb(struct inode *inode, struct ceph_cap *cap, void *p)
{
struct seq_file *s = p;
- seq_printf(s, "0x%-17lx%-17s%-17s\n", inode->i_ino,
+ seq_printf(s, "0x%-17llx%-17s%-17s\n", ceph_ino(inode),
ceph_cap_string(cap->issued),
ceph_cap_string(cap->implemented));
return 0;
@@ -257,7 +247,7 @@ static int caps_show(struct seq_file *s, void *p)
spin_lock(&mdsc->caps_list_lock);
list_for_each_entry(cw, &mdsc->cap_wait_list, list) {
- seq_printf(s, "%-13d0x%-17lx%-17s%-17s\n", cw->tgid, cw->ino,
+ seq_printf(s, "%-13d0x%-17llx%-17s%-17s\n", cw->tgid, cw->ino,
ceph_cap_string(cw->need),
ceph_cap_string(cw->want));
}
@@ -272,7 +262,7 @@ static int mds_sessions_show(struct seq_file *s, void *ptr)
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_auth_client *ac = fsc->client->monc.auth;
struct ceph_options *opt = fsc->client->options;
- int mds = -1;
+ int mds;
mutex_lock(&mdsc->mutex);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 39f5311404b0..d72e4a12bb69 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -259,9 +259,7 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx,
dentry, dentry, d_inode(dentry));
ctx->pos = di->offset;
if (!dir_emit(ctx, dentry->d_name.name,
- dentry->d_name.len,
- ceph_translate_ino(dentry->d_sb,
- d_inode(dentry)->i_ino),
+ dentry->d_name.len, ceph_present_inode(d_inode(dentry)),
d_inode(dentry)->i_mode >> 12)) {
dput(dentry);
err = 0;
@@ -324,18 +322,21 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
/* always start with . and .. */
if (ctx->pos == 0) {
dout("readdir off 0 -> '.'\n");
- if (!dir_emit(ctx, ".", 1,
- ceph_translate_ino(inode->i_sb, inode->i_ino),
+ if (!dir_emit(ctx, ".", 1, ceph_present_inode(inode),
inode->i_mode >> 12))
return 0;
ctx->pos = 1;
}
if (ctx->pos == 1) {
- ino_t ino = parent_ino(file->f_path.dentry);
+ u64 ino;
+ struct dentry *dentry = file->f_path.dentry;
+
+ spin_lock(&dentry->d_lock);
+ ino = ceph_present_inode(dentry->d_parent->d_inode);
+ spin_unlock(&dentry->d_lock);
+
dout("readdir off 1 -> '..'\n");
- if (!dir_emit(ctx, "..", 2,
- ceph_translate_ino(inode->i_sb, ino),
- inode->i_mode >> 12))
+ if (!dir_emit(ctx, "..", 2, ino, inode->i_mode >> 12))
return 0;
ctx->pos = 2;
}
@@ -507,9 +508,6 @@ more:
}
for (; i < rinfo->dir_nr; i++) {
struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
- struct ceph_vino vino;
- ino_t ino;
- u32 ftype;
BUG_ON(rde->offset < ctx->pos);
@@ -519,13 +517,10 @@ more:
rde->name_len, rde->name, &rde->inode.in);
BUG_ON(!rde->inode.in);
- ftype = le32_to_cpu(rde->inode.in->mode) >> 12;
- vino.ino = le64_to_cpu(rde->inode.in->ino);
- vino.snap = le64_to_cpu(rde->inode.in->snapid);
- ino = ceph_vino_to_ino(vino);
if (!dir_emit(ctx, rde->name, rde->name_len,
- ceph_translate_ino(inode->i_sb, ino), ftype)) {
+ ceph_present_ino(inode->i_sb, le64_to_cpu(rde->inode.in->ino)),
+ le32_to_cpu(rde->inode.in->mode) >> 12)) {
dout("filldir stopping us...\n");
return 0;
}
@@ -930,6 +925,10 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
req->r_num_caps = 2;
req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL;
req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+ if (as_ctx.pagelist) {
+ req->r_pagelist = as_ctx.pagelist;
+ as_ctx.pagelist = NULL;
+ }
err = ceph_mdsc_do_request(mdsc, dir, req);
if (!err && !req->r_reply_info.head->is_dentry)
err = ceph_handle_notrace_create(dir, dentry);
@@ -1157,7 +1156,7 @@ retry:
if (try_async && op == CEPH_MDS_OP_UNLINK &&
(req->r_dir_caps = get_caps_for_async_unlink(dir, dentry))) {
- dout("async unlink on %lu/%.*s caps=%s", dir->i_ino,
+ dout("async unlink on %llu/%.*s caps=%s", ceph_ino(dir),
dentry->d_name.len, dentry->d_name.name,
ceph_cap_string(req->r_dir_caps));
set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags);
@@ -1741,7 +1740,7 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
case -ENOENT:
if (d_really_is_negative(dentry))
valid = 1;
- /* Fallthrough */
+ fallthrough;
default:
break;
}
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 160644ddaeed..3f4c993dfc6f 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -252,7 +252,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
case S_IFREG:
ceph_fscache_register_inode_cookie(inode);
ceph_fscache_file_set_cookie(inode, file);
- /* fall through */
+ fallthrough;
case S_IFDIR:
ret = ceph_init_file_info(inode, file, fmode,
S_ISDIR(inode->i_mode));
@@ -630,8 +630,8 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry,
} else {
struct dentry *dn;
- dout("%s d_adding new inode 0x%llx to 0x%lx/%s\n", __func__,
- vino.ino, dir->i_ino, dentry->d_name.name);
+ dout("%s d_adding new inode 0x%llx to 0x%llx/%s\n", __func__,
+ vino.ino, ceph_ino(dir), dentry->d_name.name);
ceph_dir_clear_ordered(dir);
ceph_init_inode_acls(inode, as_ctx);
if (inode->i_state & I_NEW) {
@@ -1538,6 +1538,7 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
struct inode *inode = file_inode(filp);
struct ceph_inode_info *ci = ceph_inode(inode);
struct page *pinned_page = NULL;
+ bool direct_lock = iocb->ki_flags & IOCB_DIRECT;
ssize_t ret;
int want, got = 0;
int retry_op = 0, read = 0;
@@ -1546,7 +1547,7 @@ again:
dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, inode);
- if (iocb->ki_flags & IOCB_DIRECT)
+ if (direct_lock)
ceph_start_io_direct(inode);
else
ceph_start_io_read(inode);
@@ -1603,7 +1604,7 @@ again:
}
ceph_put_cap_refs(ci, got);
- if (iocb->ki_flags & IOCB_DIRECT)
+ if (direct_lock)
ceph_end_io_direct(inode);
else
ceph_end_io_read(inode);
@@ -2506,6 +2507,7 @@ const struct file_operations ceph_file_fops = {
.mmap = ceph_mmap,
.fsync = ceph_fsync,
.lock = ceph_lock,
+ .setlease = simple_nosetlease,
.flock = ceph_flock,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 357c937699d5..d163fa96cb40 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -41,8 +41,10 @@ static void ceph_inode_work(struct work_struct *work);
*/
static int ceph_set_ino_cb(struct inode *inode, void *data)
{
- ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
- inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ ci->i_vino = *(struct ceph_vino *)data;
+ inode->i_ino = ceph_vino_to_ino_t(ci->i_vino);
inode_set_iversion_raw(inode, 0);
return 0;
}
@@ -50,17 +52,14 @@ static int ceph_set_ino_cb(struct inode *inode, void *data)
struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
{
struct inode *inode;
- ino_t t = ceph_vino_to_ino(vino);
- inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino);
+ inode = iget5_locked(sb, (unsigned long)vino.ino, ceph_ino_compare,
+ ceph_set_ino_cb, &vino);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (inode->i_state & I_NEW)
- dout("get_inode created new inode %p %llx.%llx ino %llx\n",
- inode, ceph_vinop(inode), (u64)inode->i_ino);
- dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino,
- vino.snap, inode);
+ dout("get_inode on %llu=%llx.%llx got %p new %d\n", ceph_present_inode(inode),
+ ceph_vinop(inode), inode, !!(inode->i_state & I_NEW));
return inode;
}
@@ -2378,7 +2377,7 @@ int ceph_getattr(const struct path *path, struct kstat *stat,
}
generic_fillattr(inode, stat);
- stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
+ stat->ino = ceph_present_inode(inode);
/*
* btime on newly-allocated inodes is 0, so if this is still set to
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index a50497142e59..4a26862d7667 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1103,8 +1103,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
frag.frag, mds);
if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
CEPH_MDS_STATE_ACTIVE) {
- if (mode == USE_ANY_MDS &&
- !ceph_mdsmap_is_laggy(mdsc->mdsmap,
+ if (!ceph_mdsmap_is_laggy(mdsc->mdsmap,
mds))
goto out;
}
@@ -1168,7 +1167,7 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq)
static const unsigned char feature_bits[] = CEPHFS_FEATURES_CLIENT_SUPPORTED;
#define FEATURE_BYTES(c) (DIV_ROUND_UP((size_t)feature_bits[c - 1] + 1, 64) * 8)
-static void encode_supported_features(void **p, void *end)
+static int encode_supported_features(void **p, void *end)
{
static const size_t count = ARRAY_SIZE(feature_bits);
@@ -1176,16 +1175,64 @@ static void encode_supported_features(void **p, void *end)
size_t i;
size_t size = FEATURE_BYTES(count);
- BUG_ON(*p + 4 + size > end);
+ if (WARN_ON_ONCE(*p + 4 + size > end))
+ return -ERANGE;
+
ceph_encode_32(p, size);
memset(*p, 0, size);
for (i = 0; i < count; i++)
((unsigned char*)(*p))[i / 8] |= BIT(feature_bits[i] % 8);
*p += size;
} else {
- BUG_ON(*p + 4 > end);
+ if (WARN_ON_ONCE(*p + 4 > end))
+ return -ERANGE;
+
ceph_encode_32(p, 0);
}
+
+ return 0;
+}
+
+static const unsigned char metric_bits[] = CEPHFS_METRIC_SPEC_CLIENT_SUPPORTED;
+#define METRIC_BYTES(cnt) (DIV_ROUND_UP((size_t)metric_bits[cnt - 1] + 1, 64) * 8)
+static int encode_metric_spec(void **p, void *end)
+{
+ static const size_t count = ARRAY_SIZE(metric_bits);
+
+ /* header */
+ if (WARN_ON_ONCE(*p + 2 > end))
+ return -ERANGE;
+
+ ceph_encode_8(p, 1); /* version */
+ ceph_encode_8(p, 1); /* compat */
+
+ if (count > 0) {
+ size_t i;
+ size_t size = METRIC_BYTES(count);
+
+ if (WARN_ON_ONCE(*p + 4 + 4 + size > end))
+ return -ERANGE;
+
+ /* metric spec info length */
+ ceph_encode_32(p, 4 + size);
+
+ /* metric spec */
+ ceph_encode_32(p, size);
+ memset(*p, 0, size);
+ for (i = 0; i < count; i++)
+ ((unsigned char *)(*p))[i / 8] |= BIT(metric_bits[i] % 8);
+ *p += size;
+ } else {
+ if (WARN_ON_ONCE(*p + 4 + 4 > end))
+ return -ERANGE;
+
+ /* metric spec info length */
+ ceph_encode_32(p, 4);
+ /* metric spec */
+ ceph_encode_32(p, 0);
+ }
+
+ return 0;
}
/*
@@ -1203,6 +1250,7 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
struct ceph_mount_options *fsopt = mdsc->fsc->mount_options;
size_t size, count;
void *p, *end;
+ int ret;
const char* metadata[][2] = {
{"hostname", mdsc->nodename},
@@ -1227,12 +1275,19 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
size = FEATURE_BYTES(count);
extra_bytes += 4 + size;
+ /* metric spec */
+ size = 0;
+ count = ARRAY_SIZE(metric_bits);
+ if (count > 0)
+ size = METRIC_BYTES(count);
+ extra_bytes += 2 + 4 + 4 + size;
+
/* Allocate the message */
msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + extra_bytes,
GFP_NOFS, false);
if (!msg) {
pr_err("create_session_msg ENOMEM creating msg\n");
- return NULL;
+ return ERR_PTR(-ENOMEM);
}
p = msg->front.iov_base;
end = p + msg->front.iov_len;
@@ -1245,9 +1300,9 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
* Serialize client metadata into waiting buffer space, using
* the format that userspace expects for map<string, string>
*
- * ClientSession messages with metadata are v3
+ * ClientSession messages with metadata are v4
*/
- msg->hdr.version = cpu_to_le16(3);
+ msg->hdr.version = cpu_to_le16(4);
msg->hdr.compat_version = cpu_to_le16(1);
/* The write pointer, following the session_head structure */
@@ -1269,7 +1324,20 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
p += val_len;
}
- encode_supported_features(&p, end);
+ ret = encode_supported_features(&p, end);
+ if (ret) {
+ pr_err("encode_supported_features failed!\n");
+ ceph_msg_put(msg);
+ return ERR_PTR(ret);
+ }
+
+ ret = encode_metric_spec(&p, end);
+ if (ret) {
+ pr_err("encode_metric_spec failed!\n");
+ ceph_msg_put(msg);
+ return ERR_PTR(ret);
+ }
+
msg->front.iov_len = p - msg->front.iov_base;
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
@@ -1297,8 +1365,8 @@ static int __open_session(struct ceph_mds_client *mdsc,
/* send connect message */
msg = create_session_open_msg(mdsc, session->s_seq);
- if (!msg)
- return -ENOMEM;
+ if (IS_ERR(msg))
+ return PTR_ERR(msg);
ceph_con_send(&session->s_con, msg);
return 0;
}
@@ -1312,6 +1380,7 @@ static struct ceph_mds_session *
__open_export_target_session(struct ceph_mds_client *mdsc, int target)
{
struct ceph_mds_session *session;
+ int ret;
session = __ceph_lookup_mds_session(mdsc, target);
if (!session) {
@@ -1320,8 +1389,11 @@ __open_export_target_session(struct ceph_mds_client *mdsc, int target)
return session;
}
if (session->s_state == CEPH_MDS_SESSION_NEW ||
- session->s_state == CEPH_MDS_SESSION_CLOSING)
- __open_session(mdsc, session);
+ session->s_state == CEPH_MDS_SESSION_CLOSING) {
+ ret = __open_session(mdsc, session);
+ if (ret)
+ return ERR_PTR(ret);
+ }
return session;
}
@@ -1485,6 +1557,7 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session,
cap->session = NULL;
list_del_init(&cap->session_caps);
session->s_nr_caps--;
+ atomic64_dec(&session->s_mdsc->metric.total_caps);
if (cap->queue_release)
__ceph_queue_cap_release(session, cap);
else
@@ -1785,8 +1858,7 @@ static void renewed_caps(struct ceph_mds_client *mdsc,
/*
* send a session close request
*/
-static int request_close_session(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session)
+static int request_close_session(struct ceph_mds_session *session)
{
struct ceph_msg *msg;
@@ -1809,7 +1881,7 @@ static int __close_session(struct ceph_mds_client *mdsc,
if (session->s_state >= CEPH_MDS_SESSION_CLOSING)
return 0;
session->s_state = CEPH_MDS_SESSION_CLOSING;
- return request_close_session(mdsc, session);
+ return request_close_session(session);
}
static bool drop_negative_children(struct dentry *dentry)
@@ -2520,7 +2592,12 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
ceph_encode_copy(&p, &ts, sizeof(ts));
}
- BUG_ON(p > end);
+ if (WARN_ON_ONCE(p > end)) {
+ ceph_msg_put(msg);
+ msg = ERR_PTR(-ERANGE);
+ goto out_free2;
+ }
+
msg->front.iov_len = p - msg->front.iov_base;
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
@@ -2756,7 +2833,9 @@ static void __do_request(struct ceph_mds_client *mdsc,
}
if (session->s_state == CEPH_MDS_SESSION_NEW ||
session->s_state == CEPH_MDS_SESSION_CLOSING) {
- __open_session(mdsc, session);
+ err = __open_session(mdsc, session);
+ if (err)
+ goto out_session;
/* retry the same mds later */
if (random)
req->r_resend_mds = mds;
@@ -3279,8 +3358,10 @@ static void handle_session(struct ceph_mds_session *session,
goto bad;
/* version >= 3, feature bits */
ceph_decode_32_safe(&p, end, len, bad);
- ceph_decode_64_safe(&p, end, features, bad);
- p += len - sizeof(features);
+ if (len) {
+ ceph_decode_64_safe(&p, end, features, bad);
+ p += len - sizeof(features);
+ }
}
mutex_lock(&mdsc->mutex);
@@ -3310,6 +3391,8 @@ static void handle_session(struct ceph_mds_session *session,
session->s_state = CEPH_MDS_SESSION_OPEN;
session->s_features = features;
renewed_caps(mdsc, session, 0);
+ if (test_bit(CEPHFS_FEATURE_METRIC_COLLECT, &session->s_features))
+ metric_schedule_delayed(&mdsc->metric);
wake = 1;
if (mdsc->stopping)
__close_session(mdsc, session);
@@ -4263,6 +4346,30 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc)
ceph_force_reconnect(fsc->sb);
}
+bool check_session_state(struct ceph_mds_session *s)
+{
+ if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
+ dout("resending session close request for mds%d\n",
+ s->s_mds);
+ request_close_session(s);
+ return false;
+ }
+ if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
+ if (s->s_state == CEPH_MDS_SESSION_OPEN) {
+ s->s_state = CEPH_MDS_SESSION_HUNG;
+ pr_info("mds%d hung\n", s->s_mds);
+ }
+ }
+ if (s->s_state == CEPH_MDS_SESSION_NEW ||
+ s->s_state == CEPH_MDS_SESSION_RESTARTING ||
+ s->s_state == CEPH_MDS_SESSION_CLOSED ||
+ s->s_state == CEPH_MDS_SESSION_REJECTED)
+ /* this mds is failed or recovering, just wait */
+ return false;
+
+ return true;
+}
+
/*
* delayed work -- periodically trim expired leases, renew caps with mds
*/
@@ -4283,6 +4390,9 @@ static void delayed_work(struct work_struct *work)
dout("mdsc delayed_work\n");
+ if (mdsc->stopping)
+ return;
+
mutex_lock(&mdsc->mutex);
renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
renew_caps = time_after_eq(jiffies, HZ*renew_interval +
@@ -4294,23 +4404,8 @@ static void delayed_work(struct work_struct *work)
struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
if (!s)
continue;
- if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
- dout("resending session close request for mds%d\n",
- s->s_mds);
- request_close_session(mdsc, s);
- ceph_put_mds_session(s);
- continue;
- }
- if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
- if (s->s_state == CEPH_MDS_SESSION_OPEN) {
- s->s_state = CEPH_MDS_SESSION_HUNG;
- pr_info("mds%d hung\n", s->s_mds);
- }
- }
- if (s->s_state == CEPH_MDS_SESSION_NEW ||
- s->s_state == CEPH_MDS_SESSION_RESTARTING ||
- s->s_state == CEPH_MDS_SESSION_REJECTED) {
- /* this mds is failed or recovering, just wait */
+
+ if (!check_session_state(s)) {
ceph_put_mds_session(s);
continue;
}
@@ -4359,7 +4454,6 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
goto err_mdsc;
}
- fsc->mdsc = mdsc;
init_completion(&mdsc->safe_umount_waiters);
init_waitqueue_head(&mdsc->session_close_wq);
INIT_LIST_HEAD(&mdsc->waiting_for_map);
@@ -4414,6 +4508,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
strscpy(mdsc->nodename, utsname()->nodename,
sizeof(mdsc->nodename));
+
+ fsc->mdsc = mdsc;
return 0;
err_mdsmap:
@@ -4657,7 +4753,16 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
{
dout("stop\n");
- cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
+ /*
+ * Make sure the delayed work stopped before releasing
+ * the resources.
+ *
+ * Because the cancel_delayed_work_sync() will only
+ * guarantee that the work finishes executing. But the
+ * delayed work will re-arm itself again after that.
+ */
+ flush_delayed_work(&mdsc->delayed_work);
+
if (mdsc->mdsmap)
ceph_mdsmap_destroy(mdsc->mdsmap);
kfree(mdsc->sessions);
@@ -4680,6 +4785,7 @@ void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
ceph_metric_destroy(&mdsc->metric);
+ flush_delayed_work(&mdsc->metric.delayed_work);
fsc->mdsc = NULL;
kfree(mdsc);
dout("mdsc_destroy %p done\n", mdsc);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 5e0c4073a6be..658800605bfb 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -18,6 +18,7 @@
#include <linux/ceph/auth.h>
#include "metric.h"
+#include "super.h"
/* The first 8 bits are reserved for old ceph releases */
enum ceph_feature_type {
@@ -27,8 +28,9 @@ enum ceph_feature_type {
CEPHFS_FEATURE_LAZY_CAP_WANTED,
CEPHFS_FEATURE_MULTI_RECONNECT,
CEPHFS_FEATURE_DELEG_INO,
+ CEPHFS_FEATURE_METRIC_COLLECT,
- CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_DELEG_INO,
+ CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_METRIC_COLLECT,
};
/*
@@ -42,6 +44,7 @@ enum ceph_feature_type {
CEPHFS_FEATURE_LAZY_CAP_WANTED, \
CEPHFS_FEATURE_MULTI_RECONNECT, \
CEPHFS_FEATURE_DELEG_INO, \
+ CEPHFS_FEATURE_METRIC_COLLECT, \
\
CEPHFS_FEATURE_MAX, \
}
@@ -369,7 +372,7 @@ struct ceph_quotarealm_inode {
struct cap_wait {
struct list_head list;
- unsigned long ino;
+ u64 ino;
pid_t tgid;
int need;
int want;
@@ -476,6 +479,8 @@ struct ceph_mds_client {
extern const char *ceph_mds_op_name(int op);
+extern bool check_session_state(struct ceph_mds_session *s);
+
extern struct ceph_mds_session *
__ceph_lookup_mds_session(struct ceph_mds_client *, int mds);
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 889627817e52..e4aba6c6d3b5 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -120,7 +120,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
const void *start = *p;
int i, j, n;
int err;
- u8 mdsmap_v, mdsmap_cv;
+ u8 mdsmap_v;
u16 mdsmap_ev;
m = kzalloc(sizeof(*m), GFP_NOFS);
@@ -129,7 +129,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
ceph_decode_need(p, end, 1 + 1, bad);
mdsmap_v = ceph_decode_8(p);
- mdsmap_cv = ceph_decode_8(p);
+ *p += sizeof(u8); /* mdsmap_cv */
if (mdsmap_v >= 4) {
u32 mdsmap_len;
ceph_decode_32_safe(p, end, mdsmap_len, bad);
@@ -174,7 +174,6 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
u64 global_id;
u32 namelen;
s32 mds, inc, state;
- u64 state_seq;
u8 info_v;
void *info_end = NULL;
struct ceph_entity_addr addr;
@@ -189,9 +188,8 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
info_v= ceph_decode_8(p);
if (info_v >= 4) {
u32 info_len;
- u8 info_cv;
ceph_decode_need(p, end, 1 + sizeof(u32), bad);
- info_cv = ceph_decode_8(p);
+ *p += sizeof(u8); /* info_cv */
info_len = ceph_decode_32(p);
info_end = *p + info_len;
if (info_end > end)
@@ -210,7 +208,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
mds = ceph_decode_32(p);
inc = ceph_decode_32(p);
state = ceph_decode_32(p);
- state_seq = ceph_decode_64(p);
+ *p += sizeof(u64); /* state_seq */
err = ceph_decode_entity_addr(p, end, &addr);
if (err)
goto corrupt;
diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c
index 9217f35bc2b9..2466b261fba2 100644
--- a/fs/ceph/metric.c
+++ b/fs/ceph/metric.c
@@ -1,10 +1,150 @@
/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/ceph/ceph_debug.h>
#include <linux/types.h>
#include <linux/percpu_counter.h>
#include <linux/math64.h>
#include "metric.h"
+#include "mds_client.h"
+
+static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *s)
+{
+ struct ceph_metric_head *head;
+ struct ceph_metric_cap *cap;
+ struct ceph_metric_read_latency *read;
+ struct ceph_metric_write_latency *write;
+ struct ceph_metric_metadata_latency *meta;
+ struct ceph_client_metric *m = &mdsc->metric;
+ u64 nr_caps = atomic64_read(&m->total_caps);
+ struct ceph_msg *msg;
+ struct timespec64 ts;
+ s64 sum;
+ s32 items = 0;
+ s32 len;
+
+ len = sizeof(*head) + sizeof(*cap) + sizeof(*read) + sizeof(*write)
+ + sizeof(*meta);
+
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_METRICS, len, GFP_NOFS, true);
+ if (!msg) {
+ pr_err("send metrics to mds%d, failed to allocate message\n",
+ s->s_mds);
+ return false;
+ }
+
+ head = msg->front.iov_base;
+
+ /* encode the cap metric */
+ cap = (struct ceph_metric_cap *)(head + 1);
+ cap->type = cpu_to_le32(CLIENT_METRIC_TYPE_CAP_INFO);
+ cap->ver = 1;
+ cap->compat = 1;
+ cap->data_len = cpu_to_le32(sizeof(*cap) - 10);
+ cap->hit = cpu_to_le64(percpu_counter_sum(&mdsc->metric.i_caps_hit));
+ cap->mis = cpu_to_le64(percpu_counter_sum(&mdsc->metric.i_caps_mis));
+ cap->total = cpu_to_le64(nr_caps);
+ items++;
+
+ /* encode the read latency metric */
+ read = (struct ceph_metric_read_latency *)(cap + 1);
+ read->type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_LATENCY);
+ read->ver = 1;
+ read->compat = 1;
+ read->data_len = cpu_to_le32(sizeof(*read) - 10);
+ sum = m->read_latency_sum;
+ jiffies_to_timespec64(sum, &ts);
+ read->sec = cpu_to_le32(ts.tv_sec);
+ read->nsec = cpu_to_le32(ts.tv_nsec);
+ items++;
+
+ /* encode the write latency metric */
+ write = (struct ceph_metric_write_latency *)(read + 1);
+ write->type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_LATENCY);
+ write->ver = 1;
+ write->compat = 1;
+ write->data_len = cpu_to_le32(sizeof(*write) - 10);
+ sum = m->write_latency_sum;
+ jiffies_to_timespec64(sum, &ts);
+ write->sec = cpu_to_le32(ts.tv_sec);
+ write->nsec = cpu_to_le32(ts.tv_nsec);
+ items++;
+
+ /* encode the metadata latency metric */
+ meta = (struct ceph_metric_metadata_latency *)(write + 1);
+ meta->type = cpu_to_le32(CLIENT_METRIC_TYPE_METADATA_LATENCY);
+ meta->ver = 1;
+ meta->compat = 1;
+ meta->data_len = cpu_to_le32(sizeof(*meta) - 10);
+ sum = m->metadata_latency_sum;
+ jiffies_to_timespec64(sum, &ts);
+ meta->sec = cpu_to_le32(ts.tv_sec);
+ meta->nsec = cpu_to_le32(ts.tv_nsec);
+ items++;
+
+ put_unaligned_le32(items, &head->num);
+ msg->front.iov_len = len;
+ msg->hdr.version = cpu_to_le16(1);
+ msg->hdr.compat_version = cpu_to_le16(1);
+ msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+ dout("client%llu send metrics to mds%d\n",
+ ceph_client_gid(mdsc->fsc->client), s->s_mds);
+ ceph_con_send(&s->s_con, msg);
+
+ return true;
+}
+
+
+static void metric_get_session(struct ceph_mds_client *mdsc)
+{
+ struct ceph_mds_session *s;
+ int i;
+
+ mutex_lock(&mdsc->mutex);
+ for (i = 0; i < mdsc->max_sessions; i++) {
+ s = __ceph_lookup_mds_session(mdsc, i);
+ if (!s)
+ continue;
+
+ /*
+ * Skip it if MDS doesn't support the metric collection,
+ * or the MDS will close the session's socket connection
+ * directly when it get this message.
+ */
+ if (check_session_state(s) &&
+ test_bit(CEPHFS_FEATURE_METRIC_COLLECT, &s->s_features)) {
+ mdsc->metric.session = s;
+ break;
+ }
+
+ ceph_put_mds_session(s);
+ }
+ mutex_unlock(&mdsc->mutex);
+}
+
+static void metric_delayed_work(struct work_struct *work)
+{
+ struct ceph_client_metric *m =
+ container_of(work, struct ceph_client_metric, delayed_work.work);
+ struct ceph_mds_client *mdsc =
+ container_of(m, struct ceph_mds_client, metric);
+
+ if (mdsc->stopping)
+ return;
+
+ if (!m->session || !check_session_state(m->session)) {
+ if (m->session) {
+ ceph_put_mds_session(m->session);
+ m->session = NULL;
+ }
+ metric_get_session(mdsc);
+ }
+ if (m->session) {
+ ceph_mdsc_send_metrics(mdsc, m->session);
+ metric_schedule_delayed(m);
+ }
+}
int ceph_metric_init(struct ceph_client_metric *m)
{
@@ -22,6 +162,7 @@ int ceph_metric_init(struct ceph_client_metric *m)
if (ret)
goto err_d_lease_mis;
+ atomic64_set(&m->total_caps, 0);
ret = percpu_counter_init(&m->i_caps_hit, 0, GFP_KERNEL);
if (ret)
goto err_i_caps_hit;
@@ -51,6 +192,9 @@ int ceph_metric_init(struct ceph_client_metric *m)
m->total_metadatas = 0;
m->metadata_latency_sum = 0;
+ m->session = NULL;
+ INIT_DELAYED_WORK(&m->delayed_work, metric_delayed_work);
+
return 0;
err_i_caps_mis:
@@ -72,6 +216,11 @@ void ceph_metric_destroy(struct ceph_client_metric *m)
percpu_counter_destroy(&m->i_caps_hit);
percpu_counter_destroy(&m->d_lease_mis);
percpu_counter_destroy(&m->d_lease_hit);
+
+ cancel_delayed_work_sync(&m->delayed_work);
+
+ if (m->session)
+ ceph_put_mds_session(m->session);
}
static inline void __update_latency(ktime_t *totalp, ktime_t *lsump,
diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h
index ccd81285a450..1d0959d669d7 100644
--- a/fs/ceph/metric.h
+++ b/fs/ceph/metric.h
@@ -6,12 +6,91 @@
#include <linux/percpu_counter.h>
#include <linux/ktime.h>
+extern bool disable_send_metrics;
+
+enum ceph_metric_type {
+ CLIENT_METRIC_TYPE_CAP_INFO,
+ CLIENT_METRIC_TYPE_READ_LATENCY,
+ CLIENT_METRIC_TYPE_WRITE_LATENCY,
+ CLIENT_METRIC_TYPE_METADATA_LATENCY,
+ CLIENT_METRIC_TYPE_DENTRY_LEASE,
+
+ CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_DENTRY_LEASE,
+};
+
+/*
+ * This will always have the highest metric bit value
+ * as the last element of the array.
+ */
+#define CEPHFS_METRIC_SPEC_CLIENT_SUPPORTED { \
+ CLIENT_METRIC_TYPE_CAP_INFO, \
+ CLIENT_METRIC_TYPE_READ_LATENCY, \
+ CLIENT_METRIC_TYPE_WRITE_LATENCY, \
+ CLIENT_METRIC_TYPE_METADATA_LATENCY, \
+ \
+ CLIENT_METRIC_TYPE_MAX, \
+}
+
+/* metric caps header */
+struct ceph_metric_cap {
+ __le32 type; /* ceph metric type */
+
+ __u8 ver;
+ __u8 compat;
+
+ __le32 data_len; /* length of sizeof(hit + mis + total) */
+ __le64 hit;
+ __le64 mis;
+ __le64 total;
+} __packed;
+
+/* metric read latency header */
+struct ceph_metric_read_latency {
+ __le32 type; /* ceph metric type */
+
+ __u8 ver;
+ __u8 compat;
+
+ __le32 data_len; /* length of sizeof(sec + nsec) */
+ __le32 sec;
+ __le32 nsec;
+} __packed;
+
+/* metric write latency header */
+struct ceph_metric_write_latency {
+ __le32 type; /* ceph metric type */
+
+ __u8 ver;
+ __u8 compat;
+
+ __le32 data_len; /* length of sizeof(sec + nsec) */
+ __le32 sec;
+ __le32 nsec;
+} __packed;
+
+/* metric metadata latency header */
+struct ceph_metric_metadata_latency {
+ __le32 type; /* ceph metric type */
+
+ __u8 ver;
+ __u8 compat;
+
+ __le32 data_len; /* length of sizeof(sec + nsec) */
+ __le32 sec;
+ __le32 nsec;
+} __packed;
+
+struct ceph_metric_head {
+ __le32 num; /* the number of metrics that will be sent */
+} __packed;
+
/* This is the global metrics */
struct ceph_client_metric {
atomic64_t total_dentries;
struct percpu_counter d_lease_hit;
struct percpu_counter d_lease_mis;
+ atomic64_t total_caps;
struct percpu_counter i_caps_hit;
struct percpu_counter i_caps_mis;
@@ -35,8 +114,20 @@ struct ceph_client_metric {
ktime_t metadata_latency_sq_sum;
ktime_t metadata_latency_min;
ktime_t metadata_latency_max;
+
+ struct ceph_mds_session *session;
+ struct delayed_work delayed_work; /* delayed work */
};
+static inline void metric_schedule_delayed(struct ceph_client_metric *m)
+{
+ if (disable_send_metrics)
+ return;
+
+ /* per second */
+ schedule_delayed_work(&m->delayed_work, round_jiffies_relative(HZ));
+}
+
extern int ceph_metric_init(struct ceph_client_metric *m);
extern void ceph_metric_destroy(struct ceph_client_metric *m);
diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
index 198ddde5c1e6..cc2c4d40b022 100644
--- a/fs/ceph/quota.c
+++ b/fs/ceph/quota.c
@@ -23,12 +23,12 @@ static inline bool ceph_has_realms_with_quotas(struct inode *inode)
{
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
struct super_block *sb = mdsc->fsc->sb;
+ struct inode *root = d_inode(sb->s_root);
if (atomic64_read(&mdsc->quotarealms_count) > 0)
return true;
/* if root is the real CephFS root, we don't have quota realms */
- if (sb->s_root->d_inode &&
- (sb->s_root->d_inode->i_ino == CEPH_INO_ROOT))
+ if (root && ceph_ino(root) == CEPH_INO_ROOT)
return false;
/* otherwise, we can't know for sure */
return true;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index c9784eb1159a..7ec0e6d03d10 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -27,6 +27,9 @@
#include <linux/ceph/auth.h>
#include <linux/ceph/debugfs.h>
+static DEFINE_SPINLOCK(ceph_fsc_lock);
+static LIST_HEAD(ceph_fsc_list);
+
/*
* Ceph superblock operations
*
@@ -634,8 +637,6 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
struct ceph_options *opt)
{
struct ceph_fs_client *fsc;
- int page_count;
- size_t size;
int err;
fsc = kzalloc(sizeof(*fsc), GFP_KERNEL);
@@ -683,18 +684,12 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
if (!fsc->cap_wq)
goto fail_inode_wq;
- /* set up mempools */
- err = -ENOMEM;
- page_count = fsc->mount_options->wsize >> PAGE_SHIFT;
- size = sizeof (struct page *) * (page_count ? page_count : 1);
- fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10, size);
- if (!fsc->wb_pagevec_pool)
- goto fail_cap_wq;
+ spin_lock(&ceph_fsc_lock);
+ list_add_tail(&fsc->metric_wakeup, &ceph_fsc_list);
+ spin_unlock(&ceph_fsc_lock);
return fsc;
-fail_cap_wq:
- destroy_workqueue(fsc->cap_wq);
fail_inode_wq:
destroy_workqueue(fsc->inode_wq);
fail_client:
@@ -717,12 +712,14 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
{
dout("destroy_fs_client %p\n", fsc);
+ spin_lock(&ceph_fsc_lock);
+ list_del(&fsc->metric_wakeup);
+ spin_unlock(&ceph_fsc_lock);
+
ceph_mdsc_destroy(fsc);
destroy_workqueue(fsc->inode_wq);
destroy_workqueue(fsc->cap_wq);
- mempool_destroy(fsc->wb_pagevec_pool);
-
destroy_mount_options(fsc->mount_options);
ceph_destroy_client(fsc->client);
@@ -741,6 +738,7 @@ struct kmem_cache *ceph_dentry_cachep;
struct kmem_cache *ceph_file_cachep;
struct kmem_cache *ceph_dir_file_cachep;
struct kmem_cache *ceph_mds_request_cachep;
+mempool_t *ceph_wb_pagevec_pool;
static void ceph_inode_init_once(void *foo)
{
@@ -785,6 +783,10 @@ static int __init init_caches(void)
if (!ceph_mds_request_cachep)
goto bad_mds_req;
+ ceph_wb_pagevec_pool = mempool_create_kmalloc_pool(10, CEPH_MAX_WRITE_SIZE >> PAGE_SHIFT);
+ if (!ceph_wb_pagevec_pool)
+ goto bad_pagevec_pool;
+
error = ceph_fscache_register();
if (error)
goto bad_fscache;
@@ -793,6 +795,8 @@ static int __init init_caches(void)
bad_fscache:
kmem_cache_destroy(ceph_mds_request_cachep);
+bad_pagevec_pool:
+ mempool_destroy(ceph_wb_pagevec_pool);
bad_mds_req:
kmem_cache_destroy(ceph_dir_file_cachep);
bad_dir_file:
@@ -823,12 +827,13 @@ static void destroy_caches(void)
kmem_cache_destroy(ceph_file_cachep);
kmem_cache_destroy(ceph_dir_file_cachep);
kmem_cache_destroy(ceph_mds_request_cachep);
+ mempool_destroy(ceph_wb_pagevec_pool);
ceph_fscache_unregister();
}
/*
- * ceph_umount_begin - initiate forced umount. Tear down down the
+ * ceph_umount_begin - initiate forced umount. Tear down the
* mount, skipping steps that may hang while waiting for server(s).
*/
static void ceph_umount_begin(struct super_block *sb)
@@ -1282,6 +1287,37 @@ static void __exit exit_ceph(void)
destroy_caches();
}
+static int param_set_metrics(const char *val, const struct kernel_param *kp)
+{
+ struct ceph_fs_client *fsc;
+ int ret;
+
+ ret = param_set_bool(val, kp);
+ if (ret) {
+ pr_err("Failed to parse sending metrics switch value '%s'\n",
+ val);
+ return ret;
+ } else if (!disable_send_metrics) {
+ // wake up all the mds clients
+ spin_lock(&ceph_fsc_lock);
+ list_for_each_entry(fsc, &ceph_fsc_list, metric_wakeup) {
+ metric_schedule_delayed(&fsc->mdsc->metric);
+ }
+ spin_unlock(&ceph_fsc_lock);
+ }
+
+ return 0;
+}
+
+static const struct kernel_param_ops param_ops_metrics = {
+ .set = param_set_metrics,
+ .get = param_get_bool,
+};
+
+bool disable_send_metrics = false;
+module_param_cb(disable_send_metrics, &param_ops_metrics, &disable_send_metrics, 0644);
+MODULE_PARM_DESC(disable_send_metrics, "Enable sending perf metrics to ceph cluster (default: on)");
+
module_init(init_ceph);
module_exit(exit_ceph);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 5a6cdd39bc10..a3995ebe0623 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -101,6 +101,8 @@ struct ceph_mount_options {
struct ceph_fs_client {
struct super_block *sb;
+ struct list_head metric_wakeup;
+
struct ceph_mount_options *mount_options;
struct ceph_client *client;
@@ -116,8 +118,6 @@ struct ceph_fs_client {
struct ceph_mds_client *mdsc;
- /* writeback */
- mempool_t *wb_pagevec_pool;
atomic_long_t writeback_count;
struct workqueue_struct *inode_wq;
@@ -353,7 +353,7 @@ struct ceph_inode_info {
unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */
/*
- * Link to the the auth cap's session's s_cap_dirty list. s_cap_dirty
+ * Link to the auth cap's session's s_cap_dirty list. s_cap_dirty
* is protected by the mdsc->cap_dirty_lock, but each individual item
* is also protected by the inode's i_ceph_lock. Walking s_cap_dirty
* requires the mdsc->cap_dirty_lock. List presence for an item can
@@ -457,15 +457,7 @@ ceph_vino(const struct inode *inode)
return ceph_inode(inode)->i_vino;
}
-/*
- * ino_t is <64 bits on many architectures, blech.
- *
- * i_ino (kernel inode) st_ino (userspace)
- * i386 32 32
- * x86_64+ino32 64 32
- * x86_64 64 64
- */
-static inline u32 ceph_ino_to_ino32(__u64 vino)
+static inline u32 ceph_ino_to_ino32(u64 vino)
{
u32 ino = vino & 0xffffffff;
ino ^= vino >> 32;
@@ -475,35 +467,18 @@ static inline u32 ceph_ino_to_ino32(__u64 vino)
}
/*
- * kernel i_ino value
+ * Inode numbers in cephfs are 64 bits, but inode->i_ino is 32-bits on
+ * some arches. We generally do not use this value inside the ceph driver, but
+ * we do want to set it to something, so that generic vfs code has an
+ * appropriate value for tracepoints and the like.
*/
-static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
+static inline ino_t ceph_vino_to_ino_t(struct ceph_vino vino)
{
-#if BITS_PER_LONG == 32
- return ceph_ino_to_ino32(vino.ino);
-#else
+ if (sizeof(ino_t) == sizeof(u32))
+ return ceph_ino_to_ino32(vino.ino);
return (ino_t)vino.ino;
-#endif
}
-/*
- * user-visible ino (stat, filldir)
- */
-#if BITS_PER_LONG == 32
-static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
-{
- return ino;
-}
-#else
-static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
-{
- if (ceph_test_mount_opt(ceph_sb_to_client(sb), INO32))
- ino = ceph_ino_to_ino32(ino);
- return ino;
-}
-#endif
-
-
/* for printf-style formatting */
#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
@@ -511,11 +486,34 @@ static inline u64 ceph_ino(struct inode *inode)
{
return ceph_inode(inode)->i_vino.ino;
}
+
static inline u64 ceph_snap(struct inode *inode)
{
return ceph_inode(inode)->i_vino.snap;
}
+/**
+ * ceph_present_ino - format an inode number for presentation to userland
+ * @sb: superblock where the inode lives
+ * @ino: inode number to (possibly) convert
+ *
+ * If the user mounted with the ino32 option, then the 64-bit value needs
+ * to be converted to something that can fit inside 32 bits. Note that
+ * internal kernel code never uses this value, so this is entirely for
+ * userland consumption.
+ */
+static inline u64 ceph_present_ino(struct super_block *sb, u64 ino)
+{
+ if (unlikely(ceph_test_mount_opt(ceph_sb_to_client(sb), INO32)))
+ return ceph_ino_to_ino32(ino);
+ return ino;
+}
+
+static inline u64 ceph_present_inode(struct inode *inode)
+{
+ return ceph_present_ino(inode->i_sb, ceph_ino(inode));
+}
+
static inline int ceph_ino_compare(struct inode *inode, void *data)
{
struct ceph_vino *pvino = (struct ceph_vino *)data;
@@ -524,11 +522,16 @@ static inline int ceph_ino_compare(struct inode *inode, void *data)
ci->i_vino.snap == pvino->snap;
}
+
static inline struct inode *ceph_find_inode(struct super_block *sb,
struct ceph_vino vino)
{
- ino_t t = ceph_vino_to_ino(vino);
- return ilookup5(sb, t, ceph_ino_compare, &vino);
+ /*
+ * NB: The hashval will be run through the fs/inode.c hash function
+ * anyway, so there is no need to squash the inode number down to
+ * 32-bits first. Just use low-order bits on arches with 32-bit long.
+ */
+ return ilookup5(sb, (unsigned long)vino.ino, ceph_ino_compare, &vino);
}
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 71ee34d160c3..3a733ac33d9b 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -497,10 +497,10 @@ static int __set_xattr(struct ceph_inode_info *ci,
kfree(*newxattr);
*newxattr = NULL;
if (xattr->should_free_val)
- kfree((void *)xattr->val);
+ kfree(xattr->val);
if (update_xattr) {
- kfree((void *)name);
+ kfree(name);
name = xattr->name;
}
ci->i_xattrs.names_size -= xattr->name_len;
@@ -566,9 +566,9 @@ static void __free_xattr(struct ceph_inode_xattr *xattr)
BUG_ON(!xattr);
if (xattr->should_free_name)
- kfree((void *)xattr->name);
+ kfree(xattr->name);
if (xattr->should_free_val)
- kfree((void *)xattr->val);
+ kfree(xattr->val);
kfree(xattr);
}
@@ -582,9 +582,9 @@ static int __remove_xattr(struct ceph_inode_info *ci,
rb_erase(&xattr->node, &ci->i_xattrs.index);
if (xattr->should_free_name)
- kfree((void *)xattr->name);
+ kfree(xattr->name);
if (xattr->should_free_val)
- kfree((void *)xattr->val);
+ kfree(xattr->val);
ci->i_xattrs.names_size -= xattr->name_len;
ci->i_xattrs.vals_size -= xattr->val_len;
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index b296964b8afa..b565d83ba89e 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -2031,4 +2031,19 @@ static inline bool is_smb1_server(struct TCP_Server_Info *server)
return strcmp(server->vals->version_string, SMB1_VERSION_STRING) == 0;
}
+static inline bool is_tcon_dfs(struct cifs_tcon *tcon)
+{
+ /*
+ * For SMB1, see MS-CIFS 2.4.55 SMB_COM_TREE_CONNECT_ANDX (0x75) and MS-CIFS 3.3.4.4 DFS
+ * Subsystem Notifies That a Share Is a DFS Share.
+ *
+ * For SMB2+, see MS-SMB2 2.2.10 SMB2 TREE_CONNECT Response and MS-SMB2 3.3.4.14 Server
+ * Application Updates a Share.
+ */
+ if (!tcon || !tcon->ses || !tcon->ses->server)
+ return false;
+ return is_smb1_server(tcon->ses->server) ? tcon->Flags & SMB_SHARE_IS_IN_DFS :
+ tcon->share_flags & (SHI1005_FLAGS_DFS | SHI1005_FLAGS_DFS_ROOT);
+}
+
#endif /* _CIFS_GLOB_H */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 0e763d2dcf16..0496934feecb 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -581,7 +581,7 @@ should_set_ext_sec_flag(enum securityEnum sectype)
if (global_secflags &
(CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP))
return true;
- /* Fallthrough */
+ fallthrough;
default:
return false;
}
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 0ad1309e88d3..a5731dd6e656 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1378,25 +1378,25 @@ static int cifs_parse_security_flavors(char *value,
return 1;
case Opt_sec_krb5i:
vol->sign = true;
- /* Fallthrough */
+ fallthrough;
case Opt_sec_krb5:
vol->sectype = Kerberos;
break;
case Opt_sec_ntlmsspi:
vol->sign = true;
- /* Fallthrough */
+ fallthrough;
case Opt_sec_ntlmssp:
vol->sectype = RawNTLMSSP;
break;
case Opt_sec_ntlmi:
vol->sign = true;
- /* Fallthrough */
+ fallthrough;
case Opt_ntlm:
vol->sectype = NTLM;
break;
case Opt_sec_ntlmv2i:
vol->sign = true;
- /* Fallthrough */
+ fallthrough;
case Opt_sec_ntlmv2:
vol->sectype = NTLMv2;
break;
@@ -2187,7 +2187,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
vol->password = NULL;
break;
}
- /* Fallthrough - to Opt_pass below.*/
+ fallthrough; /* to Opt_pass below */
case Opt_pass:
/* Obtain the value string */
value = strchr(data, '=');
@@ -4886,6 +4886,7 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *vol)
full_path = build_unc_path_to_root(vol, cifs_sb, !!count);
if (IS_ERR(full_path)) {
rc = PTR_ERR(full_path);
+ full_path = NULL;
break;
}
/* Chase referral */
@@ -4908,7 +4909,7 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *vol)
if (!tcon)
continue;
/* Make sure that requests go through new root servers */
- if (tcon->share_flags & (SHI1005_FLAGS_DFS | SHI1005_FLAGS_DFS_ROOT)) {
+ if (is_tcon_dfs(tcon)) {
put_root_ses(root_ses);
set_root_ses(cifs_sb, ses, &root_ses);
}
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 3989d08396ac..1f75b25e559a 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1017,6 +1017,8 @@ handle_mnt_opt:
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MODE_FROM_SID) {
rc = cifs_acl_to_fattr(cifs_sb, &fattr, *inode, true,
full_path, fid);
+ if (rc == -EREMOTE)
+ rc = 0;
if (rc) {
cifs_dbg(FYI, "%s: Get mode from SID failed. rc=%d\n",
__func__, rc);
@@ -1025,6 +1027,8 @@ handle_mnt_opt:
} else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
rc = cifs_acl_to_fattr(cifs_sb, &fattr, *inode, false,
full_path, fid);
+ if (rc == -EREMOTE)
+ rc = 0;
if (rc) {
cifs_dbg(FYI, "%s: Getting ACL failed with error: %d\n",
__func__, rc);
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 69cd5856621b..de564368a887 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -798,7 +798,7 @@ cifs_select_sectype(struct TCP_Server_Info *server, enum securityEnum requested)
if ((server->sec_kerberos || server->sec_mskerberos) &&
(global_secflags & CIFSSEC_MAY_KRB5))
return Kerberos;
- /* Fallthrough */
+ fallthrough;
default:
return Unspecified;
}
@@ -815,7 +815,7 @@ cifs_select_sectype(struct TCP_Server_Info *server, enum securityEnum requested)
default:
break;
}
- /* Fallthrough - to attempt LANMAN authentication next */
+ fallthrough; /* to attempt LANMAN authentication next */
case CIFS_NEGFLAVOR_LANMAN:
switch (requested) {
case LANMAN:
@@ -823,7 +823,7 @@ cifs_select_sectype(struct TCP_Server_Info *server, enum securityEnum requested)
case Unspecified:
if (global_secflags & CIFSSEC_MAY_LANMAN)
return LANMAN;
- /* Fallthrough */
+ fallthrough;
default:
return Unspecified;
}
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index b9db73687eaa..eba01d0908dd 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -115,6 +115,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
vars->oparms.fid = &fid;
vars->oparms.reconnect = false;
vars->oparms.mode = mode;
+ vars->oparms.cifs_sb = cifs_sb;
rqst[num_rqst].rq_iov = &vars->open_iov[0];
rqst[num_rqst].rq_nvec = SMB2_CREATE_IOV_SIZE;
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 32f90dc82c84..d44df8f95bcd 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -1208,7 +1208,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
rqst[1].rq_iov = si_iov;
rqst[1].rq_nvec = 1;
- len = sizeof(ea) + ea_name_len + ea_value_len + 1;
+ len = sizeof(*ea) + ea_name_len + ea_value_len + 1;
ea = kzalloc(len, GFP_KERNEL);
if (ea == NULL) {
rc = -ENOMEM;
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 24c2ac360591..96c172d94fba 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -1101,7 +1101,7 @@ smb2_select_sectype(struct TCP_Server_Info *server, enum securityEnum requested)
if ((server->sec_kerberos || server->sec_mskerberos) &&
(global_secflags & CIFSSEC_MAY_KRB5))
return Kerberos;
- /* Fallthrough */
+ fallthrough;
default:
return Unspecified;
}
@@ -3913,7 +3913,7 @@ smb2_readv_callback(struct mid_q_entry *mid)
case MID_RESPONSE_MALFORMED:
credits.value = le16_to_cpu(shdr->CreditRequest);
credits.instance = server->reconnect_instance;
- /* fall through */
+ fallthrough;
default:
rdata->result = -EIO;
}
@@ -4146,7 +4146,7 @@ smb2_writev_callback(struct mid_q_entry *mid)
case MID_RESPONSE_MALFORMED:
credits.value = le16_to_cpu(rsp->sync_hdr.CreditRequest);
credits.instance = server->reconnect_instance;
- /* fall through */
+ fallthrough;
default:
wdata->result = -EIO;
break;
diff --git a/fs/compat.c b/fs/compat.c
deleted file mode 100644
index 436d228cf71c..000000000000
--- a/fs/compat.c
+++ /dev/null
@@ -1,132 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/fs/compat.c
- *
- * Kernel compatibililty routines for e.g. 32 bit syscall support
- * on 64 bit kernels.
- *
- * Copyright (C) 2002 Stephen Rothwell, IBM Corporation
- * Copyright (C) 1997-2000 Jakub Jelinek (jakub@redhat.com)
- * Copyright (C) 1998 Eddie C. Dost (ecd@skynet.be)
- * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs
- * Copyright (C) 2003 Pavel Machek (pavel@ucw.cz)
- */
-
-#include <linux/compat.h>
-#include <linux/nfs4_mount.h>
-#include <linux/syscalls.h>
-#include <linux/slab.h>
-#include <linux/uaccess.h>
-#include "internal.h"
-
-struct compat_nfs_string {
- compat_uint_t len;
- compat_uptr_t data;
-};
-
-static inline void compat_nfs_string(struct nfs_string *dst,
- struct compat_nfs_string *src)
-{
- dst->data = compat_ptr(src->data);
- dst->len = src->len;
-}
-
-struct compat_nfs4_mount_data_v1 {
- compat_int_t version;
- compat_int_t flags;
- compat_int_t rsize;
- compat_int_t wsize;
- compat_int_t timeo;
- compat_int_t retrans;
- compat_int_t acregmin;
- compat_int_t acregmax;
- compat_int_t acdirmin;
- compat_int_t acdirmax;
- struct compat_nfs_string client_addr;
- struct compat_nfs_string mnt_path;
- struct compat_nfs_string hostname;
- compat_uint_t host_addrlen;
- compat_uptr_t host_addr;
- compat_int_t proto;
- compat_int_t auth_flavourlen;
- compat_uptr_t auth_flavours;
-};
-
-static int do_nfs4_super_data_conv(void *raw_data)
-{
- int version = *(compat_uint_t *) raw_data;
-
- if (version == 1) {
- struct compat_nfs4_mount_data_v1 *raw = raw_data;
- struct nfs4_mount_data *real = raw_data;
-
- /* copy the fields backwards */
- real->auth_flavours = compat_ptr(raw->auth_flavours);
- real->auth_flavourlen = raw->auth_flavourlen;
- real->proto = raw->proto;
- real->host_addr = compat_ptr(raw->host_addr);
- real->host_addrlen = raw->host_addrlen;
- compat_nfs_string(&real->hostname, &raw->hostname);
- compat_nfs_string(&real->mnt_path, &raw->mnt_path);
- compat_nfs_string(&real->client_addr, &raw->client_addr);
- real->acdirmax = raw->acdirmax;
- real->acdirmin = raw->acdirmin;
- real->acregmax = raw->acregmax;
- real->acregmin = raw->acregmin;
- real->retrans = raw->retrans;
- real->timeo = raw->timeo;
- real->wsize = raw->wsize;
- real->rsize = raw->rsize;
- real->flags = raw->flags;
- real->version = raw->version;
- }
-
- return 0;
-}
-
-#define NFS4_NAME "nfs4"
-
-COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name,
- const char __user *, dir_name,
- const char __user *, type, compat_ulong_t, flags,
- const void __user *, data)
-{
- char *kernel_type;
- void *options;
- char *kernel_dev;
- int retval;
-
- kernel_type = copy_mount_string(type);
- retval = PTR_ERR(kernel_type);
- if (IS_ERR(kernel_type))
- goto out;
-
- kernel_dev = copy_mount_string(dev_name);
- retval = PTR_ERR(kernel_dev);
- if (IS_ERR(kernel_dev))
- goto out1;
-
- options = copy_mount_options(data);
- retval = PTR_ERR(options);
- if (IS_ERR(options))
- goto out2;
-
- if (kernel_type && options) {
- if (!strcmp(kernel_type, NFS4_NAME)) {
- retval = -EINVAL;
- if (do_nfs4_super_data_conv(options))
- goto out3;
- }
- }
-
- retval = do_mount(kernel_dev, dir_name, kernel_type, flags, options);
-
- out3:
- kfree(options);
- out2:
- kfree(kernel_dev);
- out1:
- kfree(kernel_type);
- out:
- return retval;
-}
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index cb733652ecca..ca2273727225 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1688,11 +1688,11 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence)
switch (whence) {
case 1:
offset += file->f_pos;
- /* fall through */
+ fallthrough;
case 0:
if (offset >= 0)
break;
- /* fall through */
+ fallthrough;
default:
return -EINVAL;
}
diff --git a/fs/coredump.c b/fs/coredump.c
index 7237f07ff6be..76e7c10edfc0 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -153,10 +153,10 @@ int cn_esc_printf(struct core_name *cn, const char *fmt, ...)
return ret;
}
-static int cn_print_exe_file(struct core_name *cn)
+static int cn_print_exe_file(struct core_name *cn, bool name_only)
{
struct file *exe_file;
- char *pathbuf, *path;
+ char *pathbuf, *path, *ptr;
int ret;
exe_file = get_mm_exe_file(current->mm);
@@ -175,6 +175,11 @@ static int cn_print_exe_file(struct core_name *cn)
goto free_buf;
}
+ if (name_only) {
+ ptr = strrchr(path, '/');
+ if (ptr)
+ path = ptr + 1;
+ }
ret = cn_esc_printf(cn, "%s", path);
free_buf:
@@ -301,12 +306,16 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm,
utsname()->nodename);
up_read(&uts_sem);
break;
- /* executable */
+ /* executable, could be changed by prctl PR_SET_NAME etc */
case 'e':
err = cn_esc_printf(cn, "%s", current->comm);
break;
+ /* file name of executable */
+ case 'f':
+ err = cn_print_exe_file(cn, true);
+ break;
case 'E':
- err = cn_print_exe_file(cn);
+ err = cn_print_exe_file(cn, false);
break;
/* core limit size */
case 'c':
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 9212325763b0..4ef3f714046a 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -343,9 +343,11 @@ void fscrypt_msg(const struct inode *inode, const char *level,
va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
- if (inode)
+ if (inode && inode->i_ino)
printk("%sfscrypt (%s, inode %lu): %pV\n",
level, inode->i_sb->s_id, inode->i_ino, &vaf);
+ else if (inode)
+ printk("%sfscrypt (%s): %pV\n", level, inode->i_sb->s_id, &vaf);
else
printk("%sfscrypt: %pV\n", level, &vaf);
va_end(args);
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index 011830f84d8d..1fbe6c24d705 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -61,15 +61,6 @@ struct fscrypt_nokey_name {
*/
#define FSCRYPT_NOKEY_NAME_MAX offsetofend(struct fscrypt_nokey_name, sha256)
-static void fscrypt_do_sha256(const u8 *data, unsigned int data_len, u8 *result)
-{
- struct sha256_state sctx;
-
- sha256_init(&sctx);
- sha256_update(&sctx, data, data_len);
- sha256_final(&sctx, result);
-}
-
static inline bool fscrypt_is_dot_dotdot(const struct qstr *str)
{
if (str->len == 1 && str->name[0] == '.')
@@ -242,11 +233,11 @@ static int base64_decode(const char *src, int len, u8 *dst)
return cp - dst;
}
-bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len,
- u32 max_len, u32 *encrypted_len_ret)
+bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
+ u32 orig_len, u32 max_len,
+ u32 *encrypted_len_ret)
{
- const struct fscrypt_info *ci = inode->i_crypt_info;
- int padding = 4 << (fscrypt_policy_flags(&ci->ci_policy) &
+ int padding = 4 << (fscrypt_policy_flags(policy) &
FSCRYPT_POLICY_FLAGS_PAD_MASK);
u32 encrypted_len;
@@ -260,8 +251,6 @@ bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len,
/**
* fscrypt_fname_alloc_buffer() - allocate a buffer for presented filenames
- * @inode: inode of the parent directory (for regular filenames)
- * or of the symlink (for symlink targets)
* @max_encrypted_len: maximum length of encrypted filenames the buffer will be
* used to present
* @crypto_str: (output) buffer to allocate
@@ -271,8 +260,7 @@ bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len,
*
* Return: 0 on success, -errno on failure
*/
-int fscrypt_fname_alloc_buffer(const struct inode *inode,
- u32 max_encrypted_len,
+int fscrypt_fname_alloc_buffer(u32 max_encrypted_len,
struct fscrypt_str *crypto_str)
{
const u32 max_encoded_len = BASE64_CHARS(FSCRYPT_NOKEY_NAME_MAX);
@@ -369,9 +357,9 @@ int fscrypt_fname_disk_to_usr(const struct inode *inode,
} else {
memcpy(nokey_name.bytes, iname->name, sizeof(nokey_name.bytes));
/* Compute strong hash of remaining part of name. */
- fscrypt_do_sha256(&iname->name[sizeof(nokey_name.bytes)],
- iname->len - sizeof(nokey_name.bytes),
- nokey_name.sha256);
+ sha256(&iname->name[sizeof(nokey_name.bytes)],
+ iname->len - sizeof(nokey_name.bytes),
+ nokey_name.sha256);
size = FSCRYPT_NOKEY_NAME_MAX;
}
oname->len = base64_encode((const u8 *)&nokey_name, size, oname->name);
@@ -394,9 +382,9 @@ EXPORT_SYMBOL(fscrypt_fname_disk_to_usr);
* directory's encryption key, then @iname is the plaintext, so we encrypt it to
* get the disk_name.
*
- * Else, for keyless @lookup operations, @iname is the presented ciphertext, so
- * we decode it to get the fscrypt_nokey_name. Non-@lookup operations will be
- * impossible in this case, so we fail them with ENOKEY.
+ * Else, for keyless @lookup operations, @iname should be a no-key name, so we
+ * decode it to get the struct fscrypt_nokey_name. Non-@lookup operations will
+ * be impossible in this case, so we fail them with ENOKEY.
*
* If successful, fscrypt_free_filename() must be called later to clean up.
*
@@ -421,7 +409,8 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
return ret;
if (fscrypt_has_encryption_key(dir)) {
- if (!fscrypt_fname_encrypted_size(dir, iname->len,
+ if (!fscrypt_fname_encrypted_size(&dir->i_crypt_info->ci_policy,
+ iname->len,
dir->i_sb->s_cop->max_namelen,
&fname->crypto_buf.len))
return -ENAMETOOLONG;
@@ -440,7 +429,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
}
if (!lookup)
return -ENOKEY;
- fname->is_ciphertext_name = true;
+ fname->is_nokey_name = true;
/*
* We don't have the key and we are doing a lookup; decode the
@@ -499,7 +488,7 @@ bool fscrypt_match_name(const struct fscrypt_name *fname,
{
const struct fscrypt_nokey_name *nokey_name =
(const void *)fname->crypto_buf.name;
- u8 sha256[SHA256_DIGEST_SIZE];
+ u8 digest[SHA256_DIGEST_SIZE];
if (likely(fname->disk_name.name)) {
if (de_name_len != fname->disk_name.len)
@@ -510,9 +499,9 @@ bool fscrypt_match_name(const struct fscrypt_name *fname,
return false;
if (memcmp(de_name, nokey_name->bytes, sizeof(nokey_name->bytes)))
return false;
- fscrypt_do_sha256(&de_name[sizeof(nokey_name->bytes)],
- de_name_len - sizeof(nokey_name->bytes), sha256);
- return !memcmp(sha256, nokey_name->sha256, sizeof(sha256));
+ sha256(&de_name[sizeof(nokey_name->bytes)],
+ de_name_len - sizeof(nokey_name->bytes), digest);
+ return !memcmp(digest, nokey_name->sha256, sizeof(digest));
}
EXPORT_SYMBOL_GPL(fscrypt_match_name);
@@ -541,7 +530,7 @@ EXPORT_SYMBOL_GPL(fscrypt_fname_siphash);
* Validate dentries in encrypted directories to make sure we aren't potentially
* caching stale dentries after a key has been added.
*/
-static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
+int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
{
struct dentry *dir;
int err;
@@ -549,17 +538,17 @@ static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
/*
* Plaintext names are always valid, since fscrypt doesn't support
- * reverting to ciphertext names without evicting the directory's inode
+ * reverting to no-key names without evicting the directory's inode
* -- which implies eviction of the dentries in the directory.
*/
- if (!(dentry->d_flags & DCACHE_ENCRYPTED_NAME))
+ if (!(dentry->d_flags & DCACHE_NOKEY_NAME))
return 1;
/*
- * Ciphertext name; valid if the directory's key is still unavailable.
+ * No-key name; valid if the directory's key is still unavailable.
*
- * Although fscrypt forbids rename() on ciphertext names, we still must
- * use dget_parent() here rather than use ->d_parent directly. That's
+ * Although fscrypt forbids rename() on no-key names, we still must use
+ * dget_parent() here rather than use ->d_parent directly. That's
* because a corrupted fs image may contain directory hard links, which
* the VFS handles by moving the directory's dentry tree in the dcache
* each time ->lookup() finds the directory and it already has a dentry
@@ -580,6 +569,7 @@ static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
return valid;
}
+EXPORT_SYMBOL_GPL(fscrypt_d_revalidate);
const struct dentry_operations fscrypt_d_ops = {
.d_revalidate = fscrypt_d_revalidate,
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 8117a61b6f55..4f5806a3b73d 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -97,7 +97,6 @@ static inline const u8 *fscrypt_context_nonce(const union fscrypt_context *ctx)
return NULL;
}
-#undef fscrypt_policy
union fscrypt_policy {
u8 version;
struct fscrypt_policy_v1 v1;
@@ -292,8 +291,9 @@ void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num,
/* fname.c */
int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname,
u8 *out, unsigned int olen);
-bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len,
- u32 max_len, u32 *encrypted_len_ret);
+bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
+ u32 orig_len, u32 max_len,
+ u32 *encrypted_len_ret);
extern const struct dentry_operations fscrypt_d_ops;
/* hkdf.c */
@@ -572,6 +572,9 @@ int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, const u8 *raw_key);
int fscrypt_derive_dirhash_key(struct fscrypt_info *ci,
const struct fscrypt_master_key *mk);
+void fscrypt_hash_inode_number(struct fscrypt_info *ci,
+ const struct fscrypt_master_key *mk);
+
/* keysetup_v1.c */
void fscrypt_put_direct_key(struct fscrypt_direct_key *dk);
@@ -590,5 +593,6 @@ bool fscrypt_supported_policy(const union fscrypt_policy *policy_u,
int fscrypt_policy_from_context(union fscrypt_policy *policy_u,
const union fscrypt_context *ctx_u,
int ctx_size);
+const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir);
#endif /* _FSCRYPT_PRIVATE_H */
diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index 09fb8aa0f2e9..20b0df47fe6a 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -60,8 +60,8 @@ int __fscrypt_prepare_link(struct inode *inode, struct inode *dir,
if (err)
return err;
- /* ... in case we looked up ciphertext name before key was added */
- if (dentry->d_flags & DCACHE_ENCRYPTED_NAME)
+ /* ... in case we looked up no-key name before key was added */
+ if (dentry->d_flags & DCACHE_NOKEY_NAME)
return -ENOKEY;
if (!fscrypt_has_permitted_context(dir, inode))
@@ -85,9 +85,8 @@ int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry,
if (err)
return err;
- /* ... in case we looked up ciphertext name(s) before key was added */
- if ((old_dentry->d_flags | new_dentry->d_flags) &
- DCACHE_ENCRYPTED_NAME)
+ /* ... in case we looked up no-key name(s) before key was added */
+ if ((old_dentry->d_flags | new_dentry->d_flags) & DCACHE_NOKEY_NAME)
return -ENOKEY;
if (old_dir != new_dir) {
@@ -114,9 +113,9 @@ int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry,
if (err && err != -ENOENT)
return err;
- if (fname->is_ciphertext_name) {
+ if (fname->is_nokey_name) {
spin_lock(&dentry->d_lock);
- dentry->d_flags |= DCACHE_ENCRYPTED_NAME;
+ dentry->d_flags |= DCACHE_NOKEY_NAME;
spin_unlock(&dentry->d_lock);
d_set_d_op(dentry, &fscrypt_d_ops);
}
@@ -166,26 +165,51 @@ int fscrypt_prepare_setflags(struct inode *inode,
return 0;
}
-int __fscrypt_prepare_symlink(struct inode *dir, unsigned int len,
- unsigned int max_len,
- struct fscrypt_str *disk_link)
+/**
+ * fscrypt_prepare_symlink() - prepare to create a possibly-encrypted symlink
+ * @dir: directory in which the symlink is being created
+ * @target: plaintext symlink target
+ * @len: length of @target excluding null terminator
+ * @max_len: space the filesystem has available to store the symlink target
+ * @disk_link: (out) the on-disk symlink target being prepared
+ *
+ * This function computes the size the symlink target will require on-disk,
+ * stores it in @disk_link->len, and validates it against @max_len. An
+ * encrypted symlink may be longer than the original.
+ *
+ * Additionally, @disk_link->name is set to @target if the symlink will be
+ * unencrypted, but left NULL if the symlink will be encrypted. For encrypted
+ * symlinks, the filesystem must call fscrypt_encrypt_symlink() to create the
+ * on-disk target later. (The reason for the two-step process is that some
+ * filesystems need to know the size of the symlink target before creating the
+ * inode, e.g. to determine whether it will be a "fast" or "slow" symlink.)
+ *
+ * Return: 0 on success, -ENAMETOOLONG if the symlink target is too long,
+ * -ENOKEY if the encryption key is missing, or another -errno code if a problem
+ * occurred while setting up the encryption key.
+ */
+int fscrypt_prepare_symlink(struct inode *dir, const char *target,
+ unsigned int len, unsigned int max_len,
+ struct fscrypt_str *disk_link)
{
- int err;
+ const union fscrypt_policy *policy;
/*
* To calculate the size of the encrypted symlink target we need to know
* the amount of NUL padding, which is determined by the flags set in
* the encryption policy which will be inherited from the directory.
- * The easiest way to get access to this is to just load the directory's
- * fscrypt_info, since we'll need it to create the dir_entry anyway.
- *
- * Note: in test_dummy_encryption mode, @dir may be unencrypted.
*/
- err = fscrypt_get_encryption_info(dir);
- if (err)
- return err;
- if (!fscrypt_has_encryption_key(dir))
- return -ENOKEY;
+ policy = fscrypt_policy_to_inherit(dir);
+ if (policy == NULL) {
+ /* Not encrypted */
+ disk_link->name = (unsigned char *)target;
+ disk_link->len = len + 1;
+ if (disk_link->len > max_len)
+ return -ENAMETOOLONG;
+ return 0;
+ }
+ if (IS_ERR(policy))
+ return PTR_ERR(policy);
/*
* Calculate the size of the encrypted symlink and verify it won't
@@ -198,7 +222,7 @@ int __fscrypt_prepare_symlink(struct inode *dir, unsigned int len,
* counting it (even though it is meaningless for ciphertext) is simpler
* for now since filesystems will assume it is there and subtract it.
*/
- if (!fscrypt_fname_encrypted_size(dir, len,
+ if (!fscrypt_fname_encrypted_size(policy, len,
max_len - sizeof(struct fscrypt_symlink_data),
&disk_link->len))
return -ENAMETOOLONG;
@@ -207,7 +231,7 @@ int __fscrypt_prepare_symlink(struct inode *dir, unsigned int len,
disk_link->name = NULL;
return 0;
}
-EXPORT_SYMBOL_GPL(__fscrypt_prepare_symlink);
+EXPORT_SYMBOL_GPL(fscrypt_prepare_symlink);
int __fscrypt_encrypt_symlink(struct inode *inode, const char *target,
unsigned int len, struct fscrypt_str *disk_link)
@@ -217,9 +241,13 @@ int __fscrypt_encrypt_symlink(struct inode *inode, const char *target,
struct fscrypt_symlink_data *sd;
unsigned int ciphertext_len;
- err = fscrypt_require_key(inode);
- if (err)
- return err;
+ /*
+ * fscrypt_prepare_new_inode() should have already set up the new
+ * symlink inode's encryption key. We don't wait until now to do it,
+ * since we may be in a filesystem transaction now.
+ */
+ if (WARN_ON_ONCE(!fscrypt_has_encryption_key(inode)))
+ return -ENOKEY;
if (disk_link->name) {
/* filesystem-provided buffer */
@@ -319,7 +347,7 @@ const char *fscrypt_get_symlink(struct inode *inode, const void *caddr,
if (cstr.len + sizeof(*sd) - 1 > max_size)
return ERR_PTR(-EUCLEAN);
- err = fscrypt_fname_alloc_buffer(inode, cstr.len, &pstr);
+ err = fscrypt_fname_alloc_buffer(cstr.len, &pstr);
if (err)
return ERR_PTR(err);
diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c
index faa25541ccb6..89bffa82ed74 100644
--- a/fs/crypto/inline_crypt.c
+++ b/fs/crypto/inline_crypt.c
@@ -106,7 +106,7 @@ int fscrypt_select_encryption_impl(struct fscrypt_info *ci)
crypto_cfg.data_unit_size = sb->s_blocksize;
crypto_cfg.dun_bytes = fscrypt_get_dun_bytes(ci);
num_devs = fscrypt_get_num_devices(sb);
- devs = kmalloc_array(num_devs, sizeof(*devs), GFP_NOFS);
+ devs = kmalloc_array(num_devs, sizeof(*devs), GFP_KERNEL);
if (!devs)
return -ENOMEM;
fscrypt_get_devices(sb, num_devs, devs);
@@ -135,9 +135,8 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
struct fscrypt_blk_crypto_key *blk_key;
int err;
int i;
- unsigned int flags;
- blk_key = kzalloc(struct_size(blk_key, devs, num_devs), GFP_NOFS);
+ blk_key = kzalloc(struct_size(blk_key, devs, num_devs), GFP_KERNEL);
if (!blk_key)
return -ENOMEM;
@@ -166,10 +165,8 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
}
queue_refs++;
- flags = memalloc_nofs_save();
err = blk_crypto_start_using_key(&blk_key->base,
blk_key->devs[i]);
- memalloc_nofs_restore(flags);
if (err) {
fscrypt_err(inode,
"error %d starting to use blk-crypto", err);
diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c
index e74f239c4428..53cc552a7b8f 100644
--- a/fs/crypto/keyring.c
+++ b/fs/crypto/keyring.c
@@ -817,6 +817,7 @@ static int check_for_busy_inodes(struct super_block *sb,
struct list_head *pos;
size_t busy_count = 0;
unsigned long ino;
+ char ino_str[50] = "";
spin_lock(&mk->mk_decrypted_inodes_lock);
@@ -838,11 +839,15 @@ static int check_for_busy_inodes(struct super_block *sb,
}
spin_unlock(&mk->mk_decrypted_inodes_lock);
+ /* If the inode is currently being created, ino may still be 0. */
+ if (ino)
+ snprintf(ino_str, sizeof(ino_str), ", including ino %lu", ino);
+
fscrypt_warn(NULL,
- "%s: %zu inode(s) still busy after removing key with %s %*phN, including ino %lu",
+ "%s: %zu inode(s) still busy after removing key with %s %*phN%s",
sb->s_id, busy_count, master_key_spec_type(&mk->mk_spec),
master_key_spec_len(&mk->mk_spec), (u8 *)&mk->mk_spec.u,
- ino);
+ ino_str);
return -EBUSY;
}
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index fea6226afc2b..d3c3e5d9b41f 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -10,6 +10,7 @@
#include <crypto/skcipher.h>
#include <linux/key.h>
+#include <linux/random.h>
#include "fscrypt_private.h"
@@ -222,6 +223,16 @@ int fscrypt_derive_dirhash_key(struct fscrypt_info *ci,
return 0;
}
+void fscrypt_hash_inode_number(struct fscrypt_info *ci,
+ const struct fscrypt_master_key *mk)
+{
+ WARN_ON(ci->ci_inode->i_ino == 0);
+ WARN_ON(!mk->mk_ino_hash_key_initialized);
+
+ ci->ci_hashed_ino = (u32)siphash_1u64(ci->ci_inode->i_ino,
+ &mk->mk_ino_hash_key);
+}
+
static int fscrypt_setup_iv_ino_lblk_32_key(struct fscrypt_info *ci,
struct fscrypt_master_key *mk)
{
@@ -254,13 +265,20 @@ unlock:
return err;
}
- ci->ci_hashed_ino = (u32)siphash_1u64(ci->ci_inode->i_ino,
- &mk->mk_ino_hash_key);
+ /*
+ * New inodes may not have an inode number assigned yet.
+ * Hashing their inode number is delayed until later.
+ */
+ if (ci->ci_inode->i_ino == 0)
+ WARN_ON(!(ci->ci_inode->i_state & I_CREATING));
+ else
+ fscrypt_hash_inode_number(ci, mk);
return 0;
}
static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci,
- struct fscrypt_master_key *mk)
+ struct fscrypt_master_key *mk,
+ bool need_dirhash_key)
{
int err;
@@ -306,7 +324,7 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci,
return err;
/* Derive a secret dirhash key for directories that need it. */
- if (S_ISDIR(ci->ci_inode->i_mode) && IS_CASEFOLDED(ci->ci_inode)) {
+ if (need_dirhash_key) {
err = fscrypt_derive_dirhash_key(ci, mk);
if (err)
return err;
@@ -326,6 +344,7 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci,
* key being removed with a new inode starting to use it.
*/
static int setup_file_encryption_key(struct fscrypt_info *ci,
+ bool need_dirhash_key,
struct key **master_key_ret)
{
struct key *key;
@@ -400,7 +419,7 @@ static int setup_file_encryption_key(struct fscrypt_info *ci,
err = fscrypt_setup_v1_file_key(ci, mk->mk_secret.raw);
break;
case FSCRYPT_POLICY_V2:
- err = fscrypt_setup_v2_file_key(ci, mk);
+ err = fscrypt_setup_v2_file_key(ci, mk, need_dirhash_key);
break;
default:
WARN_ON(1);
@@ -454,57 +473,28 @@ static void put_crypt_info(struct fscrypt_info *ci)
kmem_cache_free(fscrypt_info_cachep, ci);
}
-int fscrypt_get_encryption_info(struct inode *inode)
+static int
+fscrypt_setup_encryption_info(struct inode *inode,
+ const union fscrypt_policy *policy,
+ const u8 nonce[FSCRYPT_FILE_NONCE_SIZE],
+ bool need_dirhash_key)
{
struct fscrypt_info *crypt_info;
- union fscrypt_context ctx;
struct fscrypt_mode *mode;
struct key *master_key = NULL;
int res;
- if (fscrypt_has_encryption_key(inode))
- return 0;
-
res = fscrypt_initialize(inode->i_sb->s_cop->flags);
if (res)
return res;
- res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
- if (res < 0) {
- const union fscrypt_context *dummy_ctx =
- fscrypt_get_dummy_context(inode->i_sb);
-
- if (IS_ENCRYPTED(inode) || !dummy_ctx) {
- fscrypt_warn(inode,
- "Error %d getting encryption context",
- res);
- return res;
- }
- /* Fake up a context for an unencrypted directory */
- res = fscrypt_context_size(dummy_ctx);
- memcpy(&ctx, dummy_ctx, res);
- }
-
- crypt_info = kmem_cache_zalloc(fscrypt_info_cachep, GFP_NOFS);
+ crypt_info = kmem_cache_zalloc(fscrypt_info_cachep, GFP_KERNEL);
if (!crypt_info)
return -ENOMEM;
crypt_info->ci_inode = inode;
-
- res = fscrypt_policy_from_context(&crypt_info->ci_policy, &ctx, res);
- if (res) {
- fscrypt_warn(inode,
- "Unrecognized or corrupt encryption context");
- goto out;
- }
-
- memcpy(crypt_info->ci_nonce, fscrypt_context_nonce(&ctx),
- FSCRYPT_FILE_NONCE_SIZE);
-
- if (!fscrypt_supported_policy(&crypt_info->ci_policy, inode)) {
- res = -EINVAL;
- goto out;
- }
+ crypt_info->ci_policy = *policy;
+ memcpy(crypt_info->ci_nonce, nonce, FSCRYPT_FILE_NONCE_SIZE);
mode = select_encryption_mode(&crypt_info->ci_policy, inode);
if (IS_ERR(mode)) {
@@ -514,13 +504,14 @@ int fscrypt_get_encryption_info(struct inode *inode)
WARN_ON(mode->ivsize > FSCRYPT_MAX_IV_SIZE);
crypt_info->ci_mode = mode;
- res = setup_file_encryption_key(crypt_info, &master_key);
+ res = setup_file_encryption_key(crypt_info, need_dirhash_key,
+ &master_key);
if (res)
goto out;
/*
- * Multiple tasks may race to set ->i_crypt_info, so use
- * cmpxchg_release(). This pairs with the smp_load_acquire() in
+ * For existing inodes, multiple tasks may race to set ->i_crypt_info.
+ * So use cmpxchg_release(). This pairs with the smp_load_acquire() in
* fscrypt_get_info(). I.e., here we publish ->i_crypt_info with a
* RELEASE barrier so that other tasks can ACQUIRE it.
*/
@@ -550,14 +541,113 @@ out:
up_read(&mk->mk_secret_sem);
key_put(master_key);
}
+ put_crypt_info(crypt_info);
+ return res;
+}
+
+/**
+ * fscrypt_get_encryption_info() - set up an inode's encryption key
+ * @inode: the inode to set up the key for. Must be encrypted.
+ *
+ * Set up ->i_crypt_info, if it hasn't already been done.
+ *
+ * Note: unless ->i_crypt_info is already set, this isn't %GFP_NOFS-safe. So
+ * generally this shouldn't be called from within a filesystem transaction.
+ *
+ * Return: 0 if ->i_crypt_info was set or was already set, *or* if the
+ * encryption key is unavailable. (Use fscrypt_has_encryption_key() to
+ * distinguish these cases.) Also can return another -errno code.
+ */
+int fscrypt_get_encryption_info(struct inode *inode)
+{
+ int res;
+ union fscrypt_context ctx;
+ union fscrypt_policy policy;
+
+ if (fscrypt_has_encryption_key(inode))
+ return 0;
+
+ res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
+ if (res < 0) {
+ fscrypt_warn(inode, "Error %d getting encryption context", res);
+ return res;
+ }
+
+ res = fscrypt_policy_from_context(&policy, &ctx, res);
+ if (res) {
+ fscrypt_warn(inode,
+ "Unrecognized or corrupt encryption context");
+ return res;
+ }
+
+ if (!fscrypt_supported_policy(&policy, inode))
+ return -EINVAL;
+
+ res = fscrypt_setup_encryption_info(inode, &policy,
+ fscrypt_context_nonce(&ctx),
+ IS_CASEFOLDED(inode) &&
+ S_ISDIR(inode->i_mode));
if (res == -ENOKEY)
res = 0;
- put_crypt_info(crypt_info);
return res;
}
EXPORT_SYMBOL(fscrypt_get_encryption_info);
/**
+ * fscrypt_prepare_new_inode() - prepare to create a new inode in a directory
+ * @dir: a possibly-encrypted directory
+ * @inode: the new inode. ->i_mode must be set already.
+ * ->i_ino doesn't need to be set yet.
+ * @encrypt_ret: (output) set to %true if the new inode will be encrypted
+ *
+ * If the directory is encrypted, set up its ->i_crypt_info in preparation for
+ * encrypting the name of the new file. Also, if the new inode will be
+ * encrypted, set up its ->i_crypt_info and set *encrypt_ret=true.
+ *
+ * This isn't %GFP_NOFS-safe, and therefore it should be called before starting
+ * any filesystem transaction to create the inode. For this reason, ->i_ino
+ * isn't required to be set yet, as the filesystem may not have set it yet.
+ *
+ * This doesn't persist the new inode's encryption context. That still needs to
+ * be done later by calling fscrypt_set_context().
+ *
+ * Return: 0 on success, -ENOKEY if the encryption key is missing, or another
+ * -errno code
+ */
+int fscrypt_prepare_new_inode(struct inode *dir, struct inode *inode,
+ bool *encrypt_ret)
+{
+ const union fscrypt_policy *policy;
+ u8 nonce[FSCRYPT_FILE_NONCE_SIZE];
+
+ policy = fscrypt_policy_to_inherit(dir);
+ if (policy == NULL)
+ return 0;
+ if (IS_ERR(policy))
+ return PTR_ERR(policy);
+
+ if (WARN_ON_ONCE(inode->i_mode == 0))
+ return -EINVAL;
+
+ /*
+ * Only regular files, directories, and symlinks are encrypted.
+ * Special files like device nodes and named pipes aren't.
+ */
+ if (!S_ISREG(inode->i_mode) &&
+ !S_ISDIR(inode->i_mode) &&
+ !S_ISLNK(inode->i_mode))
+ return 0;
+
+ *encrypt_ret = true;
+
+ get_random_bytes(nonce, FSCRYPT_FILE_NONCE_SIZE);
+ return fscrypt_setup_encryption_info(inode, policy, nonce,
+ IS_CASEFOLDED(dir) &&
+ S_ISDIR(inode->i_mode));
+}
+EXPORT_SYMBOL_GPL(fscrypt_prepare_new_inode);
+
+/**
* fscrypt_put_encryption_info() - free most of an inode's fscrypt data
* @inode: an inode being evicted
*
diff --git a/fs/crypto/keysetup_v1.c b/fs/crypto/keysetup_v1.c
index a3cb52572b05..2762c5350432 100644
--- a/fs/crypto/keysetup_v1.c
+++ b/fs/crypto/keysetup_v1.c
@@ -60,7 +60,7 @@ static int derive_key_aes(const u8 *master_key,
goto out;
}
crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS);
- req = skcipher_request_alloc(tfm, GFP_NOFS);
+ req = skcipher_request_alloc(tfm, GFP_KERNEL);
if (!req) {
res = -ENOMEM;
goto out;
@@ -99,7 +99,7 @@ find_and_lock_process_key(const char *prefix,
const struct user_key_payload *ukp;
const struct fscrypt_key *payload;
- description = kasprintf(GFP_NOFS, "%s%*phN", prefix,
+ description = kasprintf(GFP_KERNEL, "%s%*phN", prefix,
FSCRYPT_KEY_DESCRIPTOR_SIZE, descriptor);
if (!description)
return ERR_PTR(-ENOMEM);
@@ -228,7 +228,7 @@ fscrypt_get_direct_key(const struct fscrypt_info *ci, const u8 *raw_key)
return dk;
/* Nope, allocate one. */
- dk = kzalloc(sizeof(*dk), GFP_NOFS);
+ dk = kzalloc(sizeof(*dk), GFP_KERNEL);
if (!dk)
return ERR_PTR(-ENOMEM);
refcount_set(&dk->dk_refcount, 1);
@@ -272,7 +272,7 @@ static int setup_v1_file_key_derived(struct fscrypt_info *ci,
* This cannot be a stack buffer because it will be passed to the
* scatterlist crypto API during derive_key_aes().
*/
- derived_key = kmalloc(ci->ci_mode->keysize, GFP_NOFS);
+ derived_key = kmalloc(ci->ci_mode->keysize, GFP_KERNEL);
if (!derived_key)
return -ENOMEM;
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index 2d73fd39ad96..4441d9944b9e 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -32,6 +32,14 @@ bool fscrypt_policies_equal(const union fscrypt_policy *policy1,
return !memcmp(policy1, policy2, fscrypt_policy_size(policy1));
}
+static const union fscrypt_policy *
+fscrypt_get_dummy_policy(struct super_block *sb)
+{
+ if (!sb->s_cop->get_dummy_policy)
+ return NULL;
+ return sb->s_cop->get_dummy_policy(sb);
+}
+
static bool fscrypt_valid_enc_modes(u32 contents_mode, u32 filenames_mode)
{
if (contents_mode == FSCRYPT_MODE_AES_256_XTS &&
@@ -192,10 +200,15 @@ static bool fscrypt_supported_v2_policy(const struct fscrypt_policy_v2 *policy,
32, 32))
return false;
+ /*
+ * IV_INO_LBLK_32 hashes the inode number, so in principle it can
+ * support any ino_bits. However, currently the inode number is gotten
+ * from inode::i_ino which is 'unsigned long'. So for now the
+ * implementation limit is 32 bits.
+ */
if ((policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) &&
- /* This uses hashed inode numbers, so ino_bits doesn't matter. */
!supported_iv_ino_lblk_policy(policy, inode, "IV_INO_LBLK_32",
- INT_MAX, 32))
+ 32, 32))
return false;
if (memchr_inv(policy->__reserved, 0, sizeof(policy->__reserved))) {
@@ -231,18 +244,19 @@ bool fscrypt_supported_policy(const union fscrypt_policy *policy_u,
}
/**
- * fscrypt_new_context_from_policy() - create a new fscrypt_context from
- * an fscrypt_policy
+ * fscrypt_new_context() - create a new fscrypt_context
* @ctx_u: output context
* @policy_u: input policy
+ * @nonce: nonce to use
*
* Create an fscrypt_context for an inode that is being assigned the given
- * encryption policy. A new nonce is randomly generated.
+ * encryption policy. @nonce must be a new random nonce.
*
* Return: the size of the new context in bytes.
*/
-static int fscrypt_new_context_from_policy(union fscrypt_context *ctx_u,
- const union fscrypt_policy *policy_u)
+static int fscrypt_new_context(union fscrypt_context *ctx_u,
+ const union fscrypt_policy *policy_u,
+ const u8 nonce[FSCRYPT_FILE_NONCE_SIZE])
{
memset(ctx_u, 0, sizeof(*ctx_u));
@@ -260,7 +274,7 @@ static int fscrypt_new_context_from_policy(union fscrypt_context *ctx_u,
memcpy(ctx->master_key_descriptor,
policy->master_key_descriptor,
sizeof(ctx->master_key_descriptor));
- get_random_bytes(ctx->nonce, sizeof(ctx->nonce));
+ memcpy(ctx->nonce, nonce, FSCRYPT_FILE_NONCE_SIZE);
return sizeof(*ctx);
}
case FSCRYPT_POLICY_V2: {
@@ -276,7 +290,7 @@ static int fscrypt_new_context_from_policy(union fscrypt_context *ctx_u,
memcpy(ctx->master_key_identifier,
policy->master_key_identifier,
sizeof(ctx->master_key_identifier));
- get_random_bytes(ctx->nonce, sizeof(ctx->nonce));
+ memcpy(ctx->nonce, nonce, FSCRYPT_FILE_NONCE_SIZE);
return sizeof(*ctx);
}
}
@@ -372,6 +386,7 @@ static int fscrypt_get_policy(struct inode *inode, union fscrypt_policy *policy)
static int set_encryption_policy(struct inode *inode,
const union fscrypt_policy *policy)
{
+ u8 nonce[FSCRYPT_FILE_NONCE_SIZE];
union fscrypt_context ctx;
int ctxsize;
int err;
@@ -409,7 +424,8 @@ static int set_encryption_policy(struct inode *inode,
return -EINVAL;
}
- ctxsize = fscrypt_new_context_from_policy(&ctx, policy);
+ get_random_bytes(nonce, FSCRYPT_FILE_NONCE_SIZE);
+ ctxsize = fscrypt_new_context(&ctx, policy, nonce);
return inode->i_sb->s_cop->set_context(inode, &ctx, ctxsize, NULL);
}
@@ -620,86 +636,99 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
}
EXPORT_SYMBOL(fscrypt_has_permitted_context);
+/*
+ * Return the encryption policy that new files in the directory will inherit, or
+ * NULL if none, or an ERR_PTR() on error. If the directory is encrypted, also
+ * ensure that its key is set up, so that the new filename can be encrypted.
+ */
+const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir)
+{
+ int err;
+
+ if (IS_ENCRYPTED(dir)) {
+ err = fscrypt_require_key(dir);
+ if (err)
+ return ERR_PTR(err);
+ return &dir->i_crypt_info->ci_policy;
+ }
+
+ return fscrypt_get_dummy_policy(dir->i_sb);
+}
+
/**
- * fscrypt_inherit_context() - Sets a child context from its parent
- * @parent: Parent inode from which the context is inherited.
- * @child: Child inode that inherits the context from @parent.
- * @fs_data: private data given by FS.
- * @preload: preload child i_crypt_info if true
+ * fscrypt_set_context() - Set the fscrypt context of a new inode
+ * @inode: a new inode
+ * @fs_data: private data given by FS and passed to ->set_context()
+ *
+ * This should be called after fscrypt_prepare_new_inode(), generally during a
+ * filesystem transaction. Everything here must be %GFP_NOFS-safe.
*
* Return: 0 on success, -errno on failure
*/
-int fscrypt_inherit_context(struct inode *parent, struct inode *child,
- void *fs_data, bool preload)
+int fscrypt_set_context(struct inode *inode, void *fs_data)
{
+ struct fscrypt_info *ci = inode->i_crypt_info;
union fscrypt_context ctx;
int ctxsize;
- struct fscrypt_info *ci;
- int res;
-
- res = fscrypt_get_encryption_info(parent);
- if (res < 0)
- return res;
- ci = fscrypt_get_info(parent);
- if (ci == NULL)
+ /* fscrypt_prepare_new_inode() should have set up the key already. */
+ if (WARN_ON_ONCE(!ci))
return -ENOKEY;
- ctxsize = fscrypt_new_context_from_policy(&ctx, &ci->ci_policy);
-
BUILD_BUG_ON(sizeof(ctx) != FSCRYPT_SET_CONTEXT_MAX_SIZE);
- res = parent->i_sb->s_cop->set_context(child, &ctx, ctxsize, fs_data);
- if (res)
- return res;
- return preload ? fscrypt_get_encryption_info(child): 0;
+ ctxsize = fscrypt_new_context(&ctx, &ci->ci_policy, ci->ci_nonce);
+
+ /*
+ * This may be the first time the inode number is available, so do any
+ * delayed key setup that requires the inode number.
+ */
+ if (ci->ci_policy.version == FSCRYPT_POLICY_V2 &&
+ (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) {
+ const struct fscrypt_master_key *mk =
+ ci->ci_master_key->payload.data[0];
+
+ fscrypt_hash_inode_number(ci, mk);
+ }
+
+ return inode->i_sb->s_cop->set_context(inode, &ctx, ctxsize, fs_data);
}
-EXPORT_SYMBOL(fscrypt_inherit_context);
+EXPORT_SYMBOL_GPL(fscrypt_set_context);
/**
* fscrypt_set_test_dummy_encryption() - handle '-o test_dummy_encryption'
* @sb: the filesystem on which test_dummy_encryption is being specified
- * @arg: the argument to the test_dummy_encryption option.
- * If no argument was specified, then @arg->from == NULL.
- * @dummy_ctx: the filesystem's current dummy context (input/output, see below)
+ * @arg: the argument to the test_dummy_encryption option. May be NULL.
+ * @dummy_policy: the filesystem's current dummy policy (input/output, see
+ * below)
*
* Handle the test_dummy_encryption mount option by creating a dummy encryption
- * context, saving it in @dummy_ctx, and adding the corresponding dummy
- * encryption key to the filesystem. If the @dummy_ctx is already set, then
+ * policy, saving it in @dummy_policy, and adding the corresponding dummy
+ * encryption key to the filesystem. If the @dummy_policy is already set, then
* instead validate that it matches @arg. Don't support changing it via
* remount, as that is difficult to do safely.
*
- * The reason we use an fscrypt_context rather than an fscrypt_policy is because
- * we mustn't generate a new nonce each time we access a dummy-encrypted
- * directory, as that would change the way filenames are encrypted.
- *
- * Return: 0 on success (dummy context set, or the same context is already set);
- * -EEXIST if a different dummy context is already set;
+ * Return: 0 on success (dummy policy set, or the same policy is already set);
+ * -EEXIST if a different dummy policy is already set;
* or another -errno value.
*/
-int fscrypt_set_test_dummy_encryption(struct super_block *sb,
- const substring_t *arg,
- struct fscrypt_dummy_context *dummy_ctx)
+int fscrypt_set_test_dummy_encryption(struct super_block *sb, const char *arg,
+ struct fscrypt_dummy_policy *dummy_policy)
{
- const char *argstr = "v2";
- const char *argstr_to_free = NULL;
struct fscrypt_key_specifier key_spec = { 0 };
int version;
- union fscrypt_context *ctx = NULL;
+ union fscrypt_policy *policy = NULL;
int err;
- if (arg->from) {
- argstr = argstr_to_free = match_strdup(arg);
- if (!argstr)
- return -ENOMEM;
- }
+ if (!arg)
+ arg = "v2";
- if (!strcmp(argstr, "v1")) {
- version = FSCRYPT_CONTEXT_V1;
+ if (!strcmp(arg, "v1")) {
+ version = FSCRYPT_POLICY_V1;
key_spec.type = FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR;
memset(key_spec.u.descriptor, 0x42,
FSCRYPT_KEY_DESCRIPTOR_SIZE);
- } else if (!strcmp(argstr, "v2")) {
- version = FSCRYPT_CONTEXT_V2;
+ } else if (!strcmp(arg, "v2")) {
+ version = FSCRYPT_POLICY_V2;
key_spec.type = FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER;
/* key_spec.u.identifier gets filled in when adding the key */
} else {
@@ -707,21 +736,8 @@ int fscrypt_set_test_dummy_encryption(struct super_block *sb,
goto out;
}
- if (dummy_ctx->ctx) {
- /*
- * Note: if we ever make test_dummy_encryption support
- * specifying other encryption settings, such as the encryption
- * modes, we'll need to compare those settings here.
- */
- if (dummy_ctx->ctx->version == version)
- err = 0;
- else
- err = -EEXIST;
- goto out;
- }
-
- ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
- if (!ctx) {
+ policy = kzalloc(sizeof(*policy), GFP_KERNEL);
+ if (!policy) {
err = -ENOMEM;
goto out;
}
@@ -730,18 +746,18 @@ int fscrypt_set_test_dummy_encryption(struct super_block *sb,
if (err)
goto out;
- ctx->version = version;
- switch (ctx->version) {
- case FSCRYPT_CONTEXT_V1:
- ctx->v1.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS;
- ctx->v1.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS;
- memcpy(ctx->v1.master_key_descriptor, key_spec.u.descriptor,
+ policy->version = version;
+ switch (policy->version) {
+ case FSCRYPT_POLICY_V1:
+ policy->v1.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS;
+ policy->v1.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS;
+ memcpy(policy->v1.master_key_descriptor, key_spec.u.descriptor,
FSCRYPT_KEY_DESCRIPTOR_SIZE);
break;
- case FSCRYPT_CONTEXT_V2:
- ctx->v2.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS;
- ctx->v2.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS;
- memcpy(ctx->v2.master_key_identifier, key_spec.u.identifier,
+ case FSCRYPT_POLICY_V2:
+ policy->v2.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS;
+ policy->v2.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS;
+ memcpy(policy->v2.master_key_identifier, key_spec.u.identifier,
FSCRYPT_KEY_IDENTIFIER_SIZE);
break;
default:
@@ -749,12 +765,19 @@ int fscrypt_set_test_dummy_encryption(struct super_block *sb,
err = -EINVAL;
goto out;
}
- dummy_ctx->ctx = ctx;
- ctx = NULL;
+
+ if (dummy_policy->policy) {
+ if (fscrypt_policies_equal(policy, dummy_policy->policy))
+ err = 0;
+ else
+ err = -EEXIST;
+ goto out;
+ }
+ dummy_policy->policy = policy;
+ policy = NULL;
err = 0;
out:
- kfree(ctx);
- kfree(argstr_to_free);
+ kfree(policy);
return err;
}
EXPORT_SYMBOL_GPL(fscrypt_set_test_dummy_encryption);
@@ -771,10 +794,16 @@ EXPORT_SYMBOL_GPL(fscrypt_set_test_dummy_encryption);
void fscrypt_show_test_dummy_encryption(struct seq_file *seq, char sep,
struct super_block *sb)
{
- const union fscrypt_context *ctx = fscrypt_get_dummy_context(sb);
+ const union fscrypt_policy *policy = fscrypt_get_dummy_policy(sb);
+ int vers;
- if (!ctx)
+ if (!policy)
return;
- seq_printf(seq, "%ctest_dummy_encryption=v%d", sep, ctx->version);
+
+ vers = policy->version;
+ if (vers == FSCRYPT_POLICY_V1) /* Handle numbering quirk */
+ vers = 1;
+
+ seq_printf(seq, "%ctest_dummy_encryption=v%d", sep, vers);
}
EXPORT_SYMBOL_GPL(fscrypt_show_test_dummy_encryption);
diff --git a/fs/d_path.c b/fs/d_path.c
index 0f1fc1743302..a69e2cd36e6e 100644
--- a/fs/d_path.c
+++ b/fs/d_path.c
@@ -102,6 +102,8 @@ restart:
if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) {
struct mount *parent = READ_ONCE(mnt->mnt_parent);
+ struct mnt_namespace *mnt_ns;
+
/* Escaped? */
if (dentry != vfsmnt->mnt_root) {
bptr = *buffer;
@@ -116,7 +118,9 @@ restart:
vfsmnt = &mnt->mnt;
continue;
}
- if (is_mounted(vfsmnt) && !is_anon_ns(mnt->mnt_ns))
+ mnt_ns = READ_ONCE(mnt->mnt_ns);
+ /* open-coded is_mounted() to use local mnt_ns */
+ if (!IS_ERR_OR_NULL(mnt_ns) && !is_anon_ns(mnt_ns))
error = 1; // absolute root
else
error = 2; // detached or not attached yet
diff --git a/fs/dax.c b/fs/dax.c
index 11b16729b86f..6ad346352a8c 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -488,7 +488,7 @@ retry:
if (dax_is_conflict(entry))
goto fallback;
if (!xa_is_value(entry)) {
- xas_set_err(xas, EIO);
+ xas_set_err(xas, -EIO);
goto out_unlock;
}
@@ -680,21 +680,20 @@ int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
return __dax_invalidate_entry(mapping, index, false);
}
-static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
- sector_t sector, size_t size, struct page *to,
- unsigned long vaddr)
+static int copy_cow_page_dax(struct block_device *bdev, struct dax_device *dax_dev,
+ sector_t sector, struct page *to, unsigned long vaddr)
{
void *vto, *kaddr;
pgoff_t pgoff;
long rc;
int id;
- rc = bdev_dax_pgoff(bdev, sector, size, &pgoff);
+ rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff);
if (rc)
return rc;
id = dax_read_lock();
- rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, NULL);
+ rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(PAGE_SIZE), &kaddr, NULL);
if (rc < 0) {
dax_read_unlock(id);
return rc;
@@ -1038,18 +1037,18 @@ static vm_fault_t dax_load_hole(struct xa_state *xas,
return ret;
}
-int dax_iomap_zero(loff_t pos, unsigned offset, unsigned size,
- struct iomap *iomap)
+s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap)
{
sector_t sector = iomap_sector(iomap, pos & PAGE_MASK);
pgoff_t pgoff;
long rc, id;
void *kaddr;
bool page_aligned = false;
-
+ unsigned offset = offset_in_page(pos);
+ unsigned size = min_t(u64, PAGE_SIZE - offset, length);
if (IS_ALIGNED(sector << SECTOR_SHIFT, PAGE_SIZE) &&
- IS_ALIGNED(size, PAGE_SIZE))
+ (size == PAGE_SIZE))
page_aligned = true;
rc = bdev_dax_pgoff(iomap->bdev, sector, PAGE_SIZE, &pgoff);
@@ -1059,8 +1058,7 @@ int dax_iomap_zero(loff_t pos, unsigned offset, unsigned size,
id = dax_read_lock();
if (page_aligned)
- rc = dax_zero_page_range(iomap->dax_dev, pgoff,
- size >> PAGE_SHIFT);
+ rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
else
rc = dax_direct_access(iomap->dax_dev, pgoff, 1, &kaddr, NULL);
if (rc < 0) {
@@ -1073,7 +1071,7 @@ int dax_iomap_zero(loff_t pos, unsigned offset, unsigned size,
dax_flush(iomap->dax_dev, kaddr + offset, size);
}
dax_read_unlock(id);
- return 0;
+ return size;
}
static loff_t
@@ -1305,8 +1303,8 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
clear_user_highpage(vmf->cow_page, vaddr);
break;
case IOMAP_MAPPED:
- error = copy_user_dax(iomap.bdev, iomap.dax_dev,
- sector, PAGE_SIZE, vmf->cow_page, vaddr);
+ error = copy_cow_page_dax(iomap.bdev, iomap.dax_dev,
+ sector, vmf->cow_page, vaddr);
break;
default:
WARN_ON_ONCE(1);
@@ -1368,7 +1366,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
ret = dax_load_hole(&xas, mapping, &entry, vmf);
goto finish_iomap;
}
- /*FALLTHRU*/
+ fallthrough;
default:
WARN_ON_ONCE(1);
error = -EIO;
diff --git a/fs/dcache.c b/fs/dcache.c
index 361ea7ab30ea..ea0485861d93 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1746,7 +1746,7 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
dentry->d_lockref.count = 1;
dentry->d_flags = 0;
spin_lock_init(&dentry->d_lock);
- seqcount_init(&dentry->d_seq);
+ seqcount_spinlock_init(&dentry->d_seq, &dentry->d_lock);
dentry->d_inode = NULL;
dentry->d_parent = dentry;
dentry->d_sb = sb;
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index b167d2d02148..a768a09430c3 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -177,7 +177,7 @@ static int open_proxy_open(struct inode *inode, struct file *filp)
goto out;
if (!fops_get(real_fops)) {
-#ifdef MODULE
+#ifdef CONFIG_MODULES
if (real_fops->owner &&
real_fops->owner->state == MODULE_STATE_GOING)
goto out;
@@ -312,7 +312,7 @@ static int full_proxy_open(struct inode *inode, struct file *filp)
goto out;
if (!fops_get(real_fops)) {
-#ifdef MODULE
+#ifdef CONFIG_MODULES
if (real_fops->owner &&
real_fops->owner->state == MODULE_STATE_GOING)
goto out;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 183299892465..abf535b036ab 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -386,25 +386,6 @@ static void dio_bio_end_io(struct bio *bio)
spin_unlock_irqrestore(&dio->bio_lock, flags);
}
-/**
- * dio_end_io - handle the end io action for the given bio
- * @bio: The direct io bio thats being completed
- *
- * This is meant to be called by any filesystem that uses their own dio_submit_t
- * so that the DIO specific endio actions are dealt with after the filesystem
- * has done it's completion work.
- */
-void dio_end_io(struct bio *bio)
-{
- struct dio *dio = bio->bi_private;
-
- if (dio->is_async)
- dio_bio_end_aio(bio);
- else
- dio_bio_end_io(bio);
-}
-EXPORT_SYMBOL_GPL(dio_end_io);
-
static inline void
dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
struct block_device *bdev,
diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
index f82a4952769d..ee92634196a8 100644
--- a/fs/dlm/Kconfig
+++ b/fs/dlm/Kconfig
@@ -4,6 +4,7 @@ menuconfig DLM
depends on INET
depends on SYSFS && CONFIGFS_FS && (IPV6 || IPV6=n)
select IP_SCTP
+ select SRCU
help
A general purpose distributed lock manager for kernel or userspace
applications.
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 47f0b98b707f..49c5f9407098 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -125,7 +125,7 @@ static ssize_t cluster_cluster_name_store(struct config_item *item,
CONFIGFS_ATTR(cluster_, cluster_name);
static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field,
- int *info_field, int check_zero,
+ int *info_field, bool (*check_cb)(unsigned int x),
const char *buf, size_t len)
{
unsigned int x;
@@ -137,7 +137,7 @@ static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field,
if (rc)
return rc;
- if (check_zero && !x)
+ if (check_cb && check_cb(x))
return -EINVAL;
*cl_field = x;
@@ -146,13 +146,13 @@ static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field,
return len;
}
-#define CLUSTER_ATTR(name, check_zero) \
+#define CLUSTER_ATTR(name, check_cb) \
static ssize_t cluster_##name##_store(struct config_item *item, \
const char *buf, size_t len) \
{ \
struct dlm_cluster *cl = config_item_to_cluster(item); \
return cluster_set(cl, &cl->cl_##name, &dlm_config.ci_##name, \
- check_zero, buf, len); \
+ check_cb, buf, len); \
} \
static ssize_t cluster_##name##_show(struct config_item *item, char *buf) \
{ \
@@ -161,20 +161,30 @@ static ssize_t cluster_##name##_show(struct config_item *item, char *buf) \
} \
CONFIGFS_ATTR(cluster_, name);
-CLUSTER_ATTR(tcp_port, 1);
-CLUSTER_ATTR(buffer_size, 1);
-CLUSTER_ATTR(rsbtbl_size, 1);
-CLUSTER_ATTR(recover_timer, 1);
-CLUSTER_ATTR(toss_secs, 1);
-CLUSTER_ATTR(scan_secs, 1);
-CLUSTER_ATTR(log_debug, 0);
-CLUSTER_ATTR(log_info, 0);
-CLUSTER_ATTR(protocol, 0);
-CLUSTER_ATTR(mark, 0);
-CLUSTER_ATTR(timewarn_cs, 1);
-CLUSTER_ATTR(waitwarn_us, 0);
-CLUSTER_ATTR(new_rsb_count, 0);
-CLUSTER_ATTR(recover_callbacks, 0);
+static bool dlm_check_zero(unsigned int x)
+{
+ return !x;
+}
+
+static bool dlm_check_buffer_size(unsigned int x)
+{
+ return (x < DEFAULT_BUFFER_SIZE);
+}
+
+CLUSTER_ATTR(tcp_port, dlm_check_zero);
+CLUSTER_ATTR(buffer_size, dlm_check_buffer_size);
+CLUSTER_ATTR(rsbtbl_size, dlm_check_zero);
+CLUSTER_ATTR(recover_timer, dlm_check_zero);
+CLUSTER_ATTR(toss_secs, dlm_check_zero);
+CLUSTER_ATTR(scan_secs, dlm_check_zero);
+CLUSTER_ATTR(log_debug, NULL);
+CLUSTER_ATTR(log_info, NULL);
+CLUSTER_ATTR(protocol, NULL);
+CLUSTER_ATTR(mark, NULL);
+CLUSTER_ATTR(timewarn_cs, dlm_check_zero);
+CLUSTER_ATTR(waitwarn_us, NULL);
+CLUSTER_ATTR(new_rsb_count, NULL);
+CLUSTER_ATTR(recover_callbacks, NULL);
static struct configfs_attribute *cluster_attrs[] = {
[CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port,
@@ -221,6 +231,7 @@ struct dlm_space {
struct list_head members;
struct mutex members_lock;
int members_count;
+ struct dlm_nodes *nds;
};
struct dlm_comms {
@@ -430,6 +441,7 @@ static struct config_group *make_space(struct config_group *g, const char *name)
INIT_LIST_HEAD(&sp->members);
mutex_init(&sp->members_lock);
sp->members_count = 0;
+ sp->nds = nds;
return &sp->group;
fail:
@@ -451,6 +463,7 @@ static void drop_space(struct config_group *g, struct config_item *i)
static void release_space(struct config_item *i)
{
struct dlm_space *sp = config_item_to_space(i);
+ kfree(sp->nds);
kfree(sp);
}
@@ -857,18 +870,22 @@ int dlm_comm_seq(int nodeid, uint32_t *seq)
return 0;
}
-int dlm_comm_mark(int nodeid, unsigned int *mark)
+void dlm_comm_mark(int nodeid, unsigned int *mark)
{
struct dlm_comm *cm;
cm = get_comm(nodeid);
- if (!cm)
- return -ENOENT;
+ if (!cm) {
+ *mark = dlm_config.ci_mark;
+ return;
+ }
- *mark = cm->mark;
- put_comm(cm);
+ if (cm->mark)
+ *mark = cm->mark;
+ else
+ *mark = dlm_config.ci_mark;
- return 0;
+ put_comm(cm);
}
int dlm_our_nodeid(void)
@@ -889,7 +906,6 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num)
/* Config file defaults */
#define DEFAULT_TCP_PORT 21064
-#define DEFAULT_BUFFER_SIZE 4096
#define DEFAULT_RSBTBL_SIZE 1024
#define DEFAULT_RECOVER_TIMER 5
#define DEFAULT_TOSS_SECS 10
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
index f62996cad561..c210250a2581 100644
--- a/fs/dlm/config.h
+++ b/fs/dlm/config.h
@@ -12,6 +12,8 @@
#ifndef __CONFIG_DOT_H__
#define __CONFIG_DOT_H__
+#define DEFAULT_BUFFER_SIZE 4096
+
struct dlm_config_node {
int nodeid;
int weight;
@@ -46,7 +48,7 @@ void dlm_config_exit(void);
int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out,
int *count_out);
int dlm_comm_seq(int nodeid, uint32_t *seq);
-int dlm_comm_mark(int nodeid, unsigned int *mark);
+void dlm_comm_mark(int nodeid, unsigned int *mark);
int dlm_our_nodeid(void);
int dlm_our_addr(struct sockaddr_storage *addr, int num);
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 18d81599522f..002123efc6b0 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -5817,7 +5817,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
break;
case -EAGAIN:
error = 0;
- /* fall through */
+ fallthrough;
default:
__put_lkb(ls, lkb);
goto out;
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 5050fe05769b..79f56f16bc2c 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -65,40 +65,6 @@
#define MAX_SEND_MSG_COUNT 25
#define DLM_SHUTDOWN_WAIT_TIMEOUT msecs_to_jiffies(10000)
-struct cbuf {
- unsigned int base;
- unsigned int len;
- unsigned int mask;
-};
-
-static void cbuf_add(struct cbuf *cb, int n)
-{
- cb->len += n;
-}
-
-static int cbuf_data(struct cbuf *cb)
-{
- return ((cb->base + cb->len) & cb->mask);
-}
-
-static void cbuf_init(struct cbuf *cb, int size)
-{
- cb->base = cb->len = 0;
- cb->mask = size-1;
-}
-
-static void cbuf_eat(struct cbuf *cb, int n)
-{
- cb->len -= n;
- cb->base += n;
- cb->base &= cb->mask;
-}
-
-static bool cbuf_empty(struct cbuf *cb)
-{
- return cb->len == 0;
-}
-
struct connection {
struct socket *sock; /* NULL if not connected */
uint32_t nodeid; /* So we know who we are in the list */
@@ -117,8 +83,6 @@ struct connection {
int (*rx_action) (struct connection *); /* What to do when active */
void (*connect_action) (struct connection *); /* What to do to connect */
void (*shutdown_action)(struct connection *con); /* What to do to shutdown */
- struct page *rx_page;
- struct cbuf cb;
int retries;
#define MAX_CONNECT_RETRIES 3
struct hlist_node list;
@@ -126,6 +90,10 @@ struct connection {
struct work_struct rwork; /* Receive workqueue */
struct work_struct swork; /* Send workqueue */
wait_queue_head_t shutdown_wait; /* wait for graceful shutdown */
+ unsigned char *rx_buf;
+ int rx_buflen;
+ int rx_leftover;
+ struct rcu_head rcu;
};
#define sock2con(x) ((struct connection *)(x)->sk_user_data)
@@ -167,8 +135,8 @@ static struct workqueue_struct *recv_workqueue;
static struct workqueue_struct *send_workqueue;
static struct hlist_head connection_hash[CONN_HASH_SIZE];
-static DEFINE_MUTEX(connections_lock);
-static struct kmem_cache *con_cache;
+static DEFINE_SPINLOCK(connections_lock);
+DEFINE_STATIC_SRCU(connections_srcu);
static void process_recv_sockets(struct work_struct *work);
static void process_send_sockets(struct work_struct *work);
@@ -184,15 +152,20 @@ static inline int nodeid_hash(int nodeid)
static struct connection *__find_con(int nodeid)
{
- int r;
+ int r, idx;
struct connection *con;
r = nodeid_hash(nodeid);
- hlist_for_each_entry(con, &connection_hash[r], list) {
- if (con->nodeid == nodeid)
+ idx = srcu_read_lock(&connections_srcu);
+ hlist_for_each_entry_rcu(con, &connection_hash[r], list) {
+ if (con->nodeid == nodeid) {
+ srcu_read_unlock(&connections_srcu, idx);
return con;
+ }
}
+ srcu_read_unlock(&connections_srcu, idx);
+
return NULL;
}
@@ -200,21 +173,25 @@ static struct connection *__find_con(int nodeid)
* If 'allocation' is zero then we don't attempt to create a new
* connection structure for this node.
*/
-static struct connection *__nodeid2con(int nodeid, gfp_t alloc)
+static struct connection *nodeid2con(int nodeid, gfp_t alloc)
{
- struct connection *con = NULL;
+ struct connection *con, *tmp;
int r;
con = __find_con(nodeid);
if (con || !alloc)
return con;
- con = kmem_cache_zalloc(con_cache, alloc);
+ con = kzalloc(sizeof(*con), alloc);
if (!con)
return NULL;
- r = nodeid_hash(nodeid);
- hlist_add_head(&con->list, &connection_hash[r]);
+ con->rx_buflen = dlm_config.ci_buffer_size;
+ con->rx_buf = kmalloc(con->rx_buflen, GFP_NOFS);
+ if (!con->rx_buf) {
+ kfree(con);
+ return NULL;
+ }
con->nodeid = nodeid;
mutex_init(&con->sock_mutex);
@@ -233,31 +210,41 @@ static struct connection *__nodeid2con(int nodeid, gfp_t alloc)
con->rx_action = zerocon->rx_action;
}
+ r = nodeid_hash(nodeid);
+
+ spin_lock(&connections_lock);
+ /* Because multiple workqueues/threads calls this function it can
+ * race on multiple cpu's. Instead of locking hot path __find_con()
+ * we just check in rare cases of recently added nodes again
+ * under protection of connections_lock. If this is the case we
+ * abort our connection creation and return the existing connection.
+ */
+ tmp = __find_con(nodeid);
+ if (tmp) {
+ spin_unlock(&connections_lock);
+ kfree(con->rx_buf);
+ kfree(con);
+ return tmp;
+ }
+
+ hlist_add_head_rcu(&con->list, &connection_hash[r]);
+ spin_unlock(&connections_lock);
+
return con;
}
/* Loop round all connections */
static void foreach_conn(void (*conn_func)(struct connection *c))
{
- int i;
- struct hlist_node *n;
+ int i, idx;
struct connection *con;
+ idx = srcu_read_lock(&connections_srcu);
for (i = 0; i < CONN_HASH_SIZE; i++) {
- hlist_for_each_entry_safe(con, n, &connection_hash[i], list)
+ hlist_for_each_entry_rcu(con, &connection_hash[i], list)
conn_func(con);
}
-}
-
-static struct connection *nodeid2con(int nodeid, gfp_t allocation)
-{
- struct connection *con;
-
- mutex_lock(&connections_lock);
- con = __nodeid2con(nodeid, allocation);
- mutex_unlock(&connections_lock);
-
- return con;
+ srcu_read_unlock(&connections_srcu, idx);
}
static struct dlm_node_addr *find_node_addr(int nodeid)
@@ -614,11 +601,8 @@ static void close_connection(struct connection *con, bool and_other,
/* Will only re-enter once. */
close_connection(con->othercon, false, true, true);
}
- if (con->rx_page) {
- __free_page(con->rx_page);
- con->rx_page = NULL;
- }
+ con->rx_leftover = 0;
con->retries = 0;
mutex_unlock(&con->sock_mutex);
clear_bit(CF_CLOSING, &con->flags);
@@ -672,16 +656,33 @@ static void dlm_tcp_shutdown(struct connection *con)
shutdown_connection(con);
}
+static int con_realloc_receive_buf(struct connection *con, int newlen)
+{
+ unsigned char *newbuf;
+
+ newbuf = kmalloc(newlen, GFP_NOFS);
+ if (!newbuf)
+ return -ENOMEM;
+
+ /* copy any leftover from last receive */
+ if (con->rx_leftover)
+ memmove(newbuf, con->rx_buf, con->rx_leftover);
+
+ /* swap to new buffer space */
+ kfree(con->rx_buf);
+ con->rx_buflen = newlen;
+ con->rx_buf = newbuf;
+
+ return 0;
+}
+
/* Data received from remote end */
static int receive_from_sock(struct connection *con)
{
- int ret = 0;
- struct msghdr msg = {};
- struct kvec iov[2];
- unsigned len;
- int r;
int call_again_soon = 0;
- int nvec;
+ struct msghdr msg;
+ struct kvec iov;
+ int ret, buflen;
mutex_lock(&con->sock_mutex);
@@ -689,71 +690,55 @@ static int receive_from_sock(struct connection *con)
ret = -EAGAIN;
goto out_close;
}
+
if (con->nodeid == 0) {
ret = -EINVAL;
goto out_close;
}
- if (con->rx_page == NULL) {
- /*
- * This doesn't need to be atomic, but I think it should
- * improve performance if it is.
- */
- con->rx_page = alloc_page(GFP_ATOMIC);
- if (con->rx_page == NULL)
+ /* realloc if we get new buffer size to read out */
+ buflen = dlm_config.ci_buffer_size;
+ if (con->rx_buflen != buflen && con->rx_leftover <= buflen) {
+ ret = con_realloc_receive_buf(con, buflen);
+ if (ret < 0)
goto out_resched;
- cbuf_init(&con->cb, PAGE_SIZE);
}
- /*
- * iov[0] is the bit of the circular buffer between the current end
- * point (cb.base + cb.len) and the end of the buffer.
- */
- iov[0].iov_len = con->cb.base - cbuf_data(&con->cb);
- iov[0].iov_base = page_address(con->rx_page) + cbuf_data(&con->cb);
- iov[1].iov_len = 0;
- nvec = 1;
-
- /*
- * iov[1] is the bit of the circular buffer between the start of the
- * buffer and the start of the currently used section (cb.base)
+ /* calculate new buffer parameter regarding last receive and
+ * possible leftover bytes
*/
- if (cbuf_data(&con->cb) >= con->cb.base) {
- iov[0].iov_len = PAGE_SIZE - cbuf_data(&con->cb);
- iov[1].iov_len = con->cb.base;
- iov[1].iov_base = page_address(con->rx_page);
- nvec = 2;
- }
- len = iov[0].iov_len + iov[1].iov_len;
- iov_iter_kvec(&msg.msg_iter, READ, iov, nvec, len);
+ iov.iov_base = con->rx_buf + con->rx_leftover;
+ iov.iov_len = con->rx_buflen - con->rx_leftover;
- r = ret = sock_recvmsg(con->sock, &msg, MSG_DONTWAIT | MSG_NOSIGNAL);
+ memset(&msg, 0, sizeof(msg));
+ msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
+ ret = kernel_recvmsg(con->sock, &msg, &iov, 1, iov.iov_len,
+ msg.msg_flags);
if (ret <= 0)
goto out_close;
- else if (ret == len)
+ else if (ret == iov.iov_len)
call_again_soon = 1;
- cbuf_add(&con->cb, ret);
- ret = dlm_process_incoming_buffer(con->nodeid,
- page_address(con->rx_page),
- con->cb.base, con->cb.len,
- PAGE_SIZE);
- if (ret < 0) {
- log_print("lowcomms err %d: addr=%p, base=%u, len=%u, read=%d",
- ret, page_address(con->rx_page), con->cb.base,
- con->cb.len, r);
- cbuf_eat(&con->cb, r);
- } else {
- cbuf_eat(&con->cb, ret);
- }
+ /* new buflen according readed bytes and leftover from last receive */
+ buflen = ret + con->rx_leftover;
+ ret = dlm_process_incoming_buffer(con->nodeid, con->rx_buf, buflen);
+ if (ret < 0)
+ goto out_close;
- if (cbuf_empty(&con->cb) && !call_again_soon) {
- __free_page(con->rx_page);
- con->rx_page = NULL;
+ /* calculate leftover bytes from process and put it into begin of
+ * the receive buffer, so next receive we have the full message
+ * at the start address of the receive buffer.
+ */
+ con->rx_leftover = buflen - ret;
+ if (con->rx_leftover) {
+ memmove(con->rx_buf, con->rx_buf + ret,
+ con->rx_leftover);
+ call_again_soon = true;
}
if (call_again_soon)
goto out_resched;
+
mutex_unlock(&con->sock_mutex);
return 0;
@@ -791,13 +776,11 @@ static int accept_from_sock(struct connection *con)
int nodeid;
struct connection *newcon;
struct connection *addcon;
+ unsigned int mark;
- mutex_lock(&connections_lock);
if (!dlm_allow_conn) {
- mutex_unlock(&connections_lock);
return -1;
}
- mutex_unlock(&connections_lock);
mutex_lock_nested(&con->sock_mutex, 0);
@@ -830,6 +813,9 @@ static int accept_from_sock(struct connection *con)
return -1;
}
+ dlm_comm_mark(nodeid, &mark);
+ sock_set_mark(newsock->sk, mark);
+
log_print("got connection from %d", nodeid);
/* Check to see if we already have a connection to this node. This
@@ -847,13 +833,24 @@ static int accept_from_sock(struct connection *con)
struct connection *othercon = newcon->othercon;
if (!othercon) {
- othercon = kmem_cache_zalloc(con_cache, GFP_NOFS);
+ othercon = kzalloc(sizeof(*othercon), GFP_NOFS);
if (!othercon) {
log_print("failed to allocate incoming socket");
mutex_unlock(&newcon->sock_mutex);
result = -ENOMEM;
goto accept_err;
}
+
+ othercon->rx_buflen = dlm_config.ci_buffer_size;
+ othercon->rx_buf = kmalloc(othercon->rx_buflen, GFP_NOFS);
+ if (!othercon->rx_buf) {
+ mutex_unlock(&newcon->sock_mutex);
+ kfree(othercon);
+ log_print("failed to allocate incoming socket receive buffer");
+ result = -ENOMEM;
+ goto accept_err;
+ }
+
othercon->nodeid = nodeid;
othercon->rx_action = receive_from_sock;
mutex_init(&othercon->sock_mutex);
@@ -975,6 +972,8 @@ static void sctp_connect_to_sock(struct connection *con)
return;
}
+ dlm_comm_mark(con->nodeid, &mark);
+
mutex_lock(&con->sock_mutex);
/* Some odd races can cause double-connects, ignore them */
@@ -999,11 +998,6 @@ static void sctp_connect_to_sock(struct connection *con)
if (result < 0)
goto socket_err;
- /* set skb mark */
- result = dlm_comm_mark(con->nodeid, &mark);
- if (result < 0)
- goto bind_err;
-
sock_set_mark(sock->sk, mark);
con->rx_action = receive_from_sock;
@@ -1076,6 +1070,8 @@ static void tcp_connect_to_sock(struct connection *con)
return;
}
+ dlm_comm_mark(con->nodeid, &mark);
+
mutex_lock(&con->sock_mutex);
if (con->retries++ > MAX_CONNECT_RETRIES)
goto out;
@@ -1090,11 +1086,6 @@ static void tcp_connect_to_sock(struct connection *con)
if (result < 0)
goto out_err;
- /* set skb mark */
- result = dlm_comm_mark(con->nodeid, &mark);
- if (result < 0)
- goto out_err;
-
sock_set_mark(sock->sk, mark);
memset(&saddr, 0, sizeof(saddr));
@@ -1238,6 +1229,14 @@ static void init_local(void)
}
}
+static void deinit_local(void)
+{
+ int i;
+
+ for (i = 0; i < dlm_local_count; i++)
+ kfree(dlm_local_addr[i]);
+}
+
/* Initialise SCTP socket and bind to all interfaces */
static int sctp_listen_for_all(void)
{
@@ -1546,13 +1545,6 @@ static void process_send_sockets(struct work_struct *work)
send_to_sock(con);
}
-
-/* Discard all entries on the write queues */
-static void clean_writequeues(void)
-{
- foreach_conn(clean_one_writequeue);
-}
-
static void work_stop(void)
{
if (recv_workqueue)
@@ -1608,26 +1600,34 @@ static void shutdown_conn(struct connection *con)
con->shutdown_action(con);
}
+static void connection_release(struct rcu_head *rcu)
+{
+ struct connection *con = container_of(rcu, struct connection, rcu);
+
+ kfree(con->rx_buf);
+ kfree(con);
+}
+
static void free_conn(struct connection *con)
{
close_connection(con, true, true, true);
- if (con->othercon)
- kmem_cache_free(con_cache, con->othercon);
- hlist_del(&con->list);
- kmem_cache_free(con_cache, con);
+ spin_lock(&connections_lock);
+ hlist_del_rcu(&con->list);
+ spin_unlock(&connections_lock);
+ if (con->othercon) {
+ clean_one_writequeue(con->othercon);
+ call_rcu(&con->othercon->rcu, connection_release);
+ }
+ clean_one_writequeue(con);
+ call_rcu(&con->rcu, connection_release);
}
static void work_flush(void)
{
- int ok;
+ int ok, idx;
int i;
- struct hlist_node *n;
struct connection *con;
- if (recv_workqueue)
- flush_workqueue(recv_workqueue);
- if (send_workqueue)
- flush_workqueue(send_workqueue);
do {
ok = 1;
foreach_conn(stop_conn);
@@ -1635,9 +1635,10 @@ static void work_flush(void)
flush_workqueue(recv_workqueue);
if (send_workqueue)
flush_workqueue(send_workqueue);
+ idx = srcu_read_lock(&connections_srcu);
for (i = 0; i < CONN_HASH_SIZE && ok; i++) {
- hlist_for_each_entry_safe(con, n,
- &connection_hash[i], list) {
+ hlist_for_each_entry_rcu(con, &connection_hash[i],
+ list) {
ok &= test_bit(CF_READ_PENDING, &con->flags);
ok &= test_bit(CF_WRITE_PENDING, &con->flags);
if (con->othercon) {
@@ -1648,6 +1649,7 @@ static void work_flush(void)
}
}
}
+ srcu_read_unlock(&connections_srcu, idx);
} while (!ok);
}
@@ -1656,16 +1658,18 @@ void dlm_lowcomms_stop(void)
/* Set all the flags to prevent any
socket activity.
*/
- mutex_lock(&connections_lock);
dlm_allow_conn = 0;
- mutex_unlock(&connections_lock);
+
+ if (recv_workqueue)
+ flush_workqueue(recv_workqueue);
+ if (send_workqueue)
+ flush_workqueue(send_workqueue);
+
foreach_conn(shutdown_conn);
work_flush();
- clean_writequeues();
foreach_conn(free_conn);
work_stop();
-
- kmem_cache_destroy(con_cache);
+ deinit_local();
}
int dlm_lowcomms_start(void)
@@ -1684,16 +1688,9 @@ int dlm_lowcomms_start(void)
goto fail;
}
- error = -ENOMEM;
- con_cache = kmem_cache_create("dlm_conn", sizeof(struct connection),
- __alignof__(struct connection), 0,
- NULL);
- if (!con_cache)
- goto fail;
-
error = work_start();
if (error)
- goto fail_destroy;
+ goto fail;
dlm_allow_conn = 1;
@@ -1710,12 +1707,8 @@ int dlm_lowcomms_start(void)
fail_unlisten:
dlm_allow_conn = 0;
con = nodeid2con(0,0);
- if (con) {
- close_connection(con, false, true, true);
- kmem_cache_free(con_cache, con);
- }
-fail_destroy:
- kmem_cache_destroy(con_cache);
+ if (con)
+ free_conn(con);
fail:
return error;
}
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index 921322d133e3..fde3a6afe4be 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -22,114 +22,84 @@
* into packets and sends them to the comms layer.
*/
+#include <asm/unaligned.h>
+
#include "dlm_internal.h"
#include "lowcomms.h"
#include "config.h"
#include "lock.h"
#include "midcomms.h"
-
-static void copy_from_cb(void *dst, const void *base, unsigned offset,
- unsigned len, unsigned limit)
-{
- unsigned copy = len;
-
- if ((copy + offset) > limit)
- copy = limit - offset;
- memcpy(dst, base + offset, copy);
- len -= copy;
- if (len)
- memcpy(dst + copy, base, len);
-}
-
/*
* Called from the low-level comms layer to process a buffer of
* commands.
- *
- * Only complete messages are processed here, any "spare" bytes from
- * the end of a buffer are saved and tacked onto the front of the next
- * message that comes in. I doubt this will happen very often but we
- * need to be able to cope with it and I don't want the task to be waiting
- * for packets to come in when there is useful work to be done.
*/
-int dlm_process_incoming_buffer(int nodeid, const void *base,
- unsigned offset, unsigned len, unsigned limit)
+int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len)
{
- union {
- unsigned char __buf[DLM_INBUF_LEN];
- /* this is to force proper alignment on some arches */
- union dlm_packet p;
- } __tmp;
- union dlm_packet *p = &__tmp.p;
- int ret = 0;
- int err = 0;
+ const unsigned char *ptr = buf;
+ const struct dlm_header *hd;
uint16_t msglen;
- uint32_t lockspace;
-
- while (len > sizeof(struct dlm_header)) {
-
- /* Copy just the header to check the total length. The
- message may wrap around the end of the buffer back to the
- start, so we need to use a temp buffer and copy_from_cb. */
-
- copy_from_cb(p, base, offset, sizeof(struct dlm_header),
- limit);
-
- msglen = le16_to_cpu(p->header.h_length);
- lockspace = p->header.h_lockspace;
+ int ret = 0;
- err = -EINVAL;
- if (msglen < sizeof(struct dlm_header))
- break;
- if (p->header.h_cmd == DLM_MSG) {
- if (msglen < sizeof(struct dlm_message))
- break;
- } else {
- if (msglen < sizeof(struct dlm_rcom))
- break;
- }
- err = -E2BIG;
- if (msglen > dlm_config.ci_buffer_size) {
- log_print("message size %d from %d too big, buf len %d",
- msglen, nodeid, len);
- break;
+ while (len >= sizeof(struct dlm_header)) {
+ hd = (struct dlm_header *)ptr;
+
+ /* no message should be more than this otherwise we
+ * cannot deliver this message to upper layers
+ */
+ msglen = get_unaligned_le16(&hd->h_length);
+ if (msglen > DEFAULT_BUFFER_SIZE) {
+ log_print("received invalid length header: %u, will abort message parsing",
+ msglen);
+ return -EBADMSG;
}
- err = 0;
-
- /* If only part of the full message is contained in this
- buffer, then do nothing and wait for lowcomms to call
- us again later with more data. We return 0 meaning
- we've consumed none of the input buffer. */
+ /* caller will take care that leftover
+ * will be parsed next call with more data
+ */
if (msglen > len)
break;
- /* Allocate a larger temp buffer if the full message won't fit
- in the buffer on the stack (which should work for most
- ordinary messages). */
-
- if (msglen > sizeof(__tmp) && p == &__tmp.p) {
- p = kmalloc(dlm_config.ci_buffer_size, GFP_NOFS);
- if (p == NULL)
- return ret;
- }
+ switch (hd->h_cmd) {
+ case DLM_MSG:
+ if (msglen < sizeof(struct dlm_message)) {
+ log_print("dlm msg too small: %u, will skip this message",
+ msglen);
+ goto skip;
+ }
- copy_from_cb(p, base, offset, msglen, limit);
+ break;
+ case DLM_RCOM:
+ if (msglen < sizeof(struct dlm_rcom)) {
+ log_print("dlm rcom msg too small: %u, will skip this message",
+ msglen);
+ goto skip;
+ }
- BUG_ON(lockspace != p->header.h_lockspace);
+ break;
+ default:
+ log_print("unsupported h_cmd received: %u, will skip this message",
+ hd->h_cmd);
+ goto skip;
+ }
+ /* for aligned memory access, we just copy current message
+ * to begin of the buffer which contains already parsed buffer
+ * data and should provide align access for upper layers
+ * because the start address of the buffer has a aligned
+ * address. This memmove can be removed when the upperlayer
+ * is capable of unaligned memory access.
+ */
+ memmove(buf, ptr, msglen);
+ dlm_receive_buffer((union dlm_packet *)buf, nodeid);
+
+skip:
ret += msglen;
- offset += msglen;
- offset &= (limit - 1);
len -= msglen;
-
- dlm_receive_buffer(p, nodeid);
+ ptr += msglen;
}
- if (p != &__tmp.p)
- kfree(p);
-
- return err ? err : ret;
+ return ret;
}
diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h
index 2e122e81c8d0..61e90a921849 100644
--- a/fs/dlm/midcomms.h
+++ b/fs/dlm/midcomms.h
@@ -12,8 +12,7 @@
#ifndef __MIDCOMMS_DOT_H__
#define __MIDCOMMS_DOT_H__
-int dlm_process_incoming_buffer(int nodeid, const void *base, unsigned offset,
- unsigned len, unsigned limit);
+int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int buflen);
#endif /* __MIDCOMMS_DOT_H__ */
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 28bb5689333a..15880a68faad 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -141,6 +141,9 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
name[len + EFI_VARIABLE_GUID_LEN+1] = '\0';
+ /* replace invalid slashes like kobject_set_name_vargs does for /sys/firmware/efi/vars. */
+ strreplace(name, '/', '!');
+
inode = efivarfs_get_inode(sb, d_inode(root), S_IFREG | 0644, 0,
is_removable);
if (!inode)
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 459ecb42cbd3..347be146884c 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -224,7 +224,7 @@ submit_bio_retry:
bio_set_dev(bio, sb->s_bdev);
bio->bi_iter.bi_sector = (sector_t)blknr <<
LOG_SECTORS_PER_BLOCK;
- bio->bi_opf = REQ_OP_READ;
+ bio->bi_opf = REQ_OP_READ | (ra ? REQ_RAHEAD : 0);
}
err = bio_add_page(bio, page, PAGE_SIZE, 0);
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index ddaa516c008a..b9a09806512a 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -211,9 +211,7 @@ static void erofs_default_options(struct erofs_fs_context *ctx)
enum {
Opt_user_xattr,
- Opt_nouser_xattr,
Opt_acl,
- Opt_noacl,
Opt_cache_strategy,
Opt_err
};
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
index c8c381eadcd6..5bde77d70852 100644
--- a/fs/erofs/xattr.c
+++ b/fs/erofs/xattr.c
@@ -473,8 +473,6 @@ static int erofs_xattr_generic_get(const struct xattr_handler *handler,
return -EOPNOTSUPP;
break;
case EROFS_XATTR_INDEX_TRUSTED:
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
break;
case EROFS_XATTR_INDEX_SECURITY:
break;
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 6c939def00f9..50912a5420b4 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -135,6 +135,7 @@ struct z_erofs_decompress_frontend {
struct z_erofs_collector clt;
struct erofs_map_blocks map;
+ bool readahead;
/* used for applying cache strategy on the fly */
bool backmost;
erofs_off_t headoffset;
@@ -153,8 +154,7 @@ static DEFINE_MUTEX(z_pagemap_global_lock);
static void preload_compressed_pages(struct z_erofs_collector *clt,
struct address_space *mc,
- enum z_erofs_cache_alloctype type,
- struct list_head *pagepool)
+ enum z_erofs_cache_alloctype type)
{
const struct z_erofs_pcluster *pcl = clt->pcl;
const unsigned int clusterpages = BIT(pcl->clusterbits);
@@ -562,8 +562,7 @@ static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe,
}
static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
- struct page *page,
- struct list_head *pagepool)
+ struct page *page)
{
struct inode *const inode = fe->inode;
struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
@@ -620,8 +619,7 @@ restart_now:
else
cache_strategy = DONTALLOC;
- preload_compressed_pages(clt, MNGD_MAPPING(sbi),
- cache_strategy, pagepool);
+ preload_compressed_pages(clt, MNGD_MAPPING(sbi), cache_strategy);
hitted:
/*
@@ -653,7 +651,7 @@ retry:
/* should allocate an additional staging page for pagevec */
if (err == -EAGAIN) {
struct page *const newpage =
- erofs_allocpage(pagepool, GFP_NOFS | __GFP_NOFAIL);
+ alloc_page(GFP_NOFS | __GFP_NOFAIL);
newpage->mapping = Z_EROFS_MAPPING_STAGING;
err = z_erofs_attach_page(clt, newpage,
@@ -1151,7 +1149,7 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl,
}
static void z_erofs_submit_queue(struct super_block *sb,
- z_erofs_next_pcluster_t owned_head,
+ struct z_erofs_decompress_frontend *f,
struct list_head *pagepool,
struct z_erofs_decompressqueue *fgq,
bool *force_fg)
@@ -1160,6 +1158,7 @@ static void z_erofs_submit_queue(struct super_block *sb,
z_erofs_next_pcluster_t qtail[NR_JOBQUEUES];
struct z_erofs_decompressqueue *q[NR_JOBQUEUES];
void *bi_private;
+ z_erofs_next_pcluster_t owned_head = f->clt.owned_head;
/* since bio will be NULL, no need to initialize last_index */
pgoff_t last_index;
unsigned int nr_bios = 0;
@@ -1193,7 +1192,6 @@ static void z_erofs_submit_queue(struct super_block *sb,
do {
struct page *page;
- int err;
page = pickup_page_for_submission(pcl, i++, pagepool,
MNGD_MAPPING(sbi),
@@ -1216,11 +1214,12 @@ submit_bio_retry:
LOG_SECTORS_PER_BLOCK;
bio->bi_private = bi_private;
bio->bi_opf = REQ_OP_READ;
+ if (f->readahead)
+ bio->bi_opf |= REQ_RAHEAD;
++nr_bios;
}
- err = bio_add_page(bio, page, PAGE_SIZE, 0);
- if (err < PAGE_SIZE)
+ if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE)
goto submit_bio_retry;
last_index = cur;
@@ -1248,14 +1247,14 @@ submit_bio_retry:
}
static void z_erofs_runqueue(struct super_block *sb,
- struct z_erofs_collector *clt,
+ struct z_erofs_decompress_frontend *f,
struct list_head *pagepool, bool force_fg)
{
struct z_erofs_decompressqueue io[NR_JOBQUEUES];
- if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL)
+ if (f->clt.owned_head == Z_EROFS_PCLUSTER_TAIL)
return;
- z_erofs_submit_queue(sb, clt->owned_head, pagepool, io, &force_fg);
+ z_erofs_submit_queue(sb, f, pagepool, io, &force_fg);
/* handle bypass queue (no i/o pclusters) immediately */
z_erofs_decompress_queue(&io[JQ_BYPASS], pagepool);
@@ -1282,11 +1281,11 @@ static int z_erofs_readpage(struct file *file, struct page *page)
f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT;
- err = z_erofs_do_read_page(&f, page, &pagepool);
+ err = z_erofs_do_read_page(&f, page);
(void)z_erofs_collector_end(&f.clt);
/* if some compressed cluster ready, need submit them anyway */
- z_erofs_runqueue(inode->i_sb, &f.clt, &pagepool, true);
+ z_erofs_runqueue(inode->i_sb, &f, &pagepool, true);
if (err)
erofs_err(inode->i_sb, "failed to read, err [%d]", err);
@@ -1299,25 +1298,20 @@ static int z_erofs_readpage(struct file *file, struct page *page)
return err;
}
-static bool should_decompress_synchronously(struct erofs_sb_info *sbi,
- unsigned int nr)
-{
- return nr <= sbi->ctx.max_sync_decompress_pages;
-}
-
static void z_erofs_readahead(struct readahead_control *rac)
{
struct inode *const inode = rac->mapping->host;
struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
- bool sync = should_decompress_synchronously(sbi, readahead_count(rac));
+ unsigned int nr_pages = readahead_count(rac);
+ bool sync = (nr_pages <= sbi->ctx.max_sync_decompress_pages);
struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
struct page *page, *head = NULL;
LIST_HEAD(pagepool);
- trace_erofs_readpages(inode, readahead_index(rac),
- readahead_count(rac), false);
+ trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false);
+ f.readahead = true;
f.headoffset = readahead_pos(rac);
while ((page = readahead_page(rac))) {
@@ -1341,7 +1335,7 @@ static void z_erofs_readahead(struct readahead_control *rac)
/* traversal in reverse order */
head = (void *)page_private(page);
- err = z_erofs_do_read_page(&f, page, &pagepool);
+ err = z_erofs_do_read_page(&f, page);
if (err)
erofs_err(inode->i_sb,
"readahead error at page %lu @ nid %llu",
@@ -1351,7 +1345,7 @@ static void z_erofs_readahead(struct readahead_control *rac)
(void)z_erofs_collector_end(&f.clt);
- z_erofs_runqueue(inode->i_sb, &f.clt, &pagepool, sync);
+ z_erofs_runqueue(inode->i_sb, &f, &pagepool, sync);
if (f.map.mpage)
put_page(f.map.mpage);
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 7d40d78ea864..ae325541884e 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -359,7 +359,7 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m,
return z_erofs_extent_lookback(m, m->delta[0]);
case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
map->m_flags &= ~EROFS_MAP_ZIPPED;
- /* fallthrough */
+ fallthrough;
case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
map->m_la = (lcn << lclusterbits) | m->clusterofs;
break;
@@ -416,7 +416,7 @@ int z_erofs_map_blocks_iter(struct inode *inode,
case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
if (endoff >= m.clusterofs)
map->m_flags &= ~EROFS_MAP_ZIPPED;
- /* fallthrough */
+ fallthrough;
case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
if (endoff >= m.clusterofs) {
map->m_la = (m.lcn << lclusterbits) | m.clusterofs;
@@ -433,7 +433,7 @@ int z_erofs_map_blocks_iter(struct inode *inode,
end = (m.lcn << lclusterbits) | m.clusterofs;
map->m_flags |= EROFS_MAP_FULL_MAPPED;
m.delta[0] = 1;
- /* fallthrough */
+ fallthrough;
case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
/* get the correspoinding first chunk */
err = z_erofs_extent_lookback(&m, m.delta[0]);
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 12eebcdea9c8..4df61129566d 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -218,8 +218,7 @@ struct eventpoll {
struct file *file;
/* used to optimize loop detection check */
- struct list_head visited_list_link;
- int visited;
+ u64 gen;
#ifdef CONFIG_NET_RX_BUSY_POLL
/* used to track busy poll napi_id */
@@ -274,6 +273,8 @@ static long max_user_watches __read_mostly;
*/
static DEFINE_MUTEX(epmutex);
+static u64 loop_check_gen = 0;
+
/* Used to check for epoll file descriptor inclusion loops */
static struct nested_calls poll_loop_ncalls;
@@ -283,9 +284,6 @@ static struct kmem_cache *epi_cache __read_mostly;
/* Slab cache used to allocate "struct eppoll_entry" */
static struct kmem_cache *pwq_cache __read_mostly;
-/* Visited nodes during ep_loop_check(), so we can unset them when we finish */
-static LIST_HEAD(visited_list);
-
/*
* List of files with newly added links, where we may need to limit the number
* of emanating paths. Protected by the epmutex.
@@ -1450,7 +1448,7 @@ static int reverse_path_check(void)
static int ep_create_wakeup_source(struct epitem *epi)
{
- const char *name;
+ struct name_snapshot n;
struct wakeup_source *ws;
if (!epi->ep->ws) {
@@ -1459,8 +1457,9 @@ static int ep_create_wakeup_source(struct epitem *epi)
return -ENOMEM;
}
- name = epi->ffd.file->f_path.dentry->d_name.name;
- ws = wakeup_source_register(NULL, name);
+ take_dentry_name_snapshot(&n, epi->ffd.file->f_path.dentry);
+ ws = wakeup_source_register(NULL, n.name.name);
+ release_dentry_name_snapshot(&n);
if (!ws)
return -ENOMEM;
@@ -1522,6 +1521,22 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
RCU_INIT_POINTER(epi->ws, NULL);
}
+ /* Add the current item to the list of active epoll hook for this file */
+ spin_lock(&tfile->f_lock);
+ list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
+ spin_unlock(&tfile->f_lock);
+
+ /*
+ * Add the current item to the RB tree. All RB tree operations are
+ * protected by "mtx", and ep_insert() is called with "mtx" held.
+ */
+ ep_rbtree_insert(ep, epi);
+
+ /* now check if we've created too many backpaths */
+ error = -EINVAL;
+ if (full_check && reverse_path_check())
+ goto error_remove_epi;
+
/* Initialize the poll table using the queue callback */
epq.epi = epi;
init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
@@ -1544,22 +1559,6 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
if (epi->nwait < 0)
goto error_unregister;
- /* Add the current item to the list of active epoll hook for this file */
- spin_lock(&tfile->f_lock);
- list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
- spin_unlock(&tfile->f_lock);
-
- /*
- * Add the current item to the RB tree. All RB tree operations are
- * protected by "mtx", and ep_insert() is called with "mtx" held.
- */
- ep_rbtree_insert(ep, epi);
-
- /* now check if we've created too many backpaths */
- error = -EINVAL;
- if (full_check && reverse_path_check())
- goto error_remove_epi;
-
/* We have to drop the new item inside our item list to keep track of it */
write_lock_irq(&ep->lock);
@@ -1588,6 +1587,8 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
return 0;
+error_unregister:
+ ep_unregister_pollwait(ep, epi);
error_remove_epi:
spin_lock(&tfile->f_lock);
list_del_rcu(&epi->fllink);
@@ -1595,9 +1596,6 @@ error_remove_epi:
rb_erase_cached(&epi->rbn, &ep->rbr);
-error_unregister:
- ep_unregister_pollwait(ep, epi);
-
/*
* We need to do this because an event could have been arrived on some
* allocated wait queue. Note that we don't care about the ep->ovflist
@@ -1972,13 +1970,12 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
struct epitem *epi;
mutex_lock_nested(&ep->mtx, call_nests + 1);
- ep->visited = 1;
- list_add(&ep->visited_list_link, &visited_list);
+ ep->gen = loop_check_gen;
for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
epi = rb_entry(rbp, struct epitem, rbn);
if (unlikely(is_file_epoll(epi->ffd.file))) {
ep_tovisit = epi->ffd.file->private_data;
- if (ep_tovisit->visited)
+ if (ep_tovisit->gen == loop_check_gen)
continue;
error = ep_call_nested(&poll_loop_ncalls,
ep_loop_check_proc, epi->ffd.file,
@@ -1994,9 +1991,11 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
* not already there, and calling reverse_path_check()
* during ep_insert().
*/
- if (list_empty(&epi->ffd.file->f_tfile_llink))
- list_add(&epi->ffd.file->f_tfile_llink,
- &tfile_check_list);
+ if (list_empty(&epi->ffd.file->f_tfile_llink)) {
+ if (get_file_rcu(epi->ffd.file))
+ list_add(&epi->ffd.file->f_tfile_llink,
+ &tfile_check_list);
+ }
}
}
mutex_unlock(&ep->mtx);
@@ -2017,18 +2016,8 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
*/
static int ep_loop_check(struct eventpoll *ep, struct file *file)
{
- int ret;
- struct eventpoll *ep_cur, *ep_next;
-
- ret = ep_call_nested(&poll_loop_ncalls,
+ return ep_call_nested(&poll_loop_ncalls,
ep_loop_check_proc, file, ep, current);
- /* clear visited list */
- list_for_each_entry_safe(ep_cur, ep_next, &visited_list,
- visited_list_link) {
- ep_cur->visited = 0;
- list_del(&ep_cur->visited_list_link);
- }
- return ret;
}
static void clear_tfile_check_list(void)
@@ -2040,6 +2029,7 @@ static void clear_tfile_check_list(void)
file = list_first_entry(&tfile_check_list, struct file,
f_tfile_llink);
list_del_init(&file->f_tfile_llink);
+ fput(file);
}
INIT_LIST_HEAD(&tfile_check_list);
}
@@ -2192,33 +2182,32 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
goto error_tgt_fput;
if (op == EPOLL_CTL_ADD) {
if (!list_empty(&f.file->f_ep_links) ||
+ ep->gen == loop_check_gen ||
is_file_epoll(tf.file)) {
mutex_unlock(&ep->mtx);
error = epoll_mutex_lock(&epmutex, 0, nonblock);
if (error)
goto error_tgt_fput;
+ loop_check_gen++;
full_check = 1;
if (is_file_epoll(tf.file)) {
error = -ELOOP;
- if (ep_loop_check(ep, tf.file) != 0) {
- clear_tfile_check_list();
+ if (ep_loop_check(ep, tf.file) != 0)
goto error_tgt_fput;
- }
- } else
+ } else {
+ get_file(tf.file);
list_add(&tf.file->f_tfile_llink,
&tfile_check_list);
+ }
error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
- if (error) {
-out_del:
- list_del(&tf.file->f_tfile_llink);
+ if (error)
goto error_tgt_fput;
- }
if (is_file_epoll(tf.file)) {
tep = tf.file->private_data;
error = epoll_mutex_lock(&tep->mtx, 1, nonblock);
if (error) {
mutex_unlock(&ep->mtx);
- goto out_del;
+ goto error_tgt_fput;
}
}
}
@@ -2239,8 +2228,6 @@ out_del:
error = ep_insert(ep, epds, tf.file, fd, full_check);
} else
error = -EEXIST;
- if (full_check)
- clear_tfile_check_list();
break;
case EPOLL_CTL_DEL:
if (epi)
@@ -2263,8 +2250,11 @@ out_del:
mutex_unlock(&ep->mtx);
error_tgt_fput:
- if (full_check)
+ if (full_check) {
+ clear_tfile_check_list();
+ loop_check_gen++;
mutex_unlock(&epmutex);
+ }
fdput(tf);
error_fput:
diff --git a/fs/exec.c b/fs/exec.c
index 3698252719a3..07910f5032e7 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -62,6 +62,7 @@
#include <linux/oom.h>
#include <linux/compat.h>
#include <linux/vmalloc.h>
+#include <linux/io_uring.h>
#include <linux/uaccess.h>
#include <asm/mmu_context.h>
@@ -141,12 +142,14 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
if (IS_ERR(file))
goto out;
- error = -EINVAL;
- if (!S_ISREG(file_inode(file)->i_mode))
- goto exit;
-
+ /*
+ * may_open() has already checked for this, so it should be
+ * impossible to trip now. But we need to be extra cautious
+ * and check again at the very end too.
+ */
error = -EACCES;
- if (path_noexec(&file->f_path))
+ if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode) ||
+ path_noexec(&file->f_path)))
goto exit;
fsnotify_open(file);
@@ -215,7 +218,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
* We are doing an exec(). 'current' is the process
* doing the exec and bprm->mm is the new process's mm.
*/
- ret = get_user_pages_remote(current, bprm->mm, pos, 1, gup_flags,
+ ret = get_user_pages_remote(bprm->mm, pos, 1, gup_flags,
&page, NULL, NULL);
if (ret <= 0)
return NULL;
@@ -909,11 +912,14 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags)
if (IS_ERR(file))
goto out;
+ /*
+ * may_open() has already checked for this, so it should be
+ * impossible to trip now. But we need to be extra cautious
+ * and check again at the very end too.
+ */
err = -EACCES;
- if (!S_ISREG(file_inode(file)->i_mode))
- goto exit;
-
- if (path_noexec(&file->f_path))
+ if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode) ||
+ path_noexec(&file->f_path)))
goto exit;
err = deny_write_access(file);
@@ -1402,7 +1408,12 @@ int begin_new_exec(struct linux_binprm * bprm)
if (retval)
goto out_unlock;
- set_fs(USER_DS);
+ /*
+ * Ensure that the uaccess routines can actually operate on userspace
+ * pointers:
+ */
+ force_uaccess_begin();
+
me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD |
PF_NOFREEZE | PF_NO_SETAFFINITY);
flush_thread();
@@ -1885,6 +1896,11 @@ static int bprm_execve(struct linux_binprm *bprm,
struct files_struct *displaced;
int retval;
+ /*
+ * Cancel any io_uring activity across execve
+ */
+ io_uring_task_cancel();
+
retval = unshare_files(&displaced);
if (retval)
return retval;
diff --git a/fs/exfat/balloc.c b/fs/exfat/balloc.c
index 4055eb00ea9b..a987919686c0 100644
--- a/fs/exfat/balloc.c
+++ b/fs/exfat/balloc.c
@@ -158,7 +158,7 @@ int exfat_set_bitmap(struct inode *inode, unsigned int clu)
b = BITMAP_OFFSET_BIT_IN_SECTOR(sb, ent_idx);
set_bit_le(b, sbi->vol_amap[i]->b_data);
- exfat_update_bh(sb, sbi->vol_amap[i], IS_DIRSYNC(inode));
+ exfat_update_bh(sbi->vol_amap[i], IS_DIRSYNC(inode));
return 0;
}
@@ -180,7 +180,7 @@ void exfat_clear_bitmap(struct inode *inode, unsigned int clu)
b = BITMAP_OFFSET_BIT_IN_SECTOR(sb, ent_idx);
clear_bit_le(b, sbi->vol_amap[i]->b_data);
- exfat_update_bh(sb, sbi->vol_amap[i], IS_DIRSYNC(inode));
+ exfat_update_bh(sbi->vol_amap[i], IS_DIRSYNC(inode));
if (opts->discard) {
int ret_discard;
diff --git a/fs/exfat/cache.c b/fs/exfat/cache.c
index 03d0824fc368..5a2f119b7e8c 100644
--- a/fs/exfat/cache.c
+++ b/fs/exfat/cache.c
@@ -17,7 +17,6 @@
#include "exfat_raw.h"
#include "exfat_fs.h"
-#define EXFAT_CACHE_VALID 0
#define EXFAT_MAX_CACHE 16
struct exfat_cache {
@@ -61,16 +60,6 @@ void exfat_cache_shutdown(void)
kmem_cache_destroy(exfat_cachep);
}
-void exfat_cache_init_inode(struct inode *inode)
-{
- struct exfat_inode_info *ei = EXFAT_I(inode);
-
- spin_lock_init(&ei->cache_lru_lock);
- ei->nr_caches = 0;
- ei->cache_valid_id = EXFAT_CACHE_VALID + 1;
- INIT_LIST_HEAD(&ei->cache_lru);
-}
-
static inline struct exfat_cache *exfat_cache_alloc(void)
{
return kmem_cache_alloc(exfat_cachep, GFP_NOFS);
diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c
index 119abf0d8dd6..573659bfbc55 100644
--- a/fs/exfat/dir.c
+++ b/fs/exfat/dir.c
@@ -470,7 +470,7 @@ int exfat_init_dir_entry(struct inode *inode, struct exfat_chain *p_dir,
&ep->dentry.file.access_date,
NULL);
- exfat_update_bh(sb, bh, IS_DIRSYNC(inode));
+ exfat_update_bh(bh, IS_DIRSYNC(inode));
brelse(bh);
ep = exfat_get_dentry(sb, p_dir, entry + 1, &bh, &sector);
@@ -480,7 +480,7 @@ int exfat_init_dir_entry(struct inode *inode, struct exfat_chain *p_dir,
exfat_init_stream_entry(ep,
(type == TYPE_FILE) ? ALLOC_FAT_CHAIN : ALLOC_NO_FAT_CHAIN,
start_clu, size);
- exfat_update_bh(sb, bh, IS_DIRSYNC(inode));
+ exfat_update_bh(bh, IS_DIRSYNC(inode));
brelse(bh);
return 0;
@@ -516,7 +516,7 @@ int exfat_update_dir_chksum(struct inode *inode, struct exfat_chain *p_dir,
}
fep->dentry.file.checksum = cpu_to_le16(chksum);
- exfat_update_bh(sb, fbh, IS_DIRSYNC(inode));
+ exfat_update_bh(fbh, IS_DIRSYNC(inode));
release_fbh:
brelse(fbh);
return ret;
@@ -538,7 +538,7 @@ int exfat_init_ext_entry(struct inode *inode, struct exfat_chain *p_dir,
return -EIO;
ep->dentry.file.num_ext = (unsigned char)(num_entries - 1);
- exfat_update_bh(sb, bh, sync);
+ exfat_update_bh(bh, sync);
brelse(bh);
ep = exfat_get_dentry(sb, p_dir, entry + 1, &bh, &sector);
@@ -547,7 +547,7 @@ int exfat_init_ext_entry(struct inode *inode, struct exfat_chain *p_dir,
ep->dentry.stream.name_len = p_uniname->name_len;
ep->dentry.stream.name_hash = cpu_to_le16(p_uniname->name_hash);
- exfat_update_bh(sb, bh, sync);
+ exfat_update_bh(bh, sync);
brelse(bh);
for (i = EXFAT_FIRST_CLUSTER; i < num_entries; i++) {
@@ -556,7 +556,7 @@ int exfat_init_ext_entry(struct inode *inode, struct exfat_chain *p_dir,
return -EIO;
exfat_init_name_entry(ep, uniname);
- exfat_update_bh(sb, bh, sync);
+ exfat_update_bh(bh, sync);
brelse(bh);
uniname += EXFAT_FILE_NAME_LEN;
}
@@ -580,7 +580,7 @@ int exfat_remove_entries(struct inode *inode, struct exfat_chain *p_dir,
return -EIO;
exfat_set_entry_type(ep, TYPE_DELETED);
- exfat_update_bh(sb, bh, IS_DIRSYNC(inode));
+ exfat_update_bh(bh, IS_DIRSYNC(inode));
brelse(bh);
}
@@ -604,16 +604,20 @@ void exfat_update_dir_chksum_with_entry_set(struct exfat_entry_set_cache *es)
es->modified = true;
}
-void exfat_free_dentry_set(struct exfat_entry_set_cache *es, int sync)
+int exfat_free_dentry_set(struct exfat_entry_set_cache *es, int sync)
{
- int i;
+ int i, err = 0;
- for (i = 0; i < es->num_bh; i++) {
- if (es->modified)
- exfat_update_bh(es->sb, es->bh[i], sync);
- brelse(es->bh[i]);
- }
+ if (es->modified)
+ err = exfat_update_bhs(es->bh, es->num_bh, sync);
+
+ for (i = 0; i < es->num_bh; i++)
+ if (err)
+ bforget(es->bh[i]);
+ else
+ brelse(es->bh[i]);
kfree(es);
+ return err;
}
static int exfat_walk_fat_chain(struct super_block *sb,
diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index 75c7bdbeba6d..c013fe931d9c 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -13,8 +13,6 @@
#define EXFAT_SUPER_MAGIC 0x2011BAB0UL
#define EXFAT_ROOT_INO 1
-#define EXFAT_SB_DIRTY 0
-
#define EXFAT_CLUSTERS_UNTRACKED (~0u)
/*
@@ -226,7 +224,8 @@ struct exfat_sb_info {
unsigned int num_FAT_sectors; /* num of FAT sectors */
unsigned int root_dir; /* root dir cluster */
unsigned int dentries_per_clu; /* num of dentries per cluster */
- unsigned int vol_flag; /* volume dirty flag */
+ unsigned int vol_flags; /* volume flags */
+ unsigned int vol_flags_persistent; /* volume flags to retain */
struct buffer_head *boot_bh; /* buffer_head of BOOT sector */
unsigned int map_clu; /* allocation bitmap start cluster */
@@ -238,7 +237,6 @@ struct exfat_sb_info {
unsigned int clu_srch_ptr; /* cluster search pointer */
unsigned int used_clusters; /* number of used clusters */
- unsigned long s_state;
struct mutex s_lock; /* superblock lock */
struct exfat_mount_options options;
struct nls_table *nls_io; /* Charset used for input and display */
@@ -250,6 +248,8 @@ struct exfat_sb_info {
struct rcu_head rcu;
};
+#define EXFAT_CACHE_VALID 0
+
/*
* EXFAT file system inode in-memory data
*/
@@ -383,7 +383,8 @@ static inline int exfat_sector_to_cluster(struct exfat_sb_info *sbi,
}
/* super.c */
-int exfat_set_vol_flags(struct super_block *sb, unsigned short new_flag);
+int exfat_set_volume_dirty(struct super_block *sb);
+int exfat_clear_volume_dirty(struct super_block *sb);
/* fatent.c */
#define exfat_get_next_cluster(sb, pclu) exfat_ent_get(sb, *(pclu), pclu)
@@ -429,7 +430,6 @@ extern const struct dentry_operations exfat_utf8_dentry_ops;
/* cache.c */
int exfat_cache_init(void);
void exfat_cache_shutdown(void);
-void exfat_cache_init_inode(struct inode *inode);
void exfat_cache_inval_inode(struct inode *inode);
int exfat_get_cluster(struct inode *inode, unsigned int cluster,
unsigned int *fclus, unsigned int *dclus,
@@ -463,7 +463,7 @@ struct exfat_dentry *exfat_get_dentry_cached(struct exfat_entry_set_cache *es,
int num);
struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb,
struct exfat_chain *p_dir, int entry, unsigned int type);
-void exfat_free_dentry_set(struct exfat_entry_set_cache *es, int sync);
+int exfat_free_dentry_set(struct exfat_entry_set_cache *es, int sync);
int exfat_count_dir_entries(struct super_block *sb, struct exfat_chain *p_dir);
/* inode.c */
@@ -515,7 +515,8 @@ void exfat_set_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts,
u8 *tz, __le16 *time, __le16 *date, u8 *time_cs);
u16 exfat_calc_chksum16(void *data, int len, u16 chksum, int type);
u32 exfat_calc_chksum32(void *data, int len, u32 chksum, int type);
-void exfat_update_bh(struct super_block *sb, struct buffer_head *bh, int sync);
+void exfat_update_bh(struct buffer_head *bh, int sync);
+int exfat_update_bhs(struct buffer_head **bhs, int nr_bhs, int sync);
void exfat_chain_set(struct exfat_chain *ec, unsigned int dir,
unsigned int size, unsigned char flags);
void exfat_chain_dup(struct exfat_chain *dup, struct exfat_chain *ec);
diff --git a/fs/exfat/exfat_raw.h b/fs/exfat/exfat_raw.h
index 350ce59cc324..6aec6288e1f2 100644
--- a/fs/exfat/exfat_raw.h
+++ b/fs/exfat/exfat_raw.h
@@ -14,9 +14,8 @@
#define EXFAT_MAX_FILE_LEN 255
-#define VOL_CLEAN 0x0000
-#define VOL_DIRTY 0x0002
-#define ERR_MEDIUM 0x0004
+#define VOLUME_DIRTY 0x0002
+#define MEDIA_FAILURE 0x0004
#define EXFAT_EOF_CLUSTER 0xFFFFFFFFu
#define EXFAT_BAD_CLUSTER 0xFFFFFFF7u
diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c
index 4e5c5c9c0f2d..c3c9afee7418 100644
--- a/fs/exfat/fatent.c
+++ b/fs/exfat/fatent.c
@@ -75,7 +75,7 @@ int exfat_ent_set(struct super_block *sb, unsigned int loc,
fat_entry = (__le32 *)&(bh->b_data[off]);
*fat_entry = cpu_to_le32(content);
- exfat_update_bh(sb, bh, sb->s_flags & SB_SYNCHRONOUS);
+ exfat_update_bh(bh, sb->s_flags & SB_SYNCHRONOUS);
exfat_mirror_bh(sb, sec, bh);
brelse(bh);
return 0;
@@ -174,7 +174,6 @@ int exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain)
return -EIO;
}
- set_bit(EXFAT_SB_DIRTY, &sbi->s_state);
clu = p_chain->dir;
if (p_chain->flags == ALLOC_NO_FAT_CHAIN) {
@@ -230,21 +229,6 @@ int exfat_find_last_cluster(struct super_block *sb, struct exfat_chain *p_chain,
return 0;
}
-static inline int exfat_sync_bhs(struct buffer_head **bhs, int nr_bhs)
-{
- int i, err = 0;
-
- for (i = 0; i < nr_bhs; i++)
- write_dirty_buffer(bhs[i], 0);
-
- for (i = 0; i < nr_bhs; i++) {
- wait_on_buffer(bhs[i]);
- if (!err && !buffer_uptodate(bhs[i]))
- err = -EIO;
- }
- return err;
-}
-
int exfat_zeroed_cluster(struct inode *dir, unsigned int clu)
{
struct super_block *sb = dir->i_sb;
@@ -266,41 +250,23 @@ int exfat_zeroed_cluster(struct inode *dir, unsigned int clu)
}
/* Zeroing the unused blocks on this cluster */
- n = 0;
while (blknr < last_blknr) {
- bhs[n] = sb_getblk(sb, blknr);
- if (!bhs[n]) {
- err = -ENOMEM;
- goto release_bhs;
- }
- memset(bhs[n]->b_data, 0, sb->s_blocksize);
- exfat_update_bh(sb, bhs[n], 0);
-
- n++;
- blknr++;
-
- if (n == nr_bhs) {
- if (IS_DIRSYNC(dir)) {
- err = exfat_sync_bhs(bhs, n);
- if (err)
- goto release_bhs;
+ for (n = 0; n < nr_bhs && blknr < last_blknr; n++, blknr++) {
+ bhs[n] = sb_getblk(sb, blknr);
+ if (!bhs[n]) {
+ err = -ENOMEM;
+ goto release_bhs;
}
-
- for (i = 0; i < n; i++)
- brelse(bhs[i]);
- n = 0;
+ memset(bhs[n]->b_data, 0, sb->s_blocksize);
}
- }
- if (IS_DIRSYNC(dir)) {
- err = exfat_sync_bhs(bhs, n);
+ err = exfat_update_bhs(bhs, n, IS_DIRSYNC(dir));
if (err)
goto release_bhs;
- }
-
- for (i = 0; i < n; i++)
- brelse(bhs[i]);
+ for (i = 0; i < n; i++)
+ brelse(bhs[i]);
+ }
return 0;
release_bhs:
@@ -358,8 +324,6 @@ int exfat_alloc_cluster(struct inode *inode, unsigned int num_alloc,
}
}
- set_bit(EXFAT_SB_DIRTY, &sbi->s_state);
-
p_chain->dir = EXFAT_EOF_CLUSTER;
while ((new_clu = exfat_find_free_bitmap(sb, hint_clu)) !=
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index a6a063830edc..f41f523a58ad 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -106,7 +106,7 @@ int __exfat_truncate(struct inode *inode, loff_t new_size)
if (ei->type != TYPE_FILE && ei->type != TYPE_DIR)
return -EPERM;
- exfat_set_vol_flags(sb, VOL_DIRTY);
+ exfat_set_volume_dirty(sb);
num_clusters_new = EXFAT_B_TO_CLU_ROUND_UP(i_size_read(inode), sbi);
num_clusters_phys =
@@ -154,6 +154,7 @@ int __exfat_truncate(struct inode *inode, loff_t new_size)
struct timespec64 ts;
struct exfat_dentry *ep, *ep2;
struct exfat_entry_set_cache *es;
+ int err;
es = exfat_get_dentry_set(sb, &(ei->dir), ei->entry,
ES_ALL_ENTRIES);
@@ -188,7 +189,9 @@ int __exfat_truncate(struct inode *inode, loff_t new_size)
}
exfat_update_dir_chksum_with_entry_set(es);
- exfat_free_dentry_set(es, inode_needs_sync(inode));
+ err = exfat_free_dentry_set(es, inode_needs_sync(inode));
+ if (err)
+ return err;
}
/* cut off from the FAT chain */
@@ -217,7 +220,7 @@ int __exfat_truncate(struct inode *inode, loff_t new_size)
if (exfat_free_cluster(inode, &clu))
return -EIO;
- exfat_set_vol_flags(sb, VOL_CLEAN);
+ exfat_clear_volume_dirty(sb);
return 0;
}
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index cf9ca6c4d046..a6de17cac3df 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -39,7 +39,7 @@ static int __exfat_write_inode(struct inode *inode, int sync)
if (is_dir && ei->dir.dir == sbi->root_dir && ei->entry == -1)
return 0;
- exfat_set_vol_flags(sb, VOL_DIRTY);
+ exfat_set_volume_dirty(sb);
/* get the directory entry of given file or directory */
es = exfat_get_dentry_set(sb, &(ei->dir), ei->entry, ES_ALL_ENTRIES);
@@ -77,8 +77,7 @@ static int __exfat_write_inode(struct inode *inode, int sync)
ep2->dentry.stream.size = ep2->dentry.stream.valid_size;
exfat_update_dir_chksum_with_entry_set(es);
- exfat_free_dentry_set(es, sync);
- return 0;
+ return exfat_free_dentry_set(es, sync);
}
int exfat_write_inode(struct inode *inode, struct writeback_control *wbc)
@@ -168,7 +167,7 @@ static int exfat_map_cluster(struct inode *inode, unsigned int clu_offset,
}
if (*clu == EXFAT_EOF_CLUSTER) {
- exfat_set_vol_flags(sb, VOL_DIRTY);
+ exfat_set_volume_dirty(sb);
new_clu.dir = (last_clu == EXFAT_EOF_CLUSTER) ?
EXFAT_EOF_CLUSTER : last_clu + 1;
@@ -222,6 +221,7 @@ static int exfat_map_cluster(struct inode *inode, unsigned int clu_offset,
if (ei->dir.dir != DIR_DELETED && modified) {
struct exfat_dentry *ep;
struct exfat_entry_set_cache *es;
+ int err;
es = exfat_get_dentry_set(sb, &(ei->dir), ei->entry,
ES_ALL_ENTRIES);
@@ -240,8 +240,9 @@ static int exfat_map_cluster(struct inode *inode, unsigned int clu_offset,
ep->dentry.stream.valid_size;
exfat_update_dir_chksum_with_entry_set(es);
- exfat_free_dentry_set(es, inode_needs_sync(inode));
-
+ err = exfat_free_dentry_set(es, inode_needs_sync(inode));
+ if (err)
+ return err;
} /* end of if != DIR_DELETED */
inode->i_blocks +=
@@ -610,8 +611,6 @@ static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info)
ei->i_crtime = info->crtime;
inode->i_atime = info->atime;
- exfat_cache_init_inode(inode);
-
return 0;
}
diff --git a/fs/exfat/misc.c b/fs/exfat/misc.c
index 17d41f3d3709..d34e6193258d 100644
--- a/fs/exfat/misc.c
+++ b/fs/exfat/misc.c
@@ -163,9 +163,8 @@ u32 exfat_calc_chksum32(void *data, int len, u32 chksum, int type)
return chksum;
}
-void exfat_update_bh(struct super_block *sb, struct buffer_head *bh, int sync)
+void exfat_update_bh(struct buffer_head *bh, int sync)
{
- set_bit(EXFAT_SB_DIRTY, &EXFAT_SB(sb)->s_state);
set_buffer_uptodate(bh);
mark_buffer_dirty(bh);
@@ -173,6 +172,25 @@ void exfat_update_bh(struct super_block *sb, struct buffer_head *bh, int sync)
sync_dirty_buffer(bh);
}
+int exfat_update_bhs(struct buffer_head **bhs, int nr_bhs, int sync)
+{
+ int i, err = 0;
+
+ for (i = 0; i < nr_bhs; i++) {
+ set_buffer_uptodate(bhs[i]);
+ mark_buffer_dirty(bhs[i]);
+ if (sync)
+ write_dirty_buffer(bhs[i], 0);
+ }
+
+ for (i = 0; i < nr_bhs && sync; i++) {
+ wait_on_buffer(bhs[i]);
+ if (!err && !buffer_uptodate(bhs[i]))
+ err = -EIO;
+ }
+ return err;
+}
+
void exfat_chain_set(struct exfat_chain *ec, unsigned int dir,
unsigned int size, unsigned char flags)
{
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index 2b9e21094a96..c94ac239f740 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -387,7 +387,7 @@ static int exfat_find_empty_entry(struct inode *inode,
ep->dentry.stream.valid_size = cpu_to_le64(size);
ep->dentry.stream.size = ep->dentry.stream.valid_size;
ep->dentry.stream.flags = p_dir->flags;
- exfat_update_bh(sb, bh, IS_DIRSYNC(inode));
+ exfat_update_bh(bh, IS_DIRSYNC(inode));
brelse(bh);
if (exfat_update_dir_chksum(inode, &(ei->dir),
ei->entry))
@@ -562,10 +562,10 @@ static int exfat_create(struct inode *dir, struct dentry *dentry, umode_t mode,
int err;
mutex_lock(&EXFAT_SB(sb)->s_lock);
- exfat_set_vol_flags(sb, VOL_DIRTY);
+ exfat_set_volume_dirty(sb);
err = exfat_add_entry(dir, dentry->d_name.name, &cdir, TYPE_FILE,
&info);
- exfat_set_vol_flags(sb, VOL_CLEAN);
+ exfat_clear_volume_dirty(sb);
if (err)
goto unlock;
@@ -578,7 +578,8 @@ static int exfat_create(struct inode *dir, struct dentry *dentry, umode_t mode,
i_pos = exfat_make_i_pos(&info);
inode = exfat_build_inode(sb, &info, i_pos);
- if (IS_ERR(inode))
+ err = PTR_ERR_OR_ZERO(inode);
+ if (err)
goto unlock;
inode_inc_iversion(inode);
@@ -745,10 +746,9 @@ static struct dentry *exfat_lookup(struct inode *dir, struct dentry *dentry,
i_pos = exfat_make_i_pos(&info);
inode = exfat_build_inode(sb, &info, i_pos);
- if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
+ err = PTR_ERR_OR_ZERO(inode);
+ if (err)
goto unlock;
- }
i_mode = inode->i_mode;
alias = d_find_alias(inode);
@@ -834,7 +834,7 @@ static int exfat_unlink(struct inode *dir, struct dentry *dentry)
num_entries++;
brelse(bh);
- exfat_set_vol_flags(sb, VOL_DIRTY);
+ exfat_set_volume_dirty(sb);
/* update the directory entry */
if (exfat_remove_entries(dir, &cdir, entry, 0, num_entries)) {
err = -EIO;
@@ -843,7 +843,7 @@ static int exfat_unlink(struct inode *dir, struct dentry *dentry)
/* This doesn't modify ei */
ei->dir.dir = DIR_DELETED;
- exfat_set_vol_flags(sb, VOL_CLEAN);
+ exfat_clear_volume_dirty(sb);
inode_inc_iversion(dir);
dir->i_mtime = dir->i_atime = current_time(dir);
@@ -873,10 +873,10 @@ static int exfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
int err;
mutex_lock(&EXFAT_SB(sb)->s_lock);
- exfat_set_vol_flags(sb, VOL_DIRTY);
+ exfat_set_volume_dirty(sb);
err = exfat_add_entry(dir, dentry->d_name.name, &cdir, TYPE_DIR,
&info);
- exfat_set_vol_flags(sb, VOL_CLEAN);
+ exfat_clear_volume_dirty(sb);
if (err)
goto unlock;
@@ -890,10 +890,9 @@ static int exfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
i_pos = exfat_make_i_pos(&info);
inode = exfat_build_inode(sb, &info, i_pos);
- if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
+ err = PTR_ERR_OR_ZERO(inode);
+ if (err)
goto unlock;
- }
inode_inc_iversion(inode);
inode->i_mtime = inode->i_atime = inode->i_ctime =
@@ -1001,14 +1000,14 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry)
num_entries++;
brelse(bh);
- exfat_set_vol_flags(sb, VOL_DIRTY);
+ exfat_set_volume_dirty(sb);
err = exfat_remove_entries(dir, &cdir, entry, 0, num_entries);
if (err) {
exfat_err(sb, "failed to exfat_remove_entries : err(%d)", err);
goto unlock;
}
ei->dir.dir = DIR_DELETED;
- exfat_set_vol_flags(sb, VOL_CLEAN);
+ exfat_clear_volume_dirty(sb);
inode_inc_iversion(dir);
dir->i_mtime = dir->i_atime = current_time(dir);
@@ -1071,7 +1070,7 @@ static int exfat_rename_file(struct inode *inode, struct exfat_chain *p_dir,
epnew->dentry.file.attr |= cpu_to_le16(ATTR_ARCHIVE);
ei->attr |= ATTR_ARCHIVE;
}
- exfat_update_bh(sb, new_bh, sync);
+ exfat_update_bh(new_bh, sync);
brelse(old_bh);
brelse(new_bh);
@@ -1087,7 +1086,7 @@ static int exfat_rename_file(struct inode *inode, struct exfat_chain *p_dir,
}
memcpy(epnew, epold, DENTRY_SIZE);
- exfat_update_bh(sb, new_bh, sync);
+ exfat_update_bh(new_bh, sync);
brelse(old_bh);
brelse(new_bh);
@@ -1104,7 +1103,7 @@ static int exfat_rename_file(struct inode *inode, struct exfat_chain *p_dir,
epold->dentry.file.attr |= cpu_to_le16(ATTR_ARCHIVE);
ei->attr |= ATTR_ARCHIVE;
}
- exfat_update_bh(sb, old_bh, sync);
+ exfat_update_bh(old_bh, sync);
brelse(old_bh);
ret = exfat_init_ext_entry(inode, p_dir, oldentry,
num_new_entries, p_uniname);
@@ -1159,7 +1158,7 @@ static int exfat_move_file(struct inode *inode, struct exfat_chain *p_olddir,
epnew->dentry.file.attr |= cpu_to_le16(ATTR_ARCHIVE);
ei->attr |= ATTR_ARCHIVE;
}
- exfat_update_bh(sb, new_bh, IS_DIRSYNC(inode));
+ exfat_update_bh(new_bh, IS_DIRSYNC(inode));
brelse(mov_bh);
brelse(new_bh);
@@ -1175,7 +1174,7 @@ static int exfat_move_file(struct inode *inode, struct exfat_chain *p_olddir,
}
memcpy(epnew, epmov, DENTRY_SIZE);
- exfat_update_bh(sb, new_bh, IS_DIRSYNC(inode));
+ exfat_update_bh(new_bh, IS_DIRSYNC(inode));
brelse(mov_bh);
brelse(new_bh);
@@ -1300,7 +1299,7 @@ static int __exfat_rename(struct inode *old_parent_inode,
if (ret)
goto out;
- exfat_set_vol_flags(sb, VOL_DIRTY);
+ exfat_set_volume_dirty(sb);
if (olddir.dir == newdir.dir)
ret = exfat_rename_file(new_parent_inode, &olddir, dentry,
@@ -1355,7 +1354,7 @@ del_out:
*/
new_ei->dir.dir = DIR_DELETED;
}
- exfat_set_vol_flags(sb, VOL_CLEAN);
+ exfat_clear_volume_dirty(sb);
out:
return ret;
}
diff --git a/fs/exfat/super.c b/fs/exfat/super.c
index 253a92460d52..60b941ba557b 100644
--- a/fs/exfat/super.c
+++ b/fs/exfat/super.c
@@ -45,9 +45,6 @@ static void exfat_put_super(struct super_block *sb)
struct exfat_sb_info *sbi = EXFAT_SB(sb);
mutex_lock(&sbi->s_lock);
- if (test_and_clear_bit(EXFAT_SB_DIRTY, &sbi->s_state))
- sync_blockdev(sb->s_bdev);
- exfat_set_vol_flags(sb, VOL_CLEAN);
exfat_free_bitmap(sbi);
brelse(sbi->boot_bh);
mutex_unlock(&sbi->s_lock);
@@ -60,13 +57,14 @@ static int exfat_sync_fs(struct super_block *sb, int wait)
struct exfat_sb_info *sbi = EXFAT_SB(sb);
int err = 0;
+ if (!wait)
+ return 0;
+
/* If there are some dirty buffers in the bdev inode */
mutex_lock(&sbi->s_lock);
- if (test_and_clear_bit(EXFAT_SB_DIRTY, &sbi->s_state)) {
- sync_blockdev(sb->s_bdev);
- if (exfat_set_vol_flags(sb, VOL_CLEAN))
- err = -EIO;
- }
+ sync_blockdev(sb->s_bdev);
+ if (exfat_clear_volume_dirty(sb))
+ err = -EIO;
mutex_unlock(&sbi->s_lock);
return err;
}
@@ -98,17 +96,20 @@ static int exfat_statfs(struct dentry *dentry, struct kstatfs *buf)
return 0;
}
-int exfat_set_vol_flags(struct super_block *sb, unsigned short new_flag)
+static int exfat_set_vol_flags(struct super_block *sb, unsigned short new_flags)
{
struct exfat_sb_info *sbi = EXFAT_SB(sb);
struct boot_sector *p_boot = (struct boot_sector *)sbi->boot_bh->b_data;
bool sync;
+ /* retain persistent-flags */
+ new_flags |= sbi->vol_flags_persistent;
+
/* flags are not changed */
- if (sbi->vol_flag == new_flag)
+ if (sbi->vol_flags == new_flags)
return 0;
- sbi->vol_flag = new_flag;
+ sbi->vol_flags = new_flags;
/* skip updating volume dirty flag,
* if this volume has been mounted with read-only
@@ -116,9 +117,9 @@ int exfat_set_vol_flags(struct super_block *sb, unsigned short new_flag)
if (sb_rdonly(sb))
return 0;
- p_boot->vol_flags = cpu_to_le16(new_flag);
+ p_boot->vol_flags = cpu_to_le16(new_flags);
- if (new_flag == VOL_DIRTY && !buffer_dirty(sbi->boot_bh))
+ if ((new_flags & VOLUME_DIRTY) && !buffer_dirty(sbi->boot_bh))
sync = true;
else
sync = false;
@@ -131,6 +132,20 @@ int exfat_set_vol_flags(struct super_block *sb, unsigned short new_flag)
return 0;
}
+int exfat_set_volume_dirty(struct super_block *sb)
+{
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+
+ return exfat_set_vol_flags(sb, sbi->vol_flags | VOLUME_DIRTY);
+}
+
+int exfat_clear_volume_dirty(struct super_block *sb)
+{
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+
+ return exfat_set_vol_flags(sb, sbi->vol_flags & ~VOLUME_DIRTY);
+}
+
static int exfat_show_options(struct seq_file *m, struct dentry *root)
{
struct super_block *sb = root->d_sb;
@@ -361,7 +376,6 @@ static int exfat_read_root(struct inode *inode)
inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime =
current_time(inode);
exfat_truncate_atime(&inode->i_atime);
- exfat_cache_init_inode(inode);
return 0;
}
@@ -459,7 +473,8 @@ static int exfat_read_boot_sector(struct super_block *sb)
sbi->dentries_per_clu = 1 <<
(sbi->cluster_size_bits - DENTRY_SIZE_BITS);
- sbi->vol_flag = le16_to_cpu(p_boot->vol_flags);
+ sbi->vol_flags = le16_to_cpu(p_boot->vol_flags);
+ sbi->vol_flags_persistent = sbi->vol_flags & (VOLUME_DIRTY | MEDIA_FAILURE);
sbi->clu_srch_ptr = EXFAT_FIRST_CLUSTER;
sbi->used_clusters = EXFAT_CLUSTERS_UNTRACKED;
@@ -474,9 +489,9 @@ static int exfat_read_boot_sector(struct super_block *sb)
exfat_err(sb, "bogus data start sector");
return -EINVAL;
}
- if (sbi->vol_flag & VOL_DIRTY)
+ if (sbi->vol_flags & VOLUME_DIRTY)
exfat_warn(sb, "Volume was not properly unmounted. Some data may be corrupt. Please run fsck.");
- if (sbi->vol_flag & ERR_MEDIUM)
+ if (sbi->vol_flags & MEDIA_FAILURE)
exfat_warn(sb, "Medium has reported failures. Some data may be lost.");
/* exFAT file size is limited by a disk volume size */
@@ -747,6 +762,10 @@ static void exfat_inode_init_once(void *foo)
{
struct exfat_inode_info *ei = (struct exfat_inode_info *)foo;
+ spin_lock_init(&ei->cache_lru_lock);
+ ei->nr_caches = 0;
+ ei->cache_valid_id = EXFAT_CACHE_VALID + 1;
+ INIT_LIST_HEAD(&ei->cache_lru);
INIT_HLIST_NODE(&ei->i_hash_fat);
inode_init_once(&ei->vfs_inode);
}
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 60378ddf1424..96044f5dbc0e 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -93,8 +93,10 @@ static vm_fault_t ext2_dax_fault(struct vm_fault *vmf)
struct inode *inode = file_inode(vmf->vma->vm_file);
struct ext2_inode_info *ei = EXT2_I(inode);
vm_fault_t ret;
+ bool write = (vmf->flags & FAULT_FLAG_WRITE) &&
+ (vmf->vma->vm_flags & VM_SHARED);
- if (vmf->flags & FAULT_FLAG_WRITE) {
+ if (write) {
sb_start_pagefault(inode->i_sb);
file_update_time(vmf->vma->vm_file);
}
@@ -103,7 +105,7 @@ static vm_fault_t ext2_dax_fault(struct vm_fault *vmf)
ret = dax_iomap_fault(vmf, PE_SIZE_PTE, NULL, NULL, &ext2_iomap_ops);
up_read(&ei->dax_sem);
- if (vmf->flags & FAULT_FLAG_WRITE)
+ if (write)
sb_end_pagefault(inode->i_sb);
return ret;
}
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 80662e1f7889..415c21f0e750 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1241,7 +1241,7 @@ do_indirects:
mark_inode_dirty(inode);
ext2_free_branches(inode, &nr, &nr+1, 1);
}
- /* fall through */
+ fallthrough;
case EXT2_IND_BLOCK:
nr = i_data[EXT2_DIND_BLOCK];
if (nr) {
@@ -1249,7 +1249,7 @@ do_indirects:
mark_inode_dirty(inode);
ext2_free_branches(inode, &nr, &nr+1, 2);
}
- /* fall through */
+ fallthrough;
case EXT2_DIND_BLOCK:
nr = i_data[EXT2_TIND_BLOCK];
if (nr) {
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index dda860562ca3..7fab2b3b5b39 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -587,7 +587,7 @@ static int parse_options(char *options, struct super_block *sb,
case Opt_xip:
ext2_msg(sb, KERN_INFO, "use dax instead of xip");
set_opt(opts->s_mount_opt, XIP);
- /* Fall through */
+ fallthrough;
case Opt_dax:
#ifdef CONFIG_FS_DAX
ext2_msg(sb, KERN_WARNING,
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 1afa5a4bcb5f..619dd35ddd48 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -110,7 +110,7 @@ config EXT4_KUNIT_TESTS
This builds the ext4 KUnit tests.
KUnit tests run during boot and output the results to the debug log
- in TAP format (http://testanything.org/). Only useful for kernel devs
+ in TAP format (https://testanything.org/). Only useful for kernel devs
running KUnit test harness and are not for inclusion into a production
build.
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 1ba46d87cdf1..48c3df47748d 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -413,7 +413,8 @@ verified:
* Return buffer_head on success or an ERR_PTR in case of failure.
*/
struct buffer_head *
-ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
+ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group,
+ bool ignore_locked)
{
struct ext4_group_desc *desc;
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -441,6 +442,12 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
return ERR_PTR(-ENOMEM);
}
+ if (ignore_locked && buffer_locked(bh)) {
+ /* buffer under IO already, return if called for prefetching */
+ put_bh(bh);
+ return NULL;
+ }
+
if (bitmap_uptodate(bh))
goto verify;
@@ -487,10 +494,11 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
* submit the buffer_head for reading
*/
set_buffer_new(bh);
- trace_ext4_read_block_bitmap_load(sb, block_group);
+ trace_ext4_read_block_bitmap_load(sb, block_group, ignore_locked);
bh->b_end_io = ext4_end_bitmap_read;
get_bh(bh);
- submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh);
+ submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO |
+ (ignore_locked ? REQ_RAHEAD : 0), bh);
return bh;
verify:
err = ext4_validate_block_bitmap(sb, desc, block_group, bh);
@@ -534,7 +542,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
struct buffer_head *bh;
int err;
- bh = ext4_read_block_bitmap_nowait(sb, block_group);
+ bh = ext4_read_block_bitmap_nowait(sb, block_group, false);
if (IS_ERR(bh))
return bh;
err = ext4_wait_block_bitmap(sb, block_group, bh);
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 16e9b2fda03a..c54ba52f2dd4 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -24,6 +24,7 @@ struct ext4_system_zone {
struct rb_node node;
ext4_fsblk_t start_blk;
unsigned int count;
+ u32 ino;
};
static struct kmem_cache *ext4_system_zone_cachep;
@@ -45,7 +46,8 @@ void ext4_exit_system_zone(void)
static inline int can_merge(struct ext4_system_zone *entry1,
struct ext4_system_zone *entry2)
{
- if ((entry1->start_blk + entry1->count) == entry2->start_blk)
+ if ((entry1->start_blk + entry1->count) == entry2->start_blk &&
+ entry1->ino == entry2->ino)
return 1;
return 0;
}
@@ -66,9 +68,9 @@ static void release_system_zone(struct ext4_system_blocks *system_blks)
*/
static int add_system_zone(struct ext4_system_blocks *system_blks,
ext4_fsblk_t start_blk,
- unsigned int count)
+ unsigned int count, u32 ino)
{
- struct ext4_system_zone *new_entry = NULL, *entry;
+ struct ext4_system_zone *new_entry, *entry;
struct rb_node **n = &system_blks->root.rb_node, *node;
struct rb_node *parent = NULL, *new_node = NULL;
@@ -79,30 +81,21 @@ static int add_system_zone(struct ext4_system_blocks *system_blks,
n = &(*n)->rb_left;
else if (start_blk >= (entry->start_blk + entry->count))
n = &(*n)->rb_right;
- else {
- if (start_blk + count > (entry->start_blk +
- entry->count))
- entry->count = (start_blk + count -
- entry->start_blk);
- new_node = *n;
- new_entry = rb_entry(new_node, struct ext4_system_zone,
- node);
- break;
- }
+ else /* Unexpected overlap of system zones. */
+ return -EFSCORRUPTED;
}
- if (!new_entry) {
- new_entry = kmem_cache_alloc(ext4_system_zone_cachep,
- GFP_KERNEL);
- if (!new_entry)
- return -ENOMEM;
- new_entry->start_blk = start_blk;
- new_entry->count = count;
- new_node = &new_entry->node;
-
- rb_link_node(new_node, parent, n);
- rb_insert_color(new_node, &system_blks->root);
- }
+ new_entry = kmem_cache_alloc(ext4_system_zone_cachep,
+ GFP_KERNEL);
+ if (!new_entry)
+ return -ENOMEM;
+ new_entry->start_blk = start_blk;
+ new_entry->count = count;
+ new_entry->ino = ino;
+ new_node = &new_entry->node;
+
+ rb_link_node(new_node, parent, n);
+ rb_insert_color(new_node, &system_blks->root);
/* Can we merge to the left? */
node = rb_prev(new_node);
@@ -151,40 +144,6 @@ static void debug_print_tree(struct ext4_sb_info *sbi)
printk(KERN_CONT "\n");
}
-/*
- * Returns 1 if the passed-in block region (start_blk,
- * start_blk+count) is valid; 0 if some part of the block region
- * overlaps with filesystem metadata blocks.
- */
-static int ext4_data_block_valid_rcu(struct ext4_sb_info *sbi,
- struct ext4_system_blocks *system_blks,
- ext4_fsblk_t start_blk,
- unsigned int count)
-{
- struct ext4_system_zone *entry;
- struct rb_node *n;
-
- if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
- (start_blk + count < start_blk) ||
- (start_blk + count > ext4_blocks_count(sbi->s_es)))
- return 0;
-
- if (system_blks == NULL)
- return 1;
-
- n = system_blks->root.rb_node;
- while (n) {
- entry = rb_entry(n, struct ext4_system_zone, node);
- if (start_blk + count - 1 < entry->start_blk)
- n = n->rb_left;
- else if (start_blk >= (entry->start_blk + entry->count))
- n = n->rb_right;
- else
- return 0;
- }
- return 1;
-}
-
static int ext4_protect_reserved_inode(struct super_block *sb,
struct ext4_system_blocks *system_blks,
u32 ino)
@@ -214,19 +173,18 @@ static int ext4_protect_reserved_inode(struct super_block *sb,
if (n == 0) {
i++;
} else {
- if (!ext4_data_block_valid_rcu(sbi, system_blks,
- map.m_pblk, n)) {
- err = -EFSCORRUPTED;
- __ext4_error(sb, __func__, __LINE__, -err,
- map.m_pblk, "blocks %llu-%llu "
- "from inode %u overlap system zone",
- map.m_pblk,
- map.m_pblk + map.m_len - 1, ino);
+ err = add_system_zone(system_blks, map.m_pblk, n, ino);
+ if (err < 0) {
+ if (err == -EFSCORRUPTED) {
+ __ext4_error(sb, __func__, __LINE__,
+ -err, map.m_pblk,
+ "blocks %llu-%llu from inode %u overlap system zone",
+ map.m_pblk,
+ map.m_pblk + map.m_len - 1,
+ ino);
+ }
break;
}
- err = add_system_zone(system_blks, map.m_pblk, n);
- if (err < 0)
- break;
i += n;
}
}
@@ -262,14 +220,6 @@ int ext4_setup_system_zone(struct super_block *sb)
int flex_size = ext4_flex_bg_size(sbi);
int ret;
- if (!test_opt(sb, BLOCK_VALIDITY)) {
- if (sbi->system_blks)
- ext4_release_system_zone(sb);
- return 0;
- }
- if (sbi->system_blks)
- return 0;
-
system_blks = kzalloc(sizeof(*system_blks), GFP_KERNEL);
if (!system_blks)
return -ENOMEM;
@@ -277,22 +227,25 @@ int ext4_setup_system_zone(struct super_block *sb)
for (i=0; i < ngroups; i++) {
cond_resched();
if (ext4_bg_has_super(sb, i) &&
- ((i < 5) || ((i % flex_size) == 0)))
- add_system_zone(system_blks,
+ ((i < 5) || ((i % flex_size) == 0))) {
+ ret = add_system_zone(system_blks,
ext4_group_first_block_no(sb, i),
- ext4_bg_num_gdb(sb, i) + 1);
+ ext4_bg_num_gdb(sb, i) + 1, 0);
+ if (ret)
+ goto err;
+ }
gdp = ext4_get_group_desc(sb, i, NULL);
ret = add_system_zone(system_blks,
- ext4_block_bitmap(sb, gdp), 1);
+ ext4_block_bitmap(sb, gdp), 1, 0);
if (ret)
goto err;
ret = add_system_zone(system_blks,
- ext4_inode_bitmap(sb, gdp), 1);
+ ext4_inode_bitmap(sb, gdp), 1, 0);
if (ret)
goto err;
ret = add_system_zone(system_blks,
ext4_inode_table(sb, gdp),
- sbi->s_itb_per_group);
+ sbi->s_itb_per_group, 0);
if (ret)
goto err;
}
@@ -341,11 +294,24 @@ void ext4_release_system_zone(struct super_block *sb)
call_rcu(&system_blks->rcu, ext4_destroy_system_zone);
}
-int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
+/*
+ * Returns 1 if the passed-in block region (start_blk,
+ * start_blk+count) is valid; 0 if some part of the block region
+ * overlaps with some other filesystem metadata blocks.
+ */
+int ext4_inode_block_valid(struct inode *inode, ext4_fsblk_t start_blk,
unsigned int count)
{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_system_blocks *system_blks;
- int ret;
+ struct ext4_system_zone *entry;
+ struct rb_node *n;
+ int ret = 1;
+
+ if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
+ (start_blk + count < start_blk) ||
+ (start_blk + count > ext4_blocks_count(sbi->s_es)))
+ return 0;
/*
* Lock the system zone to prevent it being released concurrently
@@ -354,8 +320,22 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
*/
rcu_read_lock();
system_blks = rcu_dereference(sbi->system_blks);
- ret = ext4_data_block_valid_rcu(sbi, system_blks, start_blk,
- count);
+ if (system_blks == NULL)
+ goto out_rcu;
+
+ n = system_blks->root.rb_node;
+ while (n) {
+ entry = rb_entry(n, struct ext4_system_zone, node);
+ if (start_blk + count - 1 < entry->start_blk)
+ n = n->rb_left;
+ else if (start_blk >= (entry->start_blk + entry->count))
+ n = n->rb_right;
+ else {
+ ret = (entry->ino == inode->i_ino);
+ break;
+ }
+ }
+out_rcu:
rcu_read_unlock();
return ret;
}
@@ -374,8 +354,7 @@ int ext4_check_blockref(const char *function, unsigned int line,
while (bref < p+max) {
blk = le32_to_cpu(*bref++);
if (blk &&
- unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
- blk, 1))) {
+ unlikely(!ext4_inode_block_valid(inode, blk, 1))) {
ext4_error_inode(inode, function, line, blk,
"invalid block");
return -EFSCORRUPTED;
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 1d82336b1cd4..efe77cffc322 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -148,7 +148,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
}
if (IS_ENCRYPTED(inode)) {
- err = fscrypt_fname_alloc_buffer(inode, EXT4_NAME_LEN, &fstr);
+ err = fscrypt_fname_alloc_buffer(EXT4_NAME_LEN, &fstr);
if (err < 0)
return err;
}
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 42f5060f3cdf..f9a692c0a66c 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -434,10 +434,36 @@ struct flex_groups {
#define EXT4_CASEFOLD_FL 0x40000000 /* Casefolded directory */
#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
-#define EXT4_FL_USER_VISIBLE 0x725BDFFF /* User visible flags */
-#define EXT4_FL_USER_MODIFIABLE 0x624BC0FF /* User modifiable flags */
-
-/* Flags we can manipulate with through EXT4_IOC_FSSETXATTR */
+/* User modifiable flags */
+#define EXT4_FL_USER_MODIFIABLE (EXT4_SECRM_FL | \
+ EXT4_UNRM_FL | \
+ EXT4_COMPR_FL | \
+ EXT4_SYNC_FL | \
+ EXT4_IMMUTABLE_FL | \
+ EXT4_APPEND_FL | \
+ EXT4_NODUMP_FL | \
+ EXT4_NOATIME_FL | \
+ EXT4_JOURNAL_DATA_FL | \
+ EXT4_NOTAIL_FL | \
+ EXT4_DIRSYNC_FL | \
+ EXT4_TOPDIR_FL | \
+ EXT4_EXTENTS_FL | \
+ 0x00400000 /* EXT4_EOFBLOCKS_FL */ | \
+ EXT4_DAX_FL | \
+ EXT4_PROJINHERIT_FL | \
+ EXT4_CASEFOLD_FL)
+
+/* User visible flags */
+#define EXT4_FL_USER_VISIBLE (EXT4_FL_USER_MODIFIABLE | \
+ EXT4_DIRTY_FL | \
+ EXT4_COMPRBLK_FL | \
+ EXT4_NOCOMPR_FL | \
+ EXT4_ENCRYPT_FL | \
+ EXT4_INDEX_FL | \
+ EXT4_VERITY_FL | \
+ EXT4_INLINE_DATA_FL)
+
+/* Flags we can manipulate with through FS_IOC_FSSETXATTR */
#define EXT4_FL_XFLAG_VISIBLE (EXT4_SYNC_FL | \
EXT4_IMMUTABLE_FL | \
EXT4_APPEND_FL | \
@@ -669,8 +695,6 @@ enum {
/*
* ioctl commands
*/
-#define EXT4_IOC_GETFLAGS FS_IOC_GETFLAGS
-#define EXT4_IOC_SETFLAGS FS_IOC_SETFLAGS
#define EXT4_IOC_GETVERSION _IOR('f', 3, long)
#define EXT4_IOC_SETVERSION _IOW('f', 4, long)
#define EXT4_IOC_GETVERSION_OLD FS_IOC_GETVERSION
@@ -687,17 +711,11 @@ enum {
#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64)
#define EXT4_IOC_SWAP_BOOT _IO('f', 17)
#define EXT4_IOC_PRECACHE_EXTENTS _IO('f', 18)
-#define EXT4_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY
-#define EXT4_IOC_GET_ENCRYPTION_PWSALT FS_IOC_GET_ENCRYPTION_PWSALT
-#define EXT4_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY
/* ioctl codes 19--39 are reserved for fscrypt */
#define EXT4_IOC_CLEAR_ES_CACHE _IO('f', 40)
#define EXT4_IOC_GETSTATE _IOW('f', 41, __u32)
#define EXT4_IOC_GET_ES_CACHE _IOWR('f', 42, struct fiemap)
-#define EXT4_IOC_FSGETXATTR FS_IOC_FSGETXATTR
-#define EXT4_IOC_FSSETXATTR FS_IOC_FSSETXATTR
-
#define EXT4_IOC_SHUTDOWN _IOR ('X', 125, __u32)
/*
@@ -722,8 +740,6 @@ enum {
/*
* ioctl commands in 32 bit emulation
*/
-#define EXT4_IOC32_GETFLAGS FS_IOC32_GETFLAGS
-#define EXT4_IOC32_SETFLAGS FS_IOC32_SETFLAGS
#define EXT4_IOC32_GETVERSION _IOR('f', 3, int)
#define EXT4_IOC32_SETVERSION _IOW('f', 4, int)
#define EXT4_IOC32_GETRSVSZ _IOR('f', 5, int)
@@ -1054,6 +1070,7 @@ struct ext4_inode_info {
struct timespec64 i_crtime;
/* mballoc */
+ atomic_t i_prealloc_active;
struct list_head i_prealloc_list;
spinlock_t i_prealloc_lock;
@@ -1172,6 +1189,7 @@ struct ext4_inode_info {
#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
#define EXT4_MOUNT_WARN_ON_ERROR 0x2000000 /* Trigger WARN_ON on error */
+#define EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS 0x4000000
#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */
#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */
@@ -1383,7 +1401,7 @@ struct ext4_super_block {
#define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */
#ifdef CONFIG_FS_ENCRYPTION
-#define DUMMY_ENCRYPTION_ENABLED(sbi) ((sbi)->s_dummy_enc_ctx.ctx != NULL)
+#define DUMMY_ENCRYPTION_ENABLED(sbi) ((sbi)->s_dummy_enc_policy.policy != NULL)
#else
#define DUMMY_ENCRYPTION_ENABLED(sbi) (0)
#endif
@@ -1501,10 +1519,13 @@ struct ext4_sb_info {
unsigned int s_mb_stats;
unsigned int s_mb_order2_reqs;
unsigned int s_mb_group_prealloc;
+ unsigned int s_mb_max_inode_prealloc;
unsigned int s_max_dir_size_kb;
/* where last allocation was done - for stream allocation */
unsigned long s_mb_last_group;
unsigned long s_mb_last_start;
+ unsigned int s_mb_prefetch;
+ unsigned int s_mb_prefetch_limit;
/* stats for buddy allocator */
atomic_t s_bal_reqs; /* number of reqs with len > 1 */
@@ -1572,9 +1593,11 @@ struct ext4_sb_info {
struct ratelimit_state s_err_ratelimit_state;
struct ratelimit_state s_warning_ratelimit_state;
struct ratelimit_state s_msg_ratelimit_state;
+ atomic_t s_warning_count;
+ atomic_t s_msg_count;
- /* Encryption context for '-o test_dummy_encryption' */
- struct fscrypt_dummy_context s_dummy_enc_ctx;
+ /* Encryption policy for '-o test_dummy_encryption' */
+ struct fscrypt_dummy_policy s_dummy_enc_policy;
/*
* Barrier between writepages ops and changing any inode's JOURNAL_DATA
@@ -1585,6 +1608,9 @@ struct ext4_sb_info {
#ifdef CONFIG_EXT4_DEBUG
unsigned long s_simulate_fail;
#endif
+ /* Record the errseq of the backing block device */
+ errseq_t s_bdev_wb_err;
+ spinlock_t s_bdev_wb_lock;
};
static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -2313,9 +2339,15 @@ struct ext4_lazy_init {
struct mutex li_list_mtx;
};
+enum ext4_li_mode {
+ EXT4_LI_MODE_PREFETCH_BBITMAP,
+ EXT4_LI_MODE_ITABLE,
+};
+
struct ext4_li_request {
struct super_block *lr_super;
- struct ext4_sb_info *lr_sbi;
+ enum ext4_li_mode lr_mode;
+ ext4_group_t lr_first_not_zeroed;
ext4_group_t lr_next_group;
struct list_head lr_request;
unsigned long lr_next_sched;
@@ -2446,7 +2478,8 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb,
- ext4_group_t block_group);
+ ext4_group_t block_group,
+ bool ignore_locked);
extern int ext4_wait_block_bitmap(struct super_block *sb,
ext4_group_t block_group,
struct buffer_head *bh);
@@ -2651,9 +2684,15 @@ extern int ext4_mb_release(struct super_block *);
extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
struct ext4_allocation_request *, int *);
extern int ext4_mb_reserve_blocks(struct super_block *, int);
-extern void ext4_discard_preallocations(struct inode *);
+extern void ext4_discard_preallocations(struct inode *, unsigned int);
extern int __init ext4_init_mballoc(void);
extern void ext4_exit_mballoc(void);
+extern ext4_group_t ext4_mb_prefetch(struct super_block *sb,
+ ext4_group_t group,
+ unsigned int nr, int *cnt);
+extern void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group,
+ unsigned int nr);
+
extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
struct buffer_head *bh, ext4_fsblk_t block,
unsigned long count, int flags);
@@ -2765,8 +2804,7 @@ extern int ext4_search_dir(struct buffer_head *bh,
struct ext4_filename *fname,
unsigned int offset,
struct ext4_dir_entry_2 **res_dir);
-extern int ext4_generic_delete_entry(handle_t *handle,
- struct inode *dir,
+extern int ext4_generic_delete_entry(struct inode *dir,
struct ext4_dir_entry_2 *de_del,
struct buffer_head *bh,
void *entry_buf,
@@ -2924,12 +2962,6 @@ do { \
#endif
-extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
- __u32 compat);
-extern int ext4_update_rocompat_feature(handle_t *handle,
- struct super_block *sb, __u32 rocompat);
-extern int ext4_update_incompat_feature(handle_t *handle,
- struct super_block *sb, __u32 incompat);
extern ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
struct ext4_group_desc *bg);
extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
@@ -3145,6 +3177,7 @@ struct ext4_group_info {
(1 << EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT)
#define EXT4_GROUP_INFO_IBITMAP_CORRUPT \
(1 << EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT)
+#define EXT4_GROUP_INFO_BBITMAP_READ_BIT 4
#define EXT4_MB_GRP_NEED_INIT(grp) \
(test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
@@ -3159,6 +3192,8 @@ struct ext4_group_info {
(set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
#define EXT4_MB_GRP_CLEAR_TRIMMED(grp) \
(clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
+#define EXT4_MB_GRP_TEST_AND_SET_READ(grp) \
+ (test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &((grp)->bb_state)))
#define EXT4_MAX_CONTENTION 8
#define EXT4_CONTENTION_THRESHOLD 2
@@ -3363,9 +3398,9 @@ extern void ext4_release_system_zone(struct super_block *sb);
extern int ext4_setup_system_zone(struct super_block *sb);
extern int __init ext4_init_system_zone(void);
extern void ext4_exit_system_zone(void);
-extern int ext4_data_block_valid(struct ext4_sb_info *sbi,
- ext4_fsblk_t start_blk,
- unsigned int count);
+extern int ext4_inode_block_valid(struct inode *inode,
+ ext4_fsblk_t start_blk,
+ unsigned int count);
extern int ext4_check_blockref(const char *, unsigned int,
struct inode *, __le32 *, unsigned int);
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 0c76cdd44d90..760b9ee49dc0 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -195,6 +195,28 @@ static void ext4_journal_abort_handle(const char *caller, unsigned int line,
jbd2_journal_abort_handle(handle);
}
+static void ext4_check_bdev_write_error(struct super_block *sb)
+{
+ struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int err;
+
+ /*
+ * If the block device has write error flag, it may have failed to
+ * async write out metadata buffers in the background. In this case,
+ * we could read old data from disk and write it out again, which
+ * may lead to on-disk filesystem inconsistency.
+ */
+ if (errseq_check(&mapping->wb_err, READ_ONCE(sbi->s_bdev_wb_err))) {
+ spin_lock(&sbi->s_bdev_wb_lock);
+ err = errseq_check_and_advance(&mapping->wb_err, &sbi->s_bdev_wb_err);
+ spin_unlock(&sbi->s_bdev_wb_lock);
+ if (err)
+ ext4_error_err(sb, -err,
+ "Error while async write back metadata");
+ }
+}
+
int __ext4_journal_get_write_access(const char *where, unsigned int line,
handle_t *handle, struct buffer_head *bh)
{
@@ -202,6 +224,9 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line,
might_sleep();
+ if (bh->b_bdev->bd_super)
+ ext4_check_bdev_write_error(bh->b_bdev->bd_super);
+
if (ext4_handle_valid(handle)) {
err = jbd2_journal_get_write_access(handle, bh);
if (err)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 221f240eae60..a0481582187a 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -100,7 +100,7 @@ static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped)
* i_mutex. So we can safely drop the i_data_sem here.
*/
BUG_ON(EXT4_JOURNAL(inode) == NULL);
- ext4_discard_preallocations(inode);
+ ext4_discard_preallocations(inode, 0);
up_write(&EXT4_I(inode)->i_data_sem);
*dropped = 1;
return 0;
@@ -340,7 +340,7 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
*/
if (lblock + len <= lblock)
return 0;
- return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
+ return ext4_inode_block_valid(inode, block, len);
}
static int ext4_valid_extent_idx(struct inode *inode,
@@ -348,7 +348,7 @@ static int ext4_valid_extent_idx(struct inode *inode,
{
ext4_fsblk_t block = ext4_idx_pblock(ext_idx);
- return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
+ return ext4_inode_block_valid(inode, block, 1);
}
static int ext4_valid_extent_entries(struct inode *inode,
@@ -507,14 +507,10 @@ __read_extent_tree_block(const char *function, unsigned int line,
}
if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE))
return bh;
- if (!ext4_has_feature_journal(inode->i_sb) ||
- (inode->i_ino !=
- le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum))) {
- err = __ext4_ext_check(function, line, inode,
- ext_block_hdr(bh), depth, pblk);
- if (err)
- goto errout;
- }
+ err = __ext4_ext_check(function, line, inode,
+ ext_block_hdr(bh), depth, pblk);
+ if (err)
+ goto errout;
set_buffer_verified(bh);
/*
* If this is a leaf block, cache all of its entries
@@ -693,10 +689,8 @@ void ext4_ext_drop_refs(struct ext4_ext_path *path)
return;
depth = path->p_depth;
for (i = 0; i <= depth; i++, path++) {
- if (path->p_bh) {
- brelse(path->p_bh);
- path->p_bh = NULL;
- }
+ brelse(path->p_bh);
+ path->p_bh = NULL;
}
}
@@ -1915,7 +1909,7 @@ out:
/*
* ext4_ext_insert_extent:
- * tries to merge requsted extent into the existing extent or
+ * tries to merge requested extent into the existing extent or
* inserts requested extent as new one into the tree,
* creating new leaf in the no-space case.
*/
@@ -3125,7 +3119,7 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
*
*
* Splits extent [a, b] into two extents [a, @split) and [@split, b], states
- * of which are deterimined by split_flag.
+ * of which are determined by split_flag.
*
* There are two cases:
* a> the extent are splitted into two extent.
@@ -3650,7 +3644,7 @@ static int ext4_split_convert_extents(handle_t *handle,
eof_block = map->m_lblk + map->m_len;
/*
* It is safe to convert extent to initialized via explicit
- * zeroout only if extent is fully insde i_size or new_size.
+ * zeroout only if extent is fully inside i_size or new_size.
*/
depth = ext_depth(inode);
ex = path[depth].p_ext;
@@ -4272,7 +4266,7 @@ got_allocated_blocks:
* not a good idea to call discard here directly,
* but otherwise we'd need to call it every free().
*/
- ext4_discard_preallocations(inode);
+ ext4_discard_preallocations(inode, 0);
if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
fb_flags = EXT4_FREE_BLOCKS_NO_QUOT_UPDATE;
ext4_free_blocks(handle, inode, NULL, newblock,
@@ -4495,7 +4489,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
}
/*
- * Round up offset. This is not fallocate, we neet to zero out
+ * Round up offset. This is not fallocate, we need to zero out
* blocks, so convert interior block aligned part of the range to
* unwritten and possibly manually zero out unaligned parts of the
* range.
@@ -5299,7 +5293,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
}
down_write(&EXT4_I(inode)->i_data_sem);
- ext4_discard_preallocations(inode);
+ ext4_discard_preallocations(inode, 0);
ret = ext4_es_remove_extent(inode, punch_start,
EXT_MAX_BLOCKS - punch_start);
@@ -5313,7 +5307,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
up_write(&EXT4_I(inode)->i_data_sem);
goto out_stop;
}
- ext4_discard_preallocations(inode);
+ ext4_discard_preallocations(inode, 0);
ret = ext4_ext_shift_extents(inode, handle, punch_stop,
punch_stop - punch_start, SHIFT_LEFT);
@@ -5445,7 +5439,7 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
goto out_stop;
down_write(&EXT4_I(inode)->i_data_sem);
- ext4_discard_preallocations(inode);
+ ext4_discard_preallocations(inode, 0);
path = ext4_find_extent(inode, offset_lblk, NULL, 0);
if (IS_ERR(path)) {
@@ -5579,7 +5573,7 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
}
ex1 = path1[path1->p_depth].p_ext;
ex2 = path2[path2->p_depth].p_ext;
- /* Do we have somthing to swap ? */
+ /* Do we have something to swap ? */
if (unlikely(!ex2 || !ex1))
goto finish;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 129cc1dd6b79..7d61069531d3 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -145,10 +145,9 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
/* if we are the last writer on the inode, drop the block reservation */
if ((filp->f_mode & FMODE_WRITE) &&
(atomic_read(&inode->i_writecount) == 1) &&
- !EXT4_I(inode)->i_reserved_data_blocks)
- {
+ !EXT4_I(inode)->i_reserved_data_blocks) {
down_write(&EXT4_I(inode)->i_data_sem);
- ext4_discard_preallocations(inode);
+ ext4_discard_preallocations(inode, 0);
up_write(&EXT4_I(inode)->i_data_sem);
}
if (is_dx(inode) && filp->private_data)
@@ -428,6 +427,10 @@ restart:
*/
if (*ilock_shared && (!IS_NOSEC(inode) || *extend ||
!ext4_overwrite_io(inode, offset, count))) {
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ ret = -EAGAIN;
+ goto out;
+ }
inode_unlock_shared(inode);
*ilock_shared = false;
inode_lock(inode);
@@ -812,7 +815,7 @@ out:
return err;
}
-static int ext4_file_open(struct inode * inode, struct file * filp)
+static int ext4_file_open(struct inode *inode, struct file *filp)
{
int ret;
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index 3e133793a5a3..2924261226e0 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -233,7 +233,7 @@ static int __ext4fs_dirhash(const char *name, int len,
break;
case DX_HASH_HALF_MD4_UNSIGNED:
str2hashbuf = str2hashbuf_unsigned;
- /* fall through */
+ fallthrough;
case DX_HASH_HALF_MD4:
p = name;
while (len > 0) {
@@ -247,7 +247,7 @@ static int __ext4fs_dirhash(const char *name, int len,
break;
case DX_HASH_TEA_UNSIGNED:
str2hashbuf = str2hashbuf_unsigned;
- /* fall through */
+ fallthrough;
case DX_HASH_TEA:
p = name;
while (len > 0) {
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index df25d38d6539..698ca4a4db5f 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -742,6 +742,53 @@ not_found:
return 1;
}
+static int ext4_xattr_credits_for_new_inode(struct inode *dir, mode_t mode,
+ bool encrypt)
+{
+ struct super_block *sb = dir->i_sb;
+ int nblocks = 0;
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
+ struct posix_acl *p = get_acl(dir, ACL_TYPE_DEFAULT);
+
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+ if (p) {
+ int acl_size = p->a_count * sizeof(ext4_acl_entry);
+
+ nblocks += (S_ISDIR(mode) ? 2 : 1) *
+ __ext4_xattr_set_credits(sb, NULL /* inode */,
+ NULL /* block_bh */, acl_size,
+ true /* is_create */);
+ posix_acl_release(p);
+ }
+#endif
+
+#ifdef CONFIG_SECURITY
+ {
+ int num_security_xattrs = 1;
+
+#ifdef CONFIG_INTEGRITY
+ num_security_xattrs++;
+#endif
+ /*
+ * We assume that security xattrs are never more than 1k.
+ * In practice they are under 128 bytes.
+ */
+ nblocks += num_security_xattrs *
+ __ext4_xattr_set_credits(sb, NULL /* inode */,
+ NULL /* block_bh */, 1024,
+ true /* is_create */);
+ }
+#endif
+ if (encrypt)
+ nblocks += __ext4_xattr_set_credits(sb,
+ NULL /* inode */,
+ NULL /* block_bh */,
+ FSCRYPT_SET_CONTEXT_MAX_SIZE,
+ true /* is_create */);
+ return nblocks;
+}
+
/*
* There are two policies for allocating an inode. If the new inode is
* a directory, then a forward search is made for a block group with both
@@ -772,7 +819,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
ext4_group_t i;
ext4_group_t flex_group;
struct ext4_group_info *grp;
- int encrypt = 0;
+ bool encrypt = false;
/* Cannot create files in a deleted directory */
if (!dir || !dir->i_nlink)
@@ -784,59 +831,6 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
if (unlikely(ext4_forced_shutdown(sbi)))
return ERR_PTR(-EIO);
- if ((IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) &&
- (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) &&
- !(i_flags & EXT4_EA_INODE_FL)) {
- err = fscrypt_get_encryption_info(dir);
- if (err)
- return ERR_PTR(err);
- if (!fscrypt_has_encryption_key(dir))
- return ERR_PTR(-ENOKEY);
- encrypt = 1;
- }
-
- if (!handle && sbi->s_journal && !(i_flags & EXT4_EA_INODE_FL)) {
-#ifdef CONFIG_EXT4_FS_POSIX_ACL
- struct posix_acl *p = get_acl(dir, ACL_TYPE_DEFAULT);
-
- if (IS_ERR(p))
- return ERR_CAST(p);
- if (p) {
- int acl_size = p->a_count * sizeof(ext4_acl_entry);
-
- nblocks += (S_ISDIR(mode) ? 2 : 1) *
- __ext4_xattr_set_credits(sb, NULL /* inode */,
- NULL /* block_bh */, acl_size,
- true /* is_create */);
- posix_acl_release(p);
- }
-#endif
-
-#ifdef CONFIG_SECURITY
- {
- int num_security_xattrs = 1;
-
-#ifdef CONFIG_INTEGRITY
- num_security_xattrs++;
-#endif
- /*
- * We assume that security xattrs are never
- * more than 1k. In practice they are under
- * 128 bytes.
- */
- nblocks += num_security_xattrs *
- __ext4_xattr_set_credits(sb, NULL /* inode */,
- NULL /* block_bh */, 1024,
- true /* is_create */);
- }
-#endif
- if (encrypt)
- nblocks += __ext4_xattr_set_credits(sb,
- NULL /* inode */, NULL /* block_bh */,
- FSCRYPT_SET_CONTEXT_MAX_SIZE,
- true /* is_create */);
- }
-
ngroups = ext4_get_groups_count(sb);
trace_ext4_request_inode(dir, mode);
inode = new_inode(sb);
@@ -866,10 +860,25 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
else
ei->i_projid = make_kprojid(&init_user_ns, EXT4_DEF_PROJID);
+ if (!(i_flags & EXT4_EA_INODE_FL)) {
+ err = fscrypt_prepare_new_inode(dir, inode, &encrypt);
+ if (err)
+ goto out;
+ }
+
err = dquot_initialize(inode);
if (err)
goto out;
+ if (!handle && sbi->s_journal && !(i_flags & EXT4_EA_INODE_FL)) {
+ ret2 = ext4_xattr_credits_for_new_inode(dir, mode, encrypt);
+ if (ret2 < 0) {
+ err = ret2;
+ goto out;
+ }
+ nblocks += ret2;
+ }
+
if (!goal)
goal = sbi->s_inode_goal;
@@ -1162,7 +1171,7 @@ got:
* prevent its deduplication.
*/
if (encrypt) {
- err = fscrypt_inherit_context(dir, inode, handle, true);
+ err = fscrypt_set_context(inode, handle);
if (err)
goto fail_free_drop;
}
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index be2b66eb65f7..80c9f33800be 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -696,7 +696,7 @@ static int ext4_ind_trunc_restart_fn(handle_t *handle, struct inode *inode,
* i_mutex. So we can safely drop the i_data_sem here.
*/
BUG_ON(EXT4_JOURNAL(inode) == NULL);
- ext4_discard_preallocations(inode);
+ ext4_discard_preallocations(inode, 0);
up_write(&EXT4_I(inode)->i_data_sem);
*dropped = 1;
return 0;
@@ -858,8 +858,7 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
else if (ext4_should_journal_data(inode))
flags |= EXT4_FREE_BLOCKS_FORGET;
- if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
- count)) {
+ if (!ext4_inode_block_valid(inode, block_to_free, count)) {
EXT4_ERROR_INODE(inode, "attempt to clear invalid "
"blocks %llu len %lu",
(unsigned long long) block_to_free, count);
@@ -1004,8 +1003,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
if (!nr)
continue; /* A hole */
- if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
- nr, 1)) {
+ if (!ext4_inode_block_valid(inode, nr, 1)) {
EXT4_ERROR_INODE(inode,
"invalid indirect mapped "
"block %lu (level %d)",
@@ -1182,21 +1180,21 @@ do_indirects:
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
i_data[EXT4_IND_BLOCK] = 0;
}
- /* fall through */
+ fallthrough;
case EXT4_IND_BLOCK:
nr = i_data[EXT4_DIND_BLOCK];
if (nr) {
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
i_data[EXT4_DIND_BLOCK] = 0;
}
- /* fall through */
+ fallthrough;
case EXT4_DIND_BLOCK:
nr = i_data[EXT4_TIND_BLOCK];
if (nr) {
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
i_data[EXT4_TIND_BLOCK] = 0;
}
- /* fall through */
+ fallthrough;
case EXT4_TIND_BLOCK:
;
}
@@ -1436,7 +1434,7 @@ do_indirects:
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
i_data[EXT4_IND_BLOCK] = 0;
}
- /* fall through */
+ fallthrough;
case EXT4_IND_BLOCK:
if (++n >= n2)
break;
@@ -1445,7 +1443,7 @@ do_indirects:
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
i_data[EXT4_DIND_BLOCK] = 0;
}
- /* fall through */
+ fallthrough;
case EXT4_DIND_BLOCK:
if (++n >= n2)
break;
@@ -1454,7 +1452,7 @@ do_indirects:
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
i_data[EXT4_TIND_BLOCK] = 0;
}
- /* fall through */
+ fallthrough;
case EXT4_TIND_BLOCK:
;
}
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index c3a1ad2db122..75c97bca0815 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -276,7 +276,7 @@ static int ext4_create_inline_data(handle_t *handle,
len = 0;
}
- /* Insert the the xttr entry. */
+ /* Insert the xttr entry. */
i.value = value;
i.value_len = len;
@@ -1706,7 +1706,7 @@ int ext4_delete_inline_entry(handle_t *handle,
if (err)
goto out;
- err = ext4_generic_delete_entry(handle, dir, de_del, bh,
+ err = ext4_generic_delete_entry(dir, de_del, bh,
inline_start, inline_size, 0);
if (err)
goto out;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 44bad4bb8831..bf596467c234 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -383,7 +383,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
*/
if ((ei->i_reserved_data_blocks == 0) &&
!inode_is_open_for_write(inode))
- ext4_discard_preallocations(inode);
+ ext4_discard_preallocations(inode, 0);
}
static int __check_block_validity(struct inode *inode, const char *func,
@@ -394,8 +394,7 @@ static int __check_block_validity(struct inode *inode, const char *func,
(inode->i_ino ==
le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum)))
return 0;
- if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk,
- map->m_len)) {
+ if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) {
ext4_error_inode(inode, func, line, map->m_pblk,
"lblock %lu mapped to illegal pblock %llu "
"(length %d)", (unsigned long) map->m_lblk,
@@ -3288,7 +3287,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
if (PageChecked(page))
return 0;
if (journal)
- return jbd2_journal_try_to_free_buffers(journal, page, wait);
+ return jbd2_journal_try_to_free_buffers(journal, page);
else
return try_to_free_buffers(page);
}
@@ -4056,7 +4055,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
if (stop_block > first_block) {
down_write(&EXT4_I(inode)->i_data_sem);
- ext4_discard_preallocations(inode);
+ ext4_discard_preallocations(inode, 0);
ret = ext4_es_remove_extent(inode, first_block,
stop_block - first_block);
@@ -4163,7 +4162,7 @@ int ext4_truncate(struct inode *inode)
trace_ext4_truncate_enter(inode);
if (!ext4_can_truncate(inode))
- return 0;
+ goto out_trace;
if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
@@ -4172,16 +4171,14 @@ int ext4_truncate(struct inode *inode)
int has_inline = 1;
err = ext4_inline_data_truncate(inode, &has_inline);
- if (err)
- return err;
- if (has_inline)
- return 0;
+ if (err || has_inline)
+ goto out_trace;
}
/* If we zero-out tail of the page, we have to create jinode for jbd2 */
if (inode->i_size & (inode->i_sb->s_blocksize - 1)) {
if (ext4_inode_attach_jinode(inode) < 0)
- return 0;
+ goto out_trace;
}
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
@@ -4190,8 +4187,10 @@ int ext4_truncate(struct inode *inode)
credits = ext4_blocks_for_truncate(inode);
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto out_trace;
+ }
if (inode->i_size & (inode->i_sb->s_blocksize - 1))
ext4_block_truncate_page(handle, mapping, inode->i_size);
@@ -4211,7 +4210,7 @@ int ext4_truncate(struct inode *inode)
down_write(&EXT4_I(inode)->i_data_sem);
- ext4_discard_preallocations(inode);
+ ext4_discard_preallocations(inode, 0);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
err = ext4_ext_truncate(handle, inode);
@@ -4242,6 +4241,7 @@ out_stop:
err = err2;
ext4_journal_stop(handle);
+out_trace:
trace_ext4_truncate_exit(inode);
return err;
}
@@ -4760,7 +4760,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
ret = 0;
if (ei->i_file_acl &&
- !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
+ !ext4_inode_block_valid(inode, ei->i_file_acl, 1)) {
ext4_error_inode(inode, function, line, 0,
"iget: bad extended attribute block %llu",
ei->i_file_acl);
@@ -4901,7 +4901,7 @@ static void __ext4_update_other_inode_time(struct super_block *sb,
(inode->i_state & I_DIRTY_TIME)) {
struct ext4_inode_info *ei = EXT4_I(inode);
- inode->i_state &= ~(I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED);
+ inode->i_state &= ~I_DIRTY_TIME;
spin_unlock(&inode->i_lock);
spin_lock(&ei->i_raw_lock);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 999cf6add39c..36eca3bc036a 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -202,7 +202,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
reset_inode_seed(inode);
reset_inode_seed(inode_bl);
- ext4_discard_preallocations(inode);
+ ext4_discard_preallocations(inode, 0);
err = ext4_mark_inode_dirty(handle, inode);
if (err < 0) {
@@ -819,12 +819,12 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
switch (cmd) {
case FS_IOC_GETFSMAP:
return ext4_ioc_getfsmap(sb, (void __user *)arg);
- case EXT4_IOC_GETFLAGS:
+ case FS_IOC_GETFLAGS:
flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
if (S_ISREG(inode->i_mode))
flags &= ~EXT4_PROJINHERIT_FL;
return put_user(flags, (int __user *) arg);
- case EXT4_IOC_SETFLAGS: {
+ case FS_IOC_SETFLAGS: {
int err;
if (!inode_owner_or_capable(inode))
@@ -1129,12 +1129,12 @@ resizefs_out:
case EXT4_IOC_PRECACHE_EXTENTS:
return ext4_ext_precache(inode);
- case EXT4_IOC_SET_ENCRYPTION_POLICY:
+ case FS_IOC_SET_ENCRYPTION_POLICY:
if (!ext4_has_feature_encrypt(sb))
return -EOPNOTSUPP;
return fscrypt_ioctl_set_policy(filp, (const void __user *)arg);
- case EXT4_IOC_GET_ENCRYPTION_PWSALT: {
+ case FS_IOC_GET_ENCRYPTION_PWSALT: {
#ifdef CONFIG_FS_ENCRYPTION
int err, err2;
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -1174,7 +1174,7 @@ resizefs_out:
return -EOPNOTSUPP;
#endif
}
- case EXT4_IOC_GET_ENCRYPTION_POLICY:
+ case FS_IOC_GET_ENCRYPTION_POLICY:
if (!ext4_has_feature_encrypt(sb))
return -EOPNOTSUPP;
return fscrypt_ioctl_get_policy(filp, (void __user *)arg);
@@ -1236,7 +1236,7 @@ resizefs_out:
case EXT4_IOC_GET_ES_CACHE:
return ext4_ioctl_get_es_cache(filp, arg);
- case EXT4_IOC_FSGETXATTR:
+ case FS_IOC_FSGETXATTR:
{
struct fsxattr fa;
@@ -1247,7 +1247,7 @@ resizefs_out:
return -EFAULT;
return 0;
}
- case EXT4_IOC_FSSETXATTR:
+ case FS_IOC_FSSETXATTR:
{
struct fsxattr fa, old_fa;
int err;
@@ -1313,11 +1313,11 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
/* These are just misnamed, they actually get/put from/to user an int */
switch (cmd) {
- case EXT4_IOC32_GETFLAGS:
- cmd = EXT4_IOC_GETFLAGS;
+ case FS_IOC32_GETFLAGS:
+ cmd = FS_IOC_GETFLAGS;
break;
- case EXT4_IOC32_SETFLAGS:
- cmd = EXT4_IOC_SETFLAGS;
+ case FS_IOC32_SETFLAGS:
+ cmd = FS_IOC_SETFLAGS;
break;
case EXT4_IOC32_GETVERSION:
cmd = EXT4_IOC_GETVERSION;
@@ -1361,9 +1361,9 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case EXT4_IOC_RESIZE_FS:
case FITRIM:
case EXT4_IOC_PRECACHE_EXTENTS:
- case EXT4_IOC_SET_ENCRYPTION_POLICY:
- case EXT4_IOC_GET_ENCRYPTION_PWSALT:
- case EXT4_IOC_GET_ENCRYPTION_POLICY:
+ case FS_IOC_SET_ENCRYPTION_POLICY:
+ case FS_IOC_GET_ENCRYPTION_PWSALT:
+ case FS_IOC_GET_ENCRYPTION_POLICY:
case FS_IOC_GET_ENCRYPTION_POLICY_EX:
case FS_IOC_ADD_ENCRYPTION_KEY:
case FS_IOC_REMOVE_ENCRYPTION_KEY:
@@ -1377,8 +1377,8 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case EXT4_IOC_CLEAR_ES_CACHE:
case EXT4_IOC_GETSTATE:
case EXT4_IOC_GET_ES_CACHE:
- case EXT4_IOC_FSGETXATTR:
- case EXT4_IOC_FSSETXATTR:
+ case FS_IOC_FSGETXATTR:
+ case FS_IOC_FSSETXATTR:
break;
default:
return -ENOIOCTLCMD;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index c0a331e2feb0..132c118d12e1 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -922,7 +922,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
bh[i] = NULL;
continue;
}
- bh[i] = ext4_read_block_bitmap_nowait(sb, group);
+ bh[i] = ext4_read_block_bitmap_nowait(sb, group, false);
if (IS_ERR(bh[i])) {
err = PTR_ERR(bh[i]);
bh[i] = NULL;
@@ -1279,9 +1279,6 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
e4b->bd_buddy_page = page;
e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
- BUG_ON(e4b->bd_bitmap_page == NULL);
- BUG_ON(e4b->bd_buddy_page == NULL);
-
return 0;
err:
@@ -1743,10 +1740,6 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
}
-/*
- * regular allocator, for general purposes allocation
- */
-
static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
struct ext4_buddy *e4b,
int finish_group)
@@ -2119,13 +2112,11 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
BUG_ON(cr < 0 || cr >= 4);
- free = grp->bb_free;
- if (free == 0)
- return false;
- if (cr <= 2 && free < ac->ac_g_ex.fe_len)
+ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
return false;
- if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
+ free = grp->bb_free;
+ if (free == 0)
return false;
fragments = grp->bb_fragments;
@@ -2142,8 +2133,10 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
((group % flex_size) == 0))
return false;
- if ((ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) ||
- (free / fragments) >= ac->ac_g_ex.fe_len)
+ if (free < ac->ac_g_ex.fe_len)
+ return false;
+
+ if (ac->ac_2order > ac->ac_sb->s_blocksize_bits+1)
return true;
if (grp->bb_largest_free_order < ac->ac_2order)
@@ -2177,6 +2170,7 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
{
struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
struct super_block *sb = ac->ac_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
bool should_lock = ac->ac_flags & EXT4_MB_STRICT_CHECK;
ext4_grpblk_t free;
int ret = 0;
@@ -2195,7 +2189,25 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
/* We only do this if the grp has never been initialized */
if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
- ret = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS);
+ struct ext4_group_desc *gdp =
+ ext4_get_group_desc(sb, group, NULL);
+ int ret;
+
+ /* cr=0/1 is a very optimistic search to find large
+ * good chunks almost for free. If buddy data is not
+ * ready, then this optimization makes no sense. But
+ * we never skip the first block group in a flex_bg,
+ * since this gets used for metadata block allocation,
+ * and we want to make sure we locate metadata blocks
+ * in the first block group in the flex_bg if possible.
+ */
+ if (cr < 2 &&
+ (!sbi->s_log_groups_per_flex ||
+ ((group & ((1 << sbi->s_log_groups_per_flex) - 1)) != 0)) &&
+ !(ext4_has_group_desc_csum(sb) &&
+ (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))))
+ return 0;
+ ret = ext4_mb_init_group(sb, group, GFP_NOFS);
if (ret)
return ret;
}
@@ -2209,15 +2221,95 @@ out:
return ret;
}
+/*
+ * Start prefetching @nr block bitmaps starting at @group.
+ * Return the next group which needs to be prefetched.
+ */
+ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group,
+ unsigned int nr, int *cnt)
+{
+ ext4_group_t ngroups = ext4_get_groups_count(sb);
+ struct buffer_head *bh;
+ struct blk_plug plug;
+
+ blk_start_plug(&plug);
+ while (nr-- > 0) {
+ struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group,
+ NULL);
+ struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+
+ /*
+ * Prefetch block groups with free blocks; but don't
+ * bother if it is marked uninitialized on disk, since
+ * it won't require I/O to read. Also only try to
+ * prefetch once, so we avoid getblk() call, which can
+ * be expensive.
+ */
+ if (!EXT4_MB_GRP_TEST_AND_SET_READ(grp) &&
+ EXT4_MB_GRP_NEED_INIT(grp) &&
+ ext4_free_group_clusters(sb, gdp) > 0 &&
+ !(ext4_has_group_desc_csum(sb) &&
+ (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) {
+ bh = ext4_read_block_bitmap_nowait(sb, group, true);
+ if (bh && !IS_ERR(bh)) {
+ if (!buffer_uptodate(bh) && cnt)
+ (*cnt)++;
+ brelse(bh);
+ }
+ }
+ if (++group >= ngroups)
+ group = 0;
+ }
+ blk_finish_plug(&plug);
+ return group;
+}
+
+/*
+ * Prefetching reads the block bitmap into the buffer cache; but we
+ * need to make sure that the buddy bitmap in the page cache has been
+ * initialized. Note that ext4_mb_init_group() will block if the I/O
+ * is not yet completed, or indeed if it was not initiated by
+ * ext4_mb_prefetch did not start the I/O.
+ *
+ * TODO: We should actually kick off the buddy bitmap setup in a work
+ * queue when the buffer I/O is completed, so that we don't block
+ * waiting for the block allocation bitmap read to finish when
+ * ext4_mb_prefetch_fini is called from ext4_mb_regular_allocator().
+ */
+void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group,
+ unsigned int nr)
+{
+ while (nr-- > 0) {
+ struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group,
+ NULL);
+ struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+
+ if (!group)
+ group = ext4_get_groups_count(sb);
+ group--;
+ grp = ext4_get_group_info(sb, group);
+
+ if (EXT4_MB_GRP_NEED_INIT(grp) &&
+ ext4_free_group_clusters(sb, gdp) > 0 &&
+ !(ext4_has_group_desc_csum(sb) &&
+ (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) {
+ if (ext4_mb_init_group(sb, group, GFP_NOFS))
+ break;
+ }
+ }
+}
+
static noinline_for_stack int
ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
{
- ext4_group_t ngroups, group, i;
+ ext4_group_t prefetch_grp = 0, ngroups, group, i;
int cr = -1;
int err = 0, first_err = 0;
+ unsigned int nr = 0, prefetch_ios = 0;
struct ext4_sb_info *sbi;
struct super_block *sb;
struct ext4_buddy e4b;
+ int lost;
sb = ac->ac_sb;
sbi = EXT4_SB(sb);
@@ -2237,8 +2329,8 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
goto out;
/*
- * ac->ac2_order is set only if the fe_len is a power of 2
- * if ac2_order is set we also set criteria to 0 so that we
+ * ac->ac_2order is set only if the fe_len is a power of 2
+ * if ac->ac_2order is set we also set criteria to 0 so that we
* try exact allocation using buddy.
*/
i = fls(ac->ac_g_ex.fe_len);
@@ -2282,6 +2374,7 @@ repeat:
* from the goal value specified
*/
group = ac->ac_g_ex.fe_group;
+ prefetch_grp = group;
for (i = 0; i < ngroups; group++, i++) {
int ret = 0;
@@ -2293,6 +2386,29 @@ repeat:
if (group >= ngroups)
group = 0;
+ /*
+ * Batch reads of the block allocation bitmaps
+ * to get multiple READs in flight; limit
+ * prefetching at cr=0/1, otherwise mballoc can
+ * spend a lot of time loading imperfect groups
+ */
+ if ((prefetch_grp == group) &&
+ (cr > 1 ||
+ prefetch_ios < sbi->s_mb_prefetch_limit)) {
+ unsigned int curr_ios = prefetch_ios;
+
+ nr = sbi->s_mb_prefetch;
+ if (ext4_has_feature_flex_bg(sb)) {
+ nr = (group / sbi->s_mb_prefetch) *
+ sbi->s_mb_prefetch;
+ nr = nr + sbi->s_mb_prefetch - group;
+ }
+ prefetch_grp = ext4_mb_prefetch(sb, group,
+ nr, &prefetch_ios);
+ if (prefetch_ios == curr_ios)
+ nr = 0;
+ }
+
/* This now checks without needing the buddy page */
ret = ext4_mb_good_group_nolock(ac, group, cr);
if (ret <= 0) {
@@ -2341,22 +2457,24 @@ repeat:
* We've been searching too long. Let's try to allocate
* the best chunk we've found so far
*/
-
ext4_mb_try_best_found(ac, &e4b);
if (ac->ac_status != AC_STATUS_FOUND) {
/*
* Someone more lucky has already allocated it.
* The only thing we can do is just take first
* found block(s)
- printk(KERN_DEBUG "EXT4-fs: someone won our chunk\n");
*/
+ lost = atomic_inc_return(&sbi->s_mb_lost_chunks);
+ mb_debug(sb, "lost chunk, group: %u, start: %d, len: %d, lost: %d\n",
+ ac->ac_b_ex.fe_group, ac->ac_b_ex.fe_start,
+ ac->ac_b_ex.fe_len, lost);
+
ac->ac_b_ex.fe_group = 0;
ac->ac_b_ex.fe_start = 0;
ac->ac_b_ex.fe_len = 0;
ac->ac_status = AC_STATUS_CONTINUE;
ac->ac_flags |= EXT4_MB_HINT_FIRST;
cr = 3;
- atomic_inc(&sbi->s_mb_lost_chunks);
goto repeat;
}
}
@@ -2367,6 +2485,10 @@ out:
mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n",
ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status,
ac->ac_flags, cr, err);
+
+ if (nr)
+ ext4_mb_prefetch_fini(sb, prefetch_grp, nr);
+
return err;
}
@@ -2439,7 +2561,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
for (i = 0; i <= 13; i++)
seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ?
sg.info.bb_counters[i] : 0);
- seq_printf(seq, " ]\n");
+ seq_puts(seq, " ]\n");
return 0;
}
@@ -2613,6 +2735,26 @@ static int ext4_mb_init_backend(struct super_block *sb)
goto err_freebuddy;
}
+ if (ext4_has_feature_flex_bg(sb)) {
+ /* a single flex group is supposed to be read by a single IO */
+ sbi->s_mb_prefetch = 1 << sbi->s_es->s_log_groups_per_flex;
+ sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */
+ } else {
+ sbi->s_mb_prefetch = 32;
+ }
+ if (sbi->s_mb_prefetch > ext4_get_groups_count(sb))
+ sbi->s_mb_prefetch = ext4_get_groups_count(sb);
+ /* now many real IOs to prefetch within a single allocation at cr=0
+ * given cr=0 is an CPU-related optimization we shouldn't try to
+ * load too many groups, at some point we should start to use what
+ * we've got in memory.
+ * with an average random access time 5ms, it'd take a second to get
+ * 200 groups (* N with flex_bg), so let's make this limit 4
+ */
+ sbi->s_mb_prefetch_limit = sbi->s_mb_prefetch * 4;
+ if (sbi->s_mb_prefetch_limit > ext4_get_groups_count(sb))
+ sbi->s_mb_prefetch_limit = ext4_get_groups_count(sb);
+
return 0;
err_freebuddy:
@@ -2736,6 +2878,7 @@ int ext4_mb_init(struct super_block *sb)
sbi->s_mb_stats = MB_DEFAULT_STATS;
sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
+ sbi->s_mb_max_inode_prealloc = MB_DEFAULT_MAX_INODE_PREALLOC;
/*
* The default group preallocation is 512, which for 4k block
* sizes translates to 2 megabytes. However for bigalloc file
@@ -3090,7 +3233,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
- if (!ext4_data_block_valid(sbi, block, len)) {
+ if (!ext4_inode_block_valid(ac->ac_inode, block, len)) {
ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
"fs metadata", block, block+len);
/* File system mounted not to panic on error
@@ -3674,6 +3817,26 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
mb_debug(sb, "preallocated %d for group %u\n", preallocated, group);
}
+static void ext4_mb_mark_pa_deleted(struct super_block *sb,
+ struct ext4_prealloc_space *pa)
+{
+ struct ext4_inode_info *ei;
+
+ if (pa->pa_deleted) {
+ ext4_warning(sb, "deleted pa, type:%d, pblk:%llu, lblk:%u, len:%d\n",
+ pa->pa_type, pa->pa_pstart, pa->pa_lstart,
+ pa->pa_len);
+ return;
+ }
+
+ pa->pa_deleted = 1;
+
+ if (pa->pa_type == MB_INODE_PA) {
+ ei = EXT4_I(pa->pa_inode);
+ atomic_dec(&ei->i_prealloc_active);
+ }
+}
+
static void ext4_mb_pa_callback(struct rcu_head *head)
{
struct ext4_prealloc_space *pa;
@@ -3706,7 +3869,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
return;
}
- pa->pa_deleted = 1;
+ ext4_mb_mark_pa_deleted(sb, pa);
spin_unlock(&pa->pa_lock);
grp_blk = pa->pa_pstart;
@@ -3830,6 +3993,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
spin_lock(pa->pa_obj_lock);
list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list);
spin_unlock(pa->pa_obj_lock);
+ atomic_inc(&ei->i_prealloc_active);
}
/*
@@ -4040,7 +4204,7 @@ repeat:
}
/* seems this one can be freed ... */
- pa->pa_deleted = 1;
+ ext4_mb_mark_pa_deleted(sb, pa);
/* we can trust pa_free ... */
free += pa->pa_free;
@@ -4103,7 +4267,7 @@ out_dbg:
*
* FIXME!! Make sure it is valid at all the call sites
*/
-void ext4_discard_preallocations(struct inode *inode)
+void ext4_discard_preallocations(struct inode *inode, unsigned int needed)
{
struct ext4_inode_info *ei = EXT4_I(inode);
struct super_block *sb = inode->i_sb;
@@ -4121,15 +4285,19 @@ void ext4_discard_preallocations(struct inode *inode)
mb_debug(sb, "discard preallocation for inode %lu\n",
inode->i_ino);
- trace_ext4_discard_preallocations(inode);
+ trace_ext4_discard_preallocations(inode,
+ atomic_read(&ei->i_prealloc_active), needed);
INIT_LIST_HEAD(&list);
+ if (needed == 0)
+ needed = UINT_MAX;
+
repeat:
/* first, collect all pa's in the inode */
spin_lock(&ei->i_prealloc_lock);
- while (!list_empty(&ei->i_prealloc_list)) {
- pa = list_entry(ei->i_prealloc_list.next,
+ while (!list_empty(&ei->i_prealloc_list) && needed) {
+ pa = list_entry(ei->i_prealloc_list.prev,
struct ext4_prealloc_space, pa_inode_list);
BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock);
spin_lock(&pa->pa_lock);
@@ -4146,10 +4314,11 @@ repeat:
}
if (pa->pa_deleted == 0) {
- pa->pa_deleted = 1;
+ ext4_mb_mark_pa_deleted(sb, pa);
spin_unlock(&pa->pa_lock);
list_del_rcu(&pa->pa_inode_list);
list_add(&pa->u.pa_tmp_list, &list);
+ needed--;
continue;
}
@@ -4399,7 +4568,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
ac->ac_g_ex = ac->ac_o_ex;
ac->ac_flags = ar->flags;
- /* we have to define context: we'll we work with a file or
+ /* we have to define context: we'll work with a file or
* locality group. this is a policy, actually */
ext4_mb_group_or_file(ac);
@@ -4450,7 +4619,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
BUG_ON(pa->pa_type != MB_GROUP_PA);
/* seems this one can be freed ... */
- pa->pa_deleted = 1;
+ ext4_mb_mark_pa_deleted(sb, pa);
spin_unlock(&pa->pa_lock);
list_del_rcu(&pa->pa_inode_list);
@@ -4549,10 +4718,29 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
}
/*
+ * if per-inode prealloc list is too long, trim some PA
+ */
+static void ext4_mb_trim_inode_pa(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ int count, delta;
+
+ count = atomic_read(&ei->i_prealloc_active);
+ delta = (sbi->s_mb_max_inode_prealloc >> 2) + 1;
+ if (count > sbi->s_mb_max_inode_prealloc + delta) {
+ count -= sbi->s_mb_max_inode_prealloc;
+ ext4_discard_preallocations(inode, count);
+ }
+}
+
+/*
* release all resource we used in allocation
*/
static int ext4_mb_release_context(struct ext4_allocation_context *ac)
{
+ struct inode *inode = ac->ac_inode;
+ struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
struct ext4_prealloc_space *pa = ac->ac_pa;
if (pa) {
@@ -4564,21 +4752,31 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
pa->pa_free -= ac->ac_b_ex.fe_len;
pa->pa_len -= ac->ac_b_ex.fe_len;
spin_unlock(&pa->pa_lock);
+
+ /*
+ * We want to add the pa to the right bucket.
+ * Remove it from the list and while adding
+ * make sure the list to which we are adding
+ * doesn't grow big.
+ */
+ if (likely(pa->pa_free)) {
+ spin_lock(pa->pa_obj_lock);
+ list_del_rcu(&pa->pa_inode_list);
+ spin_unlock(pa->pa_obj_lock);
+ ext4_mb_add_n_trim(ac);
+ }
}
- }
- if (pa) {
- /*
- * We want to add the pa to the right bucket.
- * Remove it from the list and while adding
- * make sure the list to which we are adding
- * doesn't grow big.
- */
- if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
+
+ if (pa->pa_type == MB_INODE_PA) {
+ /*
+ * treat per-inode prealloc list as a lru list, then try
+ * to trim the least recently used PA.
+ */
spin_lock(pa->pa_obj_lock);
- list_del_rcu(&pa->pa_inode_list);
+ list_move(&pa->pa_inode_list, &ei->i_prealloc_list);
spin_unlock(pa->pa_obj_lock);
- ext4_mb_add_n_trim(ac);
}
+
ext4_mb_put_pa(ac, ac->ac_sb, pa);
}
if (ac->ac_bitmap_page)
@@ -4588,6 +4786,7 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
mutex_unlock(&ac->ac_lg->lg_mutex);
ext4_mb_collect_stats(ac);
+ ext4_mb_trim_inode_pa(inode);
return 0;
}
@@ -4915,7 +5114,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
sbi = EXT4_SB(sb);
if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
- !ext4_data_block_valid(sbi, block, count)) {
+ !ext4_inode_block_valid(inode, block, count)) {
ext4_error(sb, "Freeing blocks not in datazone - "
"block = %llu, count = %lu", block, count);
goto error_return;
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 6b4d17c2935d..e75b4749aa1c 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -73,6 +73,10 @@
*/
#define MB_DEFAULT_GROUP_PREALLOC 512
+/*
+ * maximum length of inode prealloc list
+ */
+#define MB_DEFAULT_MAX_INODE_PREALLOC 512
struct ext4_free_data {
/* this links the free block information from sb_info */
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 1ed86fb6c302..0d601b822875 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -686,8 +686,8 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
out:
if (*moved_len) {
- ext4_discard_preallocations(orig_inode);
- ext4_discard_preallocations(donor_inode);
+ ext4_discard_preallocations(orig_inode, 0);
+ ext4_discard_preallocations(donor_inode, 0);
}
ext4_ext_drop_refs(path);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 56738b538ddf..0d74615fcce3 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -663,8 +663,7 @@ static struct stats dx_show_leaf(struct inode *dir,
/* Directory is encrypted */
res = fscrypt_fname_alloc_buffer(
- dir, len,
- &fname_crypto_str);
+ len, &fname_crypto_str);
if (res)
printk(KERN_WARNING "Error "
"allocating crypto "
@@ -1016,8 +1015,8 @@ static int htree_dirblock_to_tree(struct file *dir_file,
brelse(bh);
return err;
}
- err = fscrypt_fname_alloc_buffer(dir, EXT4_NAME_LEN,
- &fname_crypto_str);
+ err = fscrypt_fname_alloc_buffer(EXT4_NAME_LEN,
+ &fname_crypto_str);
if (err < 0) {
brelse(bh);
return err;
@@ -1396,8 +1395,8 @@ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size,
ext4_match(dir, fname, de)) {
/* found a match - just to be sure, do
* a full check */
- if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data,
- bh->b_size, offset))
+ if (ext4_check_dir_entry(dir, NULL, de, bh, search_buf,
+ buf_size, offset))
return -1;
*res_dir = de;
return 1;
@@ -1858,7 +1857,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
blocksize, hinfo, map);
map -= count;
dx_sort_map(map, count);
- /* Split the existing block in the middle, size-wise */
+ /* Ensure that neither split block is over half full */
size = 0;
move = 0;
for (i = count-1; i >= 0; i--) {
@@ -1868,8 +1867,18 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
size += map[i].size;
move++;
}
- /* map index at which we will split */
- split = count - move;
+ /*
+ * map index at which we will split
+ *
+ * If the sum of active entries didn't exceed half the block size, just
+ * split it in half by count; each resulting block will have at least
+ * half the space free.
+ */
+ if (i > 0)
+ split = count - move;
+ else
+ split = count/2;
+
hash2 = map[split].hash;
continued = hash2 == map[split - 1].hash;
dxtrace(printk(KERN_INFO "Split block %lu at %x, %i/%i\n",
@@ -2455,8 +2464,7 @@ cleanup:
* ext4_generic_delete_entry deletes a directory entry by merging it
* with the previous entry
*/
-int ext4_generic_delete_entry(handle_t *handle,
- struct inode *dir,
+int ext4_generic_delete_entry(struct inode *dir,
struct ext4_dir_entry_2 *de_del,
struct buffer_head *bh,
void *entry_buf,
@@ -2472,7 +2480,7 @@ int ext4_generic_delete_entry(handle_t *handle,
de = (struct ext4_dir_entry_2 *)entry_buf;
while (i < buf_size - csum_size) {
if (ext4_check_dir_entry(dir, NULL, de, bh,
- bh->b_data, bh->b_size, i))
+ entry_buf, buf_size, i))
return -EFSCORRUPTED;
if (de == de_del) {
if (pde)
@@ -2517,8 +2525,7 @@ static int ext4_delete_entry(handle_t *handle,
if (unlikely(err))
goto out;
- err = ext4_generic_delete_entry(handle, dir, de_del,
- bh, bh->b_data,
+ err = ext4_generic_delete_entry(dir, de_del, bh, bh->b_data,
dir->i_sb->s_blocksize, csum_size);
if (err)
goto out;
@@ -3193,30 +3200,33 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
* in separate transaction */
retval = dquot_initialize(dir);
if (retval)
- return retval;
+ goto out_trace;
retval = dquot_initialize(d_inode(dentry));
if (retval)
- return retval;
+ goto out_trace;
- retval = -ENOENT;
bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
- if (IS_ERR(bh))
- return PTR_ERR(bh);
- if (!bh)
- goto end_unlink;
+ if (IS_ERR(bh)) {
+ retval = PTR_ERR(bh);
+ goto out_trace;
+ }
+ if (!bh) {
+ retval = -ENOENT;
+ goto out_trace;
+ }
inode = d_inode(dentry);
- retval = -EFSCORRUPTED;
- if (le32_to_cpu(de->inode) != inode->i_ino)
- goto end_unlink;
+ if (le32_to_cpu(de->inode) != inode->i_ino) {
+ retval = -EFSCORRUPTED;
+ goto out_bh;
+ }
handle = ext4_journal_start(dir, EXT4_HT_DIR,
EXT4_DATA_TRANS_BLOCKS(dir->i_sb));
if (IS_ERR(handle)) {
retval = PTR_ERR(handle);
- handle = NULL;
- goto end_unlink;
+ goto out_bh;
}
if (IS_DIRSYNC(dir))
@@ -3224,12 +3234,12 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
retval = ext4_delete_entry(handle, dir, de, bh);
if (retval)
- goto end_unlink;
+ goto out_handle;
dir->i_ctime = dir->i_mtime = current_time(dir);
ext4_update_dx_flag(dir);
retval = ext4_mark_inode_dirty(handle, dir);
if (retval)
- goto end_unlink;
+ goto out_handle;
if (inode->i_nlink == 0)
ext4_warning_inode(inode, "Deleting file '%.*s' with no links",
dentry->d_name.len, dentry->d_name.name);
@@ -3251,10 +3261,11 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
d_invalidate(dentry);
#endif
-end_unlink:
+out_handle:
+ ext4_journal_stop(handle);
+out_bh:
brelse(bh);
- if (handle)
- ext4_journal_stop(handle);
+out_trace:
trace_ext4_unlink_exit(dentry, retval);
return retval;
}
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index f2df2db0786c..f014c5e473a9 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -140,7 +140,7 @@ static void bio_post_read_processing(struct bio_post_read_ctx *ctx)
return;
}
ctx->cur_step++;
- /* fall-through */
+ fallthrough;
case STEP_VERITY:
if (ctx->enabled_steps & (1 << STEP_VERITY)) {
INIT_WORK(&ctx->work, verity_work);
@@ -148,7 +148,7 @@ static void bio_post_read_processing(struct bio_post_read_ctx *ctx)
return;
}
ctx->cur_step++;
- /* fall-through */
+ fallthrough;
default:
__read_end_io(ctx->bio);
}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 0907f907c47d..8b2736283481 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -66,10 +66,10 @@ static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
unsigned long journal_devnum);
static int ext4_show_options(struct seq_file *seq, struct dentry *root);
static int ext4_commit_super(struct super_block *sb, int sync);
-static void ext4_mark_recovery_complete(struct super_block *sb,
+static int ext4_mark_recovery_complete(struct super_block *sb,
struct ext4_super_block *es);
-static void ext4_clear_journal_err(struct super_block *sb,
- struct ext4_super_block *es);
+static int ext4_clear_journal_err(struct super_block *sb,
+ struct ext4_super_block *es);
static int ext4_sync_fs(struct super_block *sb, int wait);
static int ext4_remount(struct super_block *sb, int *flags, char *data);
static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
@@ -744,6 +744,7 @@ void __ext4_msg(struct super_block *sb,
struct va_format vaf;
va_list args;
+ atomic_inc(&EXT4_SB(sb)->s_msg_count);
if (!___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state), "EXT4-fs"))
return;
@@ -754,9 +755,12 @@ void __ext4_msg(struct super_block *sb,
va_end(args);
}
-#define ext4_warning_ratelimit(sb) \
- ___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state), \
- "EXT4-fs warning")
+static int ext4_warning_ratelimit(struct super_block *sb)
+{
+ atomic_inc(&EXT4_SB(sb)->s_warning_count);
+ return ___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state),
+ "EXT4-fs warning");
+}
void __ext4_warning(struct super_block *sb, const char *function,
unsigned int line, const char *fmt, ...)
@@ -1100,7 +1104,7 @@ static void ext4_put_super(struct super_block *sb)
crypto_free_shash(sbi->s_chksum_driver);
kfree(sbi->s_blockgroup_lock);
fs_put_dax(sbi->s_daxdev);
- fscrypt_free_dummy_context(&sbi->s_dummy_enc_ctx);
+ fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
#ifdef CONFIG_UNICODE
utf8_unload(sbi->s_encoding);
#endif
@@ -1123,6 +1127,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
inode_set_iversion(&ei->vfs_inode, 1);
spin_lock_init(&ei->i_raw_lock);
INIT_LIST_HEAD(&ei->i_prealloc_list);
+ atomic_set(&ei->i_prealloc_active, 0);
spin_lock_init(&ei->i_prealloc_lock);
ext4_es_init_tree(&ei->i_es_tree);
rwlock_init(&ei->i_es_lock);
@@ -1216,7 +1221,7 @@ void ext4_clear_inode(struct inode *inode)
{
invalidate_inode_buffers(inode);
clear_inode(inode);
- ext4_discard_preallocations(inode);
+ ext4_discard_preallocations(inode, 0);
ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
dquot_drop(inode);
if (EXT4_I(inode)->jinode) {
@@ -1288,8 +1293,8 @@ static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
if (!page_has_buffers(page))
return 0;
if (journal)
- return jbd2_journal_try_to_free_buffers(journal, page,
- wait & ~__GFP_DIRECT_RECLAIM);
+ return jbd2_journal_try_to_free_buffers(journal, page);
+
return try_to_free_buffers(page);
}
@@ -1387,10 +1392,9 @@ retry:
return res;
}
-static const union fscrypt_context *
-ext4_get_dummy_context(struct super_block *sb)
+static const union fscrypt_policy *ext4_get_dummy_policy(struct super_block *sb)
{
- return EXT4_SB(sb)->s_dummy_enc_ctx.ctx;
+ return EXT4_SB(sb)->s_dummy_enc_policy.policy;
}
static bool ext4_has_stable_inodes(struct super_block *sb)
@@ -1409,7 +1413,7 @@ static const struct fscrypt_operations ext4_cryptops = {
.key_prefix = "ext4:",
.get_context = ext4_get_context,
.set_context = ext4_set_context,
- .get_dummy_context = ext4_get_dummy_context,
+ .get_dummy_policy = ext4_get_dummy_policy,
.empty_dir = ext4_empty_dir,
.max_namelen = EXT4_NAME_LEN,
.has_stable_inodes = ext4_has_stable_inodes,
@@ -1522,6 +1526,7 @@ enum {
Opt_dioread_nolock, Opt_dioread_lock,
Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
+ Opt_prefetch_block_bitmaps,
};
static const match_table_t tokens = {
@@ -1614,6 +1619,7 @@ static const match_table_t tokens = {
{Opt_inlinecrypt, "inlinecrypt"},
{Opt_nombcache, "nombcache"},
{Opt_nombcache, "no_mbcache"}, /* for backward compatibility */
+ {Opt_prefetch_block_bitmaps, "prefetch_block_bitmaps"},
{Opt_removed, "check=none"}, /* mount option from ext2/3 */
{Opt_removed, "nocheck"}, /* mount option from ext2/3 */
{Opt_removed, "reservation"}, /* mount option from ext2/3 */
@@ -1831,6 +1837,8 @@ static const struct mount_opts {
{Opt_max_dir_size_kb, 0, MOPT_GTE0},
{Opt_test_dummy_encryption, 0, MOPT_STRING},
{Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},
+ {Opt_prefetch_block_bitmaps, EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS,
+ MOPT_SET},
{Opt_err, 0, 0}
};
@@ -1879,12 +1887,13 @@ static int ext4_set_test_dummy_encryption(struct super_block *sb,
* needed to allow it to be set or changed during remount. We do allow
* it to be specified during remount, but only if there is no change.
*/
- if (is_remount && !sbi->s_dummy_enc_ctx.ctx) {
+ if (is_remount && !sbi->s_dummy_enc_policy.policy) {
ext4_msg(sb, KERN_WARNING,
"Can't set test_dummy_encryption on remount");
return -1;
}
- err = fscrypt_set_test_dummy_encryption(sb, arg, &sbi->s_dummy_enc_ctx);
+ err = fscrypt_set_test_dummy_encryption(sb, arg->from,
+ &sbi->s_dummy_enc_policy);
if (err) {
if (err == -EEXIST)
ext4_msg(sb, KERN_WARNING,
@@ -3213,15 +3222,34 @@ static void print_daily_error_info(struct timer_list *t)
static int ext4_run_li_request(struct ext4_li_request *elr)
{
struct ext4_group_desc *gdp = NULL;
- ext4_group_t group, ngroups;
- struct super_block *sb;
+ struct super_block *sb = elr->lr_super;
+ ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+ ext4_group_t group = elr->lr_next_group;
unsigned long timeout = 0;
+ unsigned int prefetch_ios = 0;
int ret = 0;
- sb = elr->lr_super;
- ngroups = EXT4_SB(sb)->s_groups_count;
+ if (elr->lr_mode == EXT4_LI_MODE_PREFETCH_BBITMAP) {
+ elr->lr_next_group = ext4_mb_prefetch(sb, group,
+ EXT4_SB(sb)->s_mb_prefetch, &prefetch_ios);
+ if (prefetch_ios)
+ ext4_mb_prefetch_fini(sb, elr->lr_next_group,
+ prefetch_ios);
+ trace_ext4_prefetch_bitmaps(sb, group, elr->lr_next_group,
+ prefetch_ios);
+ if (group >= elr->lr_next_group) {
+ ret = 1;
+ if (elr->lr_first_not_zeroed != ngroups &&
+ !sb_rdonly(sb) && test_opt(sb, INIT_INODE_TABLE)) {
+ elr->lr_next_group = elr->lr_first_not_zeroed;
+ elr->lr_mode = EXT4_LI_MODE_ITABLE;
+ ret = 0;
+ }
+ }
+ return ret;
+ }
- for (group = elr->lr_next_group; group < ngroups; group++) {
+ for (; group < ngroups; group++) {
gdp = ext4_get_group_desc(sb, group, NULL);
if (!gdp) {
ret = 1;
@@ -3239,9 +3267,10 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
timeout = jiffies;
ret = ext4_init_inode_table(sb, group,
elr->lr_timeout ? 0 : 1);
+ trace_ext4_lazy_itable_init(sb, group);
if (elr->lr_timeout == 0) {
timeout = (jiffies - timeout) *
- elr->lr_sbi->s_li_wait_mult;
+ EXT4_SB(elr->lr_super)->s_li_wait_mult;
elr->lr_timeout = timeout;
}
elr->lr_next_sched = jiffies + elr->lr_timeout;
@@ -3256,15 +3285,11 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
*/
static void ext4_remove_li_request(struct ext4_li_request *elr)
{
- struct ext4_sb_info *sbi;
-
if (!elr)
return;
- sbi = elr->lr_sbi;
-
list_del(&elr->lr_request);
- sbi->s_li_request = NULL;
+ EXT4_SB(elr->lr_super)->s_li_request = NULL;
kfree(elr);
}
@@ -3473,7 +3498,6 @@ static int ext4_li_info_new(void)
static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
ext4_group_t start)
{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_li_request *elr;
elr = kzalloc(sizeof(*elr), GFP_KERNEL);
@@ -3481,8 +3505,13 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
return NULL;
elr->lr_super = sb;
- elr->lr_sbi = sbi;
- elr->lr_next_group = start;
+ elr->lr_first_not_zeroed = start;
+ if (test_opt(sb, PREFETCH_BLOCK_BITMAPS))
+ elr->lr_mode = EXT4_LI_MODE_PREFETCH_BBITMAP;
+ else {
+ elr->lr_mode = EXT4_LI_MODE_ITABLE;
+ elr->lr_next_group = start;
+ }
/*
* Randomize first schedule time of the request to
@@ -3512,8 +3541,9 @@ int ext4_register_li_request(struct super_block *sb,
goto out;
}
- if (first_not_zeroed == ngroups || sb_rdonly(sb) ||
- !test_opt(sb, INIT_INODE_TABLE))
+ if (!test_opt(sb, PREFETCH_BLOCK_BITMAPS) &&
+ (first_not_zeroed == ngroups || sb_rdonly(sb) ||
+ !test_opt(sb, INIT_INODE_TABLE)))
goto out;
elr = ext4_li_request_new(sb, first_not_zeroed);
@@ -4710,11 +4740,13 @@ no_journal:
ext4_set_resv_clusters(sb);
- err = ext4_setup_system_zone(sb);
- if (err) {
- ext4_msg(sb, KERN_ERR, "failed to initialize system "
- "zone (%d)", err);
- goto failed_mount4a;
+ if (test_opt(sb, BLOCK_VALIDITY)) {
+ err = ext4_setup_system_zone(sb);
+ if (err) {
+ ext4_msg(sb, KERN_ERR, "failed to initialize system "
+ "zone (%d)", err);
+ goto failed_mount4a;
+ }
}
ext4_ext_init(sb);
@@ -4777,12 +4809,23 @@ no_journal:
}
#endif /* CONFIG_QUOTA */
+ /*
+ * Save the original bdev mapping's wb_err value which could be
+ * used to detect the metadata async write error.
+ */
+ spin_lock_init(&sbi->s_bdev_wb_lock);
+ if (!sb_rdonly(sb))
+ errseq_check_and_advance(&sb->s_bdev->bd_inode->i_mapping->wb_err,
+ &sbi->s_bdev_wb_err);
+ sb->s_bdev->bd_super = sb;
EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
ext4_orphan_cleanup(sb, es);
EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
if (needs_recovery) {
ext4_msg(sb, KERN_INFO, "recovery complete");
- ext4_mark_recovery_complete(sb, es);
+ err = ext4_mark_recovery_complete(sb, es);
+ if (err)
+ goto failed_mount8;
}
if (EXT4_SB(sb)->s_journal) {
if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
@@ -4816,6 +4859,8 @@ no_journal:
ratelimit_state_init(&sbi->s_err_ratelimit_state, 5 * HZ, 10);
ratelimit_state_init(&sbi->s_warning_ratelimit_state, 5 * HZ, 10);
ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10);
+ atomic_set(&sbi->s_warning_count, 0);
+ atomic_set(&sbi->s_msg_count, 0);
kfree(orig_data);
return 0;
@@ -4825,10 +4870,8 @@ cantfind_ext4:
ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
goto failed_mount;
-#ifdef CONFIG_QUOTA
failed_mount8:
ext4_unregister_sysfs(sb);
-#endif
failed_mount7:
ext4_unregister_li_request(sb);
failed_mount6:
@@ -4892,7 +4935,7 @@ failed_mount:
for (i = 0; i < EXT4_MAXQUOTAS; i++)
kfree(get_qf_name(sb, sbi, i));
#endif
- fscrypt_free_dummy_context(&sbi->s_dummy_enc_ctx);
+ fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
ext4_blkdev_remove(sbi);
brelse(bh);
out_fail:
@@ -4968,7 +5011,8 @@ static journal_t *ext4_get_journal(struct super_block *sb,
struct inode *journal_inode;
journal_t *journal;
- BUG_ON(!ext4_has_feature_journal(sb));
+ if (WARN_ON_ONCE(!ext4_has_feature_journal(sb)))
+ return NULL;
journal_inode = ext4_get_journal_inode(sb, journal_inum);
if (!journal_inode)
@@ -4998,7 +5042,8 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
struct ext4_super_block *es;
struct block_device *bdev;
- BUG_ON(!ext4_has_feature_journal(sb));
+ if (WARN_ON_ONCE(!ext4_has_feature_journal(sb)))
+ return NULL;
bdev = ext4_blkdev_get(j_dev, sb);
if (bdev == NULL)
@@ -5089,8 +5134,10 @@ static int ext4_load_journal(struct super_block *sb,
dev_t journal_dev;
int err = 0;
int really_read_only;
+ int journal_dev_ro;
- BUG_ON(!ext4_has_feature_journal(sb));
+ if (WARN_ON_ONCE(!ext4_has_feature_journal(sb)))
+ return -EFSCORRUPTED;
if (journal_devnum &&
journal_devnum != le32_to_cpu(es->s_journal_dev)) {
@@ -5100,7 +5147,31 @@ static int ext4_load_journal(struct super_block *sb,
} else
journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
- really_read_only = bdev_read_only(sb->s_bdev);
+ if (journal_inum && journal_dev) {
+ ext4_msg(sb, KERN_ERR,
+ "filesystem has both journal inode and journal device!");
+ return -EINVAL;
+ }
+
+ if (journal_inum) {
+ journal = ext4_get_journal(sb, journal_inum);
+ if (!journal)
+ return -EINVAL;
+ } else {
+ journal = ext4_get_dev_journal(sb, journal_dev);
+ if (!journal)
+ return -EINVAL;
+ }
+
+ journal_dev_ro = bdev_read_only(journal->j_dev);
+ really_read_only = bdev_read_only(sb->s_bdev) | journal_dev_ro;
+
+ if (journal_dev_ro && !sb_rdonly(sb)) {
+ ext4_msg(sb, KERN_ERR,
+ "journal device read-only, try mounting with '-o ro'");
+ err = -EROFS;
+ goto err_out;
+ }
/*
* Are we loading a blank journal or performing recovery after a
@@ -5115,27 +5186,14 @@ static int ext4_load_journal(struct super_block *sb,
ext4_msg(sb, KERN_ERR, "write access "
"unavailable, cannot proceed "
"(try mounting with noload)");
- return -EROFS;
+ err = -EROFS;
+ goto err_out;
}
ext4_msg(sb, KERN_INFO, "write access will "
"be enabled during recovery");
}
}
- if (journal_inum && journal_dev) {
- ext4_msg(sb, KERN_ERR, "filesystem has both journal "
- "and inode journals!");
- return -EINVAL;
- }
-
- if (journal_inum) {
- if (!(journal = ext4_get_journal(sb, journal_inum)))
- return -EINVAL;
- } else {
- if (!(journal = ext4_get_dev_journal(sb, journal_dev)))
- return -EINVAL;
- }
-
if (!(journal->j_flags & JBD2_BARRIER))
ext4_msg(sb, KERN_INFO, "barriers disabled");
@@ -5155,12 +5213,16 @@ static int ext4_load_journal(struct super_block *sb,
if (err) {
ext4_msg(sb, KERN_ERR, "error loading journal");
- jbd2_journal_destroy(journal);
- return err;
+ goto err_out;
}
EXT4_SB(sb)->s_journal = journal;
- ext4_clear_journal_err(sb, es);
+ err = ext4_clear_journal_err(sb, es);
+ if (err) {
+ EXT4_SB(sb)->s_journal = NULL;
+ jbd2_journal_destroy(journal);
+ return err;
+ }
if (!really_read_only && journal_devnum &&
journal_devnum != le32_to_cpu(es->s_journal_dev)) {
@@ -5171,6 +5233,10 @@ static int ext4_load_journal(struct super_block *sb,
}
return 0;
+
+err_out:
+ jbd2_journal_destroy(journal);
+ return err;
}
static int ext4_commit_super(struct super_block *sb, int sync)
@@ -5183,13 +5249,6 @@ static int ext4_commit_super(struct super_block *sb, int sync)
return error;
/*
- * The superblock bh should be mapped, but it might not be if the
- * device was hot-removed. Not much we can do but fail the I/O.
- */
- if (!buffer_mapped(sbh))
- return error;
-
- /*
* If the file system is mounted read-only, don't update the
* superblock write time. This avoids updating the superblock
* write time when we are mounting the root file system
@@ -5256,26 +5315,32 @@ static int ext4_commit_super(struct super_block *sb, int sync)
* remounting) the filesystem readonly, then we will end up with a
* consistent fs on disk. Record that fact.
*/
-static void ext4_mark_recovery_complete(struct super_block *sb,
- struct ext4_super_block *es)
+static int ext4_mark_recovery_complete(struct super_block *sb,
+ struct ext4_super_block *es)
{
+ int err;
journal_t *journal = EXT4_SB(sb)->s_journal;
if (!ext4_has_feature_journal(sb)) {
- BUG_ON(journal != NULL);
- return;
+ if (journal != NULL) {
+ ext4_error(sb, "Journal got removed while the fs was "
+ "mounted!");
+ return -EFSCORRUPTED;
+ }
+ return 0;
}
jbd2_journal_lock_updates(journal);
- if (jbd2_journal_flush(journal) < 0)
+ err = jbd2_journal_flush(journal);
+ if (err < 0)
goto out;
if (ext4_has_feature_journal_needs_recovery(sb) && sb_rdonly(sb)) {
ext4_clear_feature_journal_needs_recovery(sb);
ext4_commit_super(sb, 1);
}
-
out:
jbd2_journal_unlock_updates(journal);
+ return err;
}
/*
@@ -5283,14 +5348,17 @@ out:
* has recorded an error from a previous lifetime, move that error to the
* main filesystem now.
*/
-static void ext4_clear_journal_err(struct super_block *sb,
+static int ext4_clear_journal_err(struct super_block *sb,
struct ext4_super_block *es)
{
journal_t *journal;
int j_errno;
const char *errstr;
- BUG_ON(!ext4_has_feature_journal(sb));
+ if (!ext4_has_feature_journal(sb)) {
+ ext4_error(sb, "Journal got removed while the fs was mounted!");
+ return -EFSCORRUPTED;
+ }
journal = EXT4_SB(sb)->s_journal;
@@ -5315,6 +5383,7 @@ static void ext4_clear_journal_err(struct super_block *sb,
jbd2_journal_clear_err(journal);
jbd2_journal_update_sb_errno(journal);
}
+ return 0;
}
/*
@@ -5457,7 +5526,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
{
struct ext4_super_block *es;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- unsigned long old_sb_flags;
+ unsigned long old_sb_flags, vfs_flags;
struct ext4_mount_options old_opts;
int enable_quota = 0;
ext4_group_t g;
@@ -5500,6 +5569,14 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
if (sbi->s_journal && sbi->s_journal->j_task->io_context)
journal_ioprio = sbi->s_journal->j_task->io_context->ioprio;
+ /*
+ * Some options can be enabled by ext4 and/or by VFS mount flag
+ * either way we need to make sure it matches in both *flags and
+ * s_flags. Copy those selected flags from *flags to s_flags
+ */
+ vfs_flags = SB_LAZYTIME | SB_I_VERSION;
+ sb->s_flags = (sb->s_flags & ~vfs_flags) | (*flags & vfs_flags);
+
if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) {
err = -EINVAL;
goto restore_opts;
@@ -5553,9 +5630,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
}
- if (*flags & SB_LAZYTIME)
- sb->s_flags |= SB_LAZYTIME;
-
if ((bool)(*flags & SB_RDONLY) != sb_rdonly(sb)) {
if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) {
err = -EROFS;
@@ -5585,8 +5659,13 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
(sbi->s_mount_state & EXT4_VALID_FS))
es->s_state = cpu_to_le16(sbi->s_mount_state);
- if (sbi->s_journal)
+ if (sbi->s_journal) {
+ /*
+ * We let remount-ro finish even if marking fs
+ * as clean failed...
+ */
ext4_mark_recovery_complete(sb, es);
+ }
if (sbi->s_mmp_tsk)
kthread_stop(sbi->s_mmp_tsk);
} else {
@@ -5629,13 +5708,24 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
}
/*
+ * Update the original bdev mapping's wb_err value
+ * which could be used to detect the metadata async
+ * write error.
+ */
+ errseq_check_and_advance(&sb->s_bdev->bd_inode->i_mapping->wb_err,
+ &sbi->s_bdev_wb_err);
+
+ /*
* Mounting a RDONLY partition read-write, so reread
* and store the current valid flag. (It may have
* been changed by e2fsck since we originally mounted
* the partition.)
*/
- if (sbi->s_journal)
- ext4_clear_journal_err(sb, es);
+ if (sbi->s_journal) {
+ err = ext4_clear_journal_err(sb, es);
+ if (err)
+ goto restore_opts;
+ }
sbi->s_mount_state = le16_to_cpu(es->s_state);
err = ext4_setup_super(sb, es, 0);
@@ -5665,7 +5755,17 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
ext4_register_li_request(sb, first_not_zeroed);
}
- ext4_setup_system_zone(sb);
+ /*
+ * Handle creation of system zone data early because it can fail.
+ * Releasing of existing data is done when we are sure remount will
+ * succeed.
+ */
+ if (test_opt(sb, BLOCK_VALIDITY) && !sbi->system_blks) {
+ err = ext4_setup_system_zone(sb);
+ if (err)
+ goto restore_opts;
+ }
+
if (sbi->s_journal == NULL && !(old_sb_flags & SB_RDONLY)) {
err = ext4_commit_super(sb, 1);
if (err)
@@ -5686,8 +5786,16 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
}
}
#endif
+ if (!test_opt(sb, BLOCK_VALIDITY) && sbi->system_blks)
+ ext4_release_system_zone(sb);
+
+ /*
+ * Some options can be enabled by ext4 and/or by VFS mount flag
+ * either way we need to make sure it matches in both *flags and
+ * s_flags. Copy those selected flags from s_flags to *flags
+ */
+ *flags = (*flags & ~vfs_flags) | (sb->s_flags & vfs_flags);
- *flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME);
ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data);
kfree(orig_data);
return 0;
@@ -5701,6 +5809,8 @@ restore_opts:
sbi->s_commit_interval = old_opts.s_commit_interval;
sbi->s_min_batch_time = old_opts.s_min_batch_time;
sbi->s_max_batch_time = old_opts.s_max_batch_time;
+ if (!test_opt(sb, BLOCK_VALIDITY) && sbi->system_blks)
+ ext4_release_system_zone(sb);
#ifdef CONFIG_QUOTA
sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
for (i = 0; i < EXT4_MAXQUOTAS; i++) {
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 6c9fc9e21c13..bfabb799fa45 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -189,6 +189,9 @@ static struct ext4_attr ext4_attr_##_name = { \
#define EXT4_RW_ATTR_SBI_UL(_name,_elname) \
EXT4_ATTR_OFFSET(_name, 0644, pointer_ul, ext4_sb_info, _elname)
+#define EXT4_RO_ATTR_SBI_ATOMIC(_name,_elname) \
+ EXT4_ATTR_OFFSET(_name, 0444, pointer_atomic, ext4_sb_info, _elname)
+
#define EXT4_ATTR_PTR(_name,_mode,_id,_ptr) \
static struct ext4_attr ext4_attr_##_name = { \
.attr = {.name = __stringify(_name), .mode = _mode }, \
@@ -215,6 +218,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
+EXT4_RW_ATTR_SBI_UI(mb_max_inode_prealloc, s_mb_max_inode_prealloc);
EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error);
EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
@@ -226,6 +230,8 @@ EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
#ifdef CONFIG_EXT4_DEBUG
EXT4_RW_ATTR_SBI_UL(simulate_fail, s_simulate_fail);
#endif
+EXT4_RO_ATTR_SBI_ATOMIC(warning_count, s_warning_count);
+EXT4_RO_ATTR_SBI_ATOMIC(msg_count, s_msg_count);
EXT4_RO_ATTR_ES_UI(errors_count, s_error_count);
EXT4_RO_ATTR_ES_U8(first_error_errcode, s_first_error_errcode);
EXT4_RO_ATTR_ES_U8(last_error_errcode, s_last_error_errcode);
@@ -240,6 +246,8 @@ EXT4_RO_ATTR_ES_STRING(last_error_func, s_last_error_func, 32);
EXT4_ATTR(first_error_time, 0444, first_error_time);
EXT4_ATTR(last_error_time, 0444, last_error_time);
EXT4_ATTR(journal_task, 0444, journal_task);
+EXT4_RW_ATTR_SBI_UI(mb_prefetch, s_mb_prefetch);
+EXT4_RW_ATTR_SBI_UI(mb_prefetch_limit, s_mb_prefetch_limit);
static unsigned int old_bump_val = 128;
EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val);
@@ -257,6 +265,7 @@ static struct attribute *ext4_attrs[] = {
ATTR_LIST(mb_order2_req),
ATTR_LIST(mb_stream_req),
ATTR_LIST(mb_group_prealloc),
+ ATTR_LIST(mb_max_inode_prealloc),
ATTR_LIST(max_writeback_mb_bump),
ATTR_LIST(extent_max_zeroout_kb),
ATTR_LIST(trigger_fs_error),
@@ -267,6 +276,8 @@ static struct attribute *ext4_attrs[] = {
ATTR_LIST(msg_ratelimit_interval_ms),
ATTR_LIST(msg_ratelimit_burst),
ATTR_LIST(errors_count),
+ ATTR_LIST(warning_count),
+ ATTR_LIST(msg_count),
ATTR_LIST(first_error_ino),
ATTR_LIST(last_error_ino),
ATTR_LIST(first_error_block),
@@ -283,6 +294,8 @@ static struct attribute *ext4_attrs[] = {
#ifdef CONFIG_EXT4_DEBUG
ATTR_LIST(simulate_fail),
#endif
+ ATTR_LIST(mb_prefetch),
+ ATTR_LIST(mb_prefetch_limit),
NULL,
};
ATTRIBUTE_GROUPS(ext4);
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 7d2f6576d954..cba4b877c606 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1356,8 +1356,7 @@ retry:
block = 0;
while (wsize < bufsize) {
- if (bh != NULL)
- brelse(bh);
+ brelse(bh);
csize = (bufsize - wsize) > blocksize ? blocksize :
bufsize - wsize;
bh = ext4_getblk(handle, ea_inode, block, 0);
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 236064930251..ff807e14c891 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -523,7 +523,7 @@ void f2fs_remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
__remove_ino_entry(sbi, ino, type);
}
-/* mode should be APPEND_INO or UPDATE_INO */
+/* mode should be APPEND_INO, UPDATE_INO or TRANS_DIR_INO */
bool f2fs_exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode)
{
struct inode_management *im = &sbi->im[mode];
@@ -1258,8 +1258,6 @@ void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type)
DEFINE_WAIT(wait);
for (;;) {
- prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE);
-
if (!get_pages(sbi, type))
break;
@@ -1269,6 +1267,10 @@ void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type)
if (type == F2FS_DIRTY_META)
f2fs_sync_meta_pages(sbi, META, LONG_MAX,
FS_CP_META_IO);
+ else if (type == F2FS_WB_CP_DATA)
+ f2fs_submit_merged_write(sbi, DATA);
+
+ prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE);
io_schedule_timeout(DEFAULT_IO_TIMEOUT);
}
finish_wait(&sbi->cp_wait, &wait);
@@ -1415,7 +1417,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
curseg_alloc_type(sbi, i + CURSEG_HOT_DATA);
}
- /* 2 cp + n data seg summary + orphan inode blocks */
+ /* 2 cp + n data seg summary + orphan inode blocks */
data_sum_blocks = f2fs_npages_for_summary_flush(sbi, false);
spin_lock_irqsave(&sbi->cp_lock, flags);
if (data_sum_blocks < NR_CURSEG_DATA_TYPE)
@@ -1515,9 +1517,10 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
/*
* invalidate intermediate page cache borrowed from meta inode which are
- * used for migration of encrypted or verity inode's blocks.
+ * used for migration of encrypted, verity or compressed inode's blocks.
*/
- if (f2fs_sb_has_encrypt(sbi) || f2fs_sb_has_verity(sbi))
+ if (f2fs_sb_has_encrypt(sbi) || f2fs_sb_has_verity(sbi) ||
+ f2fs_sb_has_compression(sbi))
invalidate_mapping_pages(META_MAPPING(sbi),
MAIN_BLKADDR(sbi), MAX_BLKADDR(sbi) - 1);
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 29e50fbe7eca..1dfb126a0cb2 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -49,6 +49,13 @@ bool f2fs_is_compressed_page(struct page *page)
return false;
if (IS_ATOMIC_WRITTEN_PAGE(page) || IS_DUMMY_WRITTEN_PAGE(page))
return false;
+ /*
+ * page->private may be set with pid.
+ * pid_max is enough to check if it is traced.
+ */
+ if (IS_IO_TRACED_PAGE(page))
+ return false;
+
f2fs_bug_on(F2FS_M_SB(page->mapping),
*((u32 *)page_private(page)) != F2FS_COMPRESSED_PAGE_MAGIC);
return true;
@@ -506,7 +513,7 @@ bool f2fs_is_compress_backend_ready(struct inode *inode)
return f2fs_cops[F2FS_I(inode)->i_compress_algorithm];
}
-static mempool_t *compress_page_pool = NULL;
+static mempool_t *compress_page_pool;
static int num_compress_pages = 512;
module_param(num_compress_pages, uint, 0444);
MODULE_PARM_DESC(num_compress_pages,
@@ -663,6 +670,7 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity)
const struct f2fs_compress_ops *cops =
f2fs_cops[fi->i_compress_algorithm];
int ret;
+ int i;
dec_page_count(sbi, F2FS_RD_DATA);
@@ -681,6 +689,26 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity)
goto out_free_dic;
}
+ dic->tpages = f2fs_kzalloc(sbi, sizeof(struct page *) *
+ dic->cluster_size, GFP_NOFS);
+ if (!dic->tpages) {
+ ret = -ENOMEM;
+ goto out_free_dic;
+ }
+
+ for (i = 0; i < dic->cluster_size; i++) {
+ if (dic->rpages[i]) {
+ dic->tpages[i] = dic->rpages[i];
+ continue;
+ }
+
+ dic->tpages[i] = f2fs_compress_alloc_page();
+ if (!dic->tpages[i]) {
+ ret = -ENOMEM;
+ goto out_free_dic;
+ }
+ }
+
if (cops->init_decompress_ctx) {
ret = cops->init_decompress_ctx(dic);
if (ret)
@@ -821,7 +849,7 @@ static int f2fs_compressed_blocks(struct compress_ctx *cc)
}
/* return # of valid blocks in compressed cluster */
-static int f2fs_cluster_blocks(struct compress_ctx *cc, bool compr)
+static int f2fs_cluster_blocks(struct compress_ctx *cc)
{
return __f2fs_cluster_blocks(cc, false);
}
@@ -835,7 +863,7 @@ int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index)
.cluster_idx = index >> F2FS_I(inode)->i_log_cluster_size,
};
- return f2fs_cluster_blocks(&cc, false);
+ return f2fs_cluster_blocks(&cc);
}
static bool cluster_may_compress(struct compress_ctx *cc)
@@ -886,7 +914,7 @@ static int prepare_compress_overwrite(struct compress_ctx *cc,
bool prealloc;
retry:
- ret = f2fs_cluster_blocks(cc, false);
+ ret = f2fs_cluster_blocks(cc);
if (ret <= 0)
return ret;
@@ -949,7 +977,7 @@ retry:
}
if (prealloc) {
- __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true);
+ f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true);
set_new_dnode(&dn, cc->inode, NULL, NULL, 0);
@@ -964,7 +992,7 @@ retry:
break;
}
- __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false);
+ f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false);
}
if (likely(!ret)) {
@@ -1096,8 +1124,16 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
loff_t psize;
int i, err;
- if (!IS_NOQUOTA(inode) && !f2fs_trylock_op(sbi))
+ if (IS_NOQUOTA(inode)) {
+ /*
+ * We need to wait for node_write to avoid block allocation during
+ * checkpoint. This can only happen to quota writes which can cause
+ * the below discard race condition.
+ */
+ down_read(&sbi->node_write);
+ } else if (!f2fs_trylock_op(sbi)) {
return -EAGAIN;
+ }
set_new_dnode(&dn, cc->inode, NULL, NULL, 0);
@@ -1137,6 +1173,13 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
f2fs_set_compressed_page(cc->cpages[i], inode,
cc->rpages[i + 1]->index, cic);
fio.compressed_page = cc->cpages[i];
+
+ fio.old_blkaddr = data_blkaddr(dn.inode, dn.node_page,
+ dn.ofs_in_node + i + 1);
+
+ /* wait for GCed page writeback via META_MAPPING */
+ f2fs_wait_on_block_writeback(inode, fio.old_blkaddr);
+
if (fio.encrypted) {
fio.page = cc->rpages[i + 1];
err = f2fs_encrypt_one_page(&fio);
@@ -1203,7 +1246,9 @@ unlock_continue:
set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN);
f2fs_put_dnode(&dn);
- if (!IS_NOQUOTA(inode))
+ if (IS_NOQUOTA(inode))
+ up_read(&sbi->node_write);
+ else
f2fs_unlock_op(sbi);
spin_lock(&fi->i_size_lock);
@@ -1230,7 +1275,9 @@ out_put_cic:
out_put_dnode:
f2fs_put_dnode(&dn);
out_unlock_op:
- if (!IS_NOQUOTA(inode))
+ if (IS_NOQUOTA(inode))
+ up_read(&sbi->node_write);
+ else
f2fs_unlock_op(sbi);
return -EAGAIN;
}
@@ -1310,6 +1357,12 @@ retry_write:
congestion_wait(BLK_RW_ASYNC,
DEFAULT_IO_TIMEOUT);
lock_page(cc->rpages[i]);
+
+ if (!PageDirty(cc->rpages[i])) {
+ unlock_page(cc->rpages[i]);
+ continue;
+ }
+
clear_page_dirty_for_io(cc->rpages[i]);
goto retry_write;
}
@@ -1353,6 +1406,8 @@ int f2fs_write_multi_pages(struct compress_ctx *cc,
err = f2fs_write_compressed_pages(cc, submitted,
wbc, io_type);
cops->destroy_compress_ctx(cc);
+ kfree(cc->cpages);
+ cc->cpages = NULL;
if (!err)
return 0;
f2fs_bug_on(F2FS_I_SB(cc->inode), err != -EAGAIN);
@@ -1415,22 +1470,6 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc)
dic->cpages[i] = page;
}
- dic->tpages = f2fs_kzalloc(sbi, sizeof(struct page *) *
- dic->cluster_size, GFP_NOFS);
- if (!dic->tpages)
- goto out_free;
-
- for (i = 0; i < dic->cluster_size; i++) {
- if (cc->rpages[i]) {
- dic->tpages[i] = cc->rpages[i];
- continue;
- }
-
- dic->tpages[i] = f2fs_compress_alloc_page();
- if (!dic->tpages[i])
- goto out_free;
- }
-
return dic;
out_free:
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 5f527073143e..73683e58a08d 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -87,7 +87,7 @@ static bool __is_cp_guaranteed(struct page *page)
sbi = F2FS_I_SB(inode);
if (inode->i_ino == F2FS_META_INO(sbi) ||
- inode->i_ino == F2FS_NODE_INO(sbi) ||
+ inode->i_ino == F2FS_NODE_INO(sbi) ||
S_ISDIR(inode->i_mode) ||
(S_ISREG(inode->i_mode) &&
(f2fs_is_atomic_file(inode) || IS_NOQUOTA(inode))) ||
@@ -1073,12 +1073,13 @@ static void f2fs_release_read_bio(struct bio *bio)
/* This can handle encryption stuffs */
static int f2fs_submit_page_read(struct inode *inode, struct page *page,
- block_t blkaddr, bool for_write)
+ block_t blkaddr, int op_flags, bool for_write)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct bio *bio;
- bio = f2fs_grab_read_bio(inode, blkaddr, 1, 0, page->index, for_write);
+ bio = f2fs_grab_read_bio(inode, blkaddr, 1, op_flags,
+ page->index, for_write);
if (IS_ERR(bio))
return PTR_ERR(bio);
@@ -1193,7 +1194,7 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index)
int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index)
{
- struct extent_info ei = {0,0,0};
+ struct extent_info ei = {0, 0, 0};
struct inode *inode = dn->inode;
if (f2fs_lookup_extent_cache(inode, index, &ei)) {
@@ -1265,7 +1266,8 @@ got_it:
return page;
}
- err = f2fs_submit_page_read(inode, page, dn.data_blkaddr, for_write);
+ err = f2fs_submit_page_read(inode, page, dn.data_blkaddr,
+ op_flags, for_write);
if (err)
goto put_err;
return page;
@@ -1414,7 +1416,7 @@ alloc:
set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
old_blkaddr = dn->data_blkaddr;
f2fs_allocate_data_block(sbi, NULL, old_blkaddr, &dn->data_blkaddr,
- &sum, seg_type, NULL, false);
+ &sum, seg_type, NULL);
if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO)
invalidate_mapping_pages(META_MAPPING(sbi),
old_blkaddr, old_blkaddr);
@@ -1474,7 +1476,7 @@ map_blocks:
return err;
}
-void __do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock)
+void f2fs_do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock)
{
if (flag == F2FS_GET_BLOCK_PRE_AIO) {
if (lock)
@@ -1539,7 +1541,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
next_dnode:
if (map->m_may_create)
- __do_map_lock(sbi, flag, true);
+ f2fs_do_map_lock(sbi, flag, true);
/* When reading holes, we need its node page */
set_new_dnode(&dn, inode, NULL, NULL, 0);
@@ -1688,7 +1690,7 @@ skip:
f2fs_put_dnode(&dn);
if (map->m_may_create) {
- __do_map_lock(sbi, flag, false);
+ f2fs_do_map_lock(sbi, flag, false);
f2fs_balance_fs(sbi, dn.node_changed);
}
goto next_dnode;
@@ -1714,7 +1716,7 @@ sync_out:
f2fs_put_dnode(&dn);
unlock_out:
if (map->m_may_create) {
- __do_map_lock(sbi, flag, false);
+ f2fs_do_map_lock(sbi, flag, false);
f2fs_balance_fs(sbi, dn.node_changed);
}
out:
@@ -1861,6 +1863,7 @@ static int f2fs_xattr_fiemap(struct inode *inode,
flags |= FIEMAP_EXTENT_LAST;
err = fiemap_fill_next_extent(fieinfo, 0, phys, len, flags);
+ trace_f2fs_fiemap(inode, 0, phys, len, flags, err);
if (err || err == 1)
return err;
}
@@ -1884,8 +1887,10 @@ static int f2fs_xattr_fiemap(struct inode *inode,
flags = FIEMAP_EXTENT_LAST;
}
- if (phys)
+ if (phys) {
err = fiemap_fill_next_extent(fieinfo, 0, phys, len, flags);
+ trace_f2fs_fiemap(inode, 0, phys, len, flags, err);
+ }
return (err < 0 ? err : 0);
}
@@ -1979,6 +1984,7 @@ next:
ret = fiemap_fill_next_extent(fieinfo, logical,
phys, size, flags);
+ trace_f2fs_fiemap(inode, logical, phys, size, flags, ret);
if (ret)
goto out;
size = 0;
@@ -2213,9 +2219,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
if (ret)
goto out;
- /* cluster was overwritten as normal cluster */
- if (dn.data_blkaddr != COMPRESS_ADDR)
- goto out;
+ f2fs_bug_on(sbi, dn.data_blkaddr != COMPRESS_ADDR);
for (i = 1; i < cc->cluster_size; i++) {
block_t blkaddr;
@@ -2342,6 +2346,7 @@ static int f2fs_mpage_readpages(struct inode *inode,
unsigned nr_pages = rac ? readahead_count(rac) : 1;
unsigned max_nr_pages = nr_pages;
int ret = 0;
+ bool drop_ra = false;
map.m_pblk = 0;
map.m_lblk = 0;
@@ -2352,10 +2357,26 @@ static int f2fs_mpage_readpages(struct inode *inode,
map.m_seg_type = NO_CHECK_TYPE;
map.m_may_create = false;
+ /*
+ * Two readahead threads for same address range can cause race condition
+ * which fragments sequential read IOs. So let's avoid each other.
+ */
+ if (rac && readahead_count(rac)) {
+ if (READ_ONCE(F2FS_I(inode)->ra_offset) == readahead_index(rac))
+ drop_ra = true;
+ else
+ WRITE_ONCE(F2FS_I(inode)->ra_offset,
+ readahead_index(rac));
+ }
+
for (; nr_pages; nr_pages--) {
if (rac) {
page = readahead_page(rac);
prefetchw(&page->flags);
+ if (drop_ra) {
+ f2fs_put_page(page, 1);
+ continue;
+ }
}
#ifdef CONFIG_F2FS_FS_COMPRESSION
@@ -2418,6 +2439,9 @@ next_page:
}
if (bio)
__submit_bio(F2FS_I_SB(inode), bio, DATA);
+
+ if (rac && readahead_count(rac) && !drop_ra)
+ WRITE_ONCE(F2FS_I(inode)->ra_offset, -1);
return ret;
}
@@ -2772,8 +2796,20 @@ write:
/* Dentry/quota blocks are controlled by checkpoint */
if (S_ISDIR(inode->i_mode) || IS_NOQUOTA(inode)) {
+ /*
+ * We need to wait for node_write to avoid block allocation during
+ * checkpoint. This can only happen to quota writes which can cause
+ * the below discard race condition.
+ */
+ if (IS_NOQUOTA(inode))
+ down_read(&sbi->node_write);
+
fio.need_lock = LOCK_DONE;
err = f2fs_do_write_data_page(&fio);
+
+ if (IS_NOQUOTA(inode))
+ up_read(&sbi->node_write);
+
goto done;
}
@@ -3268,7 +3304,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi,
if (f2fs_has_inline_data(inode) ||
(pos & PAGE_MASK) >= i_size_read(inode)) {
- __do_map_lock(sbi, flag, true);
+ f2fs_do_map_lock(sbi, flag, true);
locked = true;
}
@@ -3305,7 +3341,7 @@ restart:
err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE);
if (err || dn.data_blkaddr == NULL_ADDR) {
f2fs_put_dnode(&dn);
- __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO,
+ f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO,
true);
WARN_ON(flag != F2FS_GET_BLOCK_PRE_AIO);
locked = true;
@@ -3321,7 +3357,7 @@ out:
f2fs_put_dnode(&dn);
unlock_out:
if (locked)
- __do_map_lock(sbi, flag, false);
+ f2fs_do_map_lock(sbi, flag, false);
return err;
}
@@ -3433,7 +3469,7 @@ repeat:
err = -EFSCORRUPTED;
goto fail;
}
- err = f2fs_submit_page_read(inode, page, blkaddr, true);
+ err = f2fs_submit_page_read(inode, page, blkaddr, 0, true);
if (err)
goto fail;
@@ -3483,6 +3519,10 @@ static int f2fs_write_end(struct file *file,
if (f2fs_compressed_file(inode) && fsdata) {
f2fs_compress_write_end(inode, fsdata, page->index, copied);
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
+
+ if (pos + copied > i_size_read(inode) &&
+ !f2fs_verity_in_progress(inode))
+ f2fs_i_size_write(inode, pos + copied);
return copied;
}
#endif
@@ -3510,6 +3550,9 @@ static int check_direct_IO(struct inode *inode, struct iov_iter *iter,
unsigned long align = offset | iov_iter_alignment(iter);
struct block_device *bdev = inode->i_sb->s_bdev;
+ if (iov_iter_rw(iter) == READ && offset >= i_size_read(inode))
+ return 1;
+
if (align & blocksize_mask) {
if (bdev)
blkbits = blksize_bits(bdev_logical_block_size(bdev));
@@ -3742,10 +3785,9 @@ static sector_t f2fs_bmap_compress(struct inode *inode, sector_t block)
}
f2fs_put_dnode(&dn);
-
return blknr;
#else
- return -EOPNOTSUPP;
+ return 0;
#endif
}
@@ -3753,18 +3795,26 @@ static sector_t f2fs_bmap_compress(struct inode *inode, sector_t block)
static sector_t f2fs_bmap(struct address_space *mapping, sector_t block)
{
struct inode *inode = mapping->host;
+ struct buffer_head tmp = {
+ .b_size = i_blocksize(inode),
+ };
+ sector_t blknr = 0;
if (f2fs_has_inline_data(inode))
- return 0;
+ goto out;
/* make sure allocating whole blocks */
if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
filemap_write_and_wait(mapping);
if (f2fs_compressed_file(inode))
- return f2fs_bmap_compress(inode, block);
+ blknr = f2fs_bmap_compress(inode, block);
- return generic_block_bmap(mapping, block, get_data_block_bmap);
+ if (!get_data_block_bmap(inode, block, &tmp, 0))
+ blknr = tmp.b_blocknr;
+out:
+ trace_f2fs_bmap(inode, block, blknr);
+ return blknr;
}
#ifdef CONFIG_MIGRATION
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 0dbcb0f9c019..4276c0f79beb 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -174,6 +174,26 @@ static void update_general_status(struct f2fs_sb_info *sbi)
for (i = META_CP; i < META_MAX; i++)
si->meta_count[i] = atomic_read(&sbi->meta_count[i]);
+ for (i = 0; i < NO_CHECK_TYPE; i++) {
+ si->dirty_seg[i] = 0;
+ si->full_seg[i] = 0;
+ si->valid_blks[i] = 0;
+ }
+
+ for (i = 0; i < MAIN_SEGS(sbi); i++) {
+ int blks = get_seg_entry(sbi, i)->valid_blocks;
+ int type = get_seg_entry(sbi, i)->type;
+
+ if (!blks)
+ continue;
+
+ if (blks == sbi->blocks_per_seg)
+ si->full_seg[type]++;
+ else
+ si->dirty_seg[type]++;
+ si->valid_blks[type] += blks;
+ }
+
for (i = 0; i < 2; i++) {
si->segment_count[i] = sbi->segment_count[i];
si->block_count[i] = sbi->block_count[i];
@@ -329,30 +349,50 @@ static int stat_show(struct seq_file *s, void *v)
seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n",
si->main_area_segs, si->main_area_sections,
si->main_area_zones);
- seq_printf(s, " - COLD data: %d, %d, %d\n",
+ seq_printf(s, " TYPE %8s %8s %8s %10s %10s %10s\n",
+ "segno", "secno", "zoneno", "dirty_seg", "full_seg", "valid_blk");
+ seq_printf(s, " - COLD data: %8d %8d %8d %10u %10u %10u\n",
si->curseg[CURSEG_COLD_DATA],
si->cursec[CURSEG_COLD_DATA],
- si->curzone[CURSEG_COLD_DATA]);
- seq_printf(s, " - WARM data: %d, %d, %d\n",
+ si->curzone[CURSEG_COLD_DATA],
+ si->dirty_seg[CURSEG_COLD_DATA],
+ si->full_seg[CURSEG_COLD_DATA],
+ si->valid_blks[CURSEG_COLD_DATA]);
+ seq_printf(s, " - WARM data: %8d %8d %8d %10u %10u %10u\n",
si->curseg[CURSEG_WARM_DATA],
si->cursec[CURSEG_WARM_DATA],
- si->curzone[CURSEG_WARM_DATA]);
- seq_printf(s, " - HOT data: %d, %d, %d\n",
+ si->curzone[CURSEG_WARM_DATA],
+ si->dirty_seg[CURSEG_WARM_DATA],
+ si->full_seg[CURSEG_WARM_DATA],
+ si->valid_blks[CURSEG_WARM_DATA]);
+ seq_printf(s, " - HOT data: %8d %8d %8d %10u %10u %10u\n",
si->curseg[CURSEG_HOT_DATA],
si->cursec[CURSEG_HOT_DATA],
- si->curzone[CURSEG_HOT_DATA]);
- seq_printf(s, " - Dir dnode: %d, %d, %d\n",
+ si->curzone[CURSEG_HOT_DATA],
+ si->dirty_seg[CURSEG_HOT_DATA],
+ si->full_seg[CURSEG_HOT_DATA],
+ si->valid_blks[CURSEG_HOT_DATA]);
+ seq_printf(s, " - Dir dnode: %8d %8d %8d %10u %10u %10u\n",
si->curseg[CURSEG_HOT_NODE],
si->cursec[CURSEG_HOT_NODE],
- si->curzone[CURSEG_HOT_NODE]);
- seq_printf(s, " - File dnode: %d, %d, %d\n",
+ si->curzone[CURSEG_HOT_NODE],
+ si->dirty_seg[CURSEG_HOT_NODE],
+ si->full_seg[CURSEG_HOT_NODE],
+ si->valid_blks[CURSEG_HOT_NODE]);
+ seq_printf(s, " - File dnode: %8d %8d %8d %10u %10u %10u\n",
si->curseg[CURSEG_WARM_NODE],
si->cursec[CURSEG_WARM_NODE],
- si->curzone[CURSEG_WARM_NODE]);
- seq_printf(s, " - Indir nodes: %d, %d, %d\n",
+ si->curzone[CURSEG_WARM_NODE],
+ si->dirty_seg[CURSEG_WARM_NODE],
+ si->full_seg[CURSEG_WARM_NODE],
+ si->valid_blks[CURSEG_WARM_NODE]);
+ seq_printf(s, " - Indir nodes: %8d %8d %8d %10u %10u %10u\n",
si->curseg[CURSEG_COLD_NODE],
si->cursec[CURSEG_COLD_NODE],
- si->curzone[CURSEG_COLD_NODE]);
+ si->curzone[CURSEG_COLD_NODE],
+ si->dirty_seg[CURSEG_COLD_NODE],
+ si->full_seg[CURSEG_COLD_NODE],
+ si->valid_blks[CURSEG_COLD_NODE]);
seq_printf(s, "\n - Valid: %d\n - Dirty: %d\n",
si->main_area_segs - si->dirty_count -
si->prefree_count - si->free_segs,
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index d35976785e8c..53fbc4dd6e48 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -111,7 +111,7 @@ static int __f2fs_setup_filename(const struct inode *dir,
#ifdef CONFIG_FS_ENCRYPTION
fname->crypto_buf = crypt_name->crypto_buf;
#endif
- if (crypt_name->is_ciphertext_name) {
+ if (crypt_name->is_nokey_name) {
/* hash was decoded from the no-key name */
fname->hash = cpu_to_le32(crypt_name->hash);
} else {
@@ -537,7 +537,7 @@ struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir,
goto put_error;
if (IS_ENCRYPTED(inode)) {
- err = fscrypt_inherit_context(dir, inode, page, false);
+ err = fscrypt_set_context(inode, page);
if (err)
goto put_error;
}
@@ -779,7 +779,7 @@ int f2fs_do_add_link(struct inode *dir, const struct qstr *name,
return err;
/*
- * An immature stakable filesystem shows a race condition between lookup
+ * An immature stackable filesystem shows a race condition between lookup
* and create. If we have same task when doing lookup and create, it's
* definitely fine as expected by VFS normally. Otherwise, let's just
* verify on-disk dentry one more time, which guarantees filesystem
@@ -1032,7 +1032,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
if (err)
goto out;
- err = fscrypt_fname_alloc_buffer(inode, F2FS_NAME_LEN, &fstr);
+ err = fscrypt_fname_alloc_buffer(F2FS_NAME_LEN, &fstr);
if (err < 0)
goto out;
}
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index e60078460ad1..686c68b98610 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -325,9 +325,10 @@ static void __drop_largest_extent(struct extent_tree *et,
}
/* return true, if inode page is changed */
-static bool __f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext)
+static void __f2fs_init_extent_tree(struct inode *inode, struct page *ipage)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct f2fs_extent *i_ext = ipage ? &F2FS_INODE(ipage)->i_ext : NULL;
struct extent_tree *et;
struct extent_node *en;
struct extent_info ei;
@@ -335,16 +336,18 @@ static bool __f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_e
if (!f2fs_may_extent_tree(inode)) {
/* drop largest extent */
if (i_ext && i_ext->len) {
+ f2fs_wait_on_page_writeback(ipage, NODE, true, true);
i_ext->len = 0;
- return true;
+ set_page_dirty(ipage);
+ return;
}
- return false;
+ return;
}
et = __grab_extent_tree(inode);
if (!i_ext || !i_ext->len)
- return false;
+ return;
get_extent_info(&ei, i_ext);
@@ -360,17 +363,14 @@ static bool __f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_e
}
out:
write_unlock(&et->lock);
- return false;
}
-bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext)
+void f2fs_init_extent_tree(struct inode *inode, struct page *ipage)
{
- bool ret = __f2fs_init_extent_tree(inode, i_ext);
+ __f2fs_init_extent_tree(inode, ipage);
if (!F2FS_I(inode)->extent_tree)
set_inode_flag(inode, FI_NO_EXTENT);
-
- return ret;
}
static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index b35a50f4953c..7c089ff7ff94 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -138,7 +138,7 @@ struct f2fs_mount_info {
int fsync_mode; /* fsync policy */
int fs_mode; /* fs mode: LFS or ADAPTIVE */
int bggc_mode; /* bggc mode: off, on or sync */
- struct fscrypt_dummy_context dummy_enc_ctx; /* test dummy encryption */
+ struct fscrypt_dummy_policy dummy_enc_policy; /* test dummy encryption */
block_t unusable_cap_perc; /* percentage for cap */
block_t unusable_cap; /* Amount of space allowed to be
* unusable when disabling checkpoint
@@ -402,12 +402,8 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal,
}
/*
- * ioctl commands
+ * f2fs-specific ioctl commands
*/
-#define F2FS_IOC_GETFLAGS FS_IOC_GETFLAGS
-#define F2FS_IOC_SETFLAGS FS_IOC_SETFLAGS
-#define F2FS_IOC_GETVERSION FS_IOC_GETVERSION
-
#define F2FS_IOCTL_MAGIC 0xf5
#define F2FS_IOC_START_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 1)
#define F2FS_IOC_COMMIT_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 2)
@@ -434,13 +430,8 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal,
_IOR(F2FS_IOCTL_MAGIC, 18, __u64)
#define F2FS_IOC_RESERVE_COMPRESS_BLOCKS \
_IOR(F2FS_IOCTL_MAGIC, 19, __u64)
-
-#define F2FS_IOC_GET_VOLUME_NAME FS_IOC_GETFSLABEL
-#define F2FS_IOC_SET_VOLUME_NAME FS_IOC_SETFSLABEL
-
-#define F2FS_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY
-#define F2FS_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY
-#define F2FS_IOC_GET_ENCRYPTION_PWSALT FS_IOC_GET_ENCRYPTION_PWSALT
+#define F2FS_IOC_SEC_TRIM_FILE _IOW(F2FS_IOCTL_MAGIC, 20, \
+ struct f2fs_sectrim_range)
/*
* should be same as XFS_IOC_GOINGDOWN.
@@ -453,17 +444,12 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal,
#define F2FS_GOING_DOWN_METAFLUSH 0x3 /* going down with meta flush */
#define F2FS_GOING_DOWN_NEED_FSCK 0x4 /* going down to trigger fsck */
-#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
/*
- * ioctl commands in 32 bit emulation
+ * Flags used by F2FS_IOC_SEC_TRIM_FILE
*/
-#define F2FS_IOC32_GETFLAGS FS_IOC32_GETFLAGS
-#define F2FS_IOC32_SETFLAGS FS_IOC32_SETFLAGS
-#define F2FS_IOC32_GETVERSION FS_IOC32_GETVERSION
-#endif
-
-#define F2FS_IOC_FSGETXATTR FS_IOC_FSGETXATTR
-#define F2FS_IOC_FSSETXATTR FS_IOC_FSSETXATTR
+#define F2FS_TRIM_FILE_DISCARD 0x1 /* send discard command */
+#define F2FS_TRIM_FILE_ZEROOUT 0x2 /* zero out */
+#define F2FS_TRIM_FILE_MASK 0x3
struct f2fs_gc_range {
u32 sync;
@@ -488,6 +474,12 @@ struct f2fs_flush_device {
u32 segments; /* # of segments to flush */
};
+struct f2fs_sectrim_range {
+ u64 start;
+ u64 len;
+ u64 flags;
+};
+
/* for inline stuff */
#define DEF_INLINE_RESERVED_SIZE 1
static inline int get_extra_isize(struct inode *inode);
@@ -794,6 +786,7 @@ struct f2fs_inode_info {
struct list_head inmem_pages; /* inmemory pages managed by f2fs */
struct task_struct *inmem_task; /* store inmemory task */
struct mutex inmem_lock; /* lock for inmemory pages */
+ pgoff_t ra_offset; /* ongoing readahead offset */
struct extent_tree *extent_tree; /* cached extent_tree entry */
/* avoid racing between foreground op and gc */
@@ -1267,7 +1260,8 @@ enum {
GC_NORMAL,
GC_IDLE_CB,
GC_IDLE_GREEDY,
- GC_URGENT,
+ GC_URGENT_HIGH,
+ GC_URGENT_LOW,
};
enum {
@@ -1313,11 +1307,12 @@ enum fsync_mode {
#define IS_DUMMY_WRITTEN_PAGE(page) \
(page_private(page) == (unsigned long)DUMMY_WRITTEN_PAGE)
-#ifdef CONFIG_FS_ENCRYPTION
-#define DUMMY_ENCRYPTION_ENABLED(sbi) \
- (unlikely(F2FS_OPTION(sbi).dummy_enc_ctx.ctx != NULL))
+#ifdef CONFIG_F2FS_IO_TRACE
+#define IS_IO_TRACED_PAGE(page) \
+ (page_private(page) > 0 && \
+ page_private(page) < (unsigned long)PID_MAX_LIMIT)
#else
-#define DUMMY_ENCRYPTION_ENABLED(sbi) (0)
+#define IS_IO_TRACED_PAGE(page) (0)
#endif
/* For compression */
@@ -1438,7 +1433,7 @@ struct f2fs_sb_info {
unsigned long last_time[MAX_TIME]; /* to store time in jiffies */
long interval_time[MAX_TIME]; /* to store thresholds */
- struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */
+ struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */
spinlock_t fsync_node_lock; /* for node entry lock */
struct list_head fsync_node_list; /* node list head */
@@ -1516,8 +1511,9 @@ struct f2fs_sb_info {
unsigned int cur_victim_sec; /* current victim section num */
unsigned int gc_mode; /* current GC state */
unsigned int next_victim_seg[2]; /* next segment in victim section */
+
/* for skip statistic */
- unsigned int atomic_files; /* # of opened atomic file */
+ unsigned int atomic_files; /* # of opened atomic file */
unsigned long long skipped_atomic_files[2]; /* FG_GC and BG_GC */
unsigned long long skipped_gc_rwsem; /* FG_GC only */
@@ -2456,7 +2452,7 @@ static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep,
static inline bool is_idle(struct f2fs_sb_info *sbi, int type)
{
- if (sbi->gc_mode == GC_URGENT)
+ if (sbi->gc_mode == GC_URGENT_HIGH)
return true;
if (get_pages(sbi, F2FS_RD_DATA) || get_pages(sbi, F2FS_RD_NODE) ||
@@ -2474,6 +2470,10 @@ static inline bool is_idle(struct f2fs_sb_info *sbi, int type)
atomic_read(&SM_I(sbi)->fcc_info->queued_flush))
return false;
+ if (sbi->gc_mode == GC_URGENT_LOW &&
+ (type == DISCARD_TIME || type == GC_TIME))
+ return true;
+
return f2fs_time_over(sbi, type);
}
@@ -2639,7 +2639,7 @@ static inline void __mark_inode_dirty_flag(struct inode *inode,
case FI_NEW_INODE:
if (set)
return;
- /* fall through */
+ fallthrough;
case FI_DATA_EXIST:
case FI_INLINE_DOTS:
case FI_PIN_FILE:
@@ -2649,7 +2649,7 @@ static inline void __mark_inode_dirty_flag(struct inode *inode,
static inline void set_inode_flag(struct inode *inode, int flag)
{
- test_and_set_bit(flag, F2FS_I(inode)->flags);
+ set_bit(flag, F2FS_I(inode)->flags);
__mark_inode_dirty_flag(inode, flag, true);
}
@@ -2660,7 +2660,7 @@ static inline int is_inode_flag_set(struct inode *inode, int flag)
static inline void clear_inode_flag(struct inode *inode, int flag)
{
- test_and_clear_bit(flag, F2FS_I(inode)->flags);
+ clear_bit(flag, F2FS_I(inode)->flags);
__mark_inode_dirty_flag(inode, flag, false);
}
@@ -3275,7 +3275,7 @@ void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid);
struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid);
struct page *f2fs_get_node_page_ra(struct page *parent, int start);
int f2fs_move_node_page(struct page *node_page, int gc_type);
-int f2fs_flush_inline_data(struct f2fs_sb_info *sbi);
+void f2fs_flush_inline_data(struct f2fs_sb_info *sbi);
int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
struct writeback_control *wbc, bool atomic,
unsigned int *seq_id);
@@ -3287,7 +3287,7 @@ bool f2fs_alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid);
void f2fs_alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid);
void f2fs_alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid);
int f2fs_try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink);
-void f2fs_recover_inline_xattr(struct inode *inode, struct page *page);
+int f2fs_recover_inline_xattr(struct inode *inode, struct page *page);
int f2fs_recover_xattr_data(struct inode *inode, struct page *page);
int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page);
int f2fs_restore_node_summary(struct f2fs_sb_info *sbi,
@@ -3325,9 +3325,10 @@ block_t f2fs_get_unusable_blocks(struct f2fs_sb_info *sbi);
int f2fs_disable_cp_again(struct f2fs_sb_info *sbi, block_t unusable);
void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi);
int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra);
-void allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
+void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
unsigned int start, unsigned int end);
-void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi, int type);
+void f2fs_allocate_new_segment(struct f2fs_sb_info *sbi, int type);
+void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi);
int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range);
bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi,
struct cp_control *cpc);
@@ -3350,7 +3351,7 @@ void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn,
void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
block_t old_blkaddr, block_t *new_blkaddr,
struct f2fs_summary *sum, int type,
- struct f2fs_io_info *fio, bool add_list);
+ struct f2fs_io_info *fio);
void f2fs_wait_on_page_writeback(struct page *page,
enum page_type type, bool ordered, bool locked);
void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr);
@@ -3448,7 +3449,7 @@ struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index,
struct page *f2fs_get_new_data_page(struct inode *inode,
struct page *ipage, pgoff_t index, bool new_i_size);
int f2fs_do_write_data_page(struct f2fs_io_info *fio);
-void __do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock);
+void f2fs_do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock);
int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
int create, int flag);
int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
@@ -3536,6 +3537,9 @@ struct f2fs_stat_info {
int curseg[NR_CURSEG_TYPE];
int cursec[NR_CURSEG_TYPE];
int curzone[NR_CURSEG_TYPE];
+ unsigned int dirty_seg[NR_CURSEG_TYPE];
+ unsigned int full_seg[NR_CURSEG_TYPE];
+ unsigned int valid_blks[NR_CURSEG_TYPE];
unsigned int meta_count[META_MAX];
unsigned int segment_count[2];
@@ -3750,7 +3754,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page);
int f2fs_convert_inline_inode(struct inode *inode);
int f2fs_try_convert_inline_dir(struct inode *dir, struct dentry *dentry);
int f2fs_write_inline_data(struct inode *inode, struct page *page);
-bool f2fs_recover_inline_data(struct inode *inode, struct page *npage);
+int f2fs_recover_inline_data(struct inode *inode, struct page *npage);
struct f2fs_dir_entry *f2fs_find_in_inline_dir(struct inode *dir,
const struct f2fs_filename *fname,
struct page **res_page);
@@ -3795,7 +3799,7 @@ struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root_cached *root,
bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi,
struct rb_root_cached *root);
unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink);
-bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext);
+void f2fs_init_extent_tree(struct inode *inode, struct page *ipage);
void f2fs_drop_extent_tree(struct inode *inode);
unsigned int f2fs_destroy_extent_node(struct inode *inode);
void f2fs_destroy_extent_tree(struct inode *inode);
@@ -4011,22 +4015,6 @@ static inline bool f2fs_lfs_mode(struct f2fs_sb_info *sbi)
return F2FS_OPTION(sbi).fs_mode == FS_MODE_LFS;
}
-static inline bool f2fs_may_encrypt(struct inode *dir, struct inode *inode)
-{
-#ifdef CONFIG_FS_ENCRYPTION
- struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
- umode_t mode = inode->i_mode;
-
- /*
- * If the directory encrypted or dummy encryption enabled,
- * then we should encrypt the inode.
- */
- if (IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi))
- return (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode));
-#endif
- return false;
-}
-
static inline bool f2fs_may_compress(struct inode *inode)
{
if (IS_SWAPFILE(inode) || f2fs_is_pinned_file(inode) ||
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 3268f8dd59bb..8a422400e824 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -21,6 +21,7 @@
#include <linux/uuid.h>
#include <linux/file.h>
#include <linux/nls.h>
+#include <linux/sched/signal.h>
#include "f2fs.h"
#include "node.h"
@@ -105,11 +106,11 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
if (need_alloc) {
/* block allocation */
- __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true);
+ f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true);
set_new_dnode(&dn, inode, NULL, NULL, 0);
err = f2fs_get_block(&dn, page->index);
f2fs_put_dnode(&dn);
- __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false);
+ f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false);
}
#ifdef CONFIG_F2FS_FS_COMPRESSION
@@ -1373,8 +1374,6 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
truncate_pagecache(inode, offset);
new_size = i_size_read(inode) - len;
- truncate_pagecache(inode, new_size);
-
ret = f2fs_truncate_blocks(inode, new_size, true);
up_write(&F2FS_I(inode)->i_mmap_sem);
if (!ret)
@@ -1660,7 +1659,7 @@ next_alloc:
map.m_seg_type = CURSEG_COLD_DATA_PINNED;
f2fs_lock_op(sbi);
- f2fs_allocate_new_segments(sbi, CURSEG_COLD_DATA);
+ f2fs_allocate_new_segment(sbi, CURSEG_COLD_DATA);
f2fs_unlock_op(sbi);
err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO);
@@ -2527,6 +2526,11 @@ do_more:
}
ret = f2fs_gc(sbi, range.sync, true, GET_SEGNO(sbi, range.start));
+ if (ret) {
+ if (ret == -EBUSY)
+ ret = -EAGAIN;
+ goto out;
+ }
range.start += BLKS_PER_SEC(sbi);
if (range.start <= end)
goto do_more;
@@ -3359,7 +3363,7 @@ static int f2fs_ioc_measure_verity(struct file *filp, unsigned long arg)
return fsverity_ioctl_measure(filp, (void __user *)arg);
}
-static int f2fs_get_volume_name(struct file *filp, unsigned long arg)
+static int f2fs_ioc_getfslabel(struct file *filp, unsigned long arg)
{
struct inode *inode = file_inode(filp);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
@@ -3385,7 +3389,7 @@ static int f2fs_get_volume_name(struct file *filp, unsigned long arg)
return err;
}
-static int f2fs_set_volume_name(struct file *filp, unsigned long arg)
+static int f2fs_ioc_setfslabel(struct file *filp, unsigned long arg)
{
struct inode *inode = file_inode(filp);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
@@ -3531,14 +3535,14 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg)
if (ret)
goto out;
- if (!F2FS_I(inode)->i_compr_blocks)
- goto out;
-
F2FS_I(inode)->i_flags |= F2FS_IMMUTABLE_FL;
f2fs_set_inode_flags(inode);
inode->i_ctime = current_time(inode);
f2fs_mark_inode_dirty_sync(inode, true);
+ if (!F2FS_I(inode)->i_compr_blocks)
+ goto out;
+
down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
down_write(&F2FS_I(inode)->i_mmap_sem);
@@ -3756,6 +3760,193 @@ out:
return ret;
}
+static int f2fs_secure_erase(struct block_device *bdev, struct inode *inode,
+ pgoff_t off, block_t block, block_t len, u32 flags)
+{
+ struct request_queue *q = bdev_get_queue(bdev);
+ sector_t sector = SECTOR_FROM_BLOCK(block);
+ sector_t nr_sects = SECTOR_FROM_BLOCK(len);
+ int ret = 0;
+
+ if (!q)
+ return -ENXIO;
+
+ if (flags & F2FS_TRIM_FILE_DISCARD)
+ ret = blkdev_issue_discard(bdev, sector, nr_sects, GFP_NOFS,
+ blk_queue_secure_erase(q) ?
+ BLKDEV_DISCARD_SECURE : 0);
+
+ if (!ret && (flags & F2FS_TRIM_FILE_ZEROOUT)) {
+ if (IS_ENCRYPTED(inode))
+ ret = fscrypt_zeroout_range(inode, off, block, len);
+ else
+ ret = blkdev_issue_zeroout(bdev, sector, nr_sects,
+ GFP_NOFS, 0);
+ }
+
+ return ret;
+}
+
+static int f2fs_sec_trim_file(struct file *filp, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct address_space *mapping = inode->i_mapping;
+ struct block_device *prev_bdev = NULL;
+ struct f2fs_sectrim_range range;
+ pgoff_t index, pg_end, prev_index = 0;
+ block_t prev_block = 0, len = 0;
+ loff_t end_addr;
+ bool to_end = false;
+ int ret = 0;
+
+ if (!(filp->f_mode & FMODE_WRITE))
+ return -EBADF;
+
+ if (copy_from_user(&range, (struct f2fs_sectrim_range __user *)arg,
+ sizeof(range)))
+ return -EFAULT;
+
+ if (range.flags == 0 || (range.flags & ~F2FS_TRIM_FILE_MASK) ||
+ !S_ISREG(inode->i_mode))
+ return -EINVAL;
+
+ if (((range.flags & F2FS_TRIM_FILE_DISCARD) &&
+ !f2fs_hw_support_discard(sbi)) ||
+ ((range.flags & F2FS_TRIM_FILE_ZEROOUT) &&
+ IS_ENCRYPTED(inode) && f2fs_is_multi_device(sbi)))
+ return -EOPNOTSUPP;
+
+ file_start_write(filp);
+ inode_lock(inode);
+
+ if (f2fs_is_atomic_file(inode) || f2fs_compressed_file(inode) ||
+ range.start >= inode->i_size) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ if (range.len == 0)
+ goto err;
+
+ if (inode->i_size - range.start > range.len) {
+ end_addr = range.start + range.len;
+ } else {
+ end_addr = range.len == (u64)-1 ?
+ sbi->sb->s_maxbytes : inode->i_size;
+ to_end = true;
+ }
+
+ if (!IS_ALIGNED(range.start, F2FS_BLKSIZE) ||
+ (!to_end && !IS_ALIGNED(end_addr, F2FS_BLKSIZE))) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ index = F2FS_BYTES_TO_BLK(range.start);
+ pg_end = DIV_ROUND_UP(end_addr, F2FS_BLKSIZE);
+
+ ret = f2fs_convert_inline_inode(inode);
+ if (ret)
+ goto err;
+
+ down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ down_write(&F2FS_I(inode)->i_mmap_sem);
+
+ ret = filemap_write_and_wait_range(mapping, range.start,
+ to_end ? LLONG_MAX : end_addr - 1);
+ if (ret)
+ goto out;
+
+ truncate_inode_pages_range(mapping, range.start,
+ to_end ? -1 : end_addr - 1);
+
+ while (index < pg_end) {
+ struct dnode_of_data dn;
+ pgoff_t end_offset, count;
+ int i;
+
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ ret = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE);
+ if (ret) {
+ if (ret == -ENOENT) {
+ index = f2fs_get_next_page_offset(&dn, index);
+ continue;
+ }
+ goto out;
+ }
+
+ end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
+ count = min(end_offset - dn.ofs_in_node, pg_end - index);
+ for (i = 0; i < count; i++, index++, dn.ofs_in_node++) {
+ struct block_device *cur_bdev;
+ block_t blkaddr = f2fs_data_blkaddr(&dn);
+
+ if (!__is_valid_data_blkaddr(blkaddr))
+ continue;
+
+ if (!f2fs_is_valid_blkaddr(sbi, blkaddr,
+ DATA_GENERIC_ENHANCE)) {
+ ret = -EFSCORRUPTED;
+ f2fs_put_dnode(&dn);
+ goto out;
+ }
+
+ cur_bdev = f2fs_target_device(sbi, blkaddr, NULL);
+ if (f2fs_is_multi_device(sbi)) {
+ int di = f2fs_target_device_index(sbi, blkaddr);
+
+ blkaddr -= FDEV(di).start_blk;
+ }
+
+ if (len) {
+ if (prev_bdev == cur_bdev &&
+ index == prev_index + len &&
+ blkaddr == prev_block + len) {
+ len++;
+ } else {
+ ret = f2fs_secure_erase(prev_bdev,
+ inode, prev_index, prev_block,
+ len, range.flags);
+ if (ret) {
+ f2fs_put_dnode(&dn);
+ goto out;
+ }
+
+ len = 0;
+ }
+ }
+
+ if (!len) {
+ prev_bdev = cur_bdev;
+ prev_index = index;
+ prev_block = blkaddr;
+ len = 1;
+ }
+ }
+
+ f2fs_put_dnode(&dn);
+
+ if (fatal_signal_pending(current)) {
+ ret = -EINTR;
+ goto out;
+ }
+ cond_resched();
+ }
+
+ if (len)
+ ret = f2fs_secure_erase(prev_bdev, inode, prev_index,
+ prev_block, len, range.flags);
+out:
+ up_write(&F2FS_I(inode)->i_mmap_sem);
+ up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+err:
+ inode_unlock(inode);
+ file_end_write(filp);
+
+ return ret;
+}
+
long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(filp)))))
@@ -3764,11 +3955,11 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return -ENOSPC;
switch (cmd) {
- case F2FS_IOC_GETFLAGS:
+ case FS_IOC_GETFLAGS:
return f2fs_ioc_getflags(filp, arg);
- case F2FS_IOC_SETFLAGS:
+ case FS_IOC_SETFLAGS:
return f2fs_ioc_setflags(filp, arg);
- case F2FS_IOC_GETVERSION:
+ case FS_IOC_GETVERSION:
return f2fs_ioc_getversion(filp, arg);
case F2FS_IOC_START_ATOMIC_WRITE:
return f2fs_ioc_start_atomic_write(filp);
@@ -3784,11 +3975,11 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return f2fs_ioc_shutdown(filp, arg);
case FITRIM:
return f2fs_ioc_fitrim(filp, arg);
- case F2FS_IOC_SET_ENCRYPTION_POLICY:
+ case FS_IOC_SET_ENCRYPTION_POLICY:
return f2fs_ioc_set_encryption_policy(filp, arg);
- case F2FS_IOC_GET_ENCRYPTION_POLICY:
+ case FS_IOC_GET_ENCRYPTION_POLICY:
return f2fs_ioc_get_encryption_policy(filp, arg);
- case F2FS_IOC_GET_ENCRYPTION_PWSALT:
+ case FS_IOC_GET_ENCRYPTION_PWSALT:
return f2fs_ioc_get_encryption_pwsalt(filp, arg);
case FS_IOC_GET_ENCRYPTION_POLICY_EX:
return f2fs_ioc_get_encryption_policy_ex(filp, arg);
@@ -3816,9 +4007,9 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return f2fs_ioc_flush_device(filp, arg);
case F2FS_IOC_GET_FEATURES:
return f2fs_ioc_get_features(filp, arg);
- case F2FS_IOC_FSGETXATTR:
+ case FS_IOC_FSGETXATTR:
return f2fs_ioc_fsgetxattr(filp, arg);
- case F2FS_IOC_FSSETXATTR:
+ case FS_IOC_FSSETXATTR:
return f2fs_ioc_fssetxattr(filp, arg);
case F2FS_IOC_GET_PIN_FILE:
return f2fs_ioc_get_pin_file(filp, arg);
@@ -3832,16 +4023,18 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return f2fs_ioc_enable_verity(filp, arg);
case FS_IOC_MEASURE_VERITY:
return f2fs_ioc_measure_verity(filp, arg);
- case F2FS_IOC_GET_VOLUME_NAME:
- return f2fs_get_volume_name(filp, arg);
- case F2FS_IOC_SET_VOLUME_NAME:
- return f2fs_set_volume_name(filp, arg);
+ case FS_IOC_GETFSLABEL:
+ return f2fs_ioc_getfslabel(filp, arg);
+ case FS_IOC_SETFSLABEL:
+ return f2fs_ioc_setfslabel(filp, arg);
case F2FS_IOC_GET_COMPRESS_BLOCKS:
return f2fs_get_compress_blocks(filp, arg);
case F2FS_IOC_RELEASE_COMPRESS_BLOCKS:
return f2fs_release_compress_blocks(filp, arg);
case F2FS_IOC_RESERVE_COMPRESS_BLOCKS:
return f2fs_reserve_compress_blocks(filp, arg);
+ case F2FS_IOC_SEC_TRIM_FILE:
+ return f2fs_sec_trim_file(filp, arg);
default:
return -ENOTTY;
}
@@ -3966,14 +4159,14 @@ out:
long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
switch (cmd) {
- case F2FS_IOC32_GETFLAGS:
- cmd = F2FS_IOC_GETFLAGS;
+ case FS_IOC32_GETFLAGS:
+ cmd = FS_IOC_GETFLAGS;
break;
- case F2FS_IOC32_SETFLAGS:
- cmd = F2FS_IOC_SETFLAGS;
+ case FS_IOC32_SETFLAGS:
+ cmd = FS_IOC_SETFLAGS;
break;
- case F2FS_IOC32_GETVERSION:
- cmd = F2FS_IOC_GETVERSION;
+ case FS_IOC32_GETVERSION:
+ cmd = FS_IOC_GETVERSION;
break;
case F2FS_IOC_START_ATOMIC_WRITE:
case F2FS_IOC_COMMIT_ATOMIC_WRITE:
@@ -3982,9 +4175,9 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case F2FS_IOC_ABORT_VOLATILE_WRITE:
case F2FS_IOC_SHUTDOWN:
case FITRIM:
- case F2FS_IOC_SET_ENCRYPTION_POLICY:
- case F2FS_IOC_GET_ENCRYPTION_PWSALT:
- case F2FS_IOC_GET_ENCRYPTION_POLICY:
+ case FS_IOC_SET_ENCRYPTION_POLICY:
+ case FS_IOC_GET_ENCRYPTION_PWSALT:
+ case FS_IOC_GET_ENCRYPTION_POLICY:
case FS_IOC_GET_ENCRYPTION_POLICY_EX:
case FS_IOC_ADD_ENCRYPTION_KEY:
case FS_IOC_REMOVE_ENCRYPTION_KEY:
@@ -3998,19 +4191,20 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case F2FS_IOC_MOVE_RANGE:
case F2FS_IOC_FLUSH_DEVICE:
case F2FS_IOC_GET_FEATURES:
- case F2FS_IOC_FSGETXATTR:
- case F2FS_IOC_FSSETXATTR:
+ case FS_IOC_FSGETXATTR:
+ case FS_IOC_FSSETXATTR:
case F2FS_IOC_GET_PIN_FILE:
case F2FS_IOC_SET_PIN_FILE:
case F2FS_IOC_PRECACHE_EXTENTS:
case F2FS_IOC_RESIZE_FS:
case FS_IOC_ENABLE_VERITY:
case FS_IOC_MEASURE_VERITY:
- case F2FS_IOC_GET_VOLUME_NAME:
- case F2FS_IOC_SET_VOLUME_NAME:
+ case FS_IOC_GETFSLABEL:
+ case FS_IOC_SETFSLABEL:
case F2FS_IOC_GET_COMPRESS_BLOCKS:
case F2FS_IOC_RELEASE_COMPRESS_BLOCKS:
case F2FS_IOC_RESERVE_COMPRESS_BLOCKS:
+ case F2FS_IOC_SEC_TRIM_FILE:
break;
default:
return -ENOIOCTLCMD;
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 5b95d5a146eb..11b4adde9baf 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -21,6 +21,9 @@
#include "gc.h"
#include <trace/events/f2fs.h>
+static unsigned int count_bits(const unsigned long *addr,
+ unsigned int offset, unsigned int len);
+
static int gc_thread_func(void *data)
{
struct f2fs_sb_info *sbi = data;
@@ -79,7 +82,7 @@ static int gc_thread_func(void *data)
* invalidated soon after by user update or deletion.
* So, I'd like to wait some time to collect dirty segments.
*/
- if (sbi->gc_mode == GC_URGENT) {
+ if (sbi->gc_mode == GC_URGENT_HIGH) {
wait_ms = gc_th->urgent_sleep_time;
down_write(&sbi->gc_lock);
goto do_gc;
@@ -173,7 +176,7 @@ static int select_gc_type(struct f2fs_sb_info *sbi, int gc_type)
gc_mode = GC_CB;
break;
case GC_IDLE_GREEDY:
- case GC_URGENT:
+ case GC_URGENT_HIGH:
gc_mode = GC_GREEDY;
break;
}
@@ -187,14 +190,20 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
if (p->alloc_mode == SSR) {
p->gc_mode = GC_GREEDY;
- p->dirty_segmap = dirty_i->dirty_segmap[type];
+ p->dirty_bitmap = dirty_i->dirty_segmap[type];
p->max_search = dirty_i->nr_dirty[type];
p->ofs_unit = 1;
} else {
p->gc_mode = select_gc_type(sbi, gc_type);
- p->dirty_segmap = dirty_i->dirty_segmap[DIRTY];
- p->max_search = dirty_i->nr_dirty[DIRTY];
p->ofs_unit = sbi->segs_per_sec;
+ if (__is_large_section(sbi)) {
+ p->dirty_bitmap = dirty_i->dirty_secmap;
+ p->max_search = count_bits(p->dirty_bitmap,
+ 0, MAIN_SECS(sbi));
+ } else {
+ p->dirty_bitmap = dirty_i->dirty_segmap[DIRTY];
+ p->max_search = dirty_i->nr_dirty[DIRTY];
+ }
}
/*
@@ -202,7 +211,7 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
* foreground GC and urgent GC cases.
*/
if (gc_type != FG_GC &&
- (sbi->gc_mode != GC_URGENT) &&
+ (sbi->gc_mode != GC_URGENT_HIGH) &&
p->max_search > sbi->max_victim_search)
p->max_search = sbi->max_victim_search;
@@ -321,6 +330,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
unsigned int secno, last_victim;
unsigned int last_segment;
unsigned int nsearched = 0;
+ int ret = 0;
mutex_lock(&dirty_i->seglist_lock);
last_segment = MAIN_SECS(sbi) * sbi->segs_per_sec;
@@ -332,12 +342,19 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
p.min_cost = get_max_cost(sbi, &p);
if (*result != NULL_SEGNO) {
- if (get_valid_blocks(sbi, *result, false) &&
- !sec_usage_check(sbi, GET_SEC_FROM_SEG(sbi, *result)))
+ if (!get_valid_blocks(sbi, *result, false)) {
+ ret = -ENODATA;
+ goto out;
+ }
+
+ if (sec_usage_check(sbi, GET_SEC_FROM_SEG(sbi, *result)))
+ ret = -EBUSY;
+ else
p.min_segno = *result;
goto out;
}
+ ret = -ENODATA;
if (p.max_search == 0)
goto out;
@@ -365,10 +382,14 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
}
while (1) {
- unsigned long cost;
- unsigned int segno;
-
- segno = find_next_bit(p.dirty_segmap, last_segment, p.offset);
+ unsigned long cost, *dirty_bitmap;
+ unsigned int unit_no, segno;
+
+ dirty_bitmap = p.dirty_bitmap;
+ unit_no = find_next_bit(dirty_bitmap,
+ last_segment / p.ofs_unit,
+ p.offset / p.ofs_unit);
+ segno = unit_no * p.ofs_unit;
if (segno >= last_segment) {
if (sm->last_victim[p.gc_mode]) {
last_segment =
@@ -381,14 +402,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
}
p.offset = segno + p.ofs_unit;
- if (p.ofs_unit > 1) {
- p.offset -= segno % p.ofs_unit;
- nsearched += count_bits(p.dirty_segmap,
- p.offset - p.ofs_unit,
- p.ofs_unit);
- } else {
- nsearched++;
- }
+ nsearched++;
#ifdef CONFIG_F2FS_CHECK_FS
/*
@@ -421,9 +435,10 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
next:
if (nsearched >= p.max_search) {
if (!sm->last_victim[p.gc_mode] && segno <= last_victim)
- sm->last_victim[p.gc_mode] = last_victim + 1;
+ sm->last_victim[p.gc_mode] =
+ last_victim + p.ofs_unit;
else
- sm->last_victim[p.gc_mode] = segno + 1;
+ sm->last_victim[p.gc_mode] = segno + p.ofs_unit;
sm->last_victim[p.gc_mode] %=
(MAIN_SECS(sbi) * sbi->segs_per_sec);
break;
@@ -440,6 +455,7 @@ got_result:
else
set_bit(secno, dirty_i->victim_secmap);
}
+ ret = 0;
}
out:
@@ -449,7 +465,7 @@ out:
prefree_segments(sbi), free_segments(sbi));
mutex_unlock(&dirty_i->seglist_lock);
- return (p.min_segno == NULL_SEGNO) ? 0 : 1;
+ return ret;
}
static const struct victim_selection default_v_ops = {
@@ -833,8 +849,10 @@ static int move_data_block(struct inode *inode, block_t bidx,
mpage = f2fs_grab_cache_page(META_MAPPING(fio.sbi),
fio.old_blkaddr, false);
- if (!mpage)
+ if (!mpage) {
+ err = -ENOMEM;
goto up_out;
+ }
fio.encrypted_page = mpage;
@@ -859,7 +877,7 @@ static int move_data_block(struct inode *inode, block_t bidx,
}
f2fs_allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr,
- &sum, CURSEG_COLD_DATA, NULL, false);
+ &sum, CURSEG_COLD_DATA, NULL);
fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(fio.sbi),
newaddr, FGP_LOCK | FGP_CREAT, GFP_NOFS);
@@ -1333,10 +1351,9 @@ gc_more:
ret = -EINVAL;
goto stop;
}
- if (!__get_victim(sbi, &segno, gc_type)) {
- ret = -ENODATA;
+ ret = __get_victim(sbi, &segno, gc_type);
+ if (ret)
goto stop;
- }
seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type);
if (gc_type == FG_GC && seg_freed == sbi->segs_per_sec)
@@ -1434,7 +1451,7 @@ static int free_segment_range(struct f2fs_sb_info *sbi,
/* Move out cursegs from the target range */
for (type = CURSEG_HOT_DATA; type < NR_CURSEG_TYPE; type++)
- allocate_segment_for_resize(sbi, type, start, end);
+ f2fs_allocate_segment_for_resize(sbi, type, start, end);
/* do GC to move out valid blocks in the range */
for (segno = start; segno <= end; segno += sbi->segs_per_sec) {
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index dbade310dc79..102df444f623 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -12,6 +12,7 @@
#include "f2fs.h"
#include "node.h"
+#include <trace/events/f2fs.h>
bool f2fs_may_inline_data(struct inode *inode)
{
@@ -253,7 +254,7 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page)
return 0;
}
-bool f2fs_recover_inline_data(struct inode *inode, struct page *npage)
+int f2fs_recover_inline_data(struct inode *inode, struct page *npage)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_inode *ri = NULL;
@@ -275,7 +276,8 @@ bool f2fs_recover_inline_data(struct inode *inode, struct page *npage)
ri && (ri->i_inline & F2FS_INLINE_DATA)) {
process_inline:
ipage = f2fs_get_node_page(sbi, inode->i_ino);
- f2fs_bug_on(sbi, IS_ERR(ipage));
+ if (IS_ERR(ipage))
+ return PTR_ERR(ipage);
f2fs_wait_on_page_writeback(ipage, NODE, true, true);
@@ -288,21 +290,25 @@ process_inline:
set_page_dirty(ipage);
f2fs_put_page(ipage, 1);
- return true;
+ return 1;
}
if (f2fs_has_inline_data(inode)) {
ipage = f2fs_get_node_page(sbi, inode->i_ino);
- f2fs_bug_on(sbi, IS_ERR(ipage));
+ if (IS_ERR(ipage))
+ return PTR_ERR(ipage);
f2fs_truncate_inline_inode(inode, ipage, 0);
clear_inode_flag(inode, FI_INLINE_DATA);
f2fs_put_page(ipage, 1);
} else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) {
- if (f2fs_truncate_blocks(inode, 0, false))
- return false;
+ int ret;
+
+ ret = f2fs_truncate_blocks(inode, 0, false);
+ if (ret)
+ return ret;
goto process_inline;
}
- return false;
+ return 0;
}
struct f2fs_dir_entry *f2fs_find_in_inline_dir(struct inode *dir,
@@ -776,6 +782,7 @@ int f2fs_inline_data_fiemap(struct inode *inode,
byteaddr += (char *)inline_data_addr(inode, ipage) -
(char *)F2FS_INODE(ipage);
err = fiemap_fill_next_extent(fieinfo, start, byteaddr, ilen, flags);
+ trace_f2fs_fiemap(inode, start, byteaddr, ilen, flags, err);
out:
f2fs_put_page(ipage, 1);
return err;
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 44582a4db513..66969ae852b9 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -367,8 +367,7 @@ static int do_read_inode(struct inode *inode)
fi->i_pino = le32_to_cpu(ri->i_pino);
fi->i_dir_level = ri->i_dir_level;
- if (f2fs_init_extent_tree(inode, &ri->i_ext))
- set_page_dirty(node_page);
+ f2fs_init_extent_tree(inode, node_page);
get_inline_info(inode, ri);
@@ -402,6 +401,7 @@ static int do_read_inode(struct inode *inode)
/* try to recover cold bit for non-dir inode */
if (!S_ISDIR(inode->i_mode) && !is_cold_node(node_page)) {
+ f2fs_wait_on_page_writeback(node_page, NODE, true, true);
set_cold_node(node_page, false);
set_page_dirty(node_page);
}
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index e94e02c6580a..45f324511a19 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -28,6 +28,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
nid_t ino;
struct inode *inode;
bool nid_free = false;
+ bool encrypt = false;
int xattr_size = 0;
int err;
@@ -69,13 +70,17 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
F2FS_I(inode)->i_projid = make_kprojid(&init_user_ns,
F2FS_DEF_PROJID);
+ err = fscrypt_prepare_new_inode(dir, inode, &encrypt);
+ if (err)
+ goto fail_drop;
+
err = dquot_initialize(inode);
if (err)
goto fail_drop;
set_inode_flag(inode, FI_NEW_INODE);
- if (f2fs_may_encrypt(dir, inode))
+ if (encrypt)
f2fs_set_encrypted_inode(inode);
if (f2fs_sb_has_extra_attr(sbi)) {
@@ -569,15 +574,17 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
trace_f2fs_unlink_enter(dir, dentry);
- if (unlikely(f2fs_cp_error(sbi)))
- return -EIO;
+ if (unlikely(f2fs_cp_error(sbi))) {
+ err = -EIO;
+ goto fail;
+ }
err = dquot_initialize(dir);
if (err)
- return err;
+ goto fail;
err = dquot_initialize(inode);
if (err)
- return err;
+ goto fail;
de = f2fs_find_entry(dir, &dentry->d_name, &page);
if (!de) {
@@ -600,7 +607,7 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
/* VFS negative dentries are incompatible with Encoding and
* Case-insensitiveness. Eventually we'll want avoid
* invalidating the dentries here, alongside with returning the
- * negative dentries at f2fs_lookup(), when it is better
+ * negative dentries at f2fs_lookup(), when it is better
* supported by the VFS for the CI case.
*/
if (IS_CASEFOLDED(dir))
@@ -1285,7 +1292,7 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry,
}
const struct inode_operations f2fs_encrypted_symlink_inode_operations = {
- .get_link = f2fs_encrypted_get_link,
+ .get_link = f2fs_encrypted_get_link,
.getattr = f2fs_getattr,
.setattr = f2fs_setattr,
.listxattr = f2fs_listxattr,
@@ -1311,7 +1318,7 @@ const struct inode_operations f2fs_dir_inode_operations = {
};
const struct inode_operations f2fs_symlink_inode_operations = {
- .get_link = f2fs_get_link,
+ .get_link = f2fs_get_link,
.getattr = f2fs_getattr,
.setattr = f2fs_setattr,
.listxattr = f2fs_listxattr,
@@ -1319,7 +1326,7 @@ const struct inode_operations f2fs_symlink_inode_operations = {
const struct inode_operations f2fs_special_inode_operations = {
.getattr = f2fs_getattr,
- .setattr = f2fs_setattr,
+ .setattr = f2fs_setattr,
.get_acl = f2fs_get_acl,
.set_acl = f2fs_set_acl,
.listxattr = f2fs_listxattr,
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 03e24df1c84f..cb1b5b61a1da 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -618,10 +618,10 @@ pgoff_t f2fs_get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs)
switch (dn->max_level) {
case 3:
base += 2 * indirect_blks;
- /* fall through */
+ fallthrough;
case 2:
base += 2 * direct_blks;
- /* fall through */
+ fallthrough;
case 1:
base += direct_index;
break;
@@ -1041,8 +1041,10 @@ int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from)
trace_f2fs_truncate_inode_blocks_enter(inode, from);
level = get_node_path(inode, from, offset, noffset);
- if (level < 0)
+ if (level < 0) {
+ trace_f2fs_truncate_inode_blocks_exit(inode, level);
return level;
+ }
page = f2fs_get_node_page(sbi, inode->i_ino);
if (IS_ERR(page)) {
@@ -1726,7 +1728,7 @@ continue_unlock:
set_dentry_mark(page,
f2fs_need_dentry_mark(sbi, ino));
}
- /* may be written by other thread */
+ /* may be written by other thread */
if (!PageDirty(page))
set_page_dirty(page);
}
@@ -1814,12 +1816,11 @@ static bool flush_dirty_inode(struct page *page)
return true;
}
-int f2fs_flush_inline_data(struct f2fs_sb_info *sbi)
+void f2fs_flush_inline_data(struct f2fs_sb_info *sbi)
{
pgoff_t index = 0;
struct pagevec pvec;
int nr_pages;
- int ret = 0;
pagevec_init(&pvec);
@@ -1858,7 +1859,6 @@ continue_unlock:
pagevec_release(&pvec);
cond_resched();
}
- return ret;
}
int f2fs_sync_node_pages(struct f2fs_sb_info *sbi,
@@ -1924,8 +1924,12 @@ continue_unlock:
goto continue_unlock;
}
- /* flush inline_data, if it's async context. */
- if (do_balance && is_inline_node(page)) {
+ /* flush inline_data/inode, if it's async context. */
+ if (!do_balance)
+ goto write_node;
+
+ /* flush inline_data */
+ if (is_inline_node(page)) {
clear_inline_node(page);
unlock_page(page);
flush_inline_data(sbi, ino_of_node(page));
@@ -1938,7 +1942,7 @@ continue_unlock:
if (flush_dirty_inode(page))
goto lock_node;
}
-
+write_node:
f2fs_wait_on_page_writeback(page, NODE, true, true);
if (!clear_page_dirty_for_io(page))
@@ -2097,7 +2101,7 @@ const struct address_space_operations f2fs_node_aops = {
.invalidatepage = f2fs_invalidate_page,
.releasepage = f2fs_release_page,
#ifdef CONFIG_MIGRATION
- .migratepage = f2fs_migrate_page,
+ .migratepage = f2fs_migrate_page,
#endif
};
@@ -2108,7 +2112,7 @@ static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i,
}
static int __insert_free_nid(struct f2fs_sb_info *sbi,
- struct free_nid *i, enum nid_state state)
+ struct free_nid *i)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
@@ -2116,10 +2120,8 @@ static int __insert_free_nid(struct f2fs_sb_info *sbi,
if (err)
return err;
- f2fs_bug_on(sbi, state != i->state);
- nm_i->nid_cnt[state]++;
- if (state == FREE_NID)
- list_add_tail(&i->list, &nm_i->free_nid_list);
+ nm_i->nid_cnt[FREE_NID]++;
+ list_add_tail(&i->list, &nm_i->free_nid_list);
return 0;
}
@@ -2241,7 +2243,7 @@ static bool add_free_nid(struct f2fs_sb_info *sbi,
}
}
ret = true;
- err = __insert_free_nid(sbi, i, FREE_NID);
+ err = __insert_free_nid(sbi, i);
err_out:
if (update) {
update_free_nid_bitmap(sbi, nid, ret, build);
@@ -2371,6 +2373,9 @@ static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi,
if (unlikely(nid >= nm_i->max_nid))
nid = 0;
+ if (unlikely(nid % NAT_ENTRY_PER_BLOCK))
+ nid = NAT_BLOCK_OFFSET(nid) * NAT_ENTRY_PER_BLOCK;
+
/* Enough entries */
if (nm_i->nid_cnt[FREE_NID] >= NAT_ENTRY_PER_BLOCK)
return 0;
@@ -2572,7 +2577,7 @@ int f2fs_try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink)
return nr - nr_shrink;
}
-void f2fs_recover_inline_xattr(struct inode *inode, struct page *page)
+int f2fs_recover_inline_xattr(struct inode *inode, struct page *page)
{
void *src_addr, *dst_addr;
size_t inline_size;
@@ -2580,7 +2585,8 @@ void f2fs_recover_inline_xattr(struct inode *inode, struct page *page)
struct f2fs_inode *ri;
ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino);
- f2fs_bug_on(F2FS_I_SB(inode), IS_ERR(ipage));
+ if (IS_ERR(ipage))
+ return PTR_ERR(ipage);
ri = F2FS_INODE(page);
if (ri->i_inline & F2FS_INLINE_XATTR) {
@@ -2599,6 +2605,7 @@ void f2fs_recover_inline_xattr(struct inode *inode, struct page *page)
update_inode:
f2fs_update_inode(inode, ipage);
f2fs_put_page(ipage, 1);
+ return 0;
}
int f2fs_recover_xattr_data(struct inode *inode, struct page *page)
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index ae5310f02e7f..4f12ade6410a 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -544,7 +544,9 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
/* step 1: recover xattr */
if (IS_INODE(page)) {
- f2fs_recover_inline_xattr(inode, page);
+ err = f2fs_recover_inline_xattr(inode, page);
+ if (err)
+ goto out;
} else if (f2fs_has_xattr_block(ofs_of_node(page))) {
err = f2fs_recover_xattr_data(inode, page);
if (!err)
@@ -553,8 +555,12 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
}
/* step 2: recover inline data */
- if (f2fs_recover_inline_data(inode, page))
+ err = f2fs_recover_inline_data(inode, page);
+ if (err) {
+ if (err == 1)
+ err = 0;
goto out;
+ }
/* step 3: recover data indices */
start = f2fs_start_bidx_of_node(ofs_of_node(page), inode);
@@ -742,7 +748,7 @@ next:
f2fs_put_page(page, 1);
}
if (!err)
- f2fs_allocate_new_segments(sbi, NO_CHECK_TYPE);
+ f2fs_allocate_new_segments(sbi);
return err;
}
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 196f31503511..e247a5ef3713 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -174,7 +174,7 @@ bool f2fs_need_SSR(struct f2fs_sb_info *sbi)
if (f2fs_lfs_mode(sbi))
return false;
- if (sbi->gc_mode == GC_URGENT)
+ if (sbi->gc_mode == GC_URGENT_HIGH)
return true;
if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
return true;
@@ -796,6 +796,18 @@ static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
}
if (!test_and_set_bit(segno, dirty_i->dirty_segmap[t]))
dirty_i->nr_dirty[t]++;
+
+ if (__is_large_section(sbi)) {
+ unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
+ block_t valid_blocks =
+ get_valid_blocks(sbi, segno, true);
+
+ f2fs_bug_on(sbi, unlikely(!valid_blocks ||
+ valid_blocks == BLKS_PER_SEC(sbi)));
+
+ if (!IS_CURSEC(sbi, secno))
+ set_bit(secno, dirty_i->dirty_secmap);
+ }
}
}
@@ -803,6 +815,7 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
enum dirty_type dirty_type)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+ block_t valid_blocks;
if (test_and_clear_bit(segno, dirty_i->dirty_segmap[dirty_type]))
dirty_i->nr_dirty[dirty_type]--;
@@ -814,13 +827,26 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t]))
dirty_i->nr_dirty[t]--;
- if (get_valid_blocks(sbi, segno, true) == 0) {
+ valid_blocks = get_valid_blocks(sbi, segno, true);
+ if (valid_blocks == 0) {
clear_bit(GET_SEC_FROM_SEG(sbi, segno),
dirty_i->victim_secmap);
#ifdef CONFIG_F2FS_CHECK_FS
clear_bit(segno, SIT_I(sbi)->invalid_segmap);
#endif
}
+ if (__is_large_section(sbi)) {
+ unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
+
+ if (!valid_blocks ||
+ valid_blocks == BLKS_PER_SEC(sbi)) {
+ clear_bit(secno, dirty_i->dirty_secmap);
+ return;
+ }
+
+ if (!IS_CURSEC(sbi, secno))
+ set_bit(secno, dirty_i->dirty_secmap);
+ }
}
}
@@ -1733,7 +1759,7 @@ static int issue_discard_thread(void *data)
continue;
}
- if (sbi->gc_mode == GC_URGENT)
+ if (sbi->gc_mode == GC_URGENT_HIGH)
__init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, 1);
sb_start_intwrite(sbi->sb);
@@ -2140,7 +2166,7 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
new_vblocks = se->valid_blocks + del;
offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr);
- f2fs_bug_on(sbi, (new_vblocks >> (sizeof(unsigned short) << 3) ||
+ f2fs_bug_on(sbi, (new_vblocks < 0 ||
(new_vblocks > sbi->blocks_per_seg)));
se->valid_blocks = new_vblocks;
@@ -2605,7 +2631,7 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type)
bool reversed = false;
/* f2fs_need_SSR() already forces to do this */
- if (v_ops->get_victim(sbi, &segno, BG_GC, type, SSR)) {
+ if (!v_ops->get_victim(sbi, &segno, BG_GC, type, SSR)) {
curseg->next_segno = segno;
return 1;
}
@@ -2632,7 +2658,7 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type)
for (; cnt-- > 0; reversed ? i-- : i++) {
if (i == type)
continue;
- if (v_ops->get_victim(sbi, &segno, BG_GC, i, SSR)) {
+ if (!v_ops->get_victim(sbi, &segno, BG_GC, i, SSR)) {
curseg->next_segno = segno;
return 1;
}
@@ -2674,7 +2700,7 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
stat_inc_seg_type(sbi, curseg);
}
-void allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
+void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
unsigned int start, unsigned int end)
{
struct curseg_info *curseg = CURSEG_I(sbi, type);
@@ -2707,28 +2733,35 @@ unlock:
up_read(&SM_I(sbi)->curseg_lock);
}
-void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi, int type)
+static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type)
{
- struct curseg_info *curseg;
+ struct curseg_info *curseg = CURSEG_I(sbi, type);
unsigned int old_segno;
- int i;
- down_write(&SIT_I(sbi)->sentry_lock);
+ if (!curseg->next_blkoff &&
+ !get_valid_blocks(sbi, curseg->segno, false) &&
+ !get_ckpt_valid_blocks(sbi, curseg->segno))
+ return;
- for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
- if (type != NO_CHECK_TYPE && i != type)
- continue;
+ old_segno = curseg->segno;
+ SIT_I(sbi)->s_ops->allocate_segment(sbi, type, true);
+ locate_dirty_segment(sbi, old_segno);
+}
- curseg = CURSEG_I(sbi, i);
- if (type == NO_CHECK_TYPE || curseg->next_blkoff ||
- get_valid_blocks(sbi, curseg->segno, false) ||
- get_ckpt_valid_blocks(sbi, curseg->segno)) {
- old_segno = curseg->segno;
- SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true);
- locate_dirty_segment(sbi, old_segno);
- }
- }
+void f2fs_allocate_new_segment(struct f2fs_sb_info *sbi, int type)
+{
+ down_write(&SIT_I(sbi)->sentry_lock);
+ __allocate_new_segment(sbi, type);
+ up_write(&SIT_I(sbi)->sentry_lock);
+}
+
+void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi)
+{
+ int i;
+ down_write(&SIT_I(sbi)->sentry_lock);
+ for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++)
+ __allocate_new_segment(sbi, i);
up_write(&SIT_I(sbi)->sentry_lock);
}
@@ -3089,7 +3122,7 @@ static int __get_segment_type(struct f2fs_io_info *fio)
void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
block_t old_blkaddr, block_t *new_blkaddr,
struct f2fs_summary *sum, int type,
- struct f2fs_io_info *fio, bool add_list)
+ struct f2fs_io_info *fio)
{
struct sit_info *sit_i = SIT_I(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, type);
@@ -3107,14 +3140,6 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
type = CURSEG_COLD_DATA;
}
- /*
- * We need to wait for node_write to avoid block allocation during
- * checkpoint. This can only happen to quota writes which can cause
- * the below discard race condition.
- */
- if (IS_DATASEG(type))
- down_write(&sbi->node_write);
-
down_read(&SM_I(sbi)->curseg_lock);
mutex_lock(&curseg->curseg_mutex);
@@ -3165,7 +3190,7 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
if (F2FS_IO_ALIGNED(sbi))
fio->retry = false;
- if (add_list) {
+ if (fio) {
struct f2fs_bio_info *io;
INIT_LIST_HEAD(&fio->list);
@@ -3180,9 +3205,6 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
up_read(&SM_I(sbi)->curseg_lock);
- if (IS_DATASEG(type))
- up_write(&sbi->node_write);
-
if (put_pin_sem)
up_read(&sbi->pin_sem);
}
@@ -3217,7 +3239,7 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio)
down_read(&fio->sbi->io_order_lock);
reallocate:
f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr,
- &fio->new_blkaddr, sum, type, fio, true);
+ &fio->new_blkaddr, sum, type, fio);
if (GET_SEGNO(fio->sbi, fio->old_blkaddr) != NULL_SEGNO)
invalidate_mapping_pages(META_MAPPING(fio->sbi),
fio->old_blkaddr, fio->old_blkaddr);
@@ -4293,8 +4315,9 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
struct free_segmap_info *free_i = FREE_I(sbi);
- unsigned int segno = 0, offset = 0;
- unsigned short valid_blocks;
+ unsigned int segno = 0, offset = 0, secno;
+ block_t valid_blocks;
+ block_t blks_per_sec = BLKS_PER_SEC(sbi);
while (1) {
/* find dirty segment based on free segmap */
@@ -4313,6 +4336,22 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi)
__locate_dirty_segment(sbi, segno, DIRTY);
mutex_unlock(&dirty_i->seglist_lock);
}
+
+ if (!__is_large_section(sbi))
+ return;
+
+ mutex_lock(&dirty_i->seglist_lock);
+ for (segno = 0; segno < MAIN_SECS(sbi); segno += blks_per_sec) {
+ valid_blocks = get_valid_blocks(sbi, segno, true);
+ secno = GET_SEC_FROM_SEG(sbi, segno);
+
+ if (!valid_blocks || valid_blocks == blks_per_sec)
+ continue;
+ if (IS_CURSEC(sbi, secno))
+ continue;
+ set_bit(secno, dirty_i->dirty_secmap);
+ }
+ mutex_unlock(&dirty_i->seglist_lock);
}
static int init_victim_secmap(struct f2fs_sb_info *sbi)
@@ -4349,6 +4388,14 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi)
return -ENOMEM;
}
+ if (__is_large_section(sbi)) {
+ bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi));
+ dirty_i->dirty_secmap = f2fs_kvzalloc(sbi,
+ bitmap_size, GFP_KERNEL);
+ if (!dirty_i->dirty_secmap)
+ return -ENOMEM;
+ }
+
init_dirty_segmap(sbi);
return init_victim_secmap(sbi);
}
@@ -4775,6 +4822,12 @@ static void destroy_dirty_segmap(struct f2fs_sb_info *sbi)
for (i = 0; i < NR_DIRTY_TYPE; i++)
discard_dirty_segmap(sbi, i);
+ if (__is_large_section(sbi)) {
+ mutex_lock(&dirty_i->seglist_lock);
+ kvfree(dirty_i->dirty_secmap);
+ mutex_unlock(&dirty_i->seglist_lock);
+ }
+
destroy_victim_secmap(sbi);
SM_I(sbi)->dirty_info = NULL;
kvfree(dirty_i);
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index cba16cca5189..752b177073b2 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -166,8 +166,11 @@ enum {
struct victim_sel_policy {
int alloc_mode; /* LFS or SSR */
int gc_mode; /* GC_CB or GC_GREEDY */
- unsigned long *dirty_segmap; /* dirty segment bitmap */
- unsigned int max_search; /* maximum # of segments to search */
+ unsigned long *dirty_bitmap; /* dirty segment/section bitmap */
+ unsigned int max_search; /*
+ * maximum # of segments/sections
+ * to search
+ */
unsigned int offset; /* last scanned bitmap offset */
unsigned int ofs_unit; /* bitmap search unit */
unsigned int min_cost; /* minimum cost */
@@ -184,7 +187,7 @@ struct seg_entry {
unsigned char *cur_valid_map_mir; /* mirror of current valid bitmap */
#endif
/*
- * # of valid blocks and the validity bitmap stored in the the last
+ * # of valid blocks and the validity bitmap stored in the last
* checkpoint pack. This information is used by the SSR mode.
*/
unsigned char *ckpt_valid_map; /* validity bitmap of blocks last cp */
@@ -266,6 +269,7 @@ enum dirty_type {
struct dirty_seglist_info {
const struct victim_selection *v_ops; /* victim selction operation */
unsigned long *dirty_segmap[NR_DIRTY_TYPE];
+ unsigned long *dirty_secmap;
struct mutex seglist_lock; /* lock for segment bitmaps */
int nr_dirty[NR_DIRTY_TYPE]; /* # of dirty segments */
unsigned long *victim_secmap; /* background GC victims */
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 23c49c313fb6..bef2be3fa3d0 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -350,7 +350,7 @@ static int f2fs_set_qf_name(struct super_block *sb, int qtype,
set_opt(sbi, QUOTA);
return 0;
errout:
- kvfree(qname);
+ kfree(qname);
return ret;
}
@@ -362,7 +362,7 @@ static int f2fs_clear_qf_name(struct super_block *sb, int qtype)
f2fs_err(sbi, "Cannot change journaled quota options when quota turned on");
return -EINVAL;
}
- kvfree(F2FS_OPTION(sbi).s_qf_names[qtype]);
+ kfree(F2FS_OPTION(sbi).s_qf_names[qtype]);
F2FS_OPTION(sbi).s_qf_names[qtype] = NULL;
return 0;
}
@@ -433,12 +433,12 @@ static int f2fs_set_test_dummy_encryption(struct super_block *sb,
* needed to allow it to be set or changed during remount. We do allow
* it to be specified during remount, but only if there is no change.
*/
- if (is_remount && !F2FS_OPTION(sbi).dummy_enc_ctx.ctx) {
+ if (is_remount && !F2FS_OPTION(sbi).dummy_enc_policy.policy) {
f2fs_warn(sbi, "Can't set test_dummy_encryption on remount");
return -EINVAL;
}
err = fscrypt_set_test_dummy_encryption(
- sb, arg, &F2FS_OPTION(sbi).dummy_enc_ctx);
+ sb, arg->from, &F2FS_OPTION(sbi).dummy_enc_policy);
if (err) {
if (err == -EEXIST)
f2fs_warn(sbi,
@@ -462,9 +462,12 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
{
struct f2fs_sb_info *sbi = F2FS_SB(sb);
substring_t args[MAX_OPT_ARGS];
+#ifdef CONFIG_F2FS_FS_COMPRESSION
unsigned char (*ext)[F2FS_EXTENSION_LEN];
+ int ext_cnt;
+#endif
char *p, *name;
- int arg = 0, ext_cnt;
+ int arg = 0;
kuid_t uid;
kgid_t gid;
int ret;
@@ -496,10 +499,10 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
} else if (!strcmp(name, "sync")) {
F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_SYNC;
} else {
- kvfree(name);
+ kfree(name);
return -EINVAL;
}
- kvfree(name);
+ kfree(name);
break;
case Opt_disable_roll_forward:
set_opt(sbi, DISABLE_ROLL_FORWARD);
@@ -656,17 +659,17 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
if (!strcmp(name, "adaptive")) {
if (f2fs_sb_has_blkzoned(sbi)) {
f2fs_warn(sbi, "adaptive mode is not allowed with zoned block device feature");
- kvfree(name);
+ kfree(name);
return -EINVAL;
}
F2FS_OPTION(sbi).fs_mode = FS_MODE_ADAPTIVE;
} else if (!strcmp(name, "lfs")) {
F2FS_OPTION(sbi).fs_mode = FS_MODE_LFS;
} else {
- kvfree(name);
+ kfree(name);
return -EINVAL;
}
- kvfree(name);
+ kfree(name);
break;
case Opt_io_size_bits:
if (args->from && match_int(args, &arg))
@@ -792,10 +795,10 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
} else if (!strcmp(name, "fs-based")) {
F2FS_OPTION(sbi).whint_mode = WHINT_MODE_FS;
} else {
- kvfree(name);
+ kfree(name);
return -EINVAL;
}
- kvfree(name);
+ kfree(name);
break;
case Opt_alloc:
name = match_strdup(&args[0]);
@@ -807,10 +810,10 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
} else if (!strcmp(name, "reuse")) {
F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE;
} else {
- kvfree(name);
+ kfree(name);
return -EINVAL;
}
- kvfree(name);
+ kfree(name);
break;
case Opt_fsync:
name = match_strdup(&args[0]);
@@ -824,10 +827,10 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
F2FS_OPTION(sbi).fsync_mode =
FSYNC_MODE_NOBARRIER;
} else {
- kvfree(name);
+ kfree(name);
return -EINVAL;
}
- kvfree(name);
+ kfree(name);
break;
case Opt_test_dummy_encryption:
ret = f2fs_set_test_dummy_encryption(sb, p, &args[0],
@@ -862,6 +865,7 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
case Opt_checkpoint_enable:
clear_opt(sbi, DISABLE_CHECKPOINT);
break;
+#ifdef CONFIG_F2FS_FS_COMPRESSION
case Opt_compress_algorithm:
if (!f2fs_sb_has_compression(sbi)) {
f2fs_err(sbi, "Compression feature if off");
@@ -927,6 +931,13 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
F2FS_OPTION(sbi).compress_ext_cnt++;
kfree(name);
break;
+#else
+ case Opt_compress_algorithm:
+ case Opt_compress_log_size:
+ case Opt_compress_extension:
+ f2fs_info(sbi, "compression options not supported");
+ break;
+#endif
default:
f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value",
p);
@@ -1024,6 +1035,8 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
/* Will be used by directory only */
fi->i_dir_level = F2FS_SB(sb)->dir_level;
+ fi->ra_offset = -1;
+
return &fi->vfs_inode;
}
@@ -1182,6 +1195,9 @@ static void f2fs_put_super(struct super_block *sb)
int i;
bool dropped;
+ /* unregister procfs/sysfs entries in advance to avoid race case */
+ f2fs_unregister_sysfs(sbi);
+
f2fs_quota_off_umount(sb);
/* prevent remaining shrinker jobs */
@@ -1247,28 +1263,26 @@ static void f2fs_put_super(struct super_block *sb)
kvfree(sbi->ckpt);
- f2fs_unregister_sysfs(sbi);
-
sb->s_fs_info = NULL;
if (sbi->s_chksum_driver)
crypto_free_shash(sbi->s_chksum_driver);
- kvfree(sbi->raw_super);
+ kfree(sbi->raw_super);
destroy_device_list(sbi);
f2fs_destroy_xattr_caches(sbi);
mempool_destroy(sbi->write_io_dummy);
#ifdef CONFIG_QUOTA
for (i = 0; i < MAXQUOTAS; i++)
- kvfree(F2FS_OPTION(sbi).s_qf_names[i]);
+ kfree(F2FS_OPTION(sbi).s_qf_names[i]);
#endif
- fscrypt_free_dummy_context(&F2FS_OPTION(sbi).dummy_enc_ctx);
+ fscrypt_free_dummy_policy(&F2FS_OPTION(sbi).dummy_enc_policy);
destroy_percpu_info(sbi);
for (i = 0; i < NR_PAGE_TYPE; i++)
kvfree(sbi->write_io[i]);
#ifdef CONFIG_UNICODE
utf8_unload(sbi->s_encoding);
#endif
- kvfree(sbi);
+ kfree(sbi);
}
int f2fs_sync_fs(struct super_block *sb, int sync)
@@ -1617,7 +1631,9 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_NOBARRIER)
seq_printf(seq, ",fsync_mode=%s", "nobarrier");
+#ifdef CONFIG_F2FS_FS_COMPRESSION
f2fs_show_compress_options(seq, sbi->sb);
+#endif
return 0;
}
@@ -1768,7 +1784,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
GFP_KERNEL);
if (!org_mount_opt.s_qf_names[i]) {
for (j = 0; j < i; j++)
- kvfree(org_mount_opt.s_qf_names[j]);
+ kfree(org_mount_opt.s_qf_names[j]);
return -ENOMEM;
}
} else {
@@ -1893,7 +1909,7 @@ skip:
#ifdef CONFIG_QUOTA
/* Release old quota file names */
for (i = 0; i < MAXQUOTAS; i++)
- kvfree(org_mount_opt.s_qf_names[i]);
+ kfree(org_mount_opt.s_qf_names[i]);
#endif
/* Update the POSIXACL Flag */
sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
@@ -1914,7 +1930,7 @@ restore_opts:
#ifdef CONFIG_QUOTA
F2FS_OPTION(sbi).s_jquota_fmt = org_mount_opt.s_jquota_fmt;
for (i = 0; i < MAXQUOTAS; i++) {
- kvfree(F2FS_OPTION(sbi).s_qf_names[i]);
+ kfree(F2FS_OPTION(sbi).s_qf_names[i]);
F2FS_OPTION(sbi).s_qf_names[i] = org_mount_opt.s_qf_names[i];
}
#endif
@@ -2466,10 +2482,9 @@ static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len,
ctx, len, fs_data, XATTR_CREATE);
}
-static const union fscrypt_context *
-f2fs_get_dummy_context(struct super_block *sb)
+static const union fscrypt_policy *f2fs_get_dummy_policy(struct super_block *sb)
{
- return F2FS_OPTION(F2FS_SB(sb)).dummy_enc_ctx.ctx;
+ return F2FS_OPTION(F2FS_SB(sb)).dummy_enc_policy.policy;
}
static bool f2fs_has_stable_inodes(struct super_block *sb)
@@ -2507,7 +2522,7 @@ static const struct fscrypt_operations f2fs_cryptops = {
.key_prefix = "f2fs:",
.get_context = f2fs_get_context,
.set_context = f2fs_set_context,
- .get_dummy_context = f2fs_get_dummy_context,
+ .get_dummy_policy = f2fs_get_dummy_policy,
.empty_dir = f2fs_empty_dir,
.max_namelen = F2FS_NAME_LEN,
.has_stable_inodes = f2fs_has_stable_inodes,
@@ -3172,7 +3187,7 @@ static int read_raw_super_block(struct f2fs_sb_info *sbi,
/* No valid superblock */
if (!*raw_super)
- kvfree(super);
+ kfree(super);
else
err = 0;
@@ -3846,16 +3861,16 @@ free_bio_info:
free_options:
#ifdef CONFIG_QUOTA
for (i = 0; i < MAXQUOTAS; i++)
- kvfree(F2FS_OPTION(sbi).s_qf_names[i]);
+ kfree(F2FS_OPTION(sbi).s_qf_names[i]);
#endif
- fscrypt_free_dummy_context(&F2FS_OPTION(sbi).dummy_enc_ctx);
+ fscrypt_free_dummy_policy(&F2FS_OPTION(sbi).dummy_enc_policy);
kvfree(options);
free_sb_buf:
- kvfree(raw_super);
+ kfree(raw_super);
free_sbi:
if (sbi->s_chksum_driver)
crypto_free_shash(sbi->s_chksum_driver);
- kvfree(sbi);
+ kfree(sbi);
/* give only one another chance */
if (retry_cnt > 0 && skip_recovery) {
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index e877c59b9fdb..88ed9969cc86 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -27,7 +27,7 @@ enum {
NM_INFO, /* struct f2fs_nm_info */
F2FS_SBI, /* struct f2fs_sb_info */
#ifdef CONFIG_F2FS_STAT_FS
- STAT_INFO, /* struct f2fs_stat_info */
+ STAT_INFO, /* struct f2fs_stat_info */
#endif
#ifdef CONFIG_F2FS_FAULT_INJECTION
FAULT_INFO_RATE, /* struct f2fs_fault_info */
@@ -223,6 +223,13 @@ static ssize_t avg_vblocks_show(struct f2fs_attr *a,
}
#endif
+static ssize_t main_blkaddr_show(struct f2fs_attr *a,
+ struct f2fs_sb_info *sbi, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
+ (unsigned long long)MAIN_BLKADDR(sbi));
+}
+
static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
@@ -350,16 +357,20 @@ out:
return -EINVAL;
if (!strcmp(a->attr.name, "gc_urgent")) {
- if (t >= 1) {
- sbi->gc_mode = GC_URGENT;
+ if (t == 0) {
+ sbi->gc_mode = GC_NORMAL;
+ } else if (t == 1) {
+ sbi->gc_mode = GC_URGENT_HIGH;
if (sbi->gc_thread) {
sbi->gc_thread->gc_wake = 1;
wake_up_interruptible_all(
&sbi->gc_thread->gc_wait_queue_head);
wake_up_discard_thread(sbi, true);
}
+ } else if (t == 2) {
+ sbi->gc_mode = GC_URGENT_LOW;
} else {
- sbi->gc_mode = GC_NORMAL;
+ return -EINVAL;
}
return count;
}
@@ -522,7 +533,6 @@ F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_idle, gc_mode);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_urgent, gc_mode);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments);
-F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, main_blkaddr, main_blkaddr);
F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_small_discards, max_discards);
F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity);
F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks);
@@ -565,6 +575,7 @@ F2FS_GENERAL_RO_ATTR(current_reserved_blocks);
F2FS_GENERAL_RO_ATTR(unusable);
F2FS_GENERAL_RO_ATTR(encoding);
F2FS_GENERAL_RO_ATTR(mounted_time_sec);
+F2FS_GENERAL_RO_ATTR(main_blkaddr);
#ifdef CONFIG_F2FS_STAT_FS
F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_foreground_calls, cp_count);
F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_background_calls, bg_cp_count);
@@ -706,7 +717,7 @@ static struct kobj_type f2fs_ktype = {
};
static struct kset f2fs_kset = {
- .kobj = {.ktype = &f2fs_ktype},
+ .kobj = {.ktype = &f2fs_ktype},
};
static struct kobj_type f2fs_feat_ktype = {
diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c
index 865c9fb774fb..9eb0dba851e8 100644
--- a/fs/f2fs/verity.c
+++ b/fs/f2fs/verity.c
@@ -29,6 +29,8 @@
#include "f2fs.h"
#include "xattr.h"
+#define F2FS_VERIFY_VER (1)
+
static inline loff_t f2fs_verity_metadata_pos(const struct inode *inode)
{
return round_up(inode->i_size, 65536);
@@ -152,7 +154,7 @@ static int f2fs_end_enable_verity(struct file *filp, const void *desc,
struct inode *inode = file_inode(filp);
u64 desc_pos = f2fs_verity_metadata_pos(inode) + merkle_tree_size;
struct fsverity_descriptor_location dloc = {
- .version = cpu_to_le32(1),
+ .version = cpu_to_le32(F2FS_VERIFY_VER),
.size = cpu_to_le32(desc_size),
.pos = cpu_to_le64(desc_pos),
};
@@ -199,7 +201,7 @@ static int f2fs_get_verity_descriptor(struct inode *inode, void *buf,
F2FS_XATTR_NAME_VERITY, &dloc, sizeof(dloc), NULL);
if (res < 0 && res != -ERANGE)
return res;
- if (res != sizeof(dloc) || dloc.version != cpu_to_le32(1)) {
+ if (res != sizeof(dloc) || dloc.version != cpu_to_le32(F2FS_VERIFY_VER)) {
f2fs_warn(F2FS_I_SB(inode), "unknown verity xattr format");
return -EINVAL;
}
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 4f6582ef7ee3..1b0736ce0918 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -175,8 +175,8 @@ const struct xattr_handler f2fs_xattr_trusted_handler = {
const struct xattr_handler f2fs_xattr_advise_handler = {
.name = F2FS_SYSTEM_ADVISE_NAME,
.flags = F2FS_XATTR_INDEX_ADVISE,
- .get = f2fs_xattr_advise_get,
- .set = f2fs_xattr_advise_set,
+ .get = f2fs_xattr_advise_get,
+ .set = f2fs_xattr_advise_set,
};
const struct xattr_handler f2fs_xattr_security_handler = {
diff --git a/fs/fat/Kconfig b/fs/fat/Kconfig
index ca31993dcb47..66532a71e8fd 100644
--- a/fs/fat/Kconfig
+++ b/fs/fat/Kconfig
@@ -41,7 +41,7 @@ config MSDOS_FS
they are compressed; to access compressed MSDOS partitions under
Linux, you can either use the DOS emulator DOSEMU, described in the
DOSEMU-HOWTO, available from
- <http://www.tldp.org/docs.html#howto>, or try dmsdosfs in
+ <https://www.tldp.org/docs.html#howto>, or try dmsdosfs in
<ftp://ibiblio.org/pub/Linux/system/filesystems/dosfs/>. If you
intend to use dosemu with a non-compressed MSDOS partition, say Y
here) and MSDOS floppies. This means that file access becomes
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index bbfe18c07417..f7e3304b7802 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -657,6 +657,9 @@ static void fat_ra_init(struct super_block *sb, struct fatent_ra *ra,
unsigned long ra_pages = sb->s_bdi->ra_pages;
unsigned int reada_blocks;
+ if (fatent->entry >= ent_limit)
+ return;
+
if (ra_pages > sb->s_bdi->io_pages)
ra_pages = rounddown(ra_pages, sb->s_bdi->io_pages);
reada_blocks = ra_pages << (PAGE_SHIFT - sb->s_blocksize_bits + 1);
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 42134c58c87e..f9ee27cf4d7c 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -25,9 +25,9 @@ static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr)
{
u32 attr;
- inode_lock(inode);
+ inode_lock_shared(inode);
attr = fat_make_attrs(inode);
- inode_unlock(inode);
+ inode_unlock_shared(inode);
return put_user(attr, user_attr);
}
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 2e4c0fa2074b..19ac5baad50f 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -362,7 +362,7 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
case F_OFD_SETLK:
case F_OFD_SETLKW:
#endif
- /* Fallthrough */
+ fallthrough;
case F_SETLK:
case F_SETLKW:
if (copy_from_user(&flock, argp, sizeof(flock)))
@@ -771,7 +771,7 @@ static void send_sigio_to_task(struct task_struct *p,
if (!do_send_sig_info(signum, &si, p, type))
break;
}
- /* fall-through - fall back on the old plain SIGIO signal */
+ fallthrough; /* fall back on the old plain SIGIO signal */
case 0:
do_send_sig_info(SIGIO, SEND_SIG_PRIV, p, type);
}
diff --git a/fs/file.c b/fs/file.c
index 21c0893f2f1d..4559b5fec3bd 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -21,6 +21,7 @@
#include <linux/rcupdate.h>
#include <linux/close_range.h>
#include <net/sock.h>
+#include <linux/io_uring.h>
unsigned int sysctl_nr_open __read_mostly = 1024*1024;
unsigned int sysctl_nr_open_min = BITS_PER_LONG;
@@ -452,6 +453,7 @@ void exit_files(struct task_struct *tsk)
struct files_struct * files = tsk->files;
if (files) {
+ io_uring_files_cancel(files);
task_lock(tsk);
tsk->files = NULL;
task_unlock(tsk);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index a605c3dddabc..e6005c78bfa9 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -42,7 +42,6 @@
struct wb_writeback_work {
long nr_pages;
struct super_block *sb;
- unsigned long *older_than_this;
enum writeback_sync_modes sync_mode;
unsigned int tagged_writepages:1;
unsigned int for_kupdate:1;
@@ -144,7 +143,9 @@ static void inode_io_list_del_locked(struct inode *inode,
struct bdi_writeback *wb)
{
assert_spin_locked(&wb->list_lock);
+ assert_spin_locked(&inode->i_lock);
+ inode->i_state &= ~I_SYNC_QUEUED;
list_del_init(&inode->i_io_list);
wb_io_lists_depopulated(wb);
}
@@ -1122,7 +1123,9 @@ void inode_io_list_del(struct inode *inode)
struct bdi_writeback *wb;
wb = inode_to_wb_and_lock_list(inode);
+ spin_lock(&inode->i_lock);
inode_io_list_del_locked(inode, wb);
+ spin_unlock(&inode->i_lock);
spin_unlock(&wb->list_lock);
}
EXPORT_SYMBOL(inode_io_list_del);
@@ -1172,8 +1175,10 @@ void sb_clear_inode_writeback(struct inode *inode)
* the case then the inode must have been redirtied while it was being written
* out and we don't reset its dirtied_when.
*/
-static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
+static void redirty_tail_locked(struct inode *inode, struct bdi_writeback *wb)
{
+ assert_spin_locked(&inode->i_lock);
+
if (!list_empty(&wb->b_dirty)) {
struct inode *tail;
@@ -1182,6 +1187,14 @@ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
inode->dirtied_when = jiffies;
}
inode_io_list_move_locked(inode, wb, &wb->b_dirty);
+ inode->i_state &= ~I_SYNC_QUEUED;
+}
+
+static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
+{
+ spin_lock(&inode->i_lock);
+ redirty_tail_locked(inode, wb);
+ spin_unlock(&inode->i_lock);
}
/*
@@ -1220,16 +1233,13 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
#define EXPIRE_DIRTY_ATIME 0x0001
/*
- * Move expired (dirtied before work->older_than_this) dirty inodes from
+ * Move expired (dirtied before dirtied_before) dirty inodes from
* @delaying_queue to @dispatch_queue.
*/
static int move_expired_inodes(struct list_head *delaying_queue,
struct list_head *dispatch_queue,
- int flags,
- struct wb_writeback_work *work)
+ unsigned long dirtied_before)
{
- unsigned long *older_than_this = NULL;
- unsigned long expire_time;
LIST_HEAD(tmp);
struct list_head *pos, *node;
struct super_block *sb = NULL;
@@ -1237,21 +1247,15 @@ static int move_expired_inodes(struct list_head *delaying_queue,
int do_sb_sort = 0;
int moved = 0;
- if ((flags & EXPIRE_DIRTY_ATIME) == 0)
- older_than_this = work->older_than_this;
- else if (!work->for_sync) {
- expire_time = jiffies - (dirtytime_expire_interval * HZ);
- older_than_this = &expire_time;
- }
while (!list_empty(delaying_queue)) {
inode = wb_inode(delaying_queue->prev);
- if (older_than_this &&
- inode_dirtied_after(inode, *older_than_this))
+ if (inode_dirtied_after(inode, dirtied_before))
break;
list_move(&inode->i_io_list, &tmp);
moved++;
- if (flags & EXPIRE_DIRTY_ATIME)
- set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state);
+ spin_lock(&inode->i_lock);
+ inode->i_state |= I_SYNC_QUEUED;
+ spin_unlock(&inode->i_lock);
if (sb_is_blkdev_sb(inode->i_sb))
continue;
if (sb && sb != inode->i_sb)
@@ -1289,18 +1293,22 @@ out:
* |
* +--> dequeue for IO
*/
-static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
+static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work,
+ unsigned long dirtied_before)
{
int moved;
+ unsigned long time_expire_jif = dirtied_before;
assert_spin_locked(&wb->list_lock);
list_splice_init(&wb->b_more_io, &wb->b_io);
- moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work);
+ moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, dirtied_before);
+ if (!work->for_sync)
+ time_expire_jif = jiffies - dirtytime_expire_interval * HZ;
moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
- EXPIRE_DIRTY_ATIME, work);
+ time_expire_jif);
if (moved)
wb_io_lists_populated(wb);
- trace_writeback_queue_io(wb, work, moved);
+ trace_writeback_queue_io(wb, work, dirtied_before, moved);
}
static int write_inode(struct inode *inode, struct writeback_control *wbc)
@@ -1394,7 +1402,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
* writeback is not making progress due to locked
* buffers. Skip this inode for now.
*/
- redirty_tail(inode, wb);
+ redirty_tail_locked(inode, wb);
return;
}
@@ -1414,7 +1422,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
* retrying writeback of the dirty page/inode
* that cannot be performed immediately.
*/
- redirty_tail(inode, wb);
+ redirty_tail_locked(inode, wb);
}
} else if (inode->i_state & I_DIRTY) {
/*
@@ -1422,10 +1430,11 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
* such as delayed allocation during submission or metadata
* updates after data IO completion.
*/
- redirty_tail(inode, wb);
+ redirty_tail_locked(inode, wb);
} else if (inode->i_state & I_DIRTY_TIME) {
inode->dirtied_when = jiffies;
inode_io_list_move_locked(inode, wb, &wb->b_dirty_time);
+ inode->i_state &= ~I_SYNC_QUEUED;
} else {
/* The inode is clean. Remove from writeback lists. */
inode_io_list_del_locked(inode, wb);
@@ -1472,18 +1481,14 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
spin_lock(&inode->i_lock);
dirty = inode->i_state & I_DIRTY;
- if (inode->i_state & I_DIRTY_TIME) {
- if ((dirty & I_DIRTY_INODE) ||
- wbc->sync_mode == WB_SYNC_ALL ||
- unlikely(inode->i_state & I_DIRTY_TIME_EXPIRED) ||
- unlikely(time_after(jiffies,
- (inode->dirtied_time_when +
- dirtytime_expire_interval * HZ)))) {
- dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED;
- trace_writeback_lazytime(inode);
- }
- } else
- inode->i_state &= ~I_DIRTY_TIME_EXPIRED;
+ if ((inode->i_state & I_DIRTY_TIME) &&
+ ((dirty & I_DIRTY_INODE) ||
+ wbc->sync_mode == WB_SYNC_ALL || wbc->for_sync ||
+ time_after(jiffies, inode->dirtied_time_when +
+ dirtytime_expire_interval * HZ))) {
+ dirty |= I_DIRTY_TIME;
+ trace_writeback_lazytime(inode);
+ }
inode->i_state &= ~dirty;
/*
@@ -1669,8 +1674,8 @@ static long writeback_sb_inodes(struct super_block *sb,
*/
spin_lock(&inode->i_lock);
if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
+ redirty_tail_locked(inode, wb);
spin_unlock(&inode->i_lock);
- redirty_tail(inode, wb);
continue;
}
if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) {
@@ -1811,7 +1816,7 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
blk_start_plug(&plug);
spin_lock(&wb->list_lock);
if (list_empty(&wb->b_io))
- queue_io(wb, &work);
+ queue_io(wb, &work, jiffies);
__writeback_inodes_wb(wb, &work);
spin_unlock(&wb->list_lock);
blk_finish_plug(&plug);
@@ -1831,7 +1836,7 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
* takes longer than a dirty_writeback_interval interval, then leave a
* one-second gap.
*
- * older_than_this takes precedence over nr_to_write. So we'll only write back
+ * dirtied_before takes precedence over nr_to_write. So we'll only write back
* all dirty pages if they are all attached to "old" mappings.
*/
static long wb_writeback(struct bdi_writeback *wb,
@@ -1839,14 +1844,11 @@ static long wb_writeback(struct bdi_writeback *wb,
{
unsigned long wb_start = jiffies;
long nr_pages = work->nr_pages;
- unsigned long oldest_jif;
+ unsigned long dirtied_before = jiffies;
struct inode *inode;
long progress;
struct blk_plug plug;
- oldest_jif = jiffies;
- work->older_than_this = &oldest_jif;
-
blk_start_plug(&plug);
spin_lock(&wb->list_lock);
for (;;) {
@@ -1880,14 +1882,14 @@ static long wb_writeback(struct bdi_writeback *wb,
* safe.
*/
if (work->for_kupdate) {
- oldest_jif = jiffies -
+ dirtied_before = jiffies -
msecs_to_jiffies(dirty_expire_interval * 10);
} else if (work->for_background)
- oldest_jif = jiffies;
+ dirtied_before = jiffies;
trace_writeback_start(wb, work);
if (list_empty(&wb->b_io))
- queue_io(wb, work);
+ queue_io(wb, work, dirtied_before);
if (work->sb)
progress = writeback_sb_inodes(work->sb, wb, work);
else
@@ -2182,7 +2184,7 @@ static int __init start_dirtytime_writeback(void)
__initcall(start_dirtytime_writeback);
int dirtytime_interval_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
@@ -2289,11 +2291,12 @@ void __mark_inode_dirty(struct inode *inode, int flags)
inode->i_state |= flags;
/*
- * If the inode is being synced, just update its dirty state.
- * The unlocker will place the inode on the appropriate
- * superblock list, based upon its state.
+ * If the inode is queued for writeback by flush worker, just
+ * update its dirty state. Once the flush worker is done with
+ * the inode it will place it on the appropriate superblock
+ * list, based upon its state.
*/
- if (inode->i_state & I_SYNC)
+ if (inode->i_state & I_SYNC_QUEUED)
goto out_unlock_inode;
/*
@@ -2318,7 +2321,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
wb = locked_inode_to_wb_and_lock_list(inode);
- WARN(bdi_cap_writeback_dirty(wb->bdi) &&
+ WARN((wb->bdi->capabilities & BDI_CAP_WRITEBACK) &&
!test_bit(WB_registered, &wb->state),
"bdi-%s not registered\n", bdi_dev_name(wb->bdi));
@@ -2343,7 +2346,8 @@ void __mark_inode_dirty(struct inode *inode, int flags)
* to make sure background write-back happens
* later.
*/
- if (bdi_cap_writeback_dirty(wb->bdi) && wakeup_bdi)
+ if (wakeup_bdi &&
+ (wb->bdi->capabilities & BDI_CAP_WRITEBACK))
wb_wakeup_delayed(wb);
return;
}
@@ -2578,7 +2582,7 @@ int write_inode_now(struct inode *inode, int sync)
.range_end = LLONG_MAX,
};
- if (!mapping_cap_writeback_dirty(inode->i_mapping))
+ if (!mapping_can_writeback(inode->i_mapping))
wbc.nr_to_write = 0;
might_sleep();
diff --git a/fs/fs_context.c b/fs/fs_context.c
index 7d5c5dd2b1d5..2834d1afa6e8 100644
--- a/fs/fs_context.c
+++ b/fs/fs_context.c
@@ -521,7 +521,7 @@ static int legacy_parse_param(struct fs_context *fc, struct fs_parameter *param)
switch (param->type) {
case fs_value_is_string:
len = 1 + param->size;
- /* Fall through */
+ fallthrough;
case fs_value_is_flag:
len += strlen(param->key);
break;
diff --git a/fs/fs_parser.c b/fs/fs_parser.c
index ab53e42a874a..68b0148f4bb8 100644
--- a/fs/fs_parser.c
+++ b/fs/fs_parser.c
@@ -189,7 +189,7 @@ out:
}
EXPORT_SYMBOL(fs_lookup_param);
-int fs_param_bad_value(struct p_log *log, struct fs_parameter *param)
+static int fs_param_bad_value(struct p_log *log, struct fs_parameter *param)
{
return inval_plog(log, "Bad value for '%s'", param->key);
}
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index ca639ed967b7..04b3f5b9c629 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -117,7 +117,7 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
fs->users = 1;
fs->in_exec = 0;
spin_lock_init(&fs->lock);
- seqcount_init(&fs->seq);
+ seqcount_spinlock_init(&fs->seq, &fs->lock);
fs->umask = old->umask;
spin_lock(&old->lock);
@@ -163,6 +163,6 @@ EXPORT_SYMBOL(current_umask);
struct fs_struct init_fs = {
.users = 1,
.lock = __SPIN_LOCK_UNLOCKED(init_fs.lock),
- .seq = SEQCNT_ZERO(init_fs.seq),
+ .seq = SEQCNT_SPINLOCK_ZERO(init_fs.seq, &init_fs.lock),
.umask = 0022,
};
diff --git a/fs/fsopen.c b/fs/fsopen.c
index 2fa3f241b762..27a890aa493a 100644
--- a/fs/fsopen.c
+++ b/fs/fsopen.c
@@ -412,7 +412,7 @@ SYSCALL_DEFINE5(fsconfig,
break;
case FSCONFIG_SET_PATH_EMPTY:
lookup_flags = LOOKUP_EMPTY;
- /* fallthru */
+ fallthrough;
case FSCONFIG_SET_PATH:
param.type = fs_value_is_filename;
param.name = getname_flags(_value, lookup_flags, NULL);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 6611ef3269a8..43c165e796da 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -3091,11 +3091,10 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
ssize_t ret = 0;
struct file *file = iocb->ki_filp;
struct fuse_file *ff = file->private_data;
- bool async_dio = ff->fc->async_dio;
loff_t pos = 0;
struct inode *inode;
loff_t i_size;
- size_t count = iov_iter_count(iter);
+ size_t count = iov_iter_count(iter), shortened = 0;
loff_t offset = iocb->ki_pos;
struct fuse_io_priv *io;
@@ -3103,17 +3102,9 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
inode = file->f_mapping->host;
i_size = i_size_read(inode);
- if ((iov_iter_rw(iter) == READ) && (offset > i_size))
+ if ((iov_iter_rw(iter) == READ) && (offset >= i_size))
return 0;
- /* optimization for short read */
- if (async_dio && iov_iter_rw(iter) != WRITE && offset + count > i_size) {
- if (offset >= i_size)
- return 0;
- iov_iter_truncate(iter, fuse_round_up(ff->fc, i_size - offset));
- count = iov_iter_count(iter);
- }
-
io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL);
if (!io)
return -ENOMEM;
@@ -3129,15 +3120,22 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
* By default, we want to optimize all I/Os with async request
* submission to the client filesystem if supported.
*/
- io->async = async_dio;
+ io->async = ff->fc->async_dio;
io->iocb = iocb;
io->blocking = is_sync_kiocb(iocb);
+ /* optimization for short read */
+ if (io->async && !io->write && offset + count > i_size) {
+ iov_iter_truncate(iter, fuse_round_up(ff->fc, i_size - offset));
+ shortened = count - iov_iter_count(iter);
+ count -= shortened;
+ }
+
/*
* We cannot asynchronously extend the size of a file.
* In such case the aio will behave exactly like sync io.
*/
- if ((offset + count > i_size) && iov_iter_rw(iter) == WRITE)
+ if ((offset + count > i_size) && io->write)
io->blocking = true;
if (io->async && io->blocking) {
@@ -3155,6 +3153,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
} else {
ret = __fuse_direct_read(io, iter, &pos);
}
+ iov_iter_reexpand(iter, iov_iter_count(iter) + shortened);
if (io->async) {
bool blocking = io->blocking;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index bba747520e9b..581329203d68 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1049,9 +1049,9 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
if (err)
return err;
- sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
/* fuse does it's own writeback accounting */
- sb->s_bdi->capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT;
+ sb->s_bdi->capabilities &= ~BDI_CAP_WRITEBACK_ACCT;
+ sb->s_bdi->capabilities |= BDI_CAP_STRICTLIMIT;
/*
* For a single fuse filesystem use max 1% of dirty +
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 4c4ef5d69298..104f35de5270 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -606,8 +606,8 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev,
unsigned int i;
int ret = 0;
- virtio_cread(vdev, struct virtio_fs_config, num_request_queues,
- &fs->num_request_queues);
+ virtio_cread_le(vdev, struct virtio_fs_config, num_request_queues,
+ &fs->num_request_queues);
if (fs->num_request_queues == 0)
return -EINVAL;
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 8dfe09f52cbc..0f69fbd4af66 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -746,7 +746,7 @@ static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap,
}
if (n == 0)
break;
- /* fall through - To branching from existing tree */
+ fallthrough; /* To branching from existing tree */
case ALLOC_GROW_DEPTH:
if (i > 1 && i < mp->mp_fheight)
gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[i-1]);
@@ -757,7 +757,7 @@ static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap,
state = ALLOC_DATA;
if (n == 0)
break;
- /* fall through - To tree complete, adding data blocks */
+ fallthrough; /* To tree complete, adding data blocks */
case ALLOC_DATA:
BUG_ON(n > dblks);
BUG_ON(mp->mp_bh[end_of_metadata] == NULL);
@@ -1351,9 +1351,15 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi
return ret;
}
+/*
+ * NOTE: Never call gfs2_block_zero_range with an open transaction because it
+ * uses iomap write to perform its actions, which begin their own transactions
+ * (iomap_begin, page_prepare, etc.)
+ */
static int gfs2_block_zero_range(struct inode *inode, loff_t from,
unsigned int length)
{
+ BUG_ON(current->journal_info);
return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops);
}
@@ -1414,6 +1420,16 @@ static int trunc_start(struct inode *inode, u64 newsize)
u64 oldsize = inode->i_size;
int error;
+ if (!gfs2_is_stuffed(ip)) {
+ unsigned int blocksize = i_blocksize(inode);
+ unsigned int offs = newsize & (blocksize - 1);
+ if (offs) {
+ error = gfs2_block_zero_range(inode, newsize,
+ blocksize - offs);
+ if (error)
+ return error;
+ }
+ }
if (journaled)
error = gfs2_trans_begin(sdp, RES_DINODE + RES_JDATA, GFS2_JTRUNC_REVOKES);
else
@@ -1427,19 +1443,10 @@ static int trunc_start(struct inode *inode, u64 newsize)
gfs2_trans_add_meta(ip->i_gl, dibh);
- if (gfs2_is_stuffed(ip)) {
+ if (gfs2_is_stuffed(ip))
gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize);
- } else {
- unsigned int blocksize = i_blocksize(inode);
- unsigned int offs = newsize & (blocksize - 1);
- if (offs) {
- error = gfs2_block_zero_range(inode, newsize,
- blocksize - offs);
- if (error)
- goto out;
- }
+ else
ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
- }
i_size_write(inode, newsize);
ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
@@ -2448,25 +2455,7 @@ int __gfs2_punch_hole(struct file *file, loff_t offset, loff_t length)
loff_t start, end;
int error;
- start = round_down(offset, blocksize);
- end = round_up(offset + length, blocksize) - 1;
- error = filemap_write_and_wait_range(inode->i_mapping, start, end);
- if (error)
- return error;
-
- if (gfs2_is_jdata(ip))
- error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_JDATA,
- GFS2_JTRUNC_REVOKES);
- else
- error = gfs2_trans_begin(sdp, RES_DINODE, 0);
- if (error)
- return error;
-
- if (gfs2_is_stuffed(ip)) {
- error = stuffed_zero_range(inode, offset, length);
- if (error)
- goto out;
- } else {
+ if (!gfs2_is_stuffed(ip)) {
unsigned int start_off, end_len;
start_off = offset & (blocksize - 1);
@@ -2489,6 +2478,26 @@ int __gfs2_punch_hole(struct file *file, loff_t offset, loff_t length)
}
}
+ start = round_down(offset, blocksize);
+ end = round_up(offset + length, blocksize) - 1;
+ error = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ if (error)
+ return error;
+
+ if (gfs2_is_jdata(ip))
+ error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_JDATA,
+ GFS2_JTRUNC_REVOKES);
+ else
+ error = gfs2_trans_begin(sdp, RES_DINODE, 0);
+ if (error)
+ return error;
+
+ if (gfs2_is_stuffed(ip)) {
+ error = stuffed_zero_range(inode, offset, length);
+ if (error)
+ goto out;
+ }
+
if (gfs2_is_jdata(ip)) {
BUG_ON(!current->journal_info);
gfs2_journaled_truncate_range(inode, offset, length);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index b085a3bea4f0..b39b339feddc 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -781,39 +781,39 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
return ret ? ret : ret1;
}
-static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to)
+static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to,
+ struct gfs2_holder *gh)
{
struct file *file = iocb->ki_filp;
struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
size_t count = iov_iter_count(to);
- struct gfs2_holder gh;
ssize_t ret;
if (!count)
return 0; /* skip atime */
- gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, &gh);
- ret = gfs2_glock_nq(&gh);
+ gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, gh);
+ ret = gfs2_glock_nq(gh);
if (ret)
goto out_uninit;
ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL,
is_sync_kiocb(iocb));
- gfs2_glock_dq(&gh);
+ gfs2_glock_dq(gh);
out_uninit:
- gfs2_holder_uninit(&gh);
+ gfs2_holder_uninit(gh);
return ret;
}
-static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
+static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
+ struct gfs2_holder *gh)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
struct gfs2_inode *ip = GFS2_I(inode);
size_t len = iov_iter_count(from);
loff_t offset = iocb->ki_pos;
- struct gfs2_holder gh;
ssize_t ret;
/*
@@ -824,8 +824,8 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
* unfortunately, have the option of only flushing a range like the
* VFS does.
*/
- gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, &gh);
- ret = gfs2_glock_nq(&gh);
+ gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, gh);
+ ret = gfs2_glock_nq(gh);
if (ret)
goto out_uninit;
@@ -838,9 +838,9 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
if (ret == -ENOTBLK)
ret = 0;
out:
- gfs2_glock_dq(&gh);
+ gfs2_glock_dq(gh);
out_uninit:
- gfs2_holder_uninit(&gh);
+ gfs2_holder_uninit(gh);
return ret;
}
@@ -852,7 +852,7 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
ssize_t ret;
if (iocb->ki_flags & IOCB_DIRECT) {
- ret = gfs2_file_direct_read(iocb, to);
+ ret = gfs2_file_direct_read(iocb, to, &gh);
if (likely(ret != -ENOTBLK))
return ret;
iocb->ki_flags &= ~IOCB_DIRECT;
@@ -901,13 +901,12 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_holder gh;
ssize_t ret;
gfs2_size_hint(file, iocb->ki_pos, iov_iter_count(from));
if (iocb->ki_flags & IOCB_APPEND) {
- struct gfs2_holder gh;
-
ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
if (ret)
return ret;
@@ -931,7 +930,7 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct address_space *mapping = file->f_mapping;
ssize_t buffered, ret2;
- ret = gfs2_file_direct_write(iocb, from);
+ ret = gfs2_file_direct_write(iocb, from, &gh);
if (ret < 0 || !iov_iter_count(from))
goto out_unlock;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 8545024a1401..f13b136654ca 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -790,9 +790,11 @@ static void gfs2_glock_poke(struct gfs2_glock *gl)
struct gfs2_holder gh;
int error;
- error = gfs2_glock_nq_init(gl, LM_ST_SHARED, flags, &gh);
+ gfs2_holder_init(gl, LM_ST_SHARED, flags, &gh);
+ error = gfs2_glock_nq(&gh);
if (!error)
gfs2_glock_dq(&gh);
+ gfs2_holder_uninit(&gh);
}
static bool gfs2_try_evict(struct gfs2_glock *gl)
@@ -2106,6 +2108,12 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl)
*p++ = 'o';
if (test_bit(GLF_BLOCKING, gflags))
*p++ = 'b';
+ if (test_bit(GLF_INODE_CREATING, gflags))
+ *p++ = 'c';
+ if (test_bit(GLF_PENDING_DELETE, gflags))
+ *p++ = 'P';
+ if (test_bit(GLF_FREEING, gflags))
+ *p++ = 'x';
*p = 0;
return buf;
}
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index a76e55bc28eb..3763c9ff1406 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -902,6 +902,36 @@ static void empty_ail1_list(struct gfs2_sbd *sdp)
}
/**
+ * drain_bd - drain the buf and databuf queue for a failed transaction
+ * @tr: the transaction to drain
+ *
+ * When this is called, we're taking an error exit for a log write that failed
+ * but since we bypassed the after_commit functions, we need to remove the
+ * items from the buf and databuf queue.
+ */
+static void trans_drain(struct gfs2_trans *tr)
+{
+ struct gfs2_bufdata *bd;
+ struct list_head *head;
+
+ if (!tr)
+ return;
+
+ head = &tr->tr_buf;
+ while (!list_empty(head)) {
+ bd = list_first_entry(head, struct gfs2_bufdata, bd_list);
+ list_del_init(&bd->bd_list);
+ kmem_cache_free(gfs2_bufdata_cachep, bd);
+ }
+ head = &tr->tr_databuf;
+ while (!list_empty(head)) {
+ bd = list_first_entry(head, struct gfs2_bufdata, bd_list);
+ list_del_init(&bd->bd_list);
+ kmem_cache_free(gfs2_bufdata_cachep, bd);
+ }
+}
+
+/**
* gfs2_log_flush - flush incore transaction(s)
* @sdp: the filesystem
* @gl: The glock structure to flush. If NULL, flush the whole incore log
@@ -1005,6 +1035,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags)
out:
if (gfs2_withdrawn(sdp)) {
+ trans_drain(tr);
/**
* If the tr_list is empty, we're withdrawing during a log
* flush that targets a transaction, but the transaction was
@@ -1092,7 +1123,7 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
* or the total number of used blocks (pinned blocks plus AIL blocks)
* is greater than thresh2.
*
- * At mount time thresh1 is 1/3rd of journal size, thresh2 is 2/3rd of
+ * At mount time thresh1 is 2/5ths of journal size, thresh2 is 4/5ths of
* journal size.
*
* Returns: errno
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 4b67d47a7e00..6e173ae378c4 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -1599,7 +1599,7 @@ static int gfs2_quota_get_state(struct super_block *sb, struct qc_state *state)
case GFS2_QUOTA_ON:
state->s_state[USRQUOTA].flags |= QCI_LIMITS_ENFORCED;
state->s_state[GRPQUOTA].flags |= QCI_LIMITS_ENFORCED;
- /*FALLTHRU*/
+ fallthrough;
case GFS2_QUOTA_ACCOUNT:
state->s_state[USRQUOTA].flags |= QCI_ACCT_ENABLED |
QCI_SYSFILE;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 47d0ae158b69..9f4d9e7be839 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -566,6 +566,7 @@ static void gfs2_dirty_inode(struct inode *inode, int flags)
ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
if (ret) {
fs_err(sdp, "dirty_inode: glock %d\n", ret);
+ gfs2_dump_glock(NULL, ip->i_gl, true);
return;
}
need_unlock = 1;
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index a3dfa3aa87ad..6d4bf7ea7b3b 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -25,13 +25,28 @@
#include "util.h"
#include "trace_gfs2.h"
+static void gfs2_print_trans(struct gfs2_sbd *sdp, const struct gfs2_trans *tr)
+{
+ fs_warn(sdp, "Transaction created at: %pSR\n", (void *)tr->tr_ip);
+ fs_warn(sdp, "blocks=%u revokes=%u reserved=%u touched=%u\n",
+ tr->tr_blocks, tr->tr_revokes, tr->tr_reserved,
+ test_bit(TR_TOUCHED, &tr->tr_flags));
+ fs_warn(sdp, "Buf %u/%u Databuf %u/%u Revoke %u/%u\n",
+ tr->tr_num_buf_new, tr->tr_num_buf_rm,
+ tr->tr_num_databuf_new, tr->tr_num_databuf_rm,
+ tr->tr_num_revoke, tr->tr_num_revoke_rm);
+}
+
int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
unsigned int revokes)
{
struct gfs2_trans *tr;
int error;
- BUG_ON(current->journal_info);
+ if (current->journal_info) {
+ gfs2_print_trans(sdp, current->journal_info);
+ BUG();
+ }
BUG_ON(blocks == 0 && revokes == 0);
if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
@@ -52,6 +67,7 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
tr->tr_reserved += gfs2_struct2blk(sdp, revokes);
INIT_LIST_HEAD(&tr->tr_databuf);
INIT_LIST_HEAD(&tr->tr_buf);
+ INIT_LIST_HEAD(&tr->tr_list);
INIT_LIST_HEAD(&tr->tr_ail1_list);
INIT_LIST_HEAD(&tr->tr_ail2_list);
@@ -72,18 +88,6 @@ fail:
return error;
}
-static void gfs2_print_trans(struct gfs2_sbd *sdp, const struct gfs2_trans *tr)
-{
- fs_warn(sdp, "Transaction created at: %pSR\n", (void *)tr->tr_ip);
- fs_warn(sdp, "blocks=%u revokes=%u reserved=%u touched=%u\n",
- tr->tr_blocks, tr->tr_revokes, tr->tr_reserved,
- test_bit(TR_TOUCHED, &tr->tr_flags));
- fs_warn(sdp, "Buf %u/%u Databuf %u/%u Revoke %u/%u\n",
- tr->tr_num_buf_new, tr->tr_num_buf_rm,
- tr->tr_num_databuf_new, tr->tr_num_databuf_rm,
- tr->tr_num_revoke, tr->tr_num_revoke_rm);
-}
-
void gfs2_trans_end(struct gfs2_sbd *sdp)
{
struct gfs2_trans *tr = current->journal_info;
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index 61eec628805d..0350dc7821bf 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -195,7 +195,7 @@ reread:
switch (sbi->s_vhdr->signature) {
case cpu_to_be16(HFSPLUS_VOLHEAD_SIGX):
set_bit(HFSPLUS_SB_HFSX, &sbi->flags);
- /*FALLTHRU*/
+ fallthrough;
case cpu_to_be16(HFSPLUS_VOLHEAD_SIG):
break;
case cpu_to_be16(HFSP_WRAP_MAGIC):
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 523954d00dff..b5c109703daa 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -1364,6 +1364,12 @@ hugetlbfs_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_magic = HUGETLBFS_MAGIC;
sb->s_op = &hugetlbfs_ops;
sb->s_time_gran = 1;
+
+ /*
+ * Due to the special and limited functionality of hugetlbfs, it does
+ * not work well as a stacking filesystem.
+ */
+ sb->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
sb->s_root = d_make_root(hugetlbfs_get_root(sb, ctx));
if (!sb->s_root)
goto out_free;
diff --git a/fs/internal.h b/fs/internal.h
index 10517ece4516..a7cd0f64faa4 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -82,9 +82,6 @@ int may_linkat(struct path *link);
/*
* namespace.c
*/
-extern void *copy_mount_options(const void __user *);
-extern char *copy_mount_string(const void __user *);
-
extern struct vfsmount *lookup_mnt(const struct path *);
extern int finish_automount(struct vfsmount *, struct path *);
diff --git a/fs/io-wq.c b/fs/io-wq.c
index e92c4724480c..0a182f1333e8 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -17,6 +17,7 @@
#include <linux/rculist_nulls.h>
#include <linux/fs_struct.h>
#include <linux/task_work.h>
+#include <linux/blk-cgroup.h>
#include "io-wq.h"
@@ -26,9 +27,8 @@ enum {
IO_WORKER_F_UP = 1, /* up and active */
IO_WORKER_F_RUNNING = 2, /* account as running */
IO_WORKER_F_FREE = 4, /* worker on free list */
- IO_WORKER_F_EXITING = 8, /* worker exiting */
- IO_WORKER_F_FIXED = 16, /* static idle worker */
- IO_WORKER_F_BOUND = 32, /* is doing bounded work */
+ IO_WORKER_F_FIXED = 8, /* static idle worker */
+ IO_WORKER_F_BOUND = 16, /* is doing bounded work */
};
enum {
@@ -57,9 +57,13 @@ struct io_worker {
struct rcu_head rcu;
struct mm_struct *mm;
+#ifdef CONFIG_BLK_CGROUP
+ struct cgroup_subsys_state *blkcg_css;
+#endif
const struct cred *cur_creds;
const struct cred *saved_creds;
struct files_struct *restore_files;
+ struct nsproxy *restore_nsproxy;
struct fs_struct *restore_fs;
};
@@ -87,7 +91,7 @@ enum {
*/
struct io_wqe {
struct {
- spinlock_t lock;
+ raw_spinlock_t lock;
struct io_wq_work_list work_list;
unsigned long hash_map;
unsigned flags;
@@ -148,11 +152,12 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker)
if (current->files != worker->restore_files) {
__acquire(&wqe->lock);
- spin_unlock_irq(&wqe->lock);
+ raw_spin_unlock_irq(&wqe->lock);
dropped_lock = true;
task_lock(current);
current->files = worker->restore_files;
+ current->nsproxy = worker->restore_nsproxy;
task_unlock(current);
}
@@ -166,7 +171,7 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker)
if (worker->mm) {
if (!dropped_lock) {
__acquire(&wqe->lock);
- spin_unlock_irq(&wqe->lock);
+ raw_spin_unlock_irq(&wqe->lock);
dropped_lock = true;
}
__set_current_state(TASK_RUNNING);
@@ -175,6 +180,13 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker)
worker->mm = NULL;
}
+#ifdef CONFIG_BLK_CGROUP
+ if (worker->blkcg_css) {
+ kthread_associate_blkcg(NULL);
+ worker->blkcg_css = NULL;
+ }
+#endif
+
return dropped_lock;
}
@@ -200,7 +212,6 @@ static void io_worker_exit(struct io_worker *worker)
{
struct io_wqe *wqe = worker->wqe;
struct io_wqe_acct *acct = io_wqe_get_acct(wqe, worker);
- unsigned nr_workers;
/*
* If we're not at zero, someone else is holding a brief reference
@@ -220,23 +231,19 @@ static void io_worker_exit(struct io_worker *worker)
worker->flags = 0;
preempt_enable();
- spin_lock_irq(&wqe->lock);
+ raw_spin_lock_irq(&wqe->lock);
hlist_nulls_del_rcu(&worker->nulls_node);
list_del_rcu(&worker->all_list);
if (__io_worker_unuse(wqe, worker)) {
__release(&wqe->lock);
- spin_lock_irq(&wqe->lock);
+ raw_spin_lock_irq(&wqe->lock);
}
acct->nr_workers--;
- nr_workers = wqe->acct[IO_WQ_ACCT_BOUND].nr_workers +
- wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers;
- spin_unlock_irq(&wqe->lock);
-
- /* all workers gone, wq exit can proceed */
- if (!nr_workers && refcount_dec_and_test(&wqe->wq->refs))
- complete(&wqe->wq->done);
+ raw_spin_unlock_irq(&wqe->lock);
kfree_rcu(worker, rcu);
+ if (refcount_dec_and_test(&wqe->wq->refs))
+ complete(&wqe->wq->done);
}
static inline bool io_wqe_run_queue(struct io_wqe *wqe)
@@ -318,6 +325,7 @@ static void io_worker_start(struct io_wqe *wqe, struct io_worker *worker)
worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING);
worker->restore_files = current->files;
+ worker->restore_nsproxy = current->nsproxy;
worker->restore_fs = current->fs;
io_wqe_inc_running(wqe, worker);
}
@@ -436,6 +444,17 @@ static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work)
work->flags |= IO_WQ_WORK_CANCEL;
}
+static inline void io_wq_switch_blkcg(struct io_worker *worker,
+ struct io_wq_work *work)
+{
+#ifdef CONFIG_BLK_CGROUP
+ if (work->blkcg_css != worker->blkcg_css) {
+ kthread_associate_blkcg(work->blkcg_css);
+ worker->blkcg_css = work->blkcg_css;
+ }
+#endif
+}
+
static void io_wq_switch_creds(struct io_worker *worker,
struct io_wq_work *work)
{
@@ -454,6 +473,7 @@ static void io_impersonate_work(struct io_worker *worker,
if (work->files && current->files != work->files) {
task_lock(current);
current->files = work->files;
+ current->nsproxy = work->nsproxy;
task_unlock(current);
}
if (work->fs && current->fs != work->fs)
@@ -463,6 +483,7 @@ static void io_impersonate_work(struct io_worker *worker,
if (worker->cur_creds != work->creds)
io_wq_switch_creds(worker, work);
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = work->fsize;
+ io_wq_switch_blkcg(worker, work);
}
static void io_assign_current_work(struct io_worker *worker,
@@ -504,7 +525,7 @@ get_next:
else if (!wq_list_empty(&wqe->work_list))
wqe->flags |= IO_WQE_FLAG_STALLED;
- spin_unlock_irq(&wqe->lock);
+ raw_spin_unlock_irq(&wqe->lock);
if (!work)
break;
io_assign_current_work(worker, work);
@@ -538,17 +559,17 @@ get_next:
io_wqe_enqueue(wqe, linked);
if (hash != -1U && !next_hashed) {
- spin_lock_irq(&wqe->lock);
+ raw_spin_lock_irq(&wqe->lock);
wqe->hash_map &= ~BIT_ULL(hash);
wqe->flags &= ~IO_WQE_FLAG_STALLED;
/* skip unnecessary unlock-lock wqe->lock */
if (!work)
goto get_next;
- spin_unlock_irq(&wqe->lock);
+ raw_spin_unlock_irq(&wqe->lock);
}
} while (work);
- spin_lock_irq(&wqe->lock);
+ raw_spin_lock_irq(&wqe->lock);
} while (1);
}
@@ -563,7 +584,7 @@ static int io_wqe_worker(void *data)
while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
set_current_state(TASK_INTERRUPTIBLE);
loop:
- spin_lock_irq(&wqe->lock);
+ raw_spin_lock_irq(&wqe->lock);
if (io_wqe_run_queue(wqe)) {
__set_current_state(TASK_RUNNING);
io_worker_handle_work(worker);
@@ -574,7 +595,7 @@ loop:
__release(&wqe->lock);
goto loop;
}
- spin_unlock_irq(&wqe->lock);
+ raw_spin_unlock_irq(&wqe->lock);
if (signal_pending(current))
flush_signals(current);
if (schedule_timeout(WORKER_IDLE_TIMEOUT))
@@ -586,11 +607,11 @@ loop:
}
if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
- spin_lock_irq(&wqe->lock);
+ raw_spin_lock_irq(&wqe->lock);
if (!wq_list_empty(&wqe->work_list))
io_worker_handle_work(worker);
else
- spin_unlock_irq(&wqe->lock);
+ raw_spin_unlock_irq(&wqe->lock);
}
io_worker_exit(worker);
@@ -630,14 +651,14 @@ void io_wq_worker_sleeping(struct task_struct *tsk)
worker->flags &= ~IO_WORKER_F_RUNNING;
- spin_lock_irq(&wqe->lock);
+ raw_spin_lock_irq(&wqe->lock);
io_wqe_dec_running(wqe, worker);
- spin_unlock_irq(&wqe->lock);
+ raw_spin_unlock_irq(&wqe->lock);
}
static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
{
- struct io_wqe_acct *acct =&wqe->acct[index];
+ struct io_wqe_acct *acct = &wqe->acct[index];
struct io_worker *worker;
worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node);
@@ -656,7 +677,7 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
return false;
}
- spin_lock_irq(&wqe->lock);
+ raw_spin_lock_irq(&wqe->lock);
hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
list_add_tail_rcu(&worker->all_list, &wqe->all_list);
worker->flags |= IO_WORKER_F_FREE;
@@ -665,11 +686,12 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
if (!acct->nr_workers && (worker->flags & IO_WORKER_F_BOUND))
worker->flags |= IO_WORKER_F_FIXED;
acct->nr_workers++;
- spin_unlock_irq(&wqe->lock);
+ raw_spin_unlock_irq(&wqe->lock);
if (index == IO_WQ_ACCT_UNBOUND)
atomic_inc(&wq->user->processes);
+ refcount_inc(&wq->refs);
wake_up_process(worker->task);
return true;
}
@@ -685,28 +707,63 @@ static inline bool io_wqe_need_worker(struct io_wqe *wqe, int index)
return acct->nr_workers < acct->max_workers;
}
+static bool io_wqe_worker_send_sig(struct io_worker *worker, void *data)
+{
+ send_sig(SIGINT, worker->task, 1);
+ return false;
+}
+
+/*
+ * Iterate the passed in list and call the specific function for each
+ * worker that isn't exiting
+ */
+static bool io_wq_for_each_worker(struct io_wqe *wqe,
+ bool (*func)(struct io_worker *, void *),
+ void *data)
+{
+ struct io_worker *worker;
+ bool ret = false;
+
+ list_for_each_entry_rcu(worker, &wqe->all_list, all_list) {
+ if (io_worker_get(worker)) {
+ /* no task if node is/was offline */
+ if (worker->task)
+ ret = func(worker, data);
+ io_worker_release(worker);
+ if (ret)
+ break;
+ }
+ }
+
+ return ret;
+}
+
+static bool io_wq_worker_wake(struct io_worker *worker, void *data)
+{
+ wake_up_process(worker->task);
+ return false;
+}
+
/*
* Manager thread. Tasked with creating new workers, if we need them.
*/
static int io_wq_manager(void *data)
{
struct io_wq *wq = data;
- int workers_to_create = num_possible_nodes();
int node;
/* create fixed workers */
- refcount_set(&wq->refs, workers_to_create);
+ refcount_set(&wq->refs, 1);
for_each_node(node) {
if (!node_online(node))
continue;
- if (!create_io_worker(wq, wq->wqes[node], IO_WQ_ACCT_BOUND))
- goto err;
- workers_to_create--;
+ if (create_io_worker(wq, wq->wqes[node], IO_WQ_ACCT_BOUND))
+ continue;
+ set_bit(IO_WQ_BIT_ERROR, &wq->state);
+ set_bit(IO_WQ_BIT_EXIT, &wq->state);
+ goto out;
}
- while (workers_to_create--)
- refcount_dec(&wq->refs);
-
complete(&wq->done);
while (!kthread_should_stop()) {
@@ -720,12 +777,12 @@ static int io_wq_manager(void *data)
if (!node_online(node))
continue;
- spin_lock_irq(&wqe->lock);
+ raw_spin_lock_irq(&wqe->lock);
if (io_wqe_need_worker(wqe, IO_WQ_ACCT_BOUND))
fork_worker[IO_WQ_ACCT_BOUND] = true;
if (io_wqe_need_worker(wqe, IO_WQ_ACCT_UNBOUND))
fork_worker[IO_WQ_ACCT_UNBOUND] = true;
- spin_unlock_irq(&wqe->lock);
+ raw_spin_unlock_irq(&wqe->lock);
if (fork_worker[IO_WQ_ACCT_BOUND])
create_io_worker(wq, wqe, IO_WQ_ACCT_BOUND);
if (fork_worker[IO_WQ_ACCT_UNBOUND])
@@ -738,12 +795,18 @@ static int io_wq_manager(void *data)
if (current->task_works)
task_work_run();
- return 0;
-err:
- set_bit(IO_WQ_BIT_ERROR, &wq->state);
- set_bit(IO_WQ_BIT_EXIT, &wq->state);
- if (refcount_sub_and_test(workers_to_create, &wq->refs))
+out:
+ if (refcount_dec_and_test(&wq->refs)) {
complete(&wq->done);
+ return 0;
+ }
+ /* if ERROR is set and we get here, we have workers to wake */
+ if (test_bit(IO_WQ_BIT_ERROR, &wq->state)) {
+ rcu_read_lock();
+ for_each_node(node)
+ io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL);
+ rcu_read_unlock();
+ }
return 0;
}
@@ -821,10 +884,10 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
}
work_flags = work->flags;
- spin_lock_irqsave(&wqe->lock, flags);
+ raw_spin_lock_irqsave(&wqe->lock, flags);
io_wqe_insert_work(wqe, work);
wqe->flags &= ~IO_WQE_FLAG_STALLED;
- spin_unlock_irqrestore(&wqe->lock, flags);
+ raw_spin_unlock_irqrestore(&wqe->lock, flags);
if ((work_flags & IO_WQ_WORK_CONCURRENT) ||
!atomic_read(&acct->nr_running))
@@ -850,37 +913,6 @@ void io_wq_hash_work(struct io_wq_work *work, void *val)
work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT));
}
-static bool io_wqe_worker_send_sig(struct io_worker *worker, void *data)
-{
- send_sig(SIGINT, worker->task, 1);
- return false;
-}
-
-/*
- * Iterate the passed in list and call the specific function for each
- * worker that isn't exiting
- */
-static bool io_wq_for_each_worker(struct io_wqe *wqe,
- bool (*func)(struct io_worker *, void *),
- void *data)
-{
- struct io_worker *worker;
- bool ret = false;
-
- list_for_each_entry_rcu(worker, &wqe->all_list, all_list) {
- if (io_worker_get(worker)) {
- /* no task if node is/was offline */
- if (worker->task)
- ret = func(worker, data);
- io_worker_release(worker);
- if (ret)
- break;
- }
- }
-
- return ret;
-}
-
void io_wq_cancel_all(struct io_wq *wq)
{
int node;
@@ -925,6 +957,24 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
return match->nr_running && !match->cancel_all;
}
+static inline void io_wqe_remove_pending(struct io_wqe *wqe,
+ struct io_wq_work *work,
+ struct io_wq_work_node *prev)
+{
+ unsigned int hash = io_get_work_hash(work);
+ struct io_wq_work *prev_work = NULL;
+
+ if (io_wq_is_hashed(work) && work == wqe->hash_tail[hash]) {
+ if (prev)
+ prev_work = container_of(prev, struct io_wq_work, list);
+ if (prev_work && io_get_work_hash(prev_work) == hash)
+ wqe->hash_tail[hash] = prev_work;
+ else
+ wqe->hash_tail[hash] = NULL;
+ }
+ wq_list_del(&wqe->work_list, &work->list, prev);
+}
+
static void io_wqe_cancel_pending_work(struct io_wqe *wqe,
struct io_cb_cancel_data *match)
{
@@ -933,14 +983,13 @@ static void io_wqe_cancel_pending_work(struct io_wqe *wqe,
unsigned long flags;
retry:
- spin_lock_irqsave(&wqe->lock, flags);
+ raw_spin_lock_irqsave(&wqe->lock, flags);
wq_list_for_each(node, prev, &wqe->work_list) {
work = container_of(node, struct io_wq_work, list);
if (!match->fn(work, match->data))
continue;
-
- wq_list_del(&wqe->work_list, node, prev);
- spin_unlock_irqrestore(&wqe->lock, flags);
+ io_wqe_remove_pending(wqe, work, prev);
+ raw_spin_unlock_irqrestore(&wqe->lock, flags);
io_run_cancel(work, wqe);
match->nr_pending++;
if (!match->cancel_all)
@@ -949,7 +998,7 @@ retry:
/* not safe to continue after unlock */
goto retry;
}
- spin_unlock_irqrestore(&wqe->lock, flags);
+ raw_spin_unlock_irqrestore(&wqe->lock, flags);
}
static void io_wqe_cancel_running_work(struct io_wqe *wqe,
@@ -1057,7 +1106,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
}
atomic_set(&wqe->acct[IO_WQ_ACCT_UNBOUND].nr_running, 0);
wqe->wq = wq;
- spin_lock_init(&wqe->lock);
+ raw_spin_lock_init(&wqe->lock);
INIT_WQ_LIST(&wqe->work_list);
INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0);
INIT_LIST_HEAD(&wqe->all_list);
@@ -1096,12 +1145,6 @@ bool io_wq_get(struct io_wq *wq, struct io_wq_data *data)
return refcount_inc_not_zero(&wq->use_refs);
}
-static bool io_wq_worker_wake(struct io_worker *worker, void *data)
-{
- wake_up_process(worker->task);
- return false;
-}
-
static void __io_wq_destroy(struct io_wq *wq)
{
int node;
diff --git a/fs/io-wq.h b/fs/io-wq.h
index ddaf9614cf9b..84bcf6a85523 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -87,7 +87,11 @@ struct io_wq_work {
struct io_wq_work_node list;
struct files_struct *files;
struct mm_struct *mm;
+#ifdef CONFIG_BLK_CGROUP
+ struct cgroup_subsys_state *blkcg_css;
+#endif
const struct cred *creds;
+ struct nsproxy *nsproxy;
struct fs_struct *fs;
unsigned long fsize;
unsigned flags;
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 2a3af95be4ca..fc6de6b4784e 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -79,6 +79,8 @@
#include <linux/splice.h>
#include <linux/task_work.h>
#include <linux/pagemap.h>
+#include <linux/io_uring.h>
+#include <linux/blk-cgroup.h>
#define CREATE_TRACE_POINTS
#include <trace/events/io_uring.h>
@@ -98,6 +100,8 @@
#define IORING_MAX_FILES_TABLE (1U << IORING_FILE_TABLE_SHIFT)
#define IORING_FILE_TABLE_MASK (IORING_MAX_FILES_TABLE - 1)
#define IORING_MAX_FIXED_FILES (64 * IORING_MAX_FILES_TABLE)
+#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
+ IORING_REGISTER_LAST + IORING_OP_LAST)
struct io_uring {
u32 head ____cacheline_aligned_in_smp;
@@ -187,6 +191,7 @@ struct io_mapped_ubuf {
size_t len;
struct bio_vec *bvec;
unsigned int nr_bvecs;
+ unsigned long acct_pages;
};
struct fixed_file_table {
@@ -205,7 +210,7 @@ struct fixed_file_data {
struct fixed_file_table *table;
struct io_ring_ctx *ctx;
- struct percpu_ref *cur_refs;
+ struct fixed_file_ref_node *node;
struct percpu_ref refs;
struct completion done;
struct list_head ref_list;
@@ -219,6 +224,27 @@ struct io_buffer {
__u16 bid;
};
+struct io_restriction {
+ DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
+ DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
+ u8 sqe_flags_allowed;
+ u8 sqe_flags_required;
+ bool registered;
+};
+
+struct io_sq_data {
+ refcount_t refs;
+ struct mutex lock;
+
+ /* ctx's that are using this sqd */
+ struct list_head ctx_list;
+ struct list_head ctx_new_list;
+ struct mutex ctx_lock;
+
+ struct task_struct *thread;
+ struct wait_queue_head wait;
+};
+
struct io_ring_ctx {
struct {
struct percpu_ref refs;
@@ -231,6 +257,7 @@ struct io_ring_ctx {
unsigned int cq_overflow_flushed: 1;
unsigned int drain_next: 1;
unsigned int eventfd_async: 1;
+ unsigned int restricted: 1;
/*
* Ring buffer of indices into array of io_uring_sqe, which is
@@ -264,9 +291,25 @@ struct io_ring_ctx {
/* IO offload */
struct io_wq *io_wq;
- struct task_struct *sqo_thread; /* if using sq thread polling */
- struct mm_struct *sqo_mm;
- wait_queue_head_t sqo_wait;
+
+ /*
+ * For SQPOLL usage - we hold a reference to the parent task, so we
+ * have access to the ->files
+ */
+ struct task_struct *sqo_task;
+
+ /* Only used for accounting purposes */
+ struct mm_struct *mm_account;
+
+#ifdef CONFIG_BLK_CGROUP
+ struct cgroup_subsys_state *sqo_blkcg_css;
+#endif
+
+ struct io_sq_data *sq_data; /* if using sq thread polling */
+
+ struct wait_queue_head sqo_sq_wait;
+ struct wait_queue_entry sqo_wait_entry;
+ struct list_head sqd_list;
/*
* If used, fixed file set. Writers must ensure that ->refs is dead,
@@ -275,8 +318,6 @@ struct io_ring_ctx {
*/
struct fixed_file_data *file_data;
unsigned nr_user_files;
- int ring_fd;
- struct file *ring_file;
/* if used, fixed mapped user buffers */
unsigned nr_user_bufs;
@@ -338,6 +379,7 @@ struct io_ring_ctx {
struct llist_head file_put_llist;
struct work_struct exit_work;
+ struct io_restriction restrictions;
};
/*
@@ -392,13 +434,16 @@ struct io_cancel {
struct io_timeout {
struct file *file;
- u64 addr;
- int flags;
u32 off;
u32 target_seq;
struct list_head list;
};
+struct io_timeout_rem {
+ struct file *file;
+ u64 addr;
+};
+
struct io_rw {
/* NOTE: kiocb has the file as the first member, so don't do it here */
struct kiocb kiocb;
@@ -508,21 +553,12 @@ struct io_async_msghdr {
struct io_async_rw {
struct iovec fast_iov[UIO_FASTIOV];
- struct iovec *iov;
- ssize_t nr_segs;
- ssize_t size;
+ const struct iovec *free_iovec;
+ struct iov_iter iter;
+ size_t bytes_done;
struct wait_page_queue wpq;
};
-struct io_async_ctx {
- union {
- struct io_async_rw rw;
- struct io_async_msghdr msg;
- struct io_async_connect connect;
- struct io_timeout_data timeout;
- };
-};
-
enum {
REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT,
REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT,
@@ -540,12 +576,10 @@ enum {
REQ_F_ISREG_BIT,
REQ_F_COMP_LOCKED_BIT,
REQ_F_NEED_CLEANUP_BIT,
- REQ_F_OVERFLOW_BIT,
REQ_F_POLLED_BIT,
REQ_F_BUFFER_SELECTED_BIT,
REQ_F_NO_FILE_TABLE_BIT,
REQ_F_WORK_INITIALIZED_BIT,
- REQ_F_TASK_PINNED_BIT,
/* not a real bit, just to check we're not overflowing the space */
__REQ_F_LAST_BIT,
@@ -583,8 +617,6 @@ enum {
REQ_F_COMP_LOCKED = BIT(REQ_F_COMP_LOCKED_BIT),
/* needs cleanup */
REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT),
- /* in overflow list */
- REQ_F_OVERFLOW = BIT(REQ_F_OVERFLOW_BIT),
/* already went through poll handler */
REQ_F_POLLED = BIT(REQ_F_POLLED_BIT),
/* buffer already selected */
@@ -593,8 +625,6 @@ enum {
REQ_F_NO_FILE_TABLE = BIT(REQ_F_NO_FILE_TABLE_BIT),
/* io_wq_work is initialized */
REQ_F_WORK_INITIALIZED = BIT(REQ_F_WORK_INITIALIZED_BIT),
- /* req->task is refcounted */
- REQ_F_TASK_PINNED = BIT(REQ_F_TASK_PINNED_BIT),
};
struct async_poll {
@@ -617,6 +647,7 @@ struct io_kiocb {
struct io_sync sync;
struct io_cancel cancel;
struct io_timeout timeout;
+ struct io_timeout_rem timeout_rem;
struct io_connect connect;
struct io_sr_msg sr_msg;
struct io_open open;
@@ -632,7 +663,8 @@ struct io_kiocb {
struct io_completion compl;
};
- struct io_async_ctx *io;
+ /* opcode allocated if it needs to store data for async defer */
+ void *async_data;
u8 opcode;
/* polled IO has completed */
u8 iopoll_completed;
@@ -700,8 +732,6 @@ struct io_submit_state {
};
struct io_op_def {
- /* needs req->io allocated for deferral/async */
- unsigned async_ctx : 1;
/* needs current->mm setup, does mm access */
unsigned needs_mm : 1;
/* needs req->file assigned */
@@ -723,35 +753,49 @@ struct io_op_def {
unsigned pollout : 1;
/* op supports buffer selection */
unsigned buffer_select : 1;
+ /* needs rlimit(RLIMIT_FSIZE) assigned */
unsigned needs_fsize : 1;
+ /* must always have async data allocated */
+ unsigned needs_async_data : 1;
+ /* needs blkcg context, issues async io potentially */
+ unsigned needs_blkcg : 1;
+ /* size of async data needed, if any */
+ unsigned short async_size;
};
-static const struct io_op_def io_op_defs[] = {
+static const struct io_op_def io_op_defs[] __read_mostly = {
[IORING_OP_NOP] = {},
[IORING_OP_READV] = {
- .async_ctx = 1,
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
.buffer_select = 1,
+ .needs_async_data = 1,
+ .needs_blkcg = 1,
+ .async_size = sizeof(struct io_async_rw),
},
[IORING_OP_WRITEV] = {
- .async_ctx = 1,
.needs_mm = 1,
.needs_file = 1,
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
.needs_fsize = 1,
+ .needs_async_data = 1,
+ .needs_blkcg = 1,
+ .async_size = sizeof(struct io_async_rw),
},
[IORING_OP_FSYNC] = {
.needs_file = 1,
+ .needs_blkcg = 1,
},
[IORING_OP_READ_FIXED] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
+ .needs_blkcg = 1,
+ .async_size = sizeof(struct io_async_rw),
},
[IORING_OP_WRITE_FIXED] = {
.needs_file = 1,
@@ -759,6 +803,8 @@ static const struct io_op_def io_op_defs[] = {
.unbound_nonreg_file = 1,
.pollout = 1,
.needs_fsize = 1,
+ .needs_blkcg = 1,
+ .async_size = sizeof(struct io_async_rw),
},
[IORING_OP_POLL_ADD] = {
.needs_file = 1,
@@ -767,27 +813,33 @@ static const struct io_op_def io_op_defs[] = {
[IORING_OP_POLL_REMOVE] = {},
[IORING_OP_SYNC_FILE_RANGE] = {
.needs_file = 1,
+ .needs_blkcg = 1,
},
[IORING_OP_SENDMSG] = {
- .async_ctx = 1,
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
.needs_fs = 1,
.pollout = 1,
+ .needs_async_data = 1,
+ .needs_blkcg = 1,
+ .async_size = sizeof(struct io_async_msghdr),
},
[IORING_OP_RECVMSG] = {
- .async_ctx = 1,
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
.needs_fs = 1,
.pollin = 1,
.buffer_select = 1,
+ .needs_async_data = 1,
+ .needs_blkcg = 1,
+ .async_size = sizeof(struct io_async_msghdr),
},
[IORING_OP_TIMEOUT] = {
- .async_ctx = 1,
.needs_mm = 1,
+ .needs_async_data = 1,
+ .async_size = sizeof(struct io_timeout_data),
},
[IORING_OP_TIMEOUT_REMOVE] = {},
[IORING_OP_ACCEPT] = {
@@ -799,28 +851,33 @@ static const struct io_op_def io_op_defs[] = {
},
[IORING_OP_ASYNC_CANCEL] = {},
[IORING_OP_LINK_TIMEOUT] = {
- .async_ctx = 1,
.needs_mm = 1,
+ .needs_async_data = 1,
+ .async_size = sizeof(struct io_timeout_data),
},
[IORING_OP_CONNECT] = {
- .async_ctx = 1,
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
+ .needs_async_data = 1,
+ .async_size = sizeof(struct io_async_connect),
},
[IORING_OP_FALLOCATE] = {
.needs_file = 1,
.needs_fsize = 1,
+ .needs_blkcg = 1,
},
[IORING_OP_OPENAT] = {
.file_table = 1,
.needs_fs = 1,
+ .needs_blkcg = 1,
},
[IORING_OP_CLOSE] = {
.needs_file = 1,
.needs_file_no_error = 1,
.file_table = 1,
+ .needs_blkcg = 1,
},
[IORING_OP_FILES_UPDATE] = {
.needs_mm = 1,
@@ -830,6 +887,7 @@ static const struct io_op_def io_op_defs[] = {
.needs_mm = 1,
.needs_fs = 1,
.file_table = 1,
+ .needs_blkcg = 1,
},
[IORING_OP_READ] = {
.needs_mm = 1,
@@ -837,6 +895,8 @@ static const struct io_op_def io_op_defs[] = {
.unbound_nonreg_file = 1,
.pollin = 1,
.buffer_select = 1,
+ .needs_blkcg = 1,
+ .async_size = sizeof(struct io_async_rw),
},
[IORING_OP_WRITE] = {
.needs_mm = 1,
@@ -844,18 +904,23 @@ static const struct io_op_def io_op_defs[] = {
.unbound_nonreg_file = 1,
.pollout = 1,
.needs_fsize = 1,
+ .needs_blkcg = 1,
+ .async_size = sizeof(struct io_async_rw),
},
[IORING_OP_FADVISE] = {
.needs_file = 1,
+ .needs_blkcg = 1,
},
[IORING_OP_MADVISE] = {
.needs_mm = 1,
+ .needs_blkcg = 1,
},
[IORING_OP_SEND] = {
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
+ .needs_blkcg = 1,
},
[IORING_OP_RECV] = {
.needs_mm = 1,
@@ -863,10 +928,12 @@ static const struct io_op_def io_op_defs[] = {
.unbound_nonreg_file = 1,
.pollin = 1,
.buffer_select = 1,
+ .needs_blkcg = 1,
},
[IORING_OP_OPENAT2] = {
.file_table = 1,
.needs_fs = 1,
+ .needs_blkcg = 1,
},
[IORING_OP_EPOLL_CTL] = {
.unbound_nonreg_file = 1,
@@ -876,6 +943,7 @@ static const struct io_op_def io_op_defs[] = {
.needs_file = 1,
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
+ .needs_blkcg = 1,
},
[IORING_OP_PROVIDE_BUFFERS] = {},
[IORING_OP_REMOVE_BUFFERS] = {},
@@ -898,29 +966,27 @@ static void io_put_req(struct io_kiocb *req);
static void io_double_put_req(struct io_kiocb *req);
static void __io_double_put_req(struct io_kiocb *req);
static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
+static void __io_queue_linked_timeout(struct io_kiocb *req);
static void io_queue_linked_timeout(struct io_kiocb *req);
static int __io_sqe_files_update(struct io_ring_ctx *ctx,
struct io_uring_files_update *ip,
unsigned nr_args);
-static int io_prep_work_files(struct io_kiocb *req);
static void __io_clean_op(struct io_kiocb *req);
-static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
- int fd, struct file **out_file, bool fixed);
-static void __io_queue_sqe(struct io_kiocb *req,
- const struct io_uring_sqe *sqe,
- struct io_comp_state *cs);
+static struct file *io_file_get(struct io_submit_state *state,
+ struct io_kiocb *req, int fd, bool fixed);
+static void __io_queue_sqe(struct io_kiocb *req, struct io_comp_state *cs);
static void io_file_put_work(struct work_struct *work);
static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
struct iovec **iovec, struct iov_iter *iter,
bool needs_lock);
-static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
- struct iovec *iovec, struct iovec *fast_iov,
- struct iov_iter *iter);
+static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
+ const struct iovec *fast_iov,
+ struct iov_iter *iter, bool force);
static struct kmem_cache *req_cachep;
-static const struct file_operations io_uring_fops;
+static const struct file_operations io_uring_fops __read_mostly;
struct sock *io_uring_get_socket(struct file *file)
{
@@ -935,27 +1001,13 @@ struct sock *io_uring_get_socket(struct file *file)
}
EXPORT_SYMBOL(io_uring_get_socket);
-static void io_get_req_task(struct io_kiocb *req)
-{
- if (req->flags & REQ_F_TASK_PINNED)
- return;
- get_task_struct(req->task);
- req->flags |= REQ_F_TASK_PINNED;
-}
-
static inline void io_clean_op(struct io_kiocb *req)
{
- if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED))
+ if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED |
+ REQ_F_INFLIGHT))
__io_clean_op(req);
}
-/* not idempotent -- it doesn't clear REQ_F_TASK_PINNED */
-static void __io_put_req_task(struct io_kiocb *req)
-{
- if (req->flags & REQ_F_TASK_PINNED)
- put_task_struct(req->task);
-}
-
static void io_sq_thread_drop_mm(void)
{
struct mm_struct *mm = current->mm;
@@ -970,9 +1022,10 @@ static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx)
{
if (!current->mm) {
if (unlikely(!(ctx->flags & IORING_SETUP_SQPOLL) ||
- !mmget_not_zero(ctx->sqo_mm)))
+ !ctx->sqo_task->mm ||
+ !mmget_not_zero(ctx->sqo_task->mm)))
return -EFAULT;
- kthread_use_mm(ctx->sqo_mm);
+ kthread_use_mm(ctx->sqo_task->mm);
}
return 0;
@@ -986,6 +1039,26 @@ static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx,
return __io_sq_thread_acquire_mm(ctx);
}
+static void io_sq_thread_associate_blkcg(struct io_ring_ctx *ctx,
+ struct cgroup_subsys_state **cur_css)
+
+{
+#ifdef CONFIG_BLK_CGROUP
+ /* puts the old one when swapping */
+ if (*cur_css != ctx->sqo_blkcg_css) {
+ kthread_associate_blkcg(ctx->sqo_blkcg_css);
+ *cur_css = ctx->sqo_blkcg_css;
+ }
+#endif
+}
+
+static void io_sq_thread_unassociate_blkcg(void)
+{
+#ifdef CONFIG_BLK_CGROUP
+ kthread_associate_blkcg(NULL);
+#endif
+}
+
static inline void req_set_fail_links(struct io_kiocb *req)
{
if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
@@ -1055,7 +1128,8 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
goto err;
ctx->flags = p->flags;
- init_waitqueue_head(&ctx->sqo_wait);
+ init_waitqueue_head(&ctx->sqo_sq_wait);
+ INIT_LIST_HEAD(&ctx->sqd_list);
init_waitqueue_head(&ctx->cq_wait);
INIT_LIST_HEAD(&ctx->cq_overflow_list);
init_completion(&ctx->ref_comp);
@@ -1107,15 +1181,25 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx)
}
}
-static void io_req_clean_work(struct io_kiocb *req)
+/*
+ * Returns true if we need to defer file table putting. This can only happen
+ * from the error path with REQ_F_COMP_LOCKED set.
+ */
+static bool io_req_clean_work(struct io_kiocb *req)
{
if (!(req->flags & REQ_F_WORK_INITIALIZED))
- return;
+ return false;
+
+ req->flags &= ~REQ_F_WORK_INITIALIZED;
if (req->work.mm) {
mmdrop(req->work.mm);
req->work.mm = NULL;
}
+#ifdef CONFIG_BLK_CGROUP
+ if (req->work.blkcg_css)
+ css_put(req->work.blkcg_css);
+#endif
if (req->work.creds) {
put_cred(req->work.creds);
req->work.creds = NULL;
@@ -1123,6 +1207,9 @@ static void io_req_clean_work(struct io_kiocb *req)
if (req->work.fs) {
struct fs_struct *fs = req->work.fs;
+ if (req->flags & REQ_F_COMP_LOCKED)
+ return true;
+
spin_lock(&req->work.fs->lock);
if (--fs->users)
fs = NULL;
@@ -1131,26 +1218,52 @@ static void io_req_clean_work(struct io_kiocb *req)
free_fs_struct(fs);
req->work.fs = NULL;
}
- req->flags &= ~REQ_F_WORK_INITIALIZED;
+
+ return false;
}
static void io_prep_async_work(struct io_kiocb *req)
{
const struct io_op_def *def = &io_op_defs[req->opcode];
+ struct io_ring_ctx *ctx = req->ctx;
io_req_init_async(req);
if (req->flags & REQ_F_ISREG) {
- if (def->hash_reg_file)
+ if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))
io_wq_hash_work(&req->work, file_inode(req->file));
} else {
if (def->unbound_nonreg_file)
req->work.flags |= IO_WQ_WORK_UNBOUND;
}
+ if (!req->work.files && io_op_defs[req->opcode].file_table &&
+ !(req->flags & REQ_F_NO_FILE_TABLE)) {
+ req->work.files = get_files_struct(current);
+ get_nsproxy(current->nsproxy);
+ req->work.nsproxy = current->nsproxy;
+ req->flags |= REQ_F_INFLIGHT;
+
+ spin_lock_irq(&ctx->inflight_lock);
+ list_add(&req->inflight_entry, &ctx->inflight_list);
+ spin_unlock_irq(&ctx->inflight_lock);
+ }
if (!req->work.mm && def->needs_mm) {
mmgrab(current->mm);
req->work.mm = current->mm;
}
+#ifdef CONFIG_BLK_CGROUP
+ if (!req->work.blkcg_css && def->needs_blkcg) {
+ rcu_read_lock();
+ req->work.blkcg_css = blkcg_css();
+ /*
+ * This should be rare, either the cgroup is dying or the task
+ * is moving cgroups. Just punt to root for the handful of ios.
+ */
+ if (!css_tryget_online(req->work.blkcg_css))
+ req->work.blkcg_css = NULL;
+ rcu_read_unlock();
+ }
+#endif
if (!req->work.creds)
req->work.creds = get_current_cred();
if (!req->work.fs && def->needs_fs) {
@@ -1179,7 +1292,7 @@ static void io_prep_async_link(struct io_kiocb *req)
io_prep_async_work(cur);
}
-static void __io_queue_async_work(struct io_kiocb *req)
+static struct io_kiocb *__io_queue_async_work(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_kiocb *link = io_prep_linked_timeout(req);
@@ -1187,23 +1300,27 @@ static void __io_queue_async_work(struct io_kiocb *req)
trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
&req->work, req->flags);
io_wq_enqueue(ctx->io_wq, &req->work);
-
- if (link)
- io_queue_linked_timeout(link);
+ return link;
}
static void io_queue_async_work(struct io_kiocb *req)
{
+ struct io_kiocb *link;
+
/* init ->work of the whole link before punting */
io_prep_async_link(req);
- __io_queue_async_work(req);
+ link = __io_queue_async_work(req);
+
+ if (link)
+ io_queue_linked_timeout(link);
}
static void io_kill_timeout(struct io_kiocb *req)
{
+ struct io_timeout_data *io = req->async_data;
int ret;
- ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
+ ret = hrtimer_try_to_cancel(&io->timer);
if (ret != -1) {
atomic_set(&req->ctx->cq_timeouts,
atomic_read(&req->ctx->cq_timeouts) + 1);
@@ -1214,14 +1331,36 @@ static void io_kill_timeout(struct io_kiocb *req)
}
}
-static void io_kill_timeouts(struct io_ring_ctx *ctx)
+static bool io_task_match(struct io_kiocb *req, struct task_struct *tsk)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+
+ if (!tsk || req->task == tsk)
+ return true;
+ if (ctx->flags & IORING_SETUP_SQPOLL) {
+ if (ctx->sq_data && req->task == ctx->sq_data->thread)
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Returns true if we found and killed one or more timeouts
+ */
+static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk)
{
struct io_kiocb *req, *tmp;
+ int canceled = 0;
spin_lock_irq(&ctx->completion_lock);
- list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list)
- io_kill_timeout(req);
+ list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
+ if (io_task_match(req, tsk)) {
+ io_kill_timeout(req);
+ canceled++;
+ }
+ }
spin_unlock_irq(&ctx->completion_lock);
+ return canceled != 0;
}
static void __io_queue_deferred(struct io_ring_ctx *ctx)
@@ -1229,12 +1368,19 @@ static void __io_queue_deferred(struct io_ring_ctx *ctx)
do {
struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
struct io_defer_entry, list);
+ struct io_kiocb *link;
if (req_need_defer(de->req, de->seq))
break;
list_del_init(&de->list);
/* punt-init is done before queueing for defer */
- __io_queue_async_work(de->req);
+ link = __io_queue_async_work(de->req);
+ if (link) {
+ __io_queue_linked_timeout(link);
+ /* drop submission reference */
+ link->flags |= REQ_F_COMP_LOCKED;
+ io_put_req(link);
+ }
kfree(de);
} while (!list_empty(&ctx->defer_list));
}
@@ -1265,6 +1411,13 @@ static void io_commit_cqring(struct io_ring_ctx *ctx)
__io_queue_deferred(ctx);
}
+static inline bool io_sqring_full(struct io_ring_ctx *ctx)
+{
+ struct io_rings *r = ctx->rings;
+
+ return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == r->sq_ring_entries;
+}
+
static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
{
struct io_rings *rings = ctx->rings;
@@ -1298,8 +1451,8 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
{
if (waitqueue_active(&ctx->wait))
wake_up(&ctx->wait);
- if (waitqueue_active(&ctx->sqo_wait))
- wake_up(&ctx->sqo_wait);
+ if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait))
+ wake_up(&ctx->sq_data->wait);
if (io_should_trigger_evfd(ctx))
eventfd_signal(ctx->cq_ev_fd, 1);
}
@@ -1313,12 +1466,24 @@ static void io_cqring_mark_overflow(struct io_ring_ctx *ctx)
}
}
+static inline bool io_match_files(struct io_kiocb *req,
+ struct files_struct *files)
+{
+ if (!files)
+ return true;
+ if (req->flags & REQ_F_WORK_INITIALIZED)
+ return req->work.files == files;
+ return false;
+}
+
/* Returns true if there are no backlogged entries after the flush */
-static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
+static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
+ struct task_struct *tsk,
+ struct files_struct *files)
{
struct io_rings *rings = ctx->rings;
+ struct io_kiocb *req, *tmp;
struct io_uring_cqe *cqe;
- struct io_kiocb *req;
unsigned long flags;
LIST_HEAD(list);
@@ -1337,15 +1502,17 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
ctx->cq_overflow_flushed = 1;
cqe = NULL;
- while (!list_empty(&ctx->cq_overflow_list)) {
+ list_for_each_entry_safe(req, tmp, &ctx->cq_overflow_list, compl.list) {
+ if (tsk && req->task != tsk)
+ continue;
+ if (!io_match_files(req, files))
+ continue;
+
cqe = io_get_cqring(ctx);
if (!cqe && !force)
break;
- req = list_first_entry(&ctx->cq_overflow_list, struct io_kiocb,
- compl.list);
list_move(&req->compl.list, &list);
- req->flags &= ~REQ_F_OVERFLOW;
if (cqe) {
WRITE_ONCE(cqe->user_data, req->user_data);
WRITE_ONCE(cqe->res, req->result);
@@ -1388,7 +1555,12 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
WRITE_ONCE(cqe->user_data, req->user_data);
WRITE_ONCE(cqe->res, res);
WRITE_ONCE(cqe->flags, cflags);
- } else if (ctx->cq_overflow_flushed) {
+ } else if (ctx->cq_overflow_flushed || req->task->io_uring->in_idle) {
+ /*
+ * If we're in ring overflow flush mode, or in task cancel mode,
+ * then we cannot store the request for later flushing, we need
+ * to drop it on the floor.
+ */
WRITE_ONCE(ctx->rings->cq_overflow,
atomic_inc_return(&ctx->cached_cq_overflow));
} else {
@@ -1398,7 +1570,6 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
}
io_clean_op(req);
- req->flags |= REQ_F_OVERFLOW;
req->result = res;
req->compl.cflags = cflags;
refcount_inc(&req->refs);
@@ -1492,10 +1663,8 @@ static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx)
static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx,
struct io_submit_state *state)
{
- gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
- struct io_kiocb *req;
-
if (!state->free_reqs) {
+ gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
size_t sz;
int ret;
@@ -1512,14 +1681,11 @@ static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx,
goto fallback;
ret = 1;
}
- state->free_reqs = ret - 1;
- req = state->reqs[ret - 1];
- } else {
- state->free_reqs--;
- req = state->reqs[state->free_reqs];
+ state->free_reqs = ret;
}
- return req;
+ state->free_reqs--;
+ return state->reqs[state->free_reqs];
fallback:
return io_get_fallback_req(ctx);
}
@@ -1533,35 +1699,28 @@ static inline void io_put_file(struct io_kiocb *req, struct file *file,
fput(file);
}
-static void io_dismantle_req(struct io_kiocb *req)
+static bool io_dismantle_req(struct io_kiocb *req)
{
io_clean_op(req);
- if (req->io)
- kfree(req->io);
+ if (req->async_data)
+ kfree(req->async_data);
if (req->file)
io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
- io_req_clean_work(req);
- if (req->flags & REQ_F_INFLIGHT) {
- struct io_ring_ctx *ctx = req->ctx;
- unsigned long flags;
-
- spin_lock_irqsave(&ctx->inflight_lock, flags);
- list_del(&req->inflight_entry);
- if (waitqueue_active(&ctx->inflight_wait))
- wake_up(&ctx->inflight_wait);
- spin_unlock_irqrestore(&ctx->inflight_lock, flags);
- }
+ return io_req_clean_work(req);
}
-static void __io_free_req(struct io_kiocb *req)
+static void __io_free_req_finish(struct io_kiocb *req)
{
- struct io_ring_ctx *ctx;
+ struct io_uring_task *tctx = req->task->io_uring;
+ struct io_ring_ctx *ctx = req->ctx;
+
+ atomic_long_inc(&tctx->req_complete);
+ if (tctx->in_idle)
+ wake_up(&tctx->wait);
+ put_task_struct(req->task);
- io_dismantle_req(req);
- __io_put_req_task(req);
- ctx = req->ctx;
if (likely(!io_is_fallback_req(req)))
kmem_cache_free(req_cachep, req);
else
@@ -1569,12 +1728,46 @@ static void __io_free_req(struct io_kiocb *req)
percpu_ref_put(&ctx->refs);
}
+static void io_req_task_file_table_put(struct callback_head *cb)
+{
+ struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
+ struct fs_struct *fs = req->work.fs;
+
+ spin_lock(&req->work.fs->lock);
+ if (--fs->users)
+ fs = NULL;
+ spin_unlock(&req->work.fs->lock);
+ if (fs)
+ free_fs_struct(fs);
+ req->work.fs = NULL;
+ __io_free_req_finish(req);
+}
+
+static void __io_free_req(struct io_kiocb *req)
+{
+ if (!io_dismantle_req(req)) {
+ __io_free_req_finish(req);
+ } else {
+ int ret;
+
+ init_task_work(&req->task_work, io_req_task_file_table_put);
+ ret = task_work_add(req->task, &req->task_work, TWA_RESUME);
+ if (unlikely(ret)) {
+ struct task_struct *tsk;
+
+ tsk = io_wq_get_task(req->ctx->io_wq);
+ task_work_add(tsk, &req->task_work, 0);
+ }
+ }
+}
+
static bool io_link_cancel_timeout(struct io_kiocb *req)
{
+ struct io_timeout_data *io = req->async_data;
struct io_ring_ctx *ctx = req->ctx;
int ret;
- ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
+ ret = hrtimer_try_to_cancel(&io->timer);
if (ret != -1) {
io_cqring_fill_event(req, -ECANCELED);
io_commit_cqring(ctx);
@@ -1598,6 +1791,7 @@ static bool __io_kill_linked_timeout(struct io_kiocb *req)
return false;
list_del_init(&link->link_list);
+ link->flags |= REQ_F_COMP_LOCKED;
wake_ev = io_link_cancel_timeout(link);
req->flags &= ~REQ_F_LINK_TIMEOUT;
return wake_ev;
@@ -1656,6 +1850,7 @@ static void __io_fail_links(struct io_kiocb *req)
trace_io_uring_fail_link(req, link);
io_cqring_fill_event(link, -ECANCELED);
+ link->flags |= REQ_F_COMP_LOCKED;
__io_double_put_req(link);
req->flags &= ~REQ_F_LINK_TIMEOUT;
}
@@ -1706,26 +1901,29 @@ static struct io_kiocb *io_req_find_next(struct io_kiocb *req)
return __io_req_find_next(req);
}
-static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb)
+static int io_req_task_work_add(struct io_kiocb *req, bool twa_signal_ok)
{
struct task_struct *tsk = req->task;
struct io_ring_ctx *ctx = req->ctx;
- int ret, notify = TWA_RESUME;
+ int ret, notify;
+
+ if (tsk->flags & PF_EXITING)
+ return -ESRCH;
/*
- * SQPOLL kernel thread doesn't need notification, just a wakeup.
- * If we're not using an eventfd, then TWA_RESUME is always fine,
- * as we won't have dependencies between request completions for
- * other kernel wait conditions.
+ * SQPOLL kernel thread doesn't need notification, just a wakeup. For
+ * all other cases, use TWA_SIGNAL unconditionally to ensure we're
+ * processing task_work. There's no reliable way to tell if TWA_RESUME
+ * will do the job.
*/
- if (ctx->flags & IORING_SETUP_SQPOLL)
- notify = 0;
- else if (ctx->cq_ev_fd)
+ notify = 0;
+ if (!(ctx->flags & IORING_SETUP_SQPOLL) && twa_signal_ok)
notify = TWA_SIGNAL;
- ret = task_work_add(tsk, cb, notify);
+ ret = task_work_add(tsk, &req->task_work, notify);
if (!ret)
wake_up_process(tsk);
+
return ret;
}
@@ -1746,8 +1944,10 @@ static void __io_req_task_cancel(struct io_kiocb *req, int error)
static void io_req_task_cancel(struct callback_head *cb)
{
struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
+ struct io_ring_ctx *ctx = req->ctx;
__io_req_task_cancel(req, -ECANCELED);
+ percpu_ref_put(&ctx->refs);
}
static void __io_req_task_submit(struct io_kiocb *req)
@@ -1756,7 +1956,7 @@ static void __io_req_task_submit(struct io_kiocb *req)
if (!__io_sq_thread_acquire_mm(ctx)) {
mutex_lock(&ctx->uring_lock);
- __io_queue_sqe(req, NULL, NULL);
+ __io_queue_sqe(req, NULL);
mutex_unlock(&ctx->uring_lock);
} else {
__io_req_task_cancel(req, -EFAULT);
@@ -1766,8 +1966,10 @@ static void __io_req_task_submit(struct io_kiocb *req)
static void io_req_task_submit(struct callback_head *cb)
{
struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
+ struct io_ring_ctx *ctx = req->ctx;
__io_req_task_submit(req);
+ percpu_ref_put(&ctx->refs);
}
static void io_req_task_queue(struct io_kiocb *req)
@@ -1775,8 +1977,9 @@ static void io_req_task_queue(struct io_kiocb *req)
int ret;
init_task_work(&req->task_work, io_req_task_submit);
+ percpu_ref_get(&req->ctx->refs);
- ret = io_req_task_work_add(req, &req->task_work);
+ ret = io_req_task_work_add(req, true);
if (unlikely(ret)) {
struct task_struct *tsk;
@@ -1830,6 +2033,7 @@ static void io_req_free_batch_finish(struct io_ring_ctx *ctx,
if (rb->to_free)
__io_req_free_batch_flush(ctx, rb);
if (rb->task) {
+ atomic_long_add(rb->task_refs, &rb->task->io_uring->req_complete);
put_task_struct_many(rb->task, rb->task_refs);
rb->task = NULL;
}
@@ -1844,18 +2048,17 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req)
if (req->flags & REQ_F_LINK_HEAD)
io_queue_next(req);
- if (req->flags & REQ_F_TASK_PINNED) {
- if (req->task != rb->task) {
- if (rb->task)
- put_task_struct_many(rb->task, rb->task_refs);
- rb->task = req->task;
- rb->task_refs = 0;
+ if (req->task != rb->task) {
+ if (rb->task) {
+ atomic_long_add(rb->task_refs, &rb->task->io_uring->req_complete);
+ put_task_struct_many(rb->task, rb->task_refs);
}
- rb->task_refs++;
- req->flags &= ~REQ_F_TASK_PINNED;
+ rb->task = req->task;
+ rb->task_refs = 0;
}
+ rb->task_refs++;
- io_dismantle_req(req);
+ WARN_ON_ONCE(io_dismantle_req(req));
rb->reqs[rb->to_free++] = req;
if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs)))
__io_req_free_batch_flush(req->ctx, rb);
@@ -1929,7 +2132,7 @@ static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush)
if (noflush && !list_empty(&ctx->cq_overflow_list))
return -1U;
- io_cqring_overflow_flush(ctx, false);
+ io_cqring_overflow_flush(ctx, false, NULL, NULL);
}
/* See comment at the top of this file */
@@ -1966,6 +2169,12 @@ static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req)
static inline bool io_run_task_work(void)
{
+ /*
+ * Not safe to run on exiting task, and the task_work handling will
+ * not add work to such a task.
+ */
+ if (unlikely(current->flags & PF_EXITING))
+ return false;
if (current->task_works) {
__set_current_state(TASK_RUNNING);
task_work_run();
@@ -2005,6 +2214,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
req = list_first_entry(done, struct io_kiocb, inflight_entry);
if (READ_ONCE(req->result) == -EAGAIN) {
+ req->result = 0;
req->iopoll_completed = 0;
list_move_tail(&req->inflight_entry, &again);
continue;
@@ -2238,46 +2448,43 @@ static bool io_resubmit_prep(struct io_kiocb *req, int error)
goto end_req;
}
- ret = io_import_iovec(rw, req, &iovec, &iter, false);
- if (ret < 0)
- goto end_req;
- ret = io_setup_async_rw(req, ret, iovec, inline_vecs, &iter);
- if (!ret)
+ if (!req->async_data) {
+ ret = io_import_iovec(rw, req, &iovec, &iter, false);
+ if (ret < 0)
+ goto end_req;
+ ret = io_setup_async_rw(req, iovec, inline_vecs, &iter, false);
+ if (!ret)
+ return true;
+ kfree(iovec);
+ } else {
return true;
- kfree(iovec);
+ }
end_req:
req_set_fail_links(req);
io_req_complete(req, ret);
return false;
}
-
-static void io_rw_resubmit(struct callback_head *cb)
-{
- struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
- struct io_ring_ctx *ctx = req->ctx;
- int err;
-
- err = io_sq_thread_acquire_mm(ctx, req);
-
- if (io_resubmit_prep(req, err)) {
- refcount_inc(&req->refs);
- io_queue_async_work(req);
- }
-}
#endif
static bool io_rw_reissue(struct io_kiocb *req, long res)
{
#ifdef CONFIG_BLOCK
+ umode_t mode = file_inode(req->file)->i_mode;
int ret;
+ if (!S_ISBLK(mode) && !S_ISREG(mode))
+ return false;
if ((res != -EAGAIN && res != -EOPNOTSUPP) || io_wq_current_is_worker())
return false;
- init_task_work(&req->task_work, io_rw_resubmit);
- ret = io_req_task_work_add(req, &req->task_work);
- if (!ret)
+ ret = io_sq_thread_acquire_mm(req->ctx, req);
+
+ if (io_resubmit_prep(req, ret)) {
+ refcount_inc(&req->refs);
+ io_queue_async_work(req);
return true;
+ }
+
#endif
return false;
}
@@ -2348,8 +2555,8 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
list_add_tail(&req->inflight_entry, &ctx->iopoll_list);
if ((ctx->flags & IORING_SETUP_SQPOLL) &&
- wq_has_sleeper(&ctx->sqo_wait))
- wake_up(&ctx->sqo_wait);
+ wq_has_sleeper(&ctx->sq_data->wait))
+ wake_up(&ctx->sq_data->wait);
}
static void __io_state_file_put(struct io_submit_state *state)
@@ -2378,7 +2585,6 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd)
if (state->file) {
if (state->fd == fd) {
state->has_refs--;
- state->ios_left--;
return state->file;
}
__io_state_file_put(state);
@@ -2388,8 +2594,7 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd)
return NULL;
state->fd = fd;
- state->ios_left--;
- state->has_refs = state->ios_left;
+ state->has_refs = state->ios_left - 1;
return state->file;
}
@@ -2438,8 +2643,7 @@ static bool io_file_supports_async(struct file *file, int rw)
return file->f_op->write_iter != NULL;
}
-static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
- bool force_nonblock)
+static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_ring_ctx *ctx = req->ctx;
struct kiocb *kiocb = &req->rw.kiocb;
@@ -2474,12 +2678,6 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (kiocb->ki_flags & IOCB_NOWAIT)
req->flags |= REQ_F_NOWAIT;
- if (kiocb->ki_flags & IOCB_DIRECT)
- io_get_req_task(req);
-
- if (force_nonblock)
- kiocb->ki_flags |= IOCB_NOWAIT;
-
if (ctx->flags & IORING_SETUP_IOPOLL) {
if (!(kiocb->ki_flags & IOCB_DIRECT) ||
!kiocb->ki_filp->f_op->iopoll)
@@ -2488,7 +2686,6 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
kiocb->ki_flags |= IOCB_HIPRI;
kiocb->ki_complete = io_complete_rw_iopoll;
req->iopoll_completed = 0;
- io_get_req_task(req);
} else {
if (kiocb->ki_flags & IOCB_HIPRI)
return -EINVAL;
@@ -2516,7 +2713,7 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
* IO with EINTR.
*/
ret = -EINTR;
- /* fall through */
+ fallthrough;
default:
kiocb->ki_complete(kiocb, ret, 0);
}
@@ -2526,6 +2723,15 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
struct io_comp_state *cs)
{
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
+ struct io_async_rw *io = req->async_data;
+
+ /* add previously done IO, if any */
+ if (io && io->bytes_done > 0) {
+ if (ret < 0)
+ ret = io->bytes_done;
+ else
+ ret += io->bytes_done;
+ }
if (req->flags & REQ_F_CUR_POS)
req->file->f_pos = kiocb->ki_pos;
@@ -2541,18 +2747,12 @@ static ssize_t io_import_fixed(struct io_kiocb *req, int rw,
struct io_ring_ctx *ctx = req->ctx;
size_t len = req->rw.len;
struct io_mapped_ubuf *imu;
- u16 index, buf_index;
+ u16 index, buf_index = req->buf_index;
size_t offset;
u64 buf_addr;
- /* attempt to use fixed buffers without having provided iovecs */
- if (unlikely(!ctx->user_bufs))
- return -EFAULT;
-
- buf_index = req->buf_index;
if (unlikely(buf_index >= ctx->nr_user_bufs))
return -EFAULT;
-
index = array_index_nospec(buf_index, ctx->nr_user_bufs);
imu = &ctx->user_bufs[index];
buf_addr = req->rw.addr;
@@ -2749,9 +2949,9 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
return __io_iov_buffer_select(req, iov, needs_lock);
}
-static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
- struct iovec **iovec, struct iov_iter *iter,
- bool needs_lock)
+static ssize_t __io_import_iovec(int rw, struct io_kiocb *req,
+ struct iovec **iovec, struct iov_iter *iter,
+ bool needs_lock)
{
void __user *buf = u64_to_user_ptr(req->rw.addr);
size_t sqe_len = req->rw.len;
@@ -2771,10 +2971,8 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
if (req->flags & REQ_F_BUFFER_SELECT) {
buf = io_rw_buffer_select(req, &sqe_len, needs_lock);
- if (IS_ERR(buf)) {
- *iovec = NULL;
+ if (IS_ERR(buf))
return PTR_ERR(buf);
- }
req->rw.len = sqe_len;
}
@@ -2783,14 +2981,6 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
return ret < 0 ? ret : sqe_len;
}
- if (req->io) {
- struct io_async_rw *iorw = &req->io->rw;
-
- iov_iter_init(iter, rw, iorw->iov, iorw->nr_segs, iorw->size);
- *iovec = NULL;
- return iorw->size;
- }
-
if (req->flags & REQ_F_BUFFER_SELECT) {
ret = io_iov_buffer_select(req, *iovec, needs_lock);
if (!ret) {
@@ -2801,13 +2991,25 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
return ret;
}
-#ifdef CONFIG_COMPAT
- if (req->ctx->compat)
- return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
- iovec, iter);
-#endif
+ return __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter,
+ req->ctx->compat);
+}
+
+static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
+ struct iovec **iovec, struct iov_iter *iter,
+ bool needs_lock)
+{
+ struct io_async_rw *iorw = req->async_data;
+
+ if (!iorw)
+ return __io_import_iovec(rw, req, iovec, iter, needs_lock);
+ *iovec = NULL;
+ return iov_iter_count(&iorw->iter);
+}
- return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter);
+static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
+{
+ return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
}
/*
@@ -2845,10 +3047,10 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
if (rw == READ) {
nr = file->f_op->read(file, iovec.iov_base,
- iovec.iov_len, &kiocb->ki_pos);
+ iovec.iov_len, io_kiocb_ppos(kiocb));
} else {
nr = file->f_op->write(file, iovec.iov_base,
- iovec.iov_len, &kiocb->ki_pos);
+ iovec.iov_len, io_kiocb_ppos(kiocb));
}
if (iov_iter_is_bvec(iter))
@@ -2868,78 +3070,85 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
return ret;
}
-static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size,
- struct iovec *iovec, struct iovec *fast_iov,
- struct iov_iter *iter)
+static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
+ const struct iovec *fast_iov, struct iov_iter *iter)
{
- struct io_async_rw *rw = &req->io->rw;
+ struct io_async_rw *rw = req->async_data;
- rw->nr_segs = iter->nr_segs;
- rw->size = io_size;
+ memcpy(&rw->iter, iter, sizeof(*iter));
+ rw->free_iovec = iovec;
+ rw->bytes_done = 0;
+ /* can only be fixed buffers, no need to do anything */
+ if (iter->type == ITER_BVEC)
+ return;
if (!iovec) {
- rw->iov = rw->fast_iov;
- if (rw->iov != fast_iov)
- memcpy(rw->iov, fast_iov,
+ unsigned iov_off = 0;
+
+ rw->iter.iov = rw->fast_iov;
+ if (iter->iov != fast_iov) {
+ iov_off = iter->iov - fast_iov;
+ rw->iter.iov += iov_off;
+ }
+ if (rw->fast_iov != fast_iov)
+ memcpy(rw->fast_iov + iov_off, fast_iov + iov_off,
sizeof(struct iovec) * iter->nr_segs);
} else {
- rw->iov = iovec;
req->flags |= REQ_F_NEED_CLEANUP;
}
}
-static inline int __io_alloc_async_ctx(struct io_kiocb *req)
+static inline int __io_alloc_async_data(struct io_kiocb *req)
{
- req->io = kmalloc(sizeof(*req->io), GFP_KERNEL);
- return req->io == NULL;
+ WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
+ req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
+ return req->async_data == NULL;
}
-static int io_alloc_async_ctx(struct io_kiocb *req)
+static int io_alloc_async_data(struct io_kiocb *req)
{
- if (!io_op_defs[req->opcode].async_ctx)
+ if (!io_op_defs[req->opcode].needs_async_data)
return 0;
- return __io_alloc_async_ctx(req);
+ return __io_alloc_async_data(req);
}
-static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
- struct iovec *iovec, struct iovec *fast_iov,
- struct iov_iter *iter)
+static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
+ const struct iovec *fast_iov,
+ struct iov_iter *iter, bool force)
{
- if (!io_op_defs[req->opcode].async_ctx)
+ if (!force && !io_op_defs[req->opcode].needs_async_data)
return 0;
- if (!req->io) {
- if (__io_alloc_async_ctx(req))
+ if (!req->async_data) {
+ if (__io_alloc_async_data(req))
return -ENOMEM;
- io_req_map_rw(req, io_size, iovec, fast_iov, iter);
+ io_req_map_rw(req, iovec, fast_iov, iter);
}
return 0;
}
-static inline int io_rw_prep_async(struct io_kiocb *req, int rw,
- bool force_nonblock)
+static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
{
- struct io_async_ctx *io = req->io;
- struct iov_iter iter;
+ struct io_async_rw *iorw = req->async_data;
+ struct iovec *iov = iorw->fast_iov;
ssize_t ret;
- io->rw.iov = io->rw.fast_iov;
- req->io = NULL;
- ret = io_import_iovec(rw, req, &io->rw.iov, &iter, !force_nonblock);
- req->io = io;
+ ret = __io_import_iovec(rw, req, &iov, &iorw->iter, false);
if (unlikely(ret < 0))
return ret;
- io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter);
+ iorw->bytes_done = 0;
+ iorw->free_iovec = iov;
+ if (iov)
+ req->flags |= REQ_F_NEED_CLEANUP;
return 0;
}
-static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
- bool force_nonblock)
+static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
ssize_t ret;
- ret = io_prep_rw(req, sqe, force_nonblock);
+ ret = io_prep_rw(req, sqe);
if (ret)
return ret;
@@ -2947,11 +3156,21 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
return -EBADF;
/* either don't need iovec imported or already have it */
- if (!req->io || req->flags & REQ_F_NEED_CLEANUP)
+ if (!req->async_data)
return 0;
- return io_rw_prep_async(req, READ, force_nonblock);
+ return io_rw_prep_async(req, READ);
}
+/*
+ * This is our waitqueue callback handler, registered through lock_page_async()
+ * when we initially tried to do the IO with the iocb armed our waitqueue.
+ * This gets called when the page is unlocked, and we generally expect that to
+ * happen when the page IO is completed and the page is now uptodate. This will
+ * queue a task_work based retry of the operation, attempting to copy the data
+ * again. If the latter fails because the page was NOT uptodate, then we will
+ * do a thread based blocking retry of the operation. That's the unexpected
+ * slow path.
+ */
static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
int sync, void *arg)
{
@@ -2965,16 +3184,15 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
if (!wake_page_match(wpq, key))
return 0;
- /* Stop waking things up if the page is locked again */
- if (test_bit(key->bit_nr, &key->page->flags))
- return -1;
-
+ req->rw.kiocb.ki_flags &= ~IOCB_WAITQ;
list_del_init(&wait->entry);
init_task_work(&req->task_work, io_req_task_submit);
+ percpu_ref_get(&req->ctx->refs);
+
/* submit ref gets dropped, acquire a new one */
refcount_inc(&req->refs);
- ret = io_req_task_work_add(req, &req->task_work);
+ ret = io_req_task_work_add(req, true);
if (unlikely(ret)) {
struct task_struct *tsk;
@@ -2987,40 +3205,32 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
return 1;
}
-static inline int kiocb_wait_page_queue_init(struct kiocb *kiocb,
- struct wait_page_queue *wait,
- wait_queue_func_t func,
- void *data)
-{
- /* Can't support async wakeup with polled IO */
- if (kiocb->ki_flags & IOCB_HIPRI)
- return -EINVAL;
- if (kiocb->ki_filp->f_mode & FMODE_BUF_RASYNC) {
- wait->wait.func = func;
- wait->wait.private = data;
- wait->wait.flags = 0;
- INIT_LIST_HEAD(&wait->wait.entry);
- kiocb->ki_flags |= IOCB_WAITQ;
- kiocb->ki_waitq = wait;
- return 0;
- }
-
- return -EOPNOTSUPP;
-}
-
-
+/*
+ * This controls whether a given IO request should be armed for async page
+ * based retry. If we return false here, the request is handed to the async
+ * worker threads for retry. If we're doing buffered reads on a regular file,
+ * we prepare a private wait_page_queue entry and retry the operation. This
+ * will either succeed because the page is now uptodate and unlocked, or it
+ * will register a callback when the page is unlocked at IO completion. Through
+ * that callback, io_uring uses task_work to setup a retry of the operation.
+ * That retry will attempt the buffered read again. The retry will generally
+ * succeed, or in rare cases where it fails, we then fall back to using the
+ * async worker threads for a blocking retry.
+ */
static bool io_rw_should_retry(struct io_kiocb *req)
{
+ struct io_async_rw *rw = req->async_data;
+ struct wait_page_queue *wait = &rw->wpq;
struct kiocb *kiocb = &req->rw.kiocb;
- int ret;
/* never retry for NOWAIT, we just complete with -EAGAIN */
if (req->flags & REQ_F_NOWAIT)
return false;
- /* already tried, or we're doing O_DIRECT */
- if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_WAITQ))
+ /* Only for buffered IO */
+ if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI))
return false;
+
/*
* just use poll if we can, and don't attempt if the fs doesn't
* support callback based unlocks
@@ -3028,28 +3238,24 @@ static bool io_rw_should_retry(struct io_kiocb *req)
if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC))
return false;
- /*
- * If request type doesn't require req->io to defer in general,
- * we need to allocate it here
- */
- if (!req->io && __io_alloc_async_ctx(req))
- return false;
-
- ret = kiocb_wait_page_queue_init(kiocb, &req->io->rw.wpq,
- io_async_buf_func, req);
- if (!ret) {
- io_get_req_task(req);
- return true;
- }
-
- return false;
+ wait->wait.func = io_async_buf_func;
+ wait->wait.private = req;
+ wait->wait.flags = 0;
+ INIT_LIST_HEAD(&wait->wait.entry);
+ kiocb->ki_flags |= IOCB_WAITQ;
+ kiocb->ki_flags &= ~IOCB_NOWAIT;
+ kiocb->ki_waitq = wait;
+ return true;
}
static int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
{
if (req->file->f_op->read_iter)
return call_read_iter(req->file, &req->rw.kiocb, iter);
- return loop_rw_iter(READ, req->file, &req->rw.kiocb, iter);
+ else if (req->file->f_op->read)
+ return loop_rw_iter(READ, req->file, &req->rw.kiocb, iter);
+ else
+ return -EINVAL;
}
static int io_read(struct io_kiocb *req, bool force_nonblock,
@@ -3057,71 +3263,118 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
{
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
struct kiocb *kiocb = &req->rw.kiocb;
- struct iov_iter iter;
- size_t iov_count;
+ struct iov_iter __iter, *iter = &__iter;
+ struct io_async_rw *rw = req->async_data;
ssize_t io_size, ret, ret2;
- unsigned long nr_segs;
+ size_t iov_count;
+ bool no_async;
- ret = io_import_iovec(READ, req, &iovec, &iter, !force_nonblock);
+ if (rw)
+ iter = &rw->iter;
+
+ ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
if (ret < 0)
return ret;
+ iov_count = iov_iter_count(iter);
io_size = ret;
req->result = io_size;
+ ret = 0;
/* Ensure we clear previously set non-block flag */
if (!force_nonblock)
kiocb->ki_flags &= ~IOCB_NOWAIT;
+ else
+ kiocb->ki_flags |= IOCB_NOWAIT;
+
/* If the file doesn't support async, just async punt */
- if (force_nonblock && !io_file_supports_async(req->file, READ))
+ no_async = force_nonblock && !io_file_supports_async(req->file, READ);
+ if (no_async)
goto copy_iov;
- iov_count = iov_iter_count(&iter);
- nr_segs = iter.nr_segs;
- ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count);
+ ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), iov_count);
if (unlikely(ret))
goto out_free;
- ret2 = io_iter_do_read(req, &iter);
+ ret = io_iter_do_read(req, iter);
- /* Catch -EAGAIN return for forced non-blocking submission */
- if (!force_nonblock || (ret2 != -EAGAIN && ret2 != -EIO)) {
- kiocb_done(kiocb, ret2, cs);
- } else {
- iter.count = iov_count;
- iter.nr_segs = nr_segs;
+ if (!ret) {
+ goto done;
+ } else if (ret == -EIOCBQUEUED) {
+ ret = 0;
+ goto out_free;
+ } else if (ret == -EAGAIN) {
+ /* IOPOLL retry should happen for io-wq threads */
+ if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
+ goto done;
+ /* no retry on NONBLOCK marked file */
+ if (req->file->f_flags & O_NONBLOCK)
+ goto done;
+ /* some cases will consume bytes even on error returns */
+ iov_iter_revert(iter, iov_count - iov_iter_count(iter));
+ ret = 0;
+ goto copy_iov;
+ } else if (ret < 0) {
+ /* make sure -ERESTARTSYS -> -EINTR is done */
+ goto done;
+ }
+
+ /* read it all, or we did blocking attempt. no retry. */
+ if (!iov_iter_count(iter) || !force_nonblock ||
+ (req->file->f_flags & O_NONBLOCK))
+ goto done;
+
+ io_size -= ret;
copy_iov:
- ret = io_setup_async_rw(req, io_size, iovec, inline_vecs,
- &iter);
- if (ret)
- goto out_free;
- /* it's copied and will be cleaned with ->io */
- iovec = NULL;
- /* if we can retry, do so with the callbacks armed */
- if (io_rw_should_retry(req)) {
- ret2 = io_iter_do_read(req, &iter);
- if (ret2 == -EIOCBQUEUED) {
- goto out_free;
- } else if (ret2 != -EAGAIN) {
- kiocb_done(kiocb, ret2, cs);
- goto out_free;
- }
- }
+ ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
+ if (ret2) {
+ ret = ret2;
+ goto out_free;
+ }
+ if (no_async)
+ return -EAGAIN;
+ rw = req->async_data;
+ /* it's copied and will be cleaned with ->io */
+ iovec = NULL;
+ /* now use our persistent iterator, if we aren't already */
+ iter = &rw->iter;
+retry:
+ rw->bytes_done += ret;
+ /* if we can retry, do so with the callbacks armed */
+ if (!io_rw_should_retry(req)) {
kiocb->ki_flags &= ~IOCB_WAITQ;
return -EAGAIN;
}
+
+ /*
+ * Now retry read with the IOCB_WAITQ parts set in the iocb. If we
+ * get -EIOCBQUEUED, then we'll get a notification when the desired
+ * page gets unlocked. We can also get a partial read here, and if we
+ * do, then just retry at the new offset.
+ */
+ ret = io_iter_do_read(req, iter);
+ if (ret == -EIOCBQUEUED) {
+ ret = 0;
+ goto out_free;
+ } else if (ret > 0 && ret < io_size) {
+ /* we got some bytes, but not all. retry. */
+ goto retry;
+ }
+done:
+ kiocb_done(kiocb, ret, cs);
+ ret = 0;
out_free:
+ /* it's reportedly faster than delegating the null check to kfree() */
if (iovec)
kfree(iovec);
return ret;
}
-static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
- bool force_nonblock)
+static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
ssize_t ret;
- ret = io_prep_rw(req, sqe, force_nonblock);
+ ret = io_prep_rw(req, sqe);
if (ret)
return ret;
@@ -3129,9 +3382,9 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
return -EBADF;
/* either don't need iovec imported or already have it */
- if (!req->io || req->flags & REQ_F_NEED_CLEANUP)
+ if (!req->async_data)
return 0;
- return io_rw_prep_async(req, WRITE, force_nonblock);
+ return io_rw_prep_async(req, WRITE);
}
static int io_write(struct io_kiocb *req, bool force_nonblock,
@@ -3139,20 +3392,26 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
{
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
struct kiocb *kiocb = &req->rw.kiocb;
- struct iov_iter iter;
+ struct iov_iter __iter, *iter = &__iter;
+ struct io_async_rw *rw = req->async_data;
size_t iov_count;
ssize_t ret, ret2, io_size;
- unsigned long nr_segs;
- ret = io_import_iovec(WRITE, req, &iovec, &iter, !force_nonblock);
+ if (rw)
+ iter = &rw->iter;
+
+ ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
if (ret < 0)
return ret;
+ iov_count = iov_iter_count(iter);
io_size = ret;
req->result = io_size;
/* Ensure we clear previously set non-block flag */
if (!force_nonblock)
- req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT;
+ kiocb->ki_flags &= ~IOCB_NOWAIT;
+ else
+ kiocb->ki_flags |= IOCB_NOWAIT;
/* If the file doesn't support async, just async punt */
if (force_nonblock && !io_file_supports_async(req->file, WRITE))
@@ -3163,9 +3422,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
(req->flags & REQ_F_ISREG))
goto copy_iov;
- iov_count = iov_iter_count(&iter);
- nr_segs = iter.nr_segs;
- ret = rw_verify_area(WRITE, req->file, &kiocb->ki_pos, iov_count);
+ ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), iov_count);
if (unlikely(ret))
goto out_free;
@@ -3185,9 +3442,11 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
kiocb->ki_flags |= IOCB_WRITE;
if (req->file->f_op->write_iter)
- ret2 = call_write_iter(req->file, kiocb, &iter);
+ ret2 = call_write_iter(req->file, kiocb, iter);
+ else if (req->file->f_op->write)
+ ret2 = loop_rw_iter(WRITE, req->file, kiocb, iter);
else
- ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter);
+ ret2 = -EINVAL;
/*
* Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
@@ -3195,21 +3454,25 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
*/
if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
ret2 = -EAGAIN;
+ /* no retry on NONBLOCK marked file */
+ if (ret2 == -EAGAIN && (req->file->f_flags & O_NONBLOCK))
+ goto done;
if (!force_nonblock || ret2 != -EAGAIN) {
+ /* IOPOLL retry should happen for io-wq threads */
+ if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN)
+ goto copy_iov;
+done:
kiocb_done(kiocb, ret2, cs);
} else {
- iter.count = iov_count;
- iter.nr_segs = nr_segs;
copy_iov:
- ret = io_setup_async_rw(req, io_size, iovec, inline_vecs,
- &iter);
- if (ret)
- goto out_free;
- /* it's copied and will be cleaned with ->io */
- iovec = NULL;
- return -EAGAIN;
+ /* some cases will consume bytes even on error returns */
+ iov_iter_revert(iter, iov_count - iov_iter_count(iter));
+ ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
+ if (!ret)
+ return -EAGAIN;
}
out_free:
+ /* it's reportedly faster than delegating the null check to kfree() */
if (iovec)
kfree(iovec);
return ret;
@@ -3220,10 +3483,7 @@ static int __io_splice_prep(struct io_kiocb *req,
{
struct io_splice* sp = &req->splice;
unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
- int ret;
- if (req->flags & REQ_F_NEED_CLEANUP)
- return 0;
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
@@ -3234,10 +3494,10 @@ static int __io_splice_prep(struct io_kiocb *req,
if (unlikely(sp->flags & ~valid_flags))
return -EINVAL;
- ret = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in), &sp->file_in,
- (sp->flags & SPLICE_F_FD_IN_FIXED));
- if (ret)
- return ret;
+ sp->file_in = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in),
+ (sp->flags & SPLICE_F_FD_IN_FIXED));
+ if (!sp->file_in)
+ return -EBADF;
req->flags |= REQ_F_NEED_CLEANUP;
if (!S_ISREG(file_inode(sp->file_in)->i_mode)) {
@@ -3405,8 +3665,6 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
const char __user *fname;
int ret;
- if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
- return -EINVAL;
if (unlikely(sqe->ioprio || sqe->buf_index))
return -EINVAL;
if (unlikely(req->flags & REQ_F_FIXED_FILE))
@@ -3433,8 +3691,8 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
u64 flags, mode;
- if (req->flags & REQ_F_NEED_CLEANUP)
- return 0;
+ if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
+ return -EINVAL;
mode = READ_ONCE(sqe->len);
flags = READ_ONCE(sqe->open_flags);
req->open.how = build_open_how(flags, mode);
@@ -3447,8 +3705,8 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
size_t len;
int ret;
- if (req->flags & REQ_F_NEED_CLEANUP)
- return 0;
+ if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
+ return -EINVAL;
how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
len = READ_ONCE(sqe->len);
if (len < OPEN_HOW_SIZE_VER0)
@@ -3664,7 +3922,7 @@ static int io_epoll_ctl_prep(struct io_kiocb *req,
#if defined(CONFIG_EPOLL)
if (sqe->ioprio || sqe->buf_index)
return -EINVAL;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL)))
return -EINVAL;
req->epoll.epfd = READ_ONCE(sqe->fd);
@@ -3779,7 +4037,7 @@ static int io_fadvise(struct io_kiocb *req, bool force_nonblock)
static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL)))
return -EINVAL;
if (sqe->ioprio || sqe->buf_index)
return -EINVAL;
@@ -3835,8 +4093,7 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return -EBADF;
req->close.fd = READ_ONCE(sqe->fd);
- if ((req->file && req->file->f_op == &io_uring_fops) ||
- req->close.fd == req->ctx->ring_fd)
+ if ((req->file && req->file->f_op == &io_uring_fops))
return -EBADF;
req->close.put_file = NULL;
@@ -3913,15 +4170,18 @@ static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock)
static int io_setup_async_msg(struct io_kiocb *req,
struct io_async_msghdr *kmsg)
{
- if (req->io)
+ struct io_async_msghdr *async_msg = req->async_data;
+
+ if (async_msg)
return -EAGAIN;
- if (io_alloc_async_ctx(req)) {
+ if (io_alloc_async_data(req)) {
if (kmsg->iov != kmsg->fast_iov)
kfree(kmsg->iov);
return -ENOMEM;
}
+ async_msg = req->async_data;
req->flags |= REQ_F_NEED_CLEANUP;
- memcpy(&req->io->msg, kmsg, sizeof(*kmsg));
+ memcpy(async_msg, kmsg, sizeof(*kmsg));
return -EAGAIN;
}
@@ -3936,8 +4196,8 @@ static int io_sendmsg_copy_hdr(struct io_kiocb *req,
static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
+ struct io_async_msghdr *async_msg = req->async_data;
struct io_sr_msg *sr = &req->sr_msg;
- struct io_async_ctx *io = req->io;
int ret;
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
@@ -3952,13 +4212,9 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
sr->msg_flags |= MSG_CMSG_COMPAT;
#endif
- if (!io || req->opcode == IORING_OP_SEND)
+ if (!async_msg || !io_op_defs[req->opcode].needs_async_data)
return 0;
- /* iovec is already imported */
- if (req->flags & REQ_F_NEED_CLEANUP)
- return 0;
-
- ret = io_sendmsg_copy_hdr(req, &io->msg);
+ ret = io_sendmsg_copy_hdr(req, async_msg);
if (!ret)
req->flags |= REQ_F_NEED_CLEANUP;
return ret;
@@ -3976,9 +4232,9 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock,
if (unlikely(!sock))
return ret;
- if (req->io) {
- kmsg = &req->io->msg;
- kmsg->msg.msg_name = &req->io->msg.addr;
+ if (req->async_data) {
+ kmsg = req->async_data;
+ kmsg->msg.msg_name = &kmsg->addr;
/* if iov is set, it's allocated already */
if (!kmsg->iov)
kmsg->iov = kmsg->fast_iov;
@@ -4027,7 +4283,7 @@ static int io_send(struct io_kiocb *req, bool force_nonblock,
ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter);
if (unlikely(ret))
- return ret;;
+ return ret;
msg.msg_name = NULL;
msg.msg_control = NULL;
@@ -4076,8 +4332,9 @@ static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
sr->len);
iomsg->iov = NULL;
} else {
- ret = import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
- &iomsg->iov, &iomsg->msg.msg_iter);
+ ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
+ &iomsg->iov, &iomsg->msg.msg_iter,
+ false);
if (ret > 0)
ret = 0;
}
@@ -4117,9 +4374,9 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
sr->len = iomsg->iov[0].iov_len;
iomsg->iov = NULL;
} else {
- ret = compat_import_iovec(READ, uiov, len, UIO_FASTIOV,
- &iomsg->iov,
- &iomsg->msg.msg_iter);
+ ret = __import_iovec(READ, (struct iovec __user *)uiov, len,
+ UIO_FASTIOV, &iomsg->iov,
+ &iomsg->msg.msg_iter, true);
if (ret < 0)
return ret;
}
@@ -4165,8 +4422,8 @@ static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req)
static int io_recvmsg_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
+ struct io_async_msghdr *async_msg = req->async_data;
struct io_sr_msg *sr = &req->sr_msg;
- struct io_async_ctx *io = req->io;
int ret;
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
@@ -4182,13 +4439,9 @@ static int io_recvmsg_prep(struct io_kiocb *req,
sr->msg_flags |= MSG_CMSG_COMPAT;
#endif
- if (!io || req->opcode == IORING_OP_RECV)
+ if (!async_msg || !io_op_defs[req->opcode].needs_async_data)
return 0;
- /* iovec is already imported */
- if (req->flags & REQ_F_NEED_CLEANUP)
- return 0;
-
- ret = io_recvmsg_copy_hdr(req, &io->msg);
+ ret = io_recvmsg_copy_hdr(req, async_msg);
if (!ret)
req->flags |= REQ_F_NEED_CLEANUP;
return ret;
@@ -4207,9 +4460,9 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock,
if (unlikely(!sock))
return ret;
- if (req->io) {
- kmsg = &req->io->msg;
- kmsg->msg.msg_name = &req->io->msg.addr;
+ if (req->async_data) {
+ kmsg = req->async_data;
+ kmsg->msg.msg_name = &kmsg->addr;
/* if iov is set, it's allocated already */
if (!kmsg->iov)
kmsg->iov = kmsg->fast_iov;
@@ -4351,7 +4604,7 @@ static int io_accept(struct io_kiocb *req, bool force_nonblock,
static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_connect *conn = &req->connect;
- struct io_async_ctx *io = req->io;
+ struct io_async_connect *io = req->async_data;
if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
return -EINVAL;
@@ -4365,22 +4618,22 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return 0;
return move_addr_to_kernel(conn->addr, conn->addr_len,
- &io->connect.address);
+ &io->address);
}
static int io_connect(struct io_kiocb *req, bool force_nonblock,
struct io_comp_state *cs)
{
- struct io_async_ctx __io, *io;
+ struct io_async_connect __io, *io;
unsigned file_flags;
int ret;
- if (req->io) {
- io = req->io;
+ if (req->async_data) {
+ io = req->async_data;
} else {
ret = move_addr_to_kernel(req->connect.addr,
req->connect.addr_len,
- &__io.connect.address);
+ &__io.address);
if (ret)
goto out;
io = &__io;
@@ -4388,16 +4641,17 @@ static int io_connect(struct io_kiocb *req, bool force_nonblock,
file_flags = force_nonblock ? O_NONBLOCK : 0;
- ret = __sys_connect_file(req->file, &io->connect.address,
+ ret = __sys_connect_file(req->file, &io->address,
req->connect.addr_len, file_flags);
if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
- if (req->io)
+ if (req->async_data)
return -EAGAIN;
- if (io_alloc_async_ctx(req)) {
+ if (io_alloc_async_data(req)) {
ret = -ENOMEM;
goto out;
}
- memcpy(&req->io->connect, &__io.connect, sizeof(__io.connect));
+ io = req->async_data;
+ memcpy(req->async_data, &__io, sizeof(__io));
return -EAGAIN;
}
if (ret == -ERESTARTSYS)
@@ -4476,6 +4730,7 @@ struct io_poll_table {
static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
__poll_t mask, task_work_func_t func)
{
+ bool twa_signal_ok;
int ret;
/* for instances that support it check for an event match first: */
@@ -4488,13 +4743,23 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
req->result = mask;
init_task_work(&req->task_work, func);
+ percpu_ref_get(&req->ctx->refs);
+
+ /*
+ * If we using the signalfd wait_queue_head for this wakeup, then
+ * it's not safe to use TWA_SIGNAL as we could be recursing on the
+ * tsk->sighand->siglock on doing the wakeup. Should not be needed
+ * either, as the normal wakeup will suffice.
+ */
+ twa_signal_ok = (poll->head != &req->task->sighand->signalfd_wqh);
+
/*
* If this fails, then the task is exiting. When a task exits, the
* work gets canceled, so just cancel this request as well instead
* of executing it. We can't safely execute it anyway, as we may not
* have the needed state needed for it anyway.
*/
- ret = io_req_task_work_add(req, &req->task_work);
+ ret = io_req_task_work_add(req, twa_signal_ok);
if (unlikely(ret)) {
struct task_struct *tsk;
@@ -4526,9 +4791,24 @@ static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll)
return false;
}
-static void io_poll_remove_double(struct io_kiocb *req, void *data)
+static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req)
{
- struct io_poll_iocb *poll = data;
+ /* pure poll stashes this in ->async_data, poll driven retry elsewhere */
+ if (req->opcode == IORING_OP_POLL_ADD)
+ return req->async_data;
+ return req->apoll->double_poll;
+}
+
+static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req)
+{
+ if (req->opcode == IORING_OP_POLL_ADD)
+ return &req->poll;
+ return &req->apoll->poll;
+}
+
+static void io_poll_remove_double(struct io_kiocb *req)
+{
+ struct io_poll_iocb *poll = io_poll_get_double(req);
lockdep_assert_held(&req->ctx->completion_lock);
@@ -4548,7 +4828,7 @@ static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
{
struct io_ring_ctx *ctx = req->ctx;
- io_poll_remove_double(req, req->io);
+ io_poll_remove_double(req);
req->poll.done = true;
io_cqring_fill_event(req, error ? error : mangle_poll(mask));
io_commit_cqring(ctx);
@@ -4575,24 +4855,28 @@ static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt)
static void io_poll_task_func(struct callback_head *cb)
{
struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
+ struct io_ring_ctx *ctx = req->ctx;
struct io_kiocb *nxt = NULL;
io_poll_task_handler(req, &nxt);
if (nxt)
__io_req_task_submit(nxt);
+ percpu_ref_put(&ctx->refs);
}
static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
int sync, void *key)
{
struct io_kiocb *req = wait->private;
- struct io_poll_iocb *poll = req->apoll->double_poll;
+ struct io_poll_iocb *poll = io_poll_get_single(req);
__poll_t mask = key_to_poll(key);
/* for instances that support it check for an event match first: */
if (mask && !(mask & poll->events))
return 0;
+ list_del_init(&wait->entry);
+
if (poll && poll->head) {
bool done;
@@ -4600,6 +4884,8 @@ static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
done = list_empty(&poll->wait.entry);
if (!done)
list_del_init(&poll->wait.entry);
+ /* make sure double remove sees this as being gone */
+ wait->private = NULL;
spin_unlock(&poll->head->lock);
if (!done)
__io_async_wake(req, poll, mask, io_poll_task_func);
@@ -4675,6 +4961,7 @@ static void io_async_task_func(struct callback_head *cb)
if (io_poll_rewait(req, &apoll->poll)) {
spin_unlock_irq(&ctx->completion_lock);
+ percpu_ref_put(&ctx->refs);
return;
}
@@ -4682,7 +4969,7 @@ static void io_async_task_func(struct callback_head *cb)
if (hash_hashed(&req->hash_node))
hash_del(&req->hash_node);
- io_poll_remove_double(req, apoll->double_poll);
+ io_poll_remove_double(req);
spin_unlock_irq(&ctx->completion_lock);
if (!READ_ONCE(apoll->poll.canceled))
@@ -4690,6 +4977,7 @@ static void io_async_task_func(struct callback_head *cb)
else
__io_req_task_cancel(req, -ECANCELED);
+ percpu_ref_put(&ctx->refs);
kfree(apoll->double_poll);
kfree(apoll);
}
@@ -4762,12 +5050,20 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
struct async_poll *apoll;
struct io_poll_table ipt;
__poll_t mask, ret;
+ int rw;
if (!req->file || !file_can_poll(req->file))
return false;
if (req->flags & REQ_F_POLLED)
return false;
- if (!def->pollin && !def->pollout)
+ if (def->pollin)
+ rw = READ;
+ else if (def->pollout)
+ rw = WRITE;
+ else
+ return false;
+ /* if we can't nonblock try, then no point in arming a poll handler */
+ if (!io_file_supports_async(req->file, rw))
return false;
apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
@@ -4776,7 +5072,6 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
apoll->double_poll = NULL;
req->flags |= REQ_F_POLLED;
- io_get_req_task(req);
req->apoll = apoll;
INIT_HLIST_NODE(&req->hash_node);
@@ -4791,8 +5086,8 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
io_async_wake);
- if (ret) {
- io_poll_remove_double(req, apoll->double_poll);
+ if (ret || ipt.error) {
+ io_poll_remove_double(req);
spin_unlock_irq(&ctx->completion_lock);
kfree(apoll->double_poll);
kfree(apoll);
@@ -4824,14 +5119,13 @@ static bool io_poll_remove_one(struct io_kiocb *req)
{
bool do_complete;
+ io_poll_remove_double(req);
+
if (req->opcode == IORING_OP_POLL_ADD) {
- io_poll_remove_double(req, req->io);
do_complete = __io_poll_remove_one(req, &req->poll);
} else {
struct async_poll *apoll = req->apoll;
- io_poll_remove_double(req, apoll->double_poll);
-
/* non-poll requests have submit ref still */
do_complete = __io_poll_remove_one(req, &apoll->poll);
if (do_complete) {
@@ -4845,13 +5139,17 @@ static bool io_poll_remove_one(struct io_kiocb *req)
io_cqring_fill_event(req, -ECANCELED);
io_commit_cqring(req->ctx);
req->flags |= REQ_F_COMP_LOCKED;
+ req_set_fail_links(req);
io_put_req(req);
}
return do_complete;
}
-static void io_poll_remove_all(struct io_ring_ctx *ctx)
+/*
+ * Returns true if we found and killed one or more poll requests
+ */
+static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk)
{
struct hlist_node *tmp;
struct io_kiocb *req;
@@ -4862,13 +5160,17 @@ static void io_poll_remove_all(struct io_ring_ctx *ctx)
struct hlist_head *list;
list = &ctx->cancel_hash[i];
- hlist_for_each_entry_safe(req, tmp, list, hash_node)
- posted += io_poll_remove_one(req);
+ hlist_for_each_entry_safe(req, tmp, list, hash_node) {
+ if (io_task_match(req, tsk))
+ posted += io_poll_remove_one(req);
+ }
}
spin_unlock_irq(&ctx->completion_lock);
if (posted)
io_cqring_ev_posted(ctx);
+
+ return posted != 0;
}
static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
@@ -4936,7 +5238,7 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
{
struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
- __io_queue_proc(&pt->req->poll, pt, head, (struct io_poll_iocb **) &pt->req->io);
+ __io_queue_proc(&pt->req->poll, pt, head, (struct io_poll_iocb **) &pt->req->async_data);
}
static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -4957,8 +5259,6 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
#endif
poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP |
(events & EPOLLEXCLUSIVE);
-
- io_get_req_task(req);
return 0;
}
@@ -4997,16 +5297,10 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
unsigned long flags;
spin_lock_irqsave(&ctx->completion_lock, flags);
+ list_del_init(&req->timeout.list);
atomic_set(&req->ctx->cq_timeouts,
atomic_read(&req->ctx->cq_timeouts) + 1);
- /*
- * We could be racing with timeout deletion. If the list is empty,
- * then timeout lookup already found it and will be handling it.
- */
- if (!list_empty(&req->timeout.list))
- list_del_init(&req->timeout.list);
-
io_cqring_fill_event(req, -ETIME);
io_commit_cqring(ctx);
spin_unlock_irqrestore(&ctx->completion_lock, flags);
@@ -5017,6 +5311,23 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
+static int __io_timeout_cancel(struct io_kiocb *req)
+{
+ struct io_timeout_data *io = req->async_data;
+ int ret;
+
+ ret = hrtimer_try_to_cancel(&io->timer);
+ if (ret == -1)
+ return -EALREADY;
+ list_del_init(&req->timeout.list);
+
+ req_set_fail_links(req);
+ req->flags |= REQ_F_COMP_LOCKED;
+ io_cqring_fill_event(req, -ECANCELED);
+ io_put_req(req);
+ return 0;
+}
+
static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
{
struct io_kiocb *req;
@@ -5024,7 +5335,6 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
if (user_data == req->user_data) {
- list_del_init(&req->timeout.list);
ret = 0;
break;
}
@@ -5033,14 +5343,7 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
if (ret == -ENOENT)
return ret;
- ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
- if (ret == -1)
- return -EALREADY;
-
- req_set_fail_links(req);
- io_cqring_fill_event(req, -ECANCELED);
- io_put_req(req);
- return 0;
+ return __io_timeout_cancel(req);
}
static int io_timeout_remove_prep(struct io_kiocb *req,
@@ -5050,14 +5353,10 @@ static int io_timeout_remove_prep(struct io_kiocb *req,
return -EINVAL;
if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
return -EINVAL;
- if (sqe->ioprio || sqe->buf_index || sqe->len)
- return -EINVAL;
-
- req->timeout.addr = READ_ONCE(sqe->addr);
- req->timeout.flags = READ_ONCE(sqe->timeout_flags);
- if (req->timeout.flags)
+ if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->timeout_flags)
return -EINVAL;
+ req->timeout_rem.addr = READ_ONCE(sqe->addr);
return 0;
}
@@ -5070,7 +5369,7 @@ static int io_timeout_remove(struct io_kiocb *req)
int ret;
spin_lock_irq(&ctx->completion_lock);
- ret = io_timeout_cancel(ctx, req->timeout.addr);
+ ret = io_timeout_cancel(ctx, req->timeout_rem.addr);
io_cqring_fill_event(req, ret);
io_commit_cqring(ctx);
@@ -5101,10 +5400,10 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
req->timeout.off = off;
- if (!req->io && io_alloc_async_ctx(req))
+ if (!req->async_data && io_alloc_async_data(req))
return -ENOMEM;
- data = &req->io->timeout;
+ data = req->async_data;
data->req = req;
if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
@@ -5122,7 +5421,7 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
static int io_timeout(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
- struct io_timeout_data *data = &req->io->timeout;
+ struct io_timeout_data *data = req->async_data;
struct list_head *entry;
u32 tail, off = req->timeout.off;
@@ -5247,6 +5546,8 @@ static int io_async_cancel(struct io_kiocb *req)
static int io_files_update_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
+ if (unlikely(req->ctx->flags & IORING_SETUP_SQPOLL))
+ return -EINVAL;
if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
return -EINVAL;
if (sqe->ioprio || sqe->rw_flags)
@@ -5283,118 +5584,86 @@ static int io_files_update(struct io_kiocb *req, bool force_nonblock,
return 0;
}
-static int io_req_defer_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
+static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
- ssize_t ret = 0;
-
- if (!sqe)
- return 0;
-
- if (io_alloc_async_ctx(req))
- return -EAGAIN;
- ret = io_prep_work_files(req);
- if (unlikely(ret))
- return ret;
-
switch (req->opcode) {
case IORING_OP_NOP:
- break;
+ return 0;
case IORING_OP_READV:
case IORING_OP_READ_FIXED:
case IORING_OP_READ:
- ret = io_read_prep(req, sqe, true);
- break;
+ return io_read_prep(req, sqe);
case IORING_OP_WRITEV:
case IORING_OP_WRITE_FIXED:
case IORING_OP_WRITE:
- ret = io_write_prep(req, sqe, true);
- break;
+ return io_write_prep(req, sqe);
case IORING_OP_POLL_ADD:
- ret = io_poll_add_prep(req, sqe);
- break;
+ return io_poll_add_prep(req, sqe);
case IORING_OP_POLL_REMOVE:
- ret = io_poll_remove_prep(req, sqe);
- break;
+ return io_poll_remove_prep(req, sqe);
case IORING_OP_FSYNC:
- ret = io_prep_fsync(req, sqe);
- break;
+ return io_prep_fsync(req, sqe);
case IORING_OP_SYNC_FILE_RANGE:
- ret = io_prep_sfr(req, sqe);
- break;
+ return io_prep_sfr(req, sqe);
case IORING_OP_SENDMSG:
case IORING_OP_SEND:
- ret = io_sendmsg_prep(req, sqe);
- break;
+ return io_sendmsg_prep(req, sqe);
case IORING_OP_RECVMSG:
case IORING_OP_RECV:
- ret = io_recvmsg_prep(req, sqe);
- break;
+ return io_recvmsg_prep(req, sqe);
case IORING_OP_CONNECT:
- ret = io_connect_prep(req, sqe);
- break;
+ return io_connect_prep(req, sqe);
case IORING_OP_TIMEOUT:
- ret = io_timeout_prep(req, sqe, false);
- break;
+ return io_timeout_prep(req, sqe, false);
case IORING_OP_TIMEOUT_REMOVE:
- ret = io_timeout_remove_prep(req, sqe);
- break;
+ return io_timeout_remove_prep(req, sqe);
case IORING_OP_ASYNC_CANCEL:
- ret = io_async_cancel_prep(req, sqe);
- break;
+ return io_async_cancel_prep(req, sqe);
case IORING_OP_LINK_TIMEOUT:
- ret = io_timeout_prep(req, sqe, true);
- break;
+ return io_timeout_prep(req, sqe, true);
case IORING_OP_ACCEPT:
- ret = io_accept_prep(req, sqe);
- break;
+ return io_accept_prep(req, sqe);
case IORING_OP_FALLOCATE:
- ret = io_fallocate_prep(req, sqe);
- break;
+ return io_fallocate_prep(req, sqe);
case IORING_OP_OPENAT:
- ret = io_openat_prep(req, sqe);
- break;
+ return io_openat_prep(req, sqe);
case IORING_OP_CLOSE:
- ret = io_close_prep(req, sqe);
- break;
+ return io_close_prep(req, sqe);
case IORING_OP_FILES_UPDATE:
- ret = io_files_update_prep(req, sqe);
- break;
+ return io_files_update_prep(req, sqe);
case IORING_OP_STATX:
- ret = io_statx_prep(req, sqe);
- break;
+ return io_statx_prep(req, sqe);
case IORING_OP_FADVISE:
- ret = io_fadvise_prep(req, sqe);
- break;
+ return io_fadvise_prep(req, sqe);
case IORING_OP_MADVISE:
- ret = io_madvise_prep(req, sqe);
- break;
+ return io_madvise_prep(req, sqe);
case IORING_OP_OPENAT2:
- ret = io_openat2_prep(req, sqe);
- break;
+ return io_openat2_prep(req, sqe);
case IORING_OP_EPOLL_CTL:
- ret = io_epoll_ctl_prep(req, sqe);
- break;
+ return io_epoll_ctl_prep(req, sqe);
case IORING_OP_SPLICE:
- ret = io_splice_prep(req, sqe);
- break;
+ return io_splice_prep(req, sqe);
case IORING_OP_PROVIDE_BUFFERS:
- ret = io_provide_buffers_prep(req, sqe);
- break;
+ return io_provide_buffers_prep(req, sqe);
case IORING_OP_REMOVE_BUFFERS:
- ret = io_remove_buffers_prep(req, sqe);
- break;
+ return io_remove_buffers_prep(req, sqe);
case IORING_OP_TEE:
- ret = io_tee_prep(req, sqe);
- break;
- default:
- printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
- req->opcode);
- ret = -EINVAL;
- break;
+ return io_tee_prep(req, sqe);
}
- return ret;
+ printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
+ req->opcode);
+ return-EINVAL;
+}
+
+static int io_req_defer_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ if (!sqe)
+ return 0;
+ if (io_alloc_async_data(req))
+ return -EAGAIN;
+ return io_req_prep(req, sqe);
}
static u32 io_get_sequence(struct io_kiocb *req)
@@ -5428,7 +5697,7 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list))
return 0;
- if (!req->io) {
+ if (!req->async_data) {
ret = io_req_defer_prep(req, sqe);
if (ret)
return ret;
@@ -5454,10 +5723,24 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return -EIOCBQUEUED;
}
-static void __io_clean_op(struct io_kiocb *req)
+static void io_req_drop_files(struct io_kiocb *req)
{
- struct io_async_ctx *io = req->io;
+ struct io_ring_ctx *ctx = req->ctx;
+ unsigned long flags;
+ spin_lock_irqsave(&ctx->inflight_lock, flags);
+ list_del(&req->inflight_entry);
+ if (waitqueue_active(&ctx->inflight_wait))
+ wake_up(&ctx->inflight_wait);
+ spin_unlock_irqrestore(&ctx->inflight_lock, flags);
+ req->flags &= ~REQ_F_INFLIGHT;
+ put_files_struct(req->work.files);
+ put_nsproxy(req->work.nsproxy);
+ req->work.files = NULL;
+}
+
+static void __io_clean_op(struct io_kiocb *req)
+{
if (req->flags & REQ_F_BUFFER_SELECTED) {
switch (req->opcode) {
case IORING_OP_READV:
@@ -5480,27 +5763,39 @@ static void __io_clean_op(struct io_kiocb *req)
case IORING_OP_READ:
case IORING_OP_WRITEV:
case IORING_OP_WRITE_FIXED:
- case IORING_OP_WRITE:
- if (io->rw.iov != io->rw.fast_iov)
- kfree(io->rw.iov);
+ case IORING_OP_WRITE: {
+ struct io_async_rw *io = req->async_data;
+ if (io->free_iovec)
+ kfree(io->free_iovec);
break;
+ }
case IORING_OP_RECVMSG:
- case IORING_OP_SENDMSG:
- if (io->msg.iov != io->msg.fast_iov)
- kfree(io->msg.iov);
+ case IORING_OP_SENDMSG: {
+ struct io_async_msghdr *io = req->async_data;
+ if (io->iov != io->fast_iov)
+ kfree(io->iov);
break;
+ }
case IORING_OP_SPLICE:
case IORING_OP_TEE:
io_put_file(req, req->splice.file_in,
(req->splice.flags & SPLICE_F_FD_IN_FIXED));
break;
+ case IORING_OP_OPENAT:
+ case IORING_OP_OPENAT2:
+ if (req->open.filename)
+ putname(req->open.filename);
+ break;
}
req->flags &= ~REQ_F_NEED_CLEANUP;
}
+
+ if (req->flags & REQ_F_INFLIGHT)
+ io_req_drop_files(req);
}
-static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
- bool force_nonblock, struct io_comp_state *cs)
+static int io_issue_sqe(struct io_kiocb *req, bool force_nonblock,
+ struct io_comp_state *cs)
{
struct io_ring_ctx *ctx = req->ctx;
int ret;
@@ -5512,221 +5807,89 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
case IORING_OP_READV:
case IORING_OP_READ_FIXED:
case IORING_OP_READ:
- if (sqe) {
- ret = io_read_prep(req, sqe, force_nonblock);
- if (ret < 0)
- break;
- }
ret = io_read(req, force_nonblock, cs);
break;
case IORING_OP_WRITEV:
case IORING_OP_WRITE_FIXED:
case IORING_OP_WRITE:
- if (sqe) {
- ret = io_write_prep(req, sqe, force_nonblock);
- if (ret < 0)
- break;
- }
ret = io_write(req, force_nonblock, cs);
break;
case IORING_OP_FSYNC:
- if (sqe) {
- ret = io_prep_fsync(req, sqe);
- if (ret < 0)
- break;
- }
ret = io_fsync(req, force_nonblock);
break;
case IORING_OP_POLL_ADD:
- if (sqe) {
- ret = io_poll_add_prep(req, sqe);
- if (ret)
- break;
- }
ret = io_poll_add(req);
break;
case IORING_OP_POLL_REMOVE:
- if (sqe) {
- ret = io_poll_remove_prep(req, sqe);
- if (ret < 0)
- break;
- }
ret = io_poll_remove(req);
break;
case IORING_OP_SYNC_FILE_RANGE:
- if (sqe) {
- ret = io_prep_sfr(req, sqe);
- if (ret < 0)
- break;
- }
ret = io_sync_file_range(req, force_nonblock);
break;
case IORING_OP_SENDMSG:
+ ret = io_sendmsg(req, force_nonblock, cs);
+ break;
case IORING_OP_SEND:
- if (sqe) {
- ret = io_sendmsg_prep(req, sqe);
- if (ret < 0)
- break;
- }
- if (req->opcode == IORING_OP_SENDMSG)
- ret = io_sendmsg(req, force_nonblock, cs);
- else
- ret = io_send(req, force_nonblock, cs);
+ ret = io_send(req, force_nonblock, cs);
break;
case IORING_OP_RECVMSG:
+ ret = io_recvmsg(req, force_nonblock, cs);
+ break;
case IORING_OP_RECV:
- if (sqe) {
- ret = io_recvmsg_prep(req, sqe);
- if (ret)
- break;
- }
- if (req->opcode == IORING_OP_RECVMSG)
- ret = io_recvmsg(req, force_nonblock, cs);
- else
- ret = io_recv(req, force_nonblock, cs);
+ ret = io_recv(req, force_nonblock, cs);
break;
case IORING_OP_TIMEOUT:
- if (sqe) {
- ret = io_timeout_prep(req, sqe, false);
- if (ret)
- break;
- }
ret = io_timeout(req);
break;
case IORING_OP_TIMEOUT_REMOVE:
- if (sqe) {
- ret = io_timeout_remove_prep(req, sqe);
- if (ret)
- break;
- }
ret = io_timeout_remove(req);
break;
case IORING_OP_ACCEPT:
- if (sqe) {
- ret = io_accept_prep(req, sqe);
- if (ret)
- break;
- }
ret = io_accept(req, force_nonblock, cs);
break;
case IORING_OP_CONNECT:
- if (sqe) {
- ret = io_connect_prep(req, sqe);
- if (ret)
- break;
- }
ret = io_connect(req, force_nonblock, cs);
break;
case IORING_OP_ASYNC_CANCEL:
- if (sqe) {
- ret = io_async_cancel_prep(req, sqe);
- if (ret)
- break;
- }
ret = io_async_cancel(req);
break;
case IORING_OP_FALLOCATE:
- if (sqe) {
- ret = io_fallocate_prep(req, sqe);
- if (ret)
- break;
- }
ret = io_fallocate(req, force_nonblock);
break;
case IORING_OP_OPENAT:
- if (sqe) {
- ret = io_openat_prep(req, sqe);
- if (ret)
- break;
- }
ret = io_openat(req, force_nonblock);
break;
case IORING_OP_CLOSE:
- if (sqe) {
- ret = io_close_prep(req, sqe);
- if (ret)
- break;
- }
ret = io_close(req, force_nonblock, cs);
break;
case IORING_OP_FILES_UPDATE:
- if (sqe) {
- ret = io_files_update_prep(req, sqe);
- if (ret)
- break;
- }
ret = io_files_update(req, force_nonblock, cs);
break;
case IORING_OP_STATX:
- if (sqe) {
- ret = io_statx_prep(req, sqe);
- if (ret)
- break;
- }
ret = io_statx(req, force_nonblock);
break;
case IORING_OP_FADVISE:
- if (sqe) {
- ret = io_fadvise_prep(req, sqe);
- if (ret)
- break;
- }
ret = io_fadvise(req, force_nonblock);
break;
case IORING_OP_MADVISE:
- if (sqe) {
- ret = io_madvise_prep(req, sqe);
- if (ret)
- break;
- }
ret = io_madvise(req, force_nonblock);
break;
case IORING_OP_OPENAT2:
- if (sqe) {
- ret = io_openat2_prep(req, sqe);
- if (ret)
- break;
- }
ret = io_openat2(req, force_nonblock);
break;
case IORING_OP_EPOLL_CTL:
- if (sqe) {
- ret = io_epoll_ctl_prep(req, sqe);
- if (ret)
- break;
- }
ret = io_epoll_ctl(req, force_nonblock, cs);
break;
case IORING_OP_SPLICE:
- if (sqe) {
- ret = io_splice_prep(req, sqe);
- if (ret < 0)
- break;
- }
ret = io_splice(req, force_nonblock);
break;
case IORING_OP_PROVIDE_BUFFERS:
- if (sqe) {
- ret = io_provide_buffers_prep(req, sqe);
- if (ret)
- break;
- }
ret = io_provide_buffers(req, force_nonblock, cs);
break;
case IORING_OP_REMOVE_BUFFERS:
- if (sqe) {
- ret = io_remove_buffers_prep(req, sqe);
- if (ret)
- break;
- }
ret = io_remove_buffers(req, force_nonblock, cs);
break;
case IORING_OP_TEE:
- if (sqe) {
- ret = io_tee_prep(req, sqe);
- if (ret < 0)
- break;
- }
ret = io_tee(req, force_nonblock);
break;
default:
@@ -5772,7 +5935,7 @@ static struct io_wq_work *io_wq_submit_work(struct io_wq_work *work)
if (!ret) {
do {
- ret = io_issue_sqe(req, NULL, false, NULL);
+ ret = io_issue_sqe(req, false, NULL);
/*
* We can get EAGAIN for polled IO even though we're
* forcing a sync submission from here, since we can't
@@ -5801,20 +5964,19 @@ static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
return table->files[index & IORING_FILE_TABLE_MASK];
}
-static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
- int fd, struct file **out_file, bool fixed)
+static struct file *io_file_get(struct io_submit_state *state,
+ struct io_kiocb *req, int fd, bool fixed)
{
struct io_ring_ctx *ctx = req->ctx;
struct file *file;
if (fixed) {
- if (unlikely(!ctx->file_data ||
- (unsigned) fd >= ctx->nr_user_files))
- return -EBADF;
+ if (unlikely((unsigned int)fd >= ctx->nr_user_files))
+ return NULL;
fd = array_index_nospec(fd, ctx->nr_user_files);
file = io_file_from_index(ctx, fd);
if (file) {
- req->fixed_file_refs = ctx->file_data->cur_refs;
+ req->fixed_file_refs = &ctx->file_data->node->refs;
percpu_ref_get(req->fixed_file_refs);
}
} else {
@@ -5822,11 +5984,7 @@ static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
file = __io_file_get(state, fd);
}
- if (file || io_op_defs[req->opcode].needs_file_no_error) {
- *out_file = file;
- return 0;
- }
- return -EBADF;
+ return file;
}
static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
@@ -5838,46 +5996,10 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
if (unlikely(!fixed && io_async_submit(req->ctx)))
return -EBADF;
- return io_file_get(state, req, fd, &req->file, fixed);
-}
-
-static int io_grab_files(struct io_kiocb *req)
-{
- int ret = -EBADF;
- struct io_ring_ctx *ctx = req->ctx;
-
- io_req_init_async(req);
-
- if (req->work.files || (req->flags & REQ_F_NO_FILE_TABLE))
- return 0;
- if (!ctx->ring_file)
- return -EBADF;
-
- rcu_read_lock();
- spin_lock_irq(&ctx->inflight_lock);
- /*
- * We use the f_ops->flush() handler to ensure that we can flush
- * out work accessing these files if the fd is closed. Check if
- * the fd has changed since we started down this path, and disallow
- * this operation if it has.
- */
- if (fcheck(ctx->ring_fd) == ctx->ring_file) {
- list_add(&req->inflight_entry, &ctx->inflight_list);
- req->flags |= REQ_F_INFLIGHT;
- req->work.files = current->files;
- ret = 0;
- }
- spin_unlock_irq(&ctx->inflight_lock);
- rcu_read_unlock();
-
- return ret;
-}
-
-static inline int io_prep_work_files(struct io_kiocb *req)
-{
- if (!io_op_defs[req->opcode].file_table)
+ req->file = io_file_get(state, req, fd, fixed);
+ if (req->file || io_op_defs[req->opcode].needs_file_no_error)
return 0;
- return io_grab_files(req);
+ return -EBADF;
}
static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
@@ -5917,22 +6039,27 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
-static void io_queue_linked_timeout(struct io_kiocb *req)
+static void __io_queue_linked_timeout(struct io_kiocb *req)
{
- struct io_ring_ctx *ctx = req->ctx;
-
/*
* If the list is now empty, then our linked request finished before
* we got a chance to setup the timer
*/
- spin_lock_irq(&ctx->completion_lock);
if (!list_empty(&req->link_list)) {
- struct io_timeout_data *data = &req->io->timeout;
+ struct io_timeout_data *data = req->async_data;
data->timer.function = io_link_timeout_fn;
hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
data->mode);
}
+}
+
+static void io_queue_linked_timeout(struct io_kiocb *req)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+
+ spin_lock_irq(&ctx->completion_lock);
+ __io_queue_linked_timeout(req);
spin_unlock_irq(&ctx->completion_lock);
/* drop submission reference */
@@ -5957,8 +6084,7 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
return nxt;
}
-static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
- struct io_comp_state *cs)
+static void __io_queue_sqe(struct io_kiocb *req, struct io_comp_state *cs)
{
struct io_kiocb *linked_timeout;
struct io_kiocb *nxt;
@@ -5978,7 +6104,7 @@ again:
old_creds = override_creds(req->work.creds);
}
- ret = io_issue_sqe(req, sqe, true, cs);
+ ret = io_issue_sqe(req, true, cs);
/*
* We async punt it if the file wasn't marked NOWAIT, or if the file
@@ -5987,9 +6113,6 @@ again:
if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
if (!io_arm_poll_handler(req)) {
punt:
- ret = io_prep_work_files(req);
- if (unlikely(ret))
- goto err;
/*
* Queued up for async execution, worker will release
* submit reference when the iocb is actually submitted.
@@ -6003,7 +6126,6 @@ punt:
}
if (unlikely(ret)) {
-err:
/* un-prep timeout, so it'll be killed as any other linked */
req->flags &= ~REQ_F_LINK_TIMEOUT;
req_set_fail_links(req);
@@ -6043,7 +6165,7 @@ fail_req:
io_req_complete(req, ret);
}
} else if (req->flags & REQ_F_FORCE_ASYNC) {
- if (!req->io) {
+ if (!req->async_data) {
ret = io_req_defer_prep(req, sqe);
if (unlikely(ret))
goto fail_req;
@@ -6057,7 +6179,12 @@ fail_req:
req->work.flags |= IO_WQ_WORK_CONCURRENT;
io_queue_async_work(req);
} else {
- __io_queue_sqe(req, sqe, cs);
+ if (sqe) {
+ ret = io_req_prep(req, sqe);
+ if (unlikely(ret))
+ goto fail_req;
+ }
+ __io_queue_sqe(req, cs);
}
}
@@ -6105,7 +6232,6 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
return ret;
}
trace_io_uring_link(ctx, req, head);
- io_get_req_task(req);
list_add_tail(&req->link_list, &head->link_list);
/* last request of a link, enqueue the link */
@@ -6154,9 +6280,6 @@ static void io_submit_state_start(struct io_submit_state *state,
struct io_ring_ctx *ctx, unsigned int max_ios)
{
blk_start_plug(&state->plug);
-#ifdef CONFIG_BLOCK
- state->plug.nowait = true;
-#endif
state->comp.nr = 0;
INIT_LIST_HEAD(&state->comp.list);
state->comp.ctx = ctx;
@@ -6213,6 +6336,32 @@ static inline void io_consume_sqe(struct io_ring_ctx *ctx)
ctx->cached_sq_head++;
}
+/*
+ * Check SQE restrictions (opcode and flags).
+ *
+ * Returns 'true' if SQE is allowed, 'false' otherwise.
+ */
+static inline bool io_check_restriction(struct io_ring_ctx *ctx,
+ struct io_kiocb *req,
+ unsigned int sqe_flags)
+{
+ if (!ctx->restricted)
+ return true;
+
+ if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
+ return false;
+
+ if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
+ ctx->restrictions.sqe_flags_required)
+ return false;
+
+ if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed |
+ ctx->restrictions.sqe_flags_required))
+ return false;
+
+ return true;
+}
+
#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
IOSQE_BUFFER_SELECT)
@@ -6222,11 +6371,11 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
struct io_submit_state *state)
{
unsigned int sqe_flags;
- int id;
+ int id, ret;
req->opcode = READ_ONCE(sqe->opcode);
req->user_data = READ_ONCE(sqe->user_data);
- req->io = NULL;
+ req->async_data = NULL;
req->file = NULL;
req->ctx = ctx;
req->flags = 0;
@@ -6246,6 +6395,9 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
if (unlikely(sqe_flags & ~SQE_VALID_FLAGS))
return -EINVAL;
+ if (unlikely(!io_check_restriction(ctx, req, sqe_flags)))
+ return -EACCES;
+
if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
!io_op_defs[req->opcode].buffer_select)
return -EOPNOTSUPP;
@@ -6265,11 +6417,12 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
if (!io_op_defs[req->opcode].needs_file)
return 0;
- return io_req_set_file(state, req, READ_ONCE(sqe->fd));
+ ret = io_req_set_file(state, req, READ_ONCE(sqe->fd));
+ state->ios_left--;
+ return ret;
}
-static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
- struct file *ring_file, int ring_fd)
+static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
{
struct io_submit_state state;
struct io_kiocb *link = NULL;
@@ -6278,7 +6431,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
/* if we have a backlog and couldn't flush it all, return BUSY */
if (test_bit(0, &ctx->sq_check_overflow)) {
if (!list_empty(&ctx->cq_overflow_list) &&
- !io_cqring_overflow_flush(ctx, false))
+ !io_cqring_overflow_flush(ctx, false, NULL, NULL))
return -EBUSY;
}
@@ -6288,10 +6441,10 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
if (!percpu_ref_tryget_many(&ctx->refs, nr))
return -EAGAIN;
- io_submit_state_start(&state, ctx, nr);
+ atomic_long_add(nr, &current->io_uring->req_issue);
+ refcount_add(nr, &current->usage);
- ctx->ring_fd = ring_fd;
- ctx->ring_file = ring_file;
+ io_submit_state_start(&state, ctx, nr);
for (i = 0; i < nr; i++) {
const struct io_uring_sqe *sqe;
@@ -6309,12 +6462,11 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
submitted = -EAGAIN;
break;
}
-
- err = io_init_req(ctx, req, sqe, &state);
io_consume_sqe(ctx);
/* will complete beyond this point, count as submitted */
submitted++;
+ err = io_init_req(ctx, req, sqe, &state);
if (unlikely(err)) {
fail_req:
io_put_req(req);
@@ -6333,6 +6485,8 @@ fail_req:
int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
percpu_ref_put_many(&ctx->refs, nr - ref_used);
+ atomic_long_sub(nr - ref_used, &current->io_uring->req_issue);
+ put_task_struct_many(current, nr - ref_used);
}
if (link)
io_queue_link_head(link, &state.comp);
@@ -6359,117 +6513,186 @@ static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx)
spin_unlock_irq(&ctx->completion_lock);
}
-static int io_sq_thread(void *data)
+static int io_sq_wake_function(struct wait_queue_entry *wqe, unsigned mode,
+ int sync, void *key)
{
- struct io_ring_ctx *ctx = data;
- const struct cred *old_cred;
- DEFINE_WAIT(wait);
- unsigned long timeout;
+ struct io_ring_ctx *ctx = container_of(wqe, struct io_ring_ctx, sqo_wait_entry);
+ int ret;
+
+ ret = autoremove_wake_function(wqe, mode, sync, key);
+ if (ret) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&ctx->completion_lock, flags);
+ ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
+ spin_unlock_irqrestore(&ctx->completion_lock, flags);
+ }
+ return ret;
+}
+
+enum sq_ret {
+ SQT_IDLE = 1,
+ SQT_SPIN = 2,
+ SQT_DID_WORK = 4,
+};
+
+static enum sq_ret __io_sq_thread(struct io_ring_ctx *ctx,
+ unsigned long start_jiffies, bool cap_entries)
+{
+ unsigned long timeout = start_jiffies + ctx->sq_thread_idle;
+ struct io_sq_data *sqd = ctx->sq_data;
+ unsigned int to_submit;
int ret = 0;
- complete(&ctx->sq_thread_comp);
+again:
+ if (!list_empty(&ctx->iopoll_list)) {
+ unsigned nr_events = 0;
- old_cred = override_creds(ctx->creds);
+ mutex_lock(&ctx->uring_lock);
+ if (!list_empty(&ctx->iopoll_list) && !need_resched())
+ io_do_iopoll(ctx, &nr_events, 0);
+ mutex_unlock(&ctx->uring_lock);
+ }
+
+ to_submit = io_sqring_entries(ctx);
+
+ /*
+ * If submit got -EBUSY, flag us as needing the application
+ * to enter the kernel to reap and flush events.
+ */
+ if (!to_submit || ret == -EBUSY || need_resched()) {
+ /*
+ * Drop cur_mm before scheduling, we can't hold it for
+ * long periods (or over schedule()). Do this before
+ * adding ourselves to the waitqueue, as the unuse/drop
+ * may sleep.
+ */
+ io_sq_thread_drop_mm();
- timeout = jiffies + ctx->sq_thread_idle;
- while (!kthread_should_park()) {
- unsigned int to_submit;
+ /*
+ * We're polling. If we're within the defined idle
+ * period, then let us spin without work before going
+ * to sleep. The exception is if we got EBUSY doing
+ * more IO, we should wait for the application to
+ * reap events and wake us up.
+ */
+ if (!list_empty(&ctx->iopoll_list) || need_resched() ||
+ (!time_after(jiffies, timeout) && ret != -EBUSY &&
+ !percpu_ref_is_dying(&ctx->refs)))
+ return SQT_SPIN;
- if (!list_empty(&ctx->iopoll_list)) {
- unsigned nr_events = 0;
+ prepare_to_wait(&sqd->wait, &ctx->sqo_wait_entry,
+ TASK_INTERRUPTIBLE);
- mutex_lock(&ctx->uring_lock);
- if (!list_empty(&ctx->iopoll_list) && !need_resched())
- io_do_iopoll(ctx, &nr_events, 0);
- else
- timeout = jiffies + ctx->sq_thread_idle;
- mutex_unlock(&ctx->uring_lock);
+ /*
+ * While doing polled IO, before going to sleep, we need
+ * to check if there are new reqs added to iopoll_list,
+ * it is because reqs may have been punted to io worker
+ * and will be added to iopoll_list later, hence check
+ * the iopoll_list again.
+ */
+ if ((ctx->flags & IORING_SETUP_IOPOLL) &&
+ !list_empty_careful(&ctx->iopoll_list)) {
+ finish_wait(&sqd->wait, &ctx->sqo_wait_entry);
+ goto again;
}
to_submit = io_sqring_entries(ctx);
+ if (!to_submit || ret == -EBUSY)
+ return SQT_IDLE;
+ }
+
+ finish_wait(&sqd->wait, &ctx->sqo_wait_entry);
+ io_ring_clear_wakeup_flag(ctx);
+
+ /* if we're handling multiple rings, cap submit size for fairness */
+ if (cap_entries && to_submit > 8)
+ to_submit = 8;
+
+ mutex_lock(&ctx->uring_lock);
+ if (likely(!percpu_ref_is_dying(&ctx->refs)))
+ ret = io_submit_sqes(ctx, to_submit);
+ mutex_unlock(&ctx->uring_lock);
+
+ if (!io_sqring_full(ctx) && wq_has_sleeper(&ctx->sqo_sq_wait))
+ wake_up(&ctx->sqo_sq_wait);
+
+ return SQT_DID_WORK;
+}
+
+static void io_sqd_init_new(struct io_sq_data *sqd)
+{
+ struct io_ring_ctx *ctx;
+
+ while (!list_empty(&sqd->ctx_new_list)) {
+ ctx = list_first_entry(&sqd->ctx_new_list, struct io_ring_ctx, sqd_list);
+ init_wait(&ctx->sqo_wait_entry);
+ ctx->sqo_wait_entry.func = io_sq_wake_function;
+ list_move_tail(&ctx->sqd_list, &sqd->ctx_list);
+ complete(&ctx->sq_thread_comp);
+ }
+}
+
+static int io_sq_thread(void *data)
+{
+ struct cgroup_subsys_state *cur_css = NULL;
+ const struct cred *old_cred = NULL;
+ struct io_sq_data *sqd = data;
+ struct io_ring_ctx *ctx;
+ unsigned long start_jiffies;
+
+ start_jiffies = jiffies;
+ while (!kthread_should_stop()) {
+ enum sq_ret ret = 0;
+ bool cap_entries;
/*
- * If submit got -EBUSY, flag us as needing the application
- * to enter the kernel to reap and flush events.
+ * Any changes to the sqd lists are synchronized through the
+ * kthread parking. This synchronizes the thread vs users,
+ * the users are synchronized on the sqd->ctx_lock.
*/
- if (!to_submit || ret == -EBUSY || need_resched()) {
- /*
- * Drop cur_mm before scheduling, we can't hold it for
- * long periods (or over schedule()). Do this before
- * adding ourselves to the waitqueue, as the unuse/drop
- * may sleep.
- */
- io_sq_thread_drop_mm();
+ if (kthread_should_park())
+ kthread_parkme();
- /*
- * We're polling. If we're within the defined idle
- * period, then let us spin without work before going
- * to sleep. The exception is if we got EBUSY doing
- * more IO, we should wait for the application to
- * reap events and wake us up.
- */
- if (!list_empty(&ctx->iopoll_list) || need_resched() ||
- (!time_after(jiffies, timeout) && ret != -EBUSY &&
- !percpu_ref_is_dying(&ctx->refs))) {
- io_run_task_work();
- cond_resched();
- continue;
- }
+ if (unlikely(!list_empty(&sqd->ctx_new_list)))
+ io_sqd_init_new(sqd);
- prepare_to_wait(&ctx->sqo_wait, &wait,
- TASK_INTERRUPTIBLE);
+ cap_entries = !list_is_singular(&sqd->ctx_list);
- /*
- * While doing polled IO, before going to sleep, we need
- * to check if there are new reqs added to iopoll_list,
- * it is because reqs may have been punted to io worker
- * and will be added to iopoll_list later, hence check
- * the iopoll_list again.
- */
- if ((ctx->flags & IORING_SETUP_IOPOLL) &&
- !list_empty_careful(&ctx->iopoll_list)) {
- finish_wait(&ctx->sqo_wait, &wait);
- continue;
+ list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
+ if (current->cred != ctx->creds) {
+ if (old_cred)
+ revert_creds(old_cred);
+ old_cred = override_creds(ctx->creds);
}
+ io_sq_thread_associate_blkcg(ctx, &cur_css);
- io_ring_set_wakeup_flag(ctx);
+ ret |= __io_sq_thread(ctx, start_jiffies, cap_entries);
- to_submit = io_sqring_entries(ctx);
- if (!to_submit || ret == -EBUSY) {
- if (kthread_should_park()) {
- finish_wait(&ctx->sqo_wait, &wait);
- break;
- }
- if (io_run_task_work()) {
- finish_wait(&ctx->sqo_wait, &wait);
- io_ring_clear_wakeup_flag(ctx);
- continue;
- }
- if (signal_pending(current))
- flush_signals(current);
- schedule();
- finish_wait(&ctx->sqo_wait, &wait);
+ io_sq_thread_drop_mm();
+ }
- io_ring_clear_wakeup_flag(ctx);
- ret = 0;
+ if (ret & SQT_SPIN) {
+ io_run_task_work();
+ cond_resched();
+ } else if (ret == SQT_IDLE) {
+ if (kthread_should_park())
continue;
- }
- finish_wait(&ctx->sqo_wait, &wait);
-
- io_ring_clear_wakeup_flag(ctx);
+ list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
+ io_ring_set_wakeup_flag(ctx);
+ schedule();
+ start_jiffies = jiffies;
+ list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
+ io_ring_clear_wakeup_flag(ctx);
}
-
- mutex_lock(&ctx->uring_lock);
- if (likely(!percpu_ref_is_dying(&ctx->refs)))
- ret = io_submit_sqes(ctx, to_submit, NULL, -1);
- mutex_unlock(&ctx->uring_lock);
- timeout = jiffies + ctx->sq_thread_idle;
}
io_run_task_work();
- io_sq_thread_drop_mm();
- revert_creds(old_cred);
+ if (cur_css)
+ io_sq_thread_unassociate_blkcg();
+ if (old_cred)
+ revert_creds(old_cred);
kthread_parkme();
@@ -6509,6 +6732,22 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
return autoremove_wake_function(curr, mode, wake_flags, key);
}
+static int io_run_task_work_sig(void)
+{
+ if (io_run_task_work())
+ return 1;
+ if (!signal_pending(current))
+ return 0;
+ if (current->jobctl & JOBCTL_TASK_WORK) {
+ spin_lock_irq(&current->sighand->siglock);
+ current->jobctl &= ~JOBCTL_TASK_WORK;
+ recalc_sigpending();
+ spin_unlock_irq(&current->sighand->siglock);
+ return 1;
+ }
+ return -EINTR;
+}
+
/*
* Wait until events become available, if we don't already have some. The
* application must reap them itself, as they reside on the shared cq ring.
@@ -6554,19 +6793,11 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
TASK_INTERRUPTIBLE);
/* make sure we run task_work before checking for signals */
- if (io_run_task_work())
+ ret = io_run_task_work_sig();
+ if (ret > 0)
continue;
- if (signal_pending(current)) {
- if (current->jobctl & JOBCTL_TASK_WORK) {
- spin_lock_irq(&current->sighand->siglock);
- current->jobctl &= ~JOBCTL_TASK_WORK;
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
- continue;
- }
- ret = -EINTR;
+ else if (ret < 0)
break;
- }
if (io_should_wake(&iowq, false))
break;
schedule();
@@ -6644,18 +6875,116 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
return 0;
}
-static void io_sq_thread_stop(struct io_ring_ctx *ctx)
+static void io_put_sq_data(struct io_sq_data *sqd)
{
- if (ctx->sqo_thread) {
- wait_for_completion(&ctx->sq_thread_comp);
+ if (refcount_dec_and_test(&sqd->refs)) {
/*
* The park is a bit of a work-around, without it we get
* warning spews on shutdown with SQPOLL set and affinity
* set to a single CPU.
*/
- kthread_park(ctx->sqo_thread);
- kthread_stop(ctx->sqo_thread);
- ctx->sqo_thread = NULL;
+ if (sqd->thread) {
+ kthread_park(sqd->thread);
+ kthread_stop(sqd->thread);
+ }
+
+ kfree(sqd);
+ }
+}
+
+static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p)
+{
+ struct io_ring_ctx *ctx_attach;
+ struct io_sq_data *sqd;
+ struct fd f;
+
+ f = fdget(p->wq_fd);
+ if (!f.file)
+ return ERR_PTR(-ENXIO);
+ if (f.file->f_op != &io_uring_fops) {
+ fdput(f);
+ return ERR_PTR(-EINVAL);
+ }
+
+ ctx_attach = f.file->private_data;
+ sqd = ctx_attach->sq_data;
+ if (!sqd) {
+ fdput(f);
+ return ERR_PTR(-EINVAL);
+ }
+
+ refcount_inc(&sqd->refs);
+ fdput(f);
+ return sqd;
+}
+
+static struct io_sq_data *io_get_sq_data(struct io_uring_params *p)
+{
+ struct io_sq_data *sqd;
+
+ if (p->flags & IORING_SETUP_ATTACH_WQ)
+ return io_attach_sq_data(p);
+
+ sqd = kzalloc(sizeof(*sqd), GFP_KERNEL);
+ if (!sqd)
+ return ERR_PTR(-ENOMEM);
+
+ refcount_set(&sqd->refs, 1);
+ INIT_LIST_HEAD(&sqd->ctx_list);
+ INIT_LIST_HEAD(&sqd->ctx_new_list);
+ mutex_init(&sqd->ctx_lock);
+ mutex_init(&sqd->lock);
+ init_waitqueue_head(&sqd->wait);
+ return sqd;
+}
+
+static void io_sq_thread_unpark(struct io_sq_data *sqd)
+ __releases(&sqd->lock)
+{
+ if (!sqd->thread)
+ return;
+ kthread_unpark(sqd->thread);
+ mutex_unlock(&sqd->lock);
+}
+
+static void io_sq_thread_park(struct io_sq_data *sqd)
+ __acquires(&sqd->lock)
+{
+ if (!sqd->thread)
+ return;
+ mutex_lock(&sqd->lock);
+ kthread_park(sqd->thread);
+}
+
+static void io_sq_thread_stop(struct io_ring_ctx *ctx)
+{
+ struct io_sq_data *sqd = ctx->sq_data;
+
+ if (sqd) {
+ if (sqd->thread) {
+ /*
+ * We may arrive here from the error branch in
+ * io_sq_offload_create() where the kthread is created
+ * without being waked up, thus wake it up now to make
+ * sure the wait will complete.
+ */
+ wake_up_process(sqd->thread);
+ wait_for_completion(&ctx->sq_thread_comp);
+
+ io_sq_thread_park(sqd);
+ }
+
+ mutex_lock(&sqd->ctx_lock);
+ list_del(&ctx->sqd_list);
+ mutex_unlock(&sqd->ctx_lock);
+
+ if (sqd->thread) {
+ finish_wait(&sqd->wait, &ctx->sqo_wait_entry);
+ io_sq_thread_unpark(sqd);
+ }
+
+ io_put_sq_data(sqd);
+ ctx->sq_data = NULL;
}
}
@@ -6766,13 +7095,13 @@ static int io_sqe_files_scm(struct io_ring_ctx *ctx)
}
#endif
-static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables,
- unsigned nr_files)
+static int io_sqe_alloc_file_tables(struct fixed_file_data *file_data,
+ unsigned nr_tables, unsigned nr_files)
{
int i;
for (i = 0; i < nr_tables; i++) {
- struct fixed_file_table *table = &ctx->file_data->table[i];
+ struct fixed_file_table *table = &file_data->table[i];
unsigned this_files;
this_files = min(nr_files, IORING_MAX_FILES_TABLE);
@@ -6787,7 +7116,7 @@ static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables,
return 0;
for (i = 0; i < nr_tables; i++) {
- struct fixed_file_table *table = &ctx->file_data->table[i];
+ struct fixed_file_table *table = &file_data->table[i];
kfree(table->files);
}
return 1;
@@ -6949,11 +7278,11 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
unsigned nr_args)
{
__s32 __user *fds = (__s32 __user *) arg;
- unsigned nr_tables;
+ unsigned nr_tables, i;
struct file *file;
- int fd, ret = 0;
- unsigned i;
+ int fd, ret = -ENOMEM;
struct fixed_file_ref_node *ref_node;
+ struct fixed_file_data *file_data;
if (ctx->file_data)
return -EBUSY;
@@ -6962,60 +7291,43 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
if (nr_args > IORING_MAX_FIXED_FILES)
return -EMFILE;
- ctx->file_data = kzalloc(sizeof(*ctx->file_data), GFP_KERNEL);
- if (!ctx->file_data)
+ file_data = kzalloc(sizeof(*ctx->file_data), GFP_KERNEL);
+ if (!file_data)
return -ENOMEM;
- ctx->file_data->ctx = ctx;
- init_completion(&ctx->file_data->done);
- INIT_LIST_HEAD(&ctx->file_data->ref_list);
- spin_lock_init(&ctx->file_data->lock);
+ file_data->ctx = ctx;
+ init_completion(&file_data->done);
+ INIT_LIST_HEAD(&file_data->ref_list);
+ spin_lock_init(&file_data->lock);
nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
- ctx->file_data->table = kcalloc(nr_tables,
- sizeof(struct fixed_file_table),
- GFP_KERNEL);
- if (!ctx->file_data->table) {
- kfree(ctx->file_data);
- ctx->file_data = NULL;
- return -ENOMEM;
- }
+ file_data->table = kcalloc(nr_tables, sizeof(file_data->table),
+ GFP_KERNEL);
+ if (!file_data->table)
+ goto out_free;
- if (percpu_ref_init(&ctx->file_data->refs, io_file_ref_kill,
- PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
- kfree(ctx->file_data->table);
- kfree(ctx->file_data);
- ctx->file_data = NULL;
- return -ENOMEM;
- }
+ if (percpu_ref_init(&file_data->refs, io_file_ref_kill,
+ PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
+ goto out_free;
- if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) {
- percpu_ref_exit(&ctx->file_data->refs);
- kfree(ctx->file_data->table);
- kfree(ctx->file_data);
- ctx->file_data = NULL;
- return -ENOMEM;
- }
+ if (io_sqe_alloc_file_tables(file_data, nr_tables, nr_args))
+ goto out_ref;
for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
struct fixed_file_table *table;
unsigned index;
- ret = -EFAULT;
- if (copy_from_user(&fd, &fds[i], sizeof(fd)))
- break;
+ if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
+ ret = -EFAULT;
+ goto out_fput;
+ }
/* allow sparse sets */
- if (fd == -1) {
- ret = 0;
+ if (fd == -1)
continue;
- }
- table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
- index = i & IORING_FILE_TABLE_MASK;
file = fget(fd);
-
ret = -EBADF;
if (!file)
- break;
+ goto out_fput;
/*
* Don't allow io_uring instances to be registered. If UNIX
@@ -7026,29 +7338,14 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
*/
if (file->f_op == &io_uring_fops) {
fput(file);
- break;
+ goto out_fput;
}
- ret = 0;
+ table = &file_data->table[i >> IORING_FILE_TABLE_SHIFT];
+ index = i & IORING_FILE_TABLE_MASK;
table->files[index] = file;
}
- if (ret) {
- for (i = 0; i < ctx->nr_user_files; i++) {
- file = io_file_from_index(ctx, i);
- if (file)
- fput(file);
- }
- for (i = 0; i < nr_tables; i++)
- kfree(ctx->file_data->table[i].files);
-
- percpu_ref_exit(&ctx->file_data->refs);
- kfree(ctx->file_data->table);
- kfree(ctx->file_data);
- ctx->file_data = NULL;
- ctx->nr_user_files = 0;
- return ret;
- }
-
+ ctx->file_data = file_data;
ret = io_sqe_files_scm(ctx);
if (ret) {
io_sqe_files_unregister(ctx);
@@ -7061,11 +7358,26 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
return PTR_ERR(ref_node);
}
- ctx->file_data->cur_refs = &ref_node->refs;
- spin_lock(&ctx->file_data->lock);
- list_add(&ref_node->node, &ctx->file_data->ref_list);
- spin_unlock(&ctx->file_data->lock);
- percpu_ref_get(&ctx->file_data->refs);
+ file_data->node = ref_node;
+ spin_lock(&file_data->lock);
+ list_add(&ref_node->node, &file_data->ref_list);
+ spin_unlock(&file_data->lock);
+ percpu_ref_get(&file_data->refs);
+ return ret;
+out_fput:
+ for (i = 0; i < ctx->nr_user_files; i++) {
+ file = io_file_from_index(ctx, i);
+ if (file)
+ fput(file);
+ }
+ for (i = 0; i < nr_tables; i++)
+ kfree(file_data->table[i].files);
+ ctx->nr_user_files = 0;
+out_ref:
+ percpu_ref_exit(&file_data->refs);
+out_free:
+ kfree(file_data->table);
+ kfree(file_data);
return ret;
}
@@ -7116,14 +7428,12 @@ static int io_queue_file_removal(struct fixed_file_data *data,
struct file *file)
{
struct io_file_put *pfile;
- struct percpu_ref *refs = data->cur_refs;
- struct fixed_file_ref_node *ref_node;
+ struct fixed_file_ref_node *ref_node = data->node;
pfile = kzalloc(sizeof(*pfile), GFP_KERNEL);
if (!pfile)
return -ENOMEM;
- ref_node = container_of(refs, struct fixed_file_ref_node, refs);
pfile->file = file;
list_add(&pfile->list, &ref_node->file_list);
@@ -7166,7 +7476,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
index = i & IORING_FILE_TABLE_MASK;
if (table->files[index]) {
- file = io_file_from_index(ctx, index);
+ file = table->files[index];
err = io_queue_file_removal(data, file);
if (err)
break;
@@ -7195,6 +7505,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
table->files[index] = file;
err = io_sqe_file_register(ctx, file, i);
if (err) {
+ table->files[index] = NULL;
fput(file);
break;
}
@@ -7205,10 +7516,10 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
}
if (needs_switch) {
- percpu_ref_kill(data->cur_refs);
+ percpu_ref_kill(&data->node->refs);
spin_lock(&data->lock);
list_add(&ref_node->node, &data->ref_list);
- data->cur_refs = &ref_node->refs;
+ data->node = ref_node;
spin_unlock(&data->lock);
percpu_ref_get(&ctx->file_data->refs);
} else
@@ -7289,23 +7600,65 @@ out_fput:
return ret;
}
-static int io_sq_offload_start(struct io_ring_ctx *ctx,
- struct io_uring_params *p)
+static int io_uring_alloc_task_context(struct task_struct *task)
{
- int ret;
+ struct io_uring_task *tctx;
- mmgrab(current->mm);
- ctx->sqo_mm = current->mm;
+ tctx = kmalloc(sizeof(*tctx), GFP_KERNEL);
+ if (unlikely(!tctx))
+ return -ENOMEM;
+
+ xa_init(&tctx->xa);
+ init_waitqueue_head(&tctx->wait);
+ tctx->last = NULL;
+ tctx->in_idle = 0;
+ atomic_long_set(&tctx->req_issue, 0);
+ atomic_long_set(&tctx->req_complete, 0);
+ task->io_uring = tctx;
+ return 0;
+}
+
+void __io_uring_free(struct task_struct *tsk)
+{
+ struct io_uring_task *tctx = tsk->io_uring;
+
+ WARN_ON_ONCE(!xa_empty(&tctx->xa));
+ kfree(tctx);
+ tsk->io_uring = NULL;
+}
+
+static int io_sq_offload_create(struct io_ring_ctx *ctx,
+ struct io_uring_params *p)
+{
+ int ret;
if (ctx->flags & IORING_SETUP_SQPOLL) {
+ struct io_sq_data *sqd;
+
ret = -EPERM;
if (!capable(CAP_SYS_ADMIN))
goto err;
+ sqd = io_get_sq_data(p);
+ if (IS_ERR(sqd)) {
+ ret = PTR_ERR(sqd);
+ goto err;
+ }
+
+ ctx->sq_data = sqd;
+ io_sq_thread_park(sqd);
+ mutex_lock(&sqd->ctx_lock);
+ list_add(&ctx->sqd_list, &sqd->ctx_new_list);
+ mutex_unlock(&sqd->ctx_lock);
+ io_sq_thread_unpark(sqd);
+
ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
if (!ctx->sq_thread_idle)
ctx->sq_thread_idle = HZ;
+ if (sqd->thread)
+ goto done;
+
if (p->flags & IORING_SETUP_SQ_AFF) {
int cpu = p->sq_thread_cpu;
@@ -7315,25 +7668,27 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
if (!cpu_online(cpu))
goto err;
- ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread,
- ctx, cpu,
- "io_uring-sq");
+ sqd->thread = kthread_create_on_cpu(io_sq_thread, sqd,
+ cpu, "io_uring-sq");
} else {
- ctx->sqo_thread = kthread_create(io_sq_thread, ctx,
+ sqd->thread = kthread_create(io_sq_thread, sqd,
"io_uring-sq");
}
- if (IS_ERR(ctx->sqo_thread)) {
- ret = PTR_ERR(ctx->sqo_thread);
- ctx->sqo_thread = NULL;
+ if (IS_ERR(sqd->thread)) {
+ ret = PTR_ERR(sqd->thread);
+ sqd->thread = NULL;
goto err;
}
- wake_up_process(ctx->sqo_thread);
+ ret = io_uring_alloc_task_context(sqd->thread);
+ if (ret)
+ goto err;
} else if (p->flags & IORING_SETUP_SQ_AFF) {
/* Can't have SQ_AFF without SQPOLL */
ret = -EINVAL;
goto err;
}
+done:
ret = io_init_wq_offload(ctx, p);
if (ret)
goto err;
@@ -7341,13 +7696,17 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
return 0;
err:
io_finish_async(ctx);
- if (ctx->sqo_mm) {
- mmdrop(ctx->sqo_mm);
- ctx->sqo_mm = NULL;
- }
return ret;
}
+static void io_sq_offload_start(struct io_ring_ctx *ctx)
+{
+ struct io_sq_data *sqd = ctx->sq_data;
+
+ if ((ctx->flags & IORING_SETUP_SQPOLL) && sqd->thread)
+ wake_up_process(sqd->thread);
+}
+
static inline void __io_unaccount_mem(struct user_struct *user,
unsigned long nr_pages)
{
@@ -7379,11 +7738,11 @@ static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages,
if (ctx->limit_mem)
__io_unaccount_mem(ctx->user, nr_pages);
- if (ctx->sqo_mm) {
+ if (ctx->mm_account) {
if (acct == ACCT_LOCKED)
- ctx->sqo_mm->locked_vm -= nr_pages;
+ ctx->mm_account->locked_vm -= nr_pages;
else if (acct == ACCT_PINNED)
- atomic64_sub(nr_pages, &ctx->sqo_mm->pinned_vm);
+ atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
}
}
@@ -7398,11 +7757,11 @@ static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages,
return ret;
}
- if (ctx->sqo_mm) {
+ if (ctx->mm_account) {
if (acct == ACCT_LOCKED)
- ctx->sqo_mm->locked_vm += nr_pages;
+ ctx->mm_account->locked_vm += nr_pages;
else if (acct == ACCT_PINNED)
- atomic64_add(nr_pages, &ctx->sqo_mm->pinned_vm);
+ atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
}
return 0;
@@ -7482,7 +7841,8 @@ static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
for (j = 0; j < imu->nr_bvecs; j++)
unpin_user_page(imu->bvec[j].bv_page);
- io_unaccount_mem(ctx, imu->nr_bvecs, ACCT_PINNED);
+ if (imu->acct_pages)
+ io_unaccount_mem(ctx, imu->acct_pages, ACCT_PINNED);
kvfree(imu->bvec);
imu->nr_bvecs = 0;
}
@@ -7518,11 +7878,80 @@ static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
return 0;
}
+/*
+ * Not super efficient, but this is just a registration time. And we do cache
+ * the last compound head, so generally we'll only do a full search if we don't
+ * match that one.
+ *
+ * We check if the given compound head page has already been accounted, to
+ * avoid double accounting it. This allows us to account the full size of the
+ * page, not just the constituent pages of a huge page.
+ */
+static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
+ int nr_pages, struct page *hpage)
+{
+ int i, j;
+
+ /* check current page array */
+ for (i = 0; i < nr_pages; i++) {
+ if (!PageCompound(pages[i]))
+ continue;
+ if (compound_head(pages[i]) == hpage)
+ return true;
+ }
+
+ /* check previously registered pages */
+ for (i = 0; i < ctx->nr_user_bufs; i++) {
+ struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
+
+ for (j = 0; j < imu->nr_bvecs; j++) {
+ if (!PageCompound(imu->bvec[j].bv_page))
+ continue;
+ if (compound_head(imu->bvec[j].bv_page) == hpage)
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
+ int nr_pages, struct io_mapped_ubuf *imu,
+ struct page **last_hpage)
+{
+ int i, ret;
+
+ for (i = 0; i < nr_pages; i++) {
+ if (!PageCompound(pages[i])) {
+ imu->acct_pages++;
+ } else {
+ struct page *hpage;
+
+ hpage = compound_head(pages[i]);
+ if (hpage == *last_hpage)
+ continue;
+ *last_hpage = hpage;
+ if (headpage_already_acct(ctx, pages, i, hpage))
+ continue;
+ imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
+ }
+ }
+
+ if (!imu->acct_pages)
+ return 0;
+
+ ret = io_account_mem(ctx, imu->acct_pages, ACCT_PINNED);
+ if (ret)
+ imu->acct_pages = 0;
+ return ret;
+}
+
static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
unsigned nr_args)
{
struct vm_area_struct **vmas = NULL;
struct page **pages = NULL;
+ struct page *last_hpage = NULL;
int i, j, got_pages = 0;
int ret = -EINVAL;
@@ -7565,10 +7994,6 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
start = ubuf >> PAGE_SHIFT;
nr_pages = end - start;
- ret = io_account_mem(ctx, nr_pages, ACCT_PINNED);
- if (ret)
- goto err;
-
ret = 0;
if (!pages || nr_pages > got_pages) {
kvfree(vmas);
@@ -7580,7 +8005,6 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
GFP_KERNEL);
if (!pages || !vmas) {
ret = -ENOMEM;
- io_unaccount_mem(ctx, nr_pages, ACCT_PINNED);
goto err;
}
got_pages = nr_pages;
@@ -7589,10 +8013,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),
GFP_KERNEL);
ret = -ENOMEM;
- if (!imu->bvec) {
- io_unaccount_mem(ctx, nr_pages, ACCT_PINNED);
+ if (!imu->bvec)
goto err;
- }
ret = 0;
mmap_read_lock(current->mm);
@@ -7621,7 +8043,13 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
*/
if (pret > 0)
unpin_user_pages(pages, pret);
- io_unaccount_mem(ctx, nr_pages, ACCT_PINNED);
+ kvfree(imu->bvec);
+ goto err;
+ }
+
+ ret = io_buffer_account_pin(ctx, pages, pret, imu, &last_hpage);
+ if (ret) {
+ unpin_user_pages(pages, pret);
kvfree(imu->bvec);
goto err;
}
@@ -7706,11 +8134,19 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
{
io_finish_async(ctx);
io_sqe_buffer_unregister(ctx);
- if (ctx->sqo_mm) {
- mmdrop(ctx->sqo_mm);
- ctx->sqo_mm = NULL;
+
+ if (ctx->sqo_task) {
+ put_task_struct(ctx->sqo_task);
+ ctx->sqo_task = NULL;
+ mmdrop(ctx->mm_account);
+ ctx->mm_account = NULL;
}
+#ifdef CONFIG_BLK_CGROUP
+ if (ctx->sqo_blkcg_css)
+ css_put(ctx->sqo_blkcg_css);
+#endif
+
io_sqe_files_unregister(ctx);
io_eventfd_unregister(ctx);
io_destroy_buffers(ctx);
@@ -7745,8 +8181,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
* io_commit_cqring
*/
smp_rmb();
- if (READ_ONCE(ctx->rings->sq.tail) - ctx->cached_sq_head !=
- ctx->rings->sq_ring_entries)
+ if (!io_sqring_full(ctx))
mask |= EPOLLOUT | EPOLLWRNORM;
if (io_cqring_events(ctx, false))
mask |= EPOLLIN | EPOLLRDNORM;
@@ -7785,7 +8220,7 @@ static void io_ring_exit_work(struct work_struct *work)
*/
do {
if (ctx->rings)
- io_cqring_overflow_flush(ctx, true);
+ io_cqring_overflow_flush(ctx, true, NULL, NULL);
io_iopoll_try_reap_events(ctx);
} while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20));
io_ring_ctx_free(ctx);
@@ -7797,15 +8232,15 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
percpu_ref_kill(&ctx->refs);
mutex_unlock(&ctx->uring_lock);
- io_kill_timeouts(ctx);
- io_poll_remove_all(ctx);
+ io_kill_timeouts(ctx, NULL);
+ io_poll_remove_all(ctx, NULL);
if (ctx->io_wq)
io_wq_cancel_all(ctx->io_wq);
/* if we failed setting up the ctx, we might not have any rings */
if (ctx->rings)
- io_cqring_overflow_flush(ctx, true);
+ io_cqring_overflow_flush(ctx, true, NULL, NULL);
io_iopoll_try_reap_events(ctx);
idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
@@ -7818,7 +8253,13 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
ACCT_LOCKED);
INIT_WORK(&ctx->exit_work, io_ring_exit_work);
- queue_work(system_wq, &ctx->exit_work);
+ /*
+ * Use system_unbound_wq to avoid spawning tons of event kworkers
+ * if we're exiting a ton of rings at the same time. It just adds
+ * noise and overhead, there's no discernable change in runtime
+ * over using system_wq.
+ */
+ queue_work(system_unbound_wq, &ctx->exit_work);
}
static int io_uring_release(struct inode *inode, struct file *file)
@@ -7834,15 +8275,152 @@ static bool io_wq_files_match(struct io_wq_work *work, void *data)
{
struct files_struct *files = data;
- return work->files == files;
+ return !files || work->files == files;
+}
+
+/*
+ * Returns true if 'preq' is the link parent of 'req'
+ */
+static bool io_match_link(struct io_kiocb *preq, struct io_kiocb *req)
+{
+ struct io_kiocb *link;
+
+ if (!(preq->flags & REQ_F_LINK_HEAD))
+ return false;
+
+ list_for_each_entry(link, &preq->link_list, link_list) {
+ if (link == req)
+ return true;
+ }
+
+ return false;
+}
+
+static bool io_match_link_files(struct io_kiocb *req,
+ struct files_struct *files)
+{
+ struct io_kiocb *link;
+
+ if (io_match_files(req, files))
+ return true;
+ if (req->flags & REQ_F_LINK_HEAD) {
+ list_for_each_entry(link, &req->link_list, link_list) {
+ if (io_match_files(link, files))
+ return true;
+ }
+ }
+ return false;
+}
+
+/*
+ * We're looking to cancel 'req' because it's holding on to our files, but
+ * 'req' could be a link to another request. See if it is, and cancel that
+ * parent request if so.
+ */
+static bool io_poll_remove_link(struct io_ring_ctx *ctx, struct io_kiocb *req)
+{
+ struct hlist_node *tmp;
+ struct io_kiocb *preq;
+ bool found = false;
+ int i;
+
+ spin_lock_irq(&ctx->completion_lock);
+ for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
+ struct hlist_head *list;
+
+ list = &ctx->cancel_hash[i];
+ hlist_for_each_entry_safe(preq, tmp, list, hash_node) {
+ found = io_match_link(preq, req);
+ if (found) {
+ io_poll_remove_one(preq);
+ break;
+ }
+ }
+ }
+ spin_unlock_irq(&ctx->completion_lock);
+ return found;
+}
+
+static bool io_timeout_remove_link(struct io_ring_ctx *ctx,
+ struct io_kiocb *req)
+{
+ struct io_kiocb *preq;
+ bool found = false;
+
+ spin_lock_irq(&ctx->completion_lock);
+ list_for_each_entry(preq, &ctx->timeout_list, timeout.list) {
+ found = io_match_link(preq, req);
+ if (found) {
+ __io_timeout_cancel(preq);
+ break;
+ }
+ }
+ spin_unlock_irq(&ctx->completion_lock);
+ return found;
+}
+
+static bool io_cancel_link_cb(struct io_wq_work *work, void *data)
+{
+ return io_match_link(container_of(work, struct io_kiocb, work), data);
+}
+
+static void io_attempt_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req)
+{
+ enum io_wq_cancel cret;
+
+ /* cancel this particular work, if it's running */
+ cret = io_wq_cancel_work(ctx->io_wq, &req->work);
+ if (cret != IO_WQ_CANCEL_NOTFOUND)
+ return;
+
+ /* find links that hold this pending, cancel those */
+ cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_link_cb, req, true);
+ if (cret != IO_WQ_CANCEL_NOTFOUND)
+ return;
+
+ /* if we have a poll link holding this pending, cancel that */
+ if (io_poll_remove_link(ctx, req))
+ return;
+
+ /* final option, timeout link is holding this req pending */
+ io_timeout_remove_link(ctx, req);
+}
+
+static void io_cancel_defer_files(struct io_ring_ctx *ctx,
+ struct files_struct *files)
+{
+ struct io_defer_entry *de = NULL;
+ LIST_HEAD(list);
+
+ spin_lock_irq(&ctx->completion_lock);
+ list_for_each_entry_reverse(de, &ctx->defer_list, list) {
+ if (io_match_link_files(de->req, files)) {
+ list_cut_position(&list, &ctx->defer_list, &de->list);
+ break;
+ }
+ }
+ spin_unlock_irq(&ctx->completion_lock);
+
+ while (!list_empty(&list)) {
+ de = list_first_entry(&list, struct io_defer_entry, list);
+ list_del_init(&de->list);
+ req_set_fail_links(de->req);
+ io_put_req(de->req);
+ io_req_complete(de->req, -ECANCELED);
+ kfree(de);
+ }
}
-static void io_uring_cancel_files(struct io_ring_ctx *ctx,
+/*
+ * Returns true if we found and killed one or more files pinning requests
+ */
+static bool io_uring_cancel_files(struct io_ring_ctx *ctx,
struct files_struct *files)
{
if (list_empty_careful(&ctx->inflight_list))
- return;
+ return false;
+ io_cancel_defer_files(ctx, files);
/* cancel all at once, should be faster than doing it one by one*/
io_wq_cancel_cb(ctx->io_wq, io_wq_files_match, files, true);
@@ -7852,7 +8430,7 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
spin_lock_irq(&ctx->inflight_lock);
list_for_each_entry(req, &ctx->inflight_list, inflight_entry) {
- if (req->work.files != files)
+ if (files && req->work.files != files)
continue;
/* req is being completed, ignore */
if (!refcount_inc_not_zero(&req->refs))
@@ -7868,57 +8446,215 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
/* We need to keep going until we don't find a matching req */
if (!cancel_req)
break;
+ /* cancel this request, or head link requests */
+ io_attempt_cancel(ctx, cancel_req);
+ io_put_req(cancel_req);
+ /* cancellations _may_ trigger task work */
+ io_run_task_work();
+ schedule();
+ finish_wait(&ctx->inflight_wait, &wait);
+ }
- if (cancel_req->flags & REQ_F_OVERFLOW) {
- spin_lock_irq(&ctx->completion_lock);
- list_del(&cancel_req->compl.list);
- cancel_req->flags &= ~REQ_F_OVERFLOW;
+ return true;
+}
- io_cqring_mark_overflow(ctx);
- WRITE_ONCE(ctx->rings->cq_overflow,
- atomic_inc_return(&ctx->cached_cq_overflow));
- io_commit_cqring(ctx);
- spin_unlock_irq(&ctx->completion_lock);
+static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
+{
+ struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+ struct task_struct *task = data;
- /*
- * Put inflight ref and overflow ref. If that's
- * all we had, then we're done with this request.
- */
- if (refcount_sub_and_test(2, &cancel_req->refs)) {
- io_free_req(cancel_req);
- finish_wait(&ctx->inflight_wait, &wait);
- continue;
+ return io_task_match(req, task);
+}
+
+static bool __io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
+ struct task_struct *task,
+ struct files_struct *files)
+{
+ bool ret;
+
+ ret = io_uring_cancel_files(ctx, files);
+ if (!files) {
+ enum io_wq_cancel cret;
+
+ cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, task, true);
+ if (cret != IO_WQ_CANCEL_NOTFOUND)
+ ret = true;
+
+ /* SQPOLL thread does its own polling */
+ if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
+ while (!list_empty_careful(&ctx->iopoll_list)) {
+ io_iopoll_try_reap_events(ctx);
+ ret = true;
}
- } else {
- io_wq_cancel_work(ctx->io_wq, &cancel_req->work);
- io_put_req(cancel_req);
}
- schedule();
- finish_wait(&ctx->inflight_wait, &wait);
+ ret |= io_poll_remove_all(ctx, task);
+ ret |= io_kill_timeouts(ctx, task);
+ }
+
+ return ret;
+}
+
+/*
+ * We need to iteratively cancel requests, in case a request has dependent
+ * hard links. These persist even for failure of cancelations, hence keep
+ * looping until none are found.
+ */
+static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
+ struct files_struct *files)
+{
+ struct task_struct *task = current;
+
+ if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data)
+ task = ctx->sq_data->thread;
+
+ io_cqring_overflow_flush(ctx, true, task, files);
+
+ while (__io_uring_cancel_task_requests(ctx, task, files)) {
+ io_run_task_work();
+ cond_resched();
}
}
-static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
+/*
+ * Note that this task has used io_uring. We use it for cancelation purposes.
+ */
+static int io_uring_add_task_file(struct file *file)
{
- struct io_kiocb *req = container_of(work, struct io_kiocb, work);
- struct task_struct *task = data;
+ struct io_uring_task *tctx = current->io_uring;
+
+ if (unlikely(!tctx)) {
+ int ret;
+
+ ret = io_uring_alloc_task_context(current);
+ if (unlikely(ret))
+ return ret;
+ tctx = current->io_uring;
+ }
+ if (tctx->last != file) {
+ void *old = xa_load(&tctx->xa, (unsigned long)file);
+
+ if (!old) {
+ get_file(file);
+ xa_store(&tctx->xa, (unsigned long)file, file, GFP_KERNEL);
+ }
+ tctx->last = file;
+ }
+
+ return 0;
+}
+
+/*
+ * Remove this io_uring_file -> task mapping.
+ */
+static void io_uring_del_task_file(struct file *file)
+{
+ struct io_uring_task *tctx = current->io_uring;
+
+ if (tctx->last == file)
+ tctx->last = NULL;
+ file = xa_erase(&tctx->xa, (unsigned long)file);
+ if (file)
+ fput(file);
+}
+
+static void __io_uring_attempt_task_drop(struct file *file)
+{
+ struct file *old = xa_load(&current->io_uring->xa, (unsigned long)file);
+
+ if (old == file)
+ io_uring_del_task_file(file);
+}
+
+/*
+ * Drop task note for this file if we're the only ones that hold it after
+ * pending fput()
+ */
+static void io_uring_attempt_task_drop(struct file *file, bool exiting)
+{
+ if (!current->io_uring)
+ return;
+ /*
+ * fput() is pending, will be 2 if the only other ref is our potential
+ * task file note. If the task is exiting, drop regardless of count.
+ */
+ if (!exiting && atomic_long_read(&file->f_count) != 2)
+ return;
+
+ __io_uring_attempt_task_drop(file);
+}
+
+void __io_uring_files_cancel(struct files_struct *files)
+{
+ struct io_uring_task *tctx = current->io_uring;
+ struct file *file;
+ unsigned long index;
+
+ /* make sure overflow events are dropped */
+ tctx->in_idle = true;
+
+ xa_for_each(&tctx->xa, index, file) {
+ struct io_ring_ctx *ctx = file->private_data;
+
+ io_uring_cancel_task_requests(ctx, files);
+ if (files)
+ io_uring_del_task_file(file);
+ }
+}
+
+static inline bool io_uring_task_idle(struct io_uring_task *tctx)
+{
+ return atomic_long_read(&tctx->req_issue) ==
+ atomic_long_read(&tctx->req_complete);
+}
+
+/*
+ * Find any io_uring fd that this task has registered or done IO on, and cancel
+ * requests.
+ */
+void __io_uring_task_cancel(void)
+{
+ struct io_uring_task *tctx = current->io_uring;
+ DEFINE_WAIT(wait);
+ long completions;
+
+ /* make sure overflow events are dropped */
+ tctx->in_idle = true;
+
+ while (!io_uring_task_idle(tctx)) {
+ /* read completions before cancelations */
+ completions = atomic_long_read(&tctx->req_complete);
+ __io_uring_files_cancel(NULL);
+
+ prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
+
+ /*
+ * If we've seen completions, retry. This avoids a race where
+ * a completion comes in before we did prepare_to_wait().
+ */
+ if (completions != atomic_long_read(&tctx->req_complete))
+ continue;
+ if (io_uring_task_idle(tctx))
+ break;
+ schedule();
+ }
- return req->task == task;
+ finish_wait(&tctx->wait, &wait);
+ tctx->in_idle = false;
}
static int io_uring_flush(struct file *file, void *data)
{
struct io_ring_ctx *ctx = file->private_data;
- io_uring_cancel_files(ctx, data);
-
/*
* If the task is going away, cancel work it may have pending
*/
if (fatal_signal_pending(current) || (current->flags & PF_EXITING))
- io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, current, true);
+ data = NULL;
+ io_uring_cancel_task_requests(ctx, data);
+ io_uring_attempt_task_drop(file, !data);
return 0;
}
@@ -7992,6 +8728,25 @@ static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
#endif /* !CONFIG_MMU */
+static void io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
+{
+ DEFINE_WAIT(wait);
+
+ do {
+ if (!io_sqring_full(ctx))
+ break;
+
+ prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE);
+
+ if (!io_sqring_full(ctx))
+ break;
+
+ schedule();
+ } while (!signal_pending(current));
+
+ finish_wait(&ctx->sqo_sq_wait, &wait);
+}
+
SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
u32, min_complete, u32, flags, const sigset_t __user *, sig,
size_t, sigsz)
@@ -8003,7 +8758,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
io_run_task_work();
- if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP))
+ if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
+ IORING_ENTER_SQ_WAIT))
return -EINVAL;
f = fdget(fd);
@@ -8019,6 +8775,10 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
if (!percpu_ref_tryget(&ctx->refs))
goto out_fput;
+ ret = -EBADFD;
+ if (ctx->flags & IORING_SETUP_R_DISABLED)
+ goto out;
+
/*
* For SQ polling, the thread will do all submissions and completions.
* Just return the requested submit count, and wake the thread if
@@ -8027,13 +8787,18 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
ret = 0;
if (ctx->flags & IORING_SETUP_SQPOLL) {
if (!list_empty_careful(&ctx->cq_overflow_list))
- io_cqring_overflow_flush(ctx, false);
+ io_cqring_overflow_flush(ctx, false, NULL, NULL);
if (flags & IORING_ENTER_SQ_WAKEUP)
- wake_up(&ctx->sqo_wait);
+ wake_up(&ctx->sq_data->wait);
+ if (flags & IORING_ENTER_SQ_WAIT)
+ io_sqpoll_wait_sq(ctx);
submitted = to_submit;
} else if (to_submit) {
+ ret = io_uring_add_task_file(f.file);
+ if (unlikely(ret))
+ goto out;
mutex_lock(&ctx->uring_lock);
- submitted = io_submit_sqes(ctx, to_submit, f.file, fd);
+ submitted = io_submit_sqes(ctx, to_submit);
mutex_unlock(&ctx->uring_lock);
if (submitted != to_submit)
@@ -8099,11 +8864,25 @@ static int io_uring_show_cred(int id, void *p, void *data)
static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
{
+ struct io_sq_data *sq = NULL;
+ bool has_lock;
int i;
- mutex_lock(&ctx->uring_lock);
+ /*
+ * Avoid ABBA deadlock between the seq lock and the io_uring mutex,
+ * since fdinfo case grabs it in the opposite direction of normal use
+ * cases. If we fail to get the lock, we just don't iterate any
+ * structures that could be going away outside the io_uring mutex.
+ */
+ has_lock = mutex_trylock(&ctx->uring_lock);
+
+ if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL))
+ sq = ctx->sq_data;
+
+ seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1);
+ seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
- for (i = 0; i < ctx->nr_user_files; i++) {
+ for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
struct fixed_file_table *table;
struct file *f;
@@ -8115,13 +8894,13 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
seq_printf(m, "%5u: <none>\n", i);
}
seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
- for (i = 0; i < ctx->nr_user_bufs; i++) {
+ for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
struct io_mapped_ubuf *buf = &ctx->user_bufs[i];
seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf,
(unsigned int) buf->len);
}
- if (!idr_is_empty(&ctx->personality_idr)) {
+ if (has_lock && !idr_is_empty(&ctx->personality_idr)) {
seq_printf(m, "Personalities:\n");
idr_for_each(&ctx->personality_idr, io_uring_show_cred, m);
}
@@ -8136,7 +8915,8 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
req->task->task_works != NULL);
}
spin_unlock_irq(&ctx->completion_lock);
- mutex_unlock(&ctx->uring_lock);
+ if (has_lock)
+ mutex_unlock(&ctx->uring_lock);
}
static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
@@ -8171,6 +8951,10 @@ static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
struct io_rings *rings;
size_t size, sq_array_offset;
+ /* make sure these are sane, as we already accounted them */
+ ctx->sq_entries = p->sq_entries;
+ ctx->cq_entries = p->cq_entries;
+
size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
if (size == SIZE_MAX)
return -EOVERFLOW;
@@ -8187,8 +8971,6 @@ static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
rings->cq_ring_entries = p->cq_entries;
ctx->sq_mask = rings->sq_ring_mask;
ctx->cq_mask = rings->cq_ring_mask;
- ctx->sq_entries = rings->sq_ring_entries;
- ctx->cq_entries = rings->cq_ring_entries;
size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
if (size == SIZE_MAX) {
@@ -8232,6 +9014,7 @@ static int io_uring_get_fd(struct io_ring_ctx *ctx)
file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
O_RDWR | O_CLOEXEC);
if (IS_ERR(file)) {
+err_fd:
put_unused_fd(ret);
ret = PTR_ERR(file);
goto err;
@@ -8240,6 +9023,10 @@ static int io_uring_get_fd(struct io_ring_ctx *ctx)
#if defined(CONFIG_UNIX)
ctx->ring_sock->file = file;
#endif
+ if (unlikely(io_uring_add_task_file(file))) {
+ file = ERR_PTR(-ENOMEM);
+ goto err_fd;
+ }
fd_install(ret, file);
return ret;
err:
@@ -8317,14 +9104,57 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
ctx->user = user;
ctx->creds = get_current_cred();
+ ctx->sqo_task = get_task_struct(current);
+
+ /*
+ * This is just grabbed for accounting purposes. When a process exits,
+ * the mm is exited and dropped before the files, hence we need to hang
+ * on to this mm purely for the purposes of being able to unaccount
+ * memory (locked/pinned vm). It's not used for anything else.
+ */
+ mmgrab(current->mm);
+ ctx->mm_account = current->mm;
+
+#ifdef CONFIG_BLK_CGROUP
+ /*
+ * The sq thread will belong to the original cgroup it was inited in.
+ * If the cgroup goes offline (e.g. disabling the io controller), then
+ * issued bios will be associated with the closest cgroup later in the
+ * block layer.
+ */
+ rcu_read_lock();
+ ctx->sqo_blkcg_css = blkcg_css();
+ ret = css_tryget_online(ctx->sqo_blkcg_css);
+ rcu_read_unlock();
+ if (!ret) {
+ /* don't init against a dying cgroup, have the user try again */
+ ctx->sqo_blkcg_css = NULL;
+ ret = -ENODEV;
+ goto err;
+ }
+#endif
+
+ /*
+ * Account memory _before_ installing the file descriptor. Once
+ * the descriptor is installed, it can get closed at any time. Also
+ * do this before hitting the general error path, as ring freeing
+ * will un-account as well.
+ */
+ io_account_mem(ctx, ring_pages(p->sq_entries, p->cq_entries),
+ ACCT_LOCKED);
+ ctx->limit_mem = limit_mem;
+
ret = io_allocate_scq_urings(ctx, p);
if (ret)
goto err;
- ret = io_sq_offload_start(ctx, p);
+ ret = io_sq_offload_create(ctx, p);
if (ret)
goto err;
+ if (!(p->flags & IORING_SETUP_R_DISABLED))
+ io_sq_offload_start(ctx);
+
memset(&p->sq_off, 0, sizeof(p->sq_off));
p->sq_off.head = offsetof(struct io_rings, sq.head);
p->sq_off.tail = offsetof(struct io_rings, sq.tail);
@@ -8354,14 +9184,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
}
/*
- * Account memory _before_ installing the file descriptor. Once
- * the descriptor is installed, it can get closed at any time.
- */
- io_account_mem(ctx, ring_pages(p->sq_entries, p->cq_entries),
- ACCT_LOCKED);
- ctx->limit_mem = limit_mem;
-
- /*
* Install ring fd as the very last thing, so we don't risk someone
* having closed it before we finish setup
*/
@@ -8395,7 +9217,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
- IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ))
+ IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
+ IORING_SETUP_R_DISABLED))
return -EINVAL;
return io_uring_create(entries, &p, params);
@@ -8471,6 +9294,91 @@ static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
return -EINVAL;
}
+static int io_register_restrictions(struct io_ring_ctx *ctx, void __user *arg,
+ unsigned int nr_args)
+{
+ struct io_uring_restriction *res;
+ size_t size;
+ int i, ret;
+
+ /* Restrictions allowed only if rings started disabled */
+ if (!(ctx->flags & IORING_SETUP_R_DISABLED))
+ return -EBADFD;
+
+ /* We allow only a single restrictions registration */
+ if (ctx->restrictions.registered)
+ return -EBUSY;
+
+ if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
+ return -EINVAL;
+
+ size = array_size(nr_args, sizeof(*res));
+ if (size == SIZE_MAX)
+ return -EOVERFLOW;
+
+ res = memdup_user(arg, size);
+ if (IS_ERR(res))
+ return PTR_ERR(res);
+
+ ret = 0;
+
+ for (i = 0; i < nr_args; i++) {
+ switch (res[i].opcode) {
+ case IORING_RESTRICTION_REGISTER_OP:
+ if (res[i].register_op >= IORING_REGISTER_LAST) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ __set_bit(res[i].register_op,
+ ctx->restrictions.register_op);
+ break;
+ case IORING_RESTRICTION_SQE_OP:
+ if (res[i].sqe_op >= IORING_OP_LAST) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
+ break;
+ case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
+ ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
+ break;
+ case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
+ ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
+ break;
+ default:
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+out:
+ /* Reset all restrictions if an error happened */
+ if (ret != 0)
+ memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
+ else
+ ctx->restrictions.registered = true;
+
+ kfree(res);
+ return ret;
+}
+
+static int io_register_enable_rings(struct io_ring_ctx *ctx)
+{
+ if (!(ctx->flags & IORING_SETUP_R_DISABLED))
+ return -EBADFD;
+
+ if (ctx->restrictions.registered)
+ ctx->restricted = 1;
+
+ ctx->flags &= ~IORING_SETUP_R_DISABLED;
+
+ io_sq_offload_start(ctx);
+
+ return 0;
+}
+
static bool io_register_op_must_quiesce(int op)
{
switch (op) {
@@ -8512,11 +9420,31 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
* after we've killed the percpu ref.
*/
mutex_unlock(&ctx->uring_lock);
- ret = wait_for_completion_interruptible(&ctx->ref_comp);
+ do {
+ ret = wait_for_completion_interruptible(&ctx->ref_comp);
+ if (!ret)
+ break;
+ ret = io_run_task_work_sig();
+ if (ret < 0)
+ break;
+ } while (1);
+
mutex_lock(&ctx->uring_lock);
+
if (ret) {
percpu_ref_resurrect(&ctx->refs);
- ret = -EINTR;
+ goto out_quiesce;
+ }
+ }
+
+ if (ctx->restricted) {
+ if (opcode >= IORING_REGISTER_LAST) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (!test_bit(opcode, ctx->restrictions.register_op)) {
+ ret = -EACCES;
goto out;
}
}
@@ -8580,15 +9508,25 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
break;
ret = io_unregister_personality(ctx, nr_args);
break;
+ case IORING_REGISTER_ENABLE_RINGS:
+ ret = -EINVAL;
+ if (arg || nr_args)
+ break;
+ ret = io_register_enable_rings(ctx);
+ break;
+ case IORING_REGISTER_RESTRICTIONS:
+ ret = io_register_restrictions(ctx, arg, nr_args);
+ break;
default:
ret = -EINVAL;
break;
}
+out:
if (io_register_op_must_quiesce(opcode)) {
/* bring the ctx back to life */
percpu_ref_reinit(&ctx->refs);
-out:
+out_quiesce:
reinit_completion(&ctx->ref_comp);
}
return ret;
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index bcfc288dba3f..8180061b9e16 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -22,18 +22,25 @@
#include "../internal.h"
/*
- * Structure allocated for each page when block size < PAGE_SIZE to track
- * sub-page uptodate status and I/O completions.
+ * Structure allocated for each page or THP when block size < page size
+ * to track sub-page uptodate status and I/O completions.
*/
struct iomap_page {
- atomic_t read_count;
- atomic_t write_count;
+ atomic_t read_bytes_pending;
+ atomic_t write_bytes_pending;
spinlock_t uptodate_lock;
- DECLARE_BITMAP(uptodate, PAGE_SIZE / 512);
+ unsigned long uptodate[];
};
static inline struct iomap_page *to_iomap_page(struct page *page)
{
+ /*
+ * per-block data is stored in the head page. Callers should
+ * not be dealing with tail pages (and if they are, they can
+ * call thp_head() first.
+ */
+ VM_BUG_ON_PGFLAGS(PageTail(page), page);
+
if (page_has_private(page))
return (struct iomap_page *)page_private(page);
return NULL;
@@ -45,20 +52,16 @@ static struct iomap_page *
iomap_page_create(struct inode *inode, struct page *page)
{
struct iomap_page *iop = to_iomap_page(page);
+ unsigned int nr_blocks = i_blocks_per_page(inode, page);
- if (iop || i_blocksize(inode) == PAGE_SIZE)
+ if (iop || nr_blocks <= 1)
return iop;
- iop = kmalloc(sizeof(*iop), GFP_NOFS | __GFP_NOFAIL);
- atomic_set(&iop->read_count, 0);
- atomic_set(&iop->write_count, 0);
+ iop = kzalloc(struct_size(iop, uptodate, BITS_TO_LONGS(nr_blocks)),
+ GFP_NOFS | __GFP_NOFAIL);
spin_lock_init(&iop->uptodate_lock);
- bitmap_zero(iop->uptodate, PAGE_SIZE / SECTOR_SIZE);
-
- /*
- * migrate_page_move_mapping() assumes that pages with private data have
- * their count elevated by 1.
- */
+ if (PageUptodate(page))
+ bitmap_fill(iop->uptodate, nr_blocks);
attach_page_private(page, iop);
return iop;
}
@@ -67,11 +70,14 @@ static void
iomap_page_release(struct page *page)
{
struct iomap_page *iop = detach_page_private(page);
+ unsigned int nr_blocks = i_blocks_per_page(page->mapping->host, page);
if (!iop)
return;
- WARN_ON_ONCE(atomic_read(&iop->read_count));
- WARN_ON_ONCE(atomic_read(&iop->write_count));
+ WARN_ON_ONCE(atomic_read(&iop->read_bytes_pending));
+ WARN_ON_ONCE(atomic_read(&iop->write_bytes_pending));
+ WARN_ON_ONCE(bitmap_full(iop->uptodate, nr_blocks) !=
+ PageUptodate(page));
kfree(iop);
}
@@ -142,19 +148,11 @@ iomap_iop_set_range_uptodate(struct page *page, unsigned off, unsigned len)
struct inode *inode = page->mapping->host;
unsigned first = off >> inode->i_blkbits;
unsigned last = (off + len - 1) >> inode->i_blkbits;
- bool uptodate = true;
unsigned long flags;
- unsigned int i;
spin_lock_irqsave(&iop->uptodate_lock, flags);
- for (i = 0; i < PAGE_SIZE / i_blocksize(inode); i++) {
- if (i >= first && i <= last)
- set_bit(i, iop->uptodate);
- else if (!test_bit(i, iop->uptodate))
- uptodate = false;
- }
-
- if (uptodate)
+ bitmap_set(iop->uptodate, first, last - first + 1);
+ if (bitmap_full(iop->uptodate, i_blocks_per_page(inode, page)))
SetPageUptodate(page);
spin_unlock_irqrestore(&iop->uptodate_lock, flags);
}
@@ -172,13 +170,6 @@ iomap_set_range_uptodate(struct page *page, unsigned off, unsigned len)
}
static void
-iomap_read_finish(struct iomap_page *iop, struct page *page)
-{
- if (!iop || atomic_dec_and_test(&iop->read_count))
- unlock_page(page);
-}
-
-static void
iomap_read_page_end_io(struct bio_vec *bvec, int error)
{
struct page *page = bvec->bv_page;
@@ -191,7 +182,8 @@ iomap_read_page_end_io(struct bio_vec *bvec, int error)
iomap_set_range_uptodate(page, bvec->bv_offset, bvec->bv_len);
}
- iomap_read_finish(iop, page);
+ if (!iop || atomic_sub_and_test(bvec->bv_len, &iop->read_bytes_pending))
+ unlock_page(page);
}
static void
@@ -271,30 +263,19 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
}
ctx->cur_page_in_bio = true;
+ if (iop)
+ atomic_add(plen, &iop->read_bytes_pending);
- /*
- * Try to merge into a previous segment if we can.
- */
+ /* Try to merge into a previous segment if we can */
sector = iomap_sector(iomap, pos);
- if (ctx->bio && bio_end_sector(ctx->bio) == sector)
+ if (ctx->bio && bio_end_sector(ctx->bio) == sector) {
+ if (__bio_try_merge_page(ctx->bio, page, plen, poff,
+ &same_page))
+ goto done;
is_contig = true;
-
- if (is_contig &&
- __bio_try_merge_page(ctx->bio, page, plen, poff, &same_page)) {
- if (!same_page && iop)
- atomic_inc(&iop->read_count);
- goto done;
}
- /*
- * If we start a new segment we need to increase the read count, and we
- * need to do so before submitting any previous full bio to make sure
- * that we don't prematurely unlock the page.
- */
- if (iop)
- atomic_inc(&iop->read_count);
-
- if (!ctx->bio || !is_contig || bio_full(ctx->bio, plen)) {
+ if (!is_contig || bio_full(ctx->bio, plen)) {
gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL);
gfp_t orig_gfp = gfp;
int nr_vecs = (length + PAGE_SIZE - 1) >> PAGE_SHIFT;
@@ -571,13 +552,13 @@ __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, int flags,
{
struct iomap_page *iop = iomap_page_create(inode, page);
loff_t block_size = i_blocksize(inode);
- loff_t block_start = pos & ~(block_size - 1);
- loff_t block_end = (pos + len + block_size - 1) & ~(block_size - 1);
+ loff_t block_start = round_down(pos, block_size);
+ loff_t block_end = round_up(pos + len, block_size);
unsigned from = offset_in_page(pos), to = from + len, poff, plen;
- int status;
if (PageUptodate(page))
return 0;
+ ClearPageError(page);
do {
iomap_adjust_read_range(inode, iop, &block_start,
@@ -594,14 +575,13 @@ __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, int flags,
if (WARN_ON_ONCE(flags & IOMAP_WRITE_F_UNSHARE))
return -EIO;
zero_user_segments(page, poff, from, to, poff + plen);
- iomap_set_range_uptodate(page, poff, plen);
- continue;
+ } else {
+ int status = iomap_read_page_sync(block_start, page,
+ poff, plen, srcmap);
+ if (status)
+ return status;
}
-
- status = iomap_read_page_sync(block_start, page, poff, plen,
- srcmap);
- if (status)
- return status;
+ iomap_set_range_uptodate(page, poff, plen);
} while ((block_start += plen) < block_end);
return 0;
@@ -685,9 +665,8 @@ iomap_set_page_dirty(struct page *page)
}
EXPORT_SYMBOL_GPL(iomap_set_page_dirty);
-static int
-__iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
- unsigned copied, struct page *page)
+static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
+ size_t copied, struct page *page)
{
flush_dcache_page(page);
@@ -709,15 +688,15 @@ __iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
return copied;
}
-static int
-iomap_write_end_inline(struct inode *inode, struct page *page,
- struct iomap *iomap, loff_t pos, unsigned copied)
+static size_t iomap_write_end_inline(struct inode *inode, struct page *page,
+ struct iomap *iomap, loff_t pos, size_t copied)
{
void *addr;
WARN_ON_ONCE(!PageUptodate(page));
BUG_ON(pos + copied > PAGE_SIZE - offset_in_page(iomap->inline_data));
+ flush_dcache_page(page);
addr = kmap_atomic(page);
memcpy(iomap->inline_data + pos, addr + pos, copied);
kunmap_atomic(addr);
@@ -726,13 +705,14 @@ iomap_write_end_inline(struct inode *inode, struct page *page,
return copied;
}
-static int
-iomap_write_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied,
- struct page *page, struct iomap *iomap, struct iomap *srcmap)
+/* Returns the number of bytes copied. May be 0. Cannot be an errno. */
+static size_t iomap_write_end(struct inode *inode, loff_t pos, size_t len,
+ size_t copied, struct page *page, struct iomap *iomap,
+ struct iomap *srcmap)
{
const struct iomap_page_ops *page_ops = iomap->page_ops;
loff_t old_size = inode->i_size;
- int ret;
+ size_t ret;
if (srcmap->type == IOMAP_INLINE) {
ret = iomap_write_end_inline(inode, page, iomap, pos, copied);
@@ -811,13 +791,8 @@ again:
copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
- flush_dcache_page(page);
-
- status = iomap_write_end(inode, pos, bytes, copied, page, iomap,
+ copied = iomap_write_end(inode, pos, bytes, copied, page, iomap,
srcmap);
- if (unlikely(status < 0))
- break;
- copied = status;
cond_resched();
@@ -891,11 +866,8 @@ iomap_unshare_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
status = iomap_write_end(inode, pos, bytes, bytes, page, iomap,
srcmap);
- if (unlikely(status <= 0)) {
- if (WARN_ON_ONCE(status == 0))
- return -EIO;
- return status;
- }
+ if (WARN_ON_ONCE(status == 0))
+ return -EIO;
cond_resched();
@@ -928,11 +900,13 @@ iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
}
EXPORT_SYMBOL_GPL(iomap_file_unshare);
-static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
- unsigned bytes, struct iomap *iomap, struct iomap *srcmap)
+static s64 iomap_zero(struct inode *inode, loff_t pos, u64 length,
+ struct iomap *iomap, struct iomap *srcmap)
{
struct page *page;
int status;
+ unsigned offset = offset_in_page(pos);
+ unsigned bytes = min_t(u64, PAGE_SIZE - offset, length);
status = iomap_write_begin(inode, pos, bytes, 0, &page, iomap, srcmap);
if (status)
@@ -944,38 +918,33 @@ static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
return iomap_write_end(inode, pos, bytes, bytes, page, iomap, srcmap);
}
-static loff_t
-iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
- void *data, struct iomap *iomap, struct iomap *srcmap)
+static loff_t iomap_zero_range_actor(struct inode *inode, loff_t pos,
+ loff_t length, void *data, struct iomap *iomap,
+ struct iomap *srcmap)
{
bool *did_zero = data;
loff_t written = 0;
- int status;
/* already zeroed? we're done. */
if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
- return count;
+ return length;
do {
- unsigned offset, bytes;
-
- offset = offset_in_page(pos);
- bytes = min_t(loff_t, PAGE_SIZE - offset, count);
+ s64 bytes;
if (IS_DAX(inode))
- status = dax_iomap_zero(pos, offset, bytes, iomap);
+ bytes = dax_iomap_zero(pos, length, iomap);
else
- status = iomap_zero(inode, pos, offset, bytes, iomap,
- srcmap);
- if (status < 0)
- return status;
+ bytes = iomap_zero(inode, pos, length, iomap, srcmap);
+ if (bytes < 0)
+ return bytes;
pos += bytes;
- count -= bytes;
+ length -= bytes;
written += bytes;
if (did_zero)
*did_zero = true;
- } while (count > 0);
+ } while (length > 0);
return written;
}
@@ -1070,7 +1039,7 @@ EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
static void
iomap_finish_page_writeback(struct inode *inode, struct page *page,
- int error)
+ int error, unsigned int len)
{
struct iomap_page *iop = to_iomap_page(page);
@@ -1079,10 +1048,10 @@ iomap_finish_page_writeback(struct inode *inode, struct page *page,
mapping_set_error(inode->i_mapping, -EIO);
}
- WARN_ON_ONCE(i_blocksize(inode) < PAGE_SIZE && !iop);
- WARN_ON_ONCE(iop && atomic_read(&iop->write_count) <= 0);
+ WARN_ON_ONCE(i_blocks_per_page(inode, page) > 1 && !iop);
+ WARN_ON_ONCE(iop && atomic_read(&iop->write_bytes_pending) <= 0);
- if (!iop || atomic_dec_and_test(&iop->write_count))
+ if (!iop || atomic_sub_and_test(len, &iop->write_bytes_pending))
end_page_writeback(page);
}
@@ -1116,7 +1085,8 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error)
/* walk each page on bio, ending page IO on them */
bio_for_each_segment_all(bv, bio, iter_all)
- iomap_finish_page_writeback(inode, bv->bv_page, error);
+ iomap_finish_page_writeback(inode, bv->bv_page, error,
+ bv->bv_len);
bio_put(bio);
}
/* The ioend has been freed by bio_put() */
@@ -1332,8 +1302,8 @@ iomap_add_to_ioend(struct inode *inode, loff_t offset, struct page *page,
merged = __bio_try_merge_page(wpc->ioend->io_bio, page, len, poff,
&same_page);
- if (iop && !same_page)
- atomic_inc(&iop->write_count);
+ if (iop)
+ atomic_add(len, &iop->write_bytes_pending);
if (!merged) {
if (bio_full(wpc->ioend->io_bio, len)) {
@@ -1375,8 +1345,8 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
int error = 0, count = 0, i;
LIST_HEAD(submit_list);
- WARN_ON_ONCE(i_blocksize(inode) < PAGE_SIZE && !iop);
- WARN_ON_ONCE(iop && atomic_read(&iop->write_count) != 0);
+ WARN_ON_ONCE(i_blocks_per_page(inode, page) > 1 && !iop);
+ WARN_ON_ONCE(iop && atomic_read(&iop->write_bytes_pending) != 0);
/*
* Walk through the page to find areas to write back. If we run off the
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index c1aafb2ab990..933f234d5bec 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -76,7 +76,7 @@ static void iomap_dio_submit_bio(struct iomap_dio *dio, struct iomap *iomap,
dio->submit.cookie = submit_bio(bio);
}
-static ssize_t iomap_dio_complete(struct iomap_dio *dio)
+ssize_t iomap_dio_complete(struct iomap_dio *dio)
{
const struct iomap_dio_ops *dops = dio->dops;
struct kiocb *iocb = dio->iocb;
@@ -108,7 +108,7 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio)
* ->end_io() when necessary, otherwise a racing buffer read would cache
* zeros from unwritten extents.
*/
- if (!dio->error &&
+ if (!dio->error && dio->size &&
(dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) {
int err;
err = invalidate_inode_pages2_range(inode->i_mapping,
@@ -118,6 +118,7 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio)
dio_warn_stale_pagecache(iocb->ki_filp);
}
+ inode_dio_end(file_inode(iocb->ki_filp));
/*
* If this is a DSYNC write, make sure we push it to stable storage now
* that we've written data.
@@ -125,11 +126,11 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio)
if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC))
ret = generic_write_sync(iocb, ret);
- inode_dio_end(file_inode(iocb->ki_filp));
kfree(dio);
return ret;
}
+EXPORT_SYMBOL_GPL(iomap_dio_complete);
static void iomap_dio_complete_work(struct work_struct *work)
{
@@ -388,6 +389,16 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
return iomap_dio_bio_actor(inode, pos, length, dio, iomap);
case IOMAP_INLINE:
return iomap_dio_inline_actor(inode, pos, length, dio, iomap);
+ case IOMAP_DELALLOC:
+ /*
+ * DIO is not serialised against mmap() access at all, and so
+ * if the page_mkwrite occurs between the writeback and the
+ * iomap_apply() call in the DIO path, then it will see the
+ * DELALLOC block that the page-mkwrite allocated.
+ */
+ pr_warn_ratelimited("Direct I/O collision with buffered writes! File: %pD4 Comm: %.20s\n",
+ dio->iocb->ki_filp, current->comm);
+ return -EIO;
default:
WARN_ON_ONCE(1);
return -EIO;
@@ -406,8 +417,8 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
* Returns -ENOTBLK In case of a page invalidation invalidation failure for
* writes. The callers needs to fall back to buffered I/O in this case.
*/
-ssize_t
-iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
+struct iomap_dio *
+__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
bool wait_for_completion)
{
@@ -421,14 +432,14 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
struct iomap_dio *dio;
if (!count)
- return 0;
+ return NULL;
if (WARN_ON(is_sync_kiocb(iocb) && !wait_for_completion))
- return -EIO;
+ return ERR_PTR(-EIO);
dio = kmalloc(sizeof(*dio), GFP_KERNEL);
if (!dio)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
dio->iocb = iocb;
atomic_set(&dio->ref, 1);
@@ -558,7 +569,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
dio->wait_for_completion = wait_for_completion;
if (!atomic_dec_and_test(&dio->ref)) {
if (!wait_for_completion)
- return -EIOCBQUEUED;
+ return ERR_PTR(-EIOCBQUEUED);
for (;;) {
set_current_state(TASK_UNINTERRUPTIBLE);
@@ -574,10 +585,26 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
__set_current_state(TASK_RUNNING);
}
- return iomap_dio_complete(dio);
+ return dio;
out_free_dio:
kfree(dio);
- return ret;
+ if (ret)
+ return ERR_PTR(ret);
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(__iomap_dio_rw);
+
+ssize_t
+iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
+ const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
+ bool wait_for_completion)
+{
+ struct iomap_dio *dio;
+
+ dio = __iomap_dio_rw(iocb, iter, ops, dops, wait_for_completion);
+ if (IS_ERR_OR_NULL(dio))
+ return PTR_ERR_OR_ZERO(dio);
+ return iomap_dio_complete(dio);
}
EXPORT_SYMBOL_GPL(iomap_dio_rw);
diff --git a/fs/iomap/seek.c b/fs/iomap/seek.c
index 89f61d93c0bc..107ee80c3568 100644
--- a/fs/iomap/seek.c
+++ b/fs/iomap/seek.c
@@ -127,7 +127,7 @@ iomap_seek_hole_actor(struct inode *inode, loff_t offset, loff_t length,
SEEK_HOLE);
if (offset < 0)
return length;
- /* fall through */
+ fallthrough;
case IOMAP_HOLE:
*(loff_t *)data = offset;
return 0;
@@ -175,7 +175,7 @@ iomap_seek_data_actor(struct inode *inode, loff_t offset, loff_t length,
SEEK_DATA);
if (offset < 0)
return length;
- /*FALLTHRU*/
+ fallthrough;
default:
*(loff_t *)data = offset;
return 0;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index e4944436e733..17fdc482f554 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1285,7 +1285,7 @@ journal_t *jbd2_journal_init_inode(struct inode *inode)
* superblock as being NULL to prevent the journal destroy from writing
* back a bogus superblock.
*/
-static void journal_fail_superblock (journal_t *journal)
+static void journal_fail_superblock(journal_t *journal)
{
struct buffer_head *bh = journal->j_sb_buffer;
brelse(bh);
@@ -1367,8 +1367,10 @@ static int jbd2_write_superblock(journal_t *journal, int write_flags)
int ret;
/* Buffer got discarded which means block device got invalidated */
- if (!buffer_mapped(bh))
+ if (!buffer_mapped(bh)) {
+ unlock_buffer(bh);
return -EIO;
+ }
trace_jbd2_write_superblock(journal, write_flags);
if (!(journal->j_flags & JBD2_BARRIER))
@@ -1815,7 +1817,7 @@ int jbd2_journal_destroy(journal_t *journal)
/**
- *int jbd2_journal_check_used_features () - Check if features specified are used.
+ *int jbd2_journal_check_used_features() - Check if features specified are used.
* @journal: Journal to check.
* @compat: bitmask of compatible features
* @ro: bitmask of features that force read-only mount
@@ -1825,7 +1827,7 @@ int jbd2_journal_destroy(journal_t *journal)
* features. Return true (non-zero) if it does.
**/
-int jbd2_journal_check_used_features (journal_t *journal, unsigned long compat,
+int jbd2_journal_check_used_features(journal_t *journal, unsigned long compat,
unsigned long ro, unsigned long incompat)
{
journal_superblock_t *sb;
@@ -1860,7 +1862,7 @@ int jbd2_journal_check_used_features (journal_t *journal, unsigned long compat,
* all of a given set of features on this journal. Return true
* (non-zero) if it can. */
-int jbd2_journal_check_available_features (journal_t *journal, unsigned long compat,
+int jbd2_journal_check_available_features(journal_t *journal, unsigned long compat,
unsigned long ro, unsigned long incompat)
{
if (!compat && !ro && !incompat)
@@ -1882,7 +1884,7 @@ int jbd2_journal_check_available_features (journal_t *journal, unsigned long com
}
/**
- * int jbd2_journal_set_features () - Mark a given journal feature in the superblock
+ * int jbd2_journal_set_features() - Mark a given journal feature in the superblock
* @journal: Journal to act on.
* @compat: bitmask of compatible features
* @ro: bitmask of features that force read-only mount
@@ -1893,7 +1895,7 @@ int jbd2_journal_check_available_features (journal_t *journal, unsigned long com
*
*/
-int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
+int jbd2_journal_set_features(journal_t *journal, unsigned long compat,
unsigned long ro, unsigned long incompat)
{
#define INCOMPAT_FEATURE_ON(f) \
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 2ed278f0dced..faa97d748474 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -690,14 +690,11 @@ static int do_one_pass(journal_t *journal,
* number. */
if (pass == PASS_SCAN &&
jbd2_has_feature_checksum(journal)) {
- int chksum_err, chksum_seen;
struct commit_header *cbh =
(struct commit_header *)bh->b_data;
unsigned found_chksum =
be32_to_cpu(cbh->h_chksum[0]);
- chksum_err = chksum_seen = 0;
-
if (info->end_transaction) {
journal->j_failed_commit =
info->end_transaction;
@@ -705,42 +702,23 @@ static int do_one_pass(journal_t *journal,
break;
}
- if (crc32_sum == found_chksum &&
- cbh->h_chksum_type == JBD2_CRC32_CHKSUM &&
- cbh->h_chksum_size ==
- JBD2_CRC32_CHKSUM_SIZE)
- chksum_seen = 1;
- else if (!(cbh->h_chksum_type == 0 &&
- cbh->h_chksum_size == 0 &&
- found_chksum == 0 &&
- !chksum_seen))
- /*
- * If fs is mounted using an old kernel and then
- * kernel with journal_chksum is used then we
- * get a situation where the journal flag has
- * checksum flag set but checksums are not
- * present i.e chksum = 0, in the individual
- * commit blocks.
- * Hence to avoid checksum failures, in this
- * situation, this extra check is added.
- */
- chksum_err = 1;
-
- if (chksum_err) {
- info->end_transaction = next_commit_ID;
-
- if (!jbd2_has_feature_async_commit(journal)) {
- journal->j_failed_commit =
- next_commit_ID;
- brelse(bh);
- break;
- }
- }
+ /* Neither checksum match nor unused? */
+ if (!((crc32_sum == found_chksum &&
+ cbh->h_chksum_type ==
+ JBD2_CRC32_CHKSUM &&
+ cbh->h_chksum_size ==
+ JBD2_CRC32_CHKSUM_SIZE) ||
+ (cbh->h_chksum_type == 0 &&
+ cbh->h_chksum_size == 0 &&
+ found_chksum == 0)))
+ goto chksum_error;
+
crc32_sum = ~0;
}
if (pass == PASS_SCAN &&
!jbd2_commit_block_csum_verify(journal,
bh->b_data)) {
+ chksum_error:
info->end_transaction = next_commit_ID;
if (!jbd2_has_feature_async_commit(journal)) {
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index e91aad3637a2..43985738aa86 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -2026,6 +2026,9 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
*/
static void __jbd2_journal_unfile_buffer(struct journal_head *jh)
{
+ J_ASSERT_JH(jh, jh->b_transaction != NULL);
+ J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+
__jbd2_journal_temp_unlink_buffer(jh);
jh->b_transaction = NULL;
}
@@ -2078,10 +2081,6 @@ out:
* int jbd2_journal_try_to_free_buffers() - try to free page buffers.
* @journal: journal for operation
* @page: to try and free
- * @gfp_mask: we use the mask to detect how hard should we try to release
- * buffers. If __GFP_DIRECT_RECLAIM and __GFP_FS is set, we wait for commit
- * code to release the buffers.
- *
*
* For all the buffers on this page,
* if they are fully written out ordered data, move them onto BUF_CLEAN
@@ -2112,11 +2111,11 @@ out:
*
* Return 0 on failure, 1 on success
*/
-int jbd2_journal_try_to_free_buffers(journal_t *journal,
- struct page *page, gfp_t gfp_mask)
+int jbd2_journal_try_to_free_buffers(journal_t *journal, struct page *page)
{
struct buffer_head *head;
struct buffer_head *bh;
+ bool has_write_io_error = false;
int ret = 0;
J_ASSERT(PageLocked(page));
@@ -2141,11 +2140,26 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal,
jbd2_journal_put_journal_head(jh);
if (buffer_jbd(bh))
goto busy;
+
+ /*
+ * If we free a metadata buffer which has been failed to
+ * write out, the jbd2 checkpoint procedure will not detect
+ * this failure and may lead to filesystem inconsistency
+ * after cleanup journal tail.
+ */
+ if (buffer_write_io_error(bh)) {
+ pr_err("JBD2: Error while async write back metadata bh %llu.",
+ (unsigned long long)bh->b_blocknr);
+ has_write_io_error = true;
+ }
} while ((bh = bh->b_this_page) != head);
ret = try_to_free_buffers(page);
busy:
+ if (has_write_io_error)
+ jbd2_journal_abort(journal, -EIO);
+
return ret;
}
@@ -2572,6 +2586,13 @@ bool __jbd2_journal_refile_buffer(struct journal_head *jh)
was_dirty = test_clear_buffer_jbddirty(bh);
__jbd2_journal_temp_unlink_buffer(jh);
+
+ /*
+ * b_transaction must be set, otherwise the new b_transaction won't
+ * be holding jh reference
+ */
+ J_ASSERT_JH(jh, jh->b_transaction != NULL);
+
/*
* We set b_transaction here because b_next_transaction will inherit
* our jh reference and thus __jbd2_journal_file_buffer() must not
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index f20cff1194bb..776493713153 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -590,10 +590,14 @@ static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry)
int ret;
uint32_t now = JFFS2_NOW();
+ mutex_lock(&f->sem);
for (fd = f->dents ; fd; fd = fd->next) {
- if (fd->ino)
+ if (fd->ino) {
+ mutex_unlock(&f->sem);
return -ENOTEMPTY;
+ }
}
+ mutex_unlock(&f->sem);
ret = jffs2_do_unlink(c, dir_f, dentry->d_name.name,
dentry->d_name.len, f, now);
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index ab8cdd9e9325..78858f6e9583 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -341,7 +341,7 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino)
rdev = old_decode_dev(je16_to_cpu(jdev.old_id));
else
rdev = new_decode_dev(je32_to_cpu(jdev.new_id));
- /* fall through */
+ fallthrough;
case S_IFSOCK:
case S_IFIFO:
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index bccfc40b3a74..2f6f0b140c05 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -1273,7 +1273,7 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c,
dbg_readinode("symlink's target '%s' cached\n", f->target);
}
- /* fall through... */
+ fallthrough;
case S_IFBLK:
case S_IFCHR:
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 5f7e284e0df3..db72a9d2d0af 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -261,7 +261,8 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
}
#endif
if (c->nr_erasing_blocks) {
- if ( !c->used_size && ((c->nr_free_blocks+empty_blocks+bad_blocks)!= c->nr_blocks || bad_blocks == c->nr_blocks) ) {
+ if (!c->used_size && !c->unchecked_size &&
+ ((c->nr_free_blocks+empty_blocks+bad_blocks) != c->nr_blocks || bad_blocks == c->nr_blocks)) {
pr_notice("Cowardly refusing to erase blocks on filesystem with no valid JFFS2 nodes\n");
pr_notice("empty_blocks %d, bad_blocks %d, c->nr_blocks %d\n",
empty_blocks, bad_blocks, c->nr_blocks);
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index a2f5338a5ea1..176580f54af9 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -473,7 +473,7 @@ static int metapage_readpage(struct file *fp, struct page *page)
struct inode *inode = page->mapping->host;
struct bio *bio = NULL;
int block_offset;
- int blocks_per_page = PAGE_SIZE >> inode->i_blkbits;
+ int blocks_per_page = i_blocks_per_page(inode, page);
sector_t page_start; /* address of page in fs blocks */
sector_t pblock;
int xlen;
diff --git a/fs/libfs.c b/fs/libfs.c
index 4d08edf19c78..e0d42e977d9a 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -137,11 +137,11 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
switch (whence) {
case 1:
offset += file->f_pos;
- /* fall through */
+ fallthrough;
case 0:
if (offset >= 0)
break;
- /* fall through */
+ fallthrough;
default:
return -EINVAL;
}
diff --git a/fs/locks.c b/fs/locks.c
index 8fc0542f5132..1f84a03601fe 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1499,7 +1499,7 @@ static void lease_clear_pending(struct file_lock *fl, int arg)
switch (arg) {
case F_UNLCK:
fl->fl_flags &= ~FL_UNLOCK_PENDING;
- /* fall through */
+ fallthrough;
case F_RDLCK:
fl->fl_flags &= ~FL_DOWNGRADE_PENDING;
}
@@ -2525,7 +2525,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
cmd = F_SETLKW;
file_lock->fl_flags |= FL_OFDLCK;
file_lock->fl_owner = filp;
- /* Fallthrough */
+ fallthrough;
case F_SETLKW:
file_lock->fl_flags |= FL_SLEEP;
}
@@ -2656,7 +2656,7 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
cmd = F_SETLKW64;
file_lock->fl_flags |= FL_OFDLCK;
file_lock->fl_owner = filp;
- /* Fallthrough */
+ fallthrough;
case F_SETLKW64:
file_lock->fl_flags |= FL_SLEEP;
}
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 7cb5fd38eb14..7b09a9158e40 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -150,6 +150,25 @@ static int minix_remount (struct super_block * sb, int * flags, char * data)
return 0;
}
+static bool minix_check_superblock(struct super_block *sb)
+{
+ struct minix_sb_info *sbi = minix_sb(sb);
+
+ if (sbi->s_imap_blocks == 0 || sbi->s_zmap_blocks == 0)
+ return false;
+
+ /*
+ * s_max_size must not exceed the block mapping limitation. This check
+ * is only needed for V1 filesystems, since V2/V3 support an extra level
+ * of indirect blocks which places the limit well above U32_MAX.
+ */
+ if (sbi->s_version == MINIX_V1 &&
+ sb->s_maxbytes > (7 + 512 + 512*512) * BLOCK_SIZE)
+ return false;
+
+ return true;
+}
+
static int minix_fill_super(struct super_block *s, void *data, int silent)
{
struct buffer_head *bh;
@@ -185,7 +204,7 @@ static int minix_fill_super(struct super_block *s, void *data, int silent)
sbi->s_zmap_blocks = ms->s_zmap_blocks;
sbi->s_firstdatazone = ms->s_firstdatazone;
sbi->s_log_zone_size = ms->s_log_zone_size;
- sbi->s_max_size = ms->s_max_size;
+ s->s_maxbytes = ms->s_max_size;
s->s_magic = ms->s_magic;
if (s->s_magic == MINIX_SUPER_MAGIC) {
sbi->s_version = MINIX_V1;
@@ -216,7 +235,7 @@ static int minix_fill_super(struct super_block *s, void *data, int silent)
sbi->s_zmap_blocks = m3s->s_zmap_blocks;
sbi->s_firstdatazone = m3s->s_firstdatazone;
sbi->s_log_zone_size = m3s->s_log_zone_size;
- sbi->s_max_size = m3s->s_max_size;
+ s->s_maxbytes = m3s->s_max_size;
sbi->s_ninodes = m3s->s_ninodes;
sbi->s_nzones = m3s->s_zones;
sbi->s_dirsize = 64;
@@ -228,11 +247,12 @@ static int minix_fill_super(struct super_block *s, void *data, int silent)
} else
goto out_no_fs;
+ if (!minix_check_superblock(s))
+ goto out_illegal_sb;
+
/*
* Allocate the buffer map to keep the superblock small.
*/
- if (sbi->s_imap_blocks == 0 || sbi->s_zmap_blocks == 0)
- goto out_illegal_sb;
i = (sbi->s_imap_blocks + sbi->s_zmap_blocks) * sizeof(bh);
map = kzalloc(i, GFP_KERNEL);
if (!map)
@@ -468,6 +488,13 @@ static struct inode *V1_minix_iget(struct inode *inode)
iget_failed(inode);
return ERR_PTR(-EIO);
}
+ if (raw_inode->i_nlinks == 0) {
+ printk("MINIX-fs: deleted inode referenced: %lu\n",
+ inode->i_ino);
+ brelse(bh);
+ iget_failed(inode);
+ return ERR_PTR(-ESTALE);
+ }
inode->i_mode = raw_inode->i_mode;
i_uid_write(inode, raw_inode->i_uid);
i_gid_write(inode, raw_inode->i_gid);
@@ -501,6 +528,13 @@ static struct inode *V2_minix_iget(struct inode *inode)
iget_failed(inode);
return ERR_PTR(-EIO);
}
+ if (raw_inode->i_nlinks == 0) {
+ printk("MINIX-fs: deleted inode referenced: %lu\n",
+ inode->i_ino);
+ brelse(bh);
+ iget_failed(inode);
+ return ERR_PTR(-ESTALE);
+ }
inode->i_mode = raw_inode->i_mode;
i_uid_write(inode, raw_inode->i_uid);
i_gid_write(inode, raw_inode->i_gid);
diff --git a/fs/minix/itree_common.c b/fs/minix/itree_common.c
index 043c3fdbc8e7..446148792f41 100644
--- a/fs/minix/itree_common.c
+++ b/fs/minix/itree_common.c
@@ -75,6 +75,7 @@ static int alloc_branch(struct inode *inode,
int n = 0;
int i;
int parent = minix_new_block(inode);
+ int err = -ENOSPC;
branch[0].key = cpu_to_block(parent);
if (parent) for (n = 1; n < num; n++) {
@@ -85,6 +86,11 @@ static int alloc_branch(struct inode *inode,
break;
branch[n].key = cpu_to_block(nr);
bh = sb_getblk(inode->i_sb, parent);
+ if (!bh) {
+ minix_free_block(inode, nr);
+ err = -ENOMEM;
+ break;
+ }
lock_buffer(bh);
memset(bh->b_data, 0, bh->b_size);
branch[n].bh = bh;
@@ -103,7 +109,7 @@ static int alloc_branch(struct inode *inode,
bforget(branch[i].bh);
for (i = 0; i < n; i++)
minix_free_block(inode, block_to_cpu(branch[i].key));
- return -ENOSPC;
+ return err;
}
static inline int splice_branch(struct inode *inode,
diff --git a/fs/minix/itree_v1.c b/fs/minix/itree_v1.c
index 046cc96ee7ad..1fed906042aa 100644
--- a/fs/minix/itree_v1.c
+++ b/fs/minix/itree_v1.c
@@ -29,12 +29,12 @@ static int block_to_path(struct inode * inode, long block, int offsets[DEPTH])
if (block < 0) {
printk("MINIX-fs: block_to_path: block %ld < 0 on dev %pg\n",
block, inode->i_sb->s_bdev);
- } else if (block >= (minix_sb(inode->i_sb)->s_max_size/BLOCK_SIZE)) {
- if (printk_ratelimit())
- printk("MINIX-fs: block_to_path: "
- "block %ld too big on dev %pg\n",
- block, inode->i_sb->s_bdev);
- } else if (block < 7) {
+ return 0;
+ }
+ if ((u64)block * BLOCK_SIZE >= inode->i_sb->s_maxbytes)
+ return 0;
+
+ if (block < 7) {
offsets[n++] = block;
} else if ((block -= 7) < 512) {
offsets[n++] = 7;
diff --git a/fs/minix/itree_v2.c b/fs/minix/itree_v2.c
index f7fc7ecccccc..9d00f31a2d9d 100644
--- a/fs/minix/itree_v2.c
+++ b/fs/minix/itree_v2.c
@@ -32,13 +32,12 @@ static int block_to_path(struct inode * inode, long block, int offsets[DEPTH])
if (block < 0) {
printk("MINIX-fs: block_to_path: block %ld < 0 on dev %pg\n",
block, sb->s_bdev);
- } else if ((u64)block * (u64)sb->s_blocksize >=
- minix_sb(sb)->s_max_size) {
- if (printk_ratelimit())
- printk("MINIX-fs: block_to_path: "
- "block %ld too big on dev %pg\n",
- block, sb->s_bdev);
- } else if (block < DIRCOUNT) {
+ return 0;
+ }
+ if ((u64)block * (u64)sb->s_blocksize >= sb->s_maxbytes)
+ return 0;
+
+ if (block < DIRCOUNT) {
offsets[n++] = block;
} else if ((block -= DIRCOUNT) < INDIRCOUNT(sb)) {
offsets[n++] = DIRCOUNT;
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index df081e8afcc3..168d45d3de73 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -32,7 +32,6 @@ struct minix_sb_info {
unsigned long s_zmap_blocks;
unsigned long s_firstdatazone;
unsigned long s_log_zone_size;
- unsigned long s_max_size;
int s_dirsize;
int s_namelen;
struct buffer_head ** s_imap;
diff --git a/fs/namei.c b/fs/namei.c
index fde8fe086c09..f1eb8ccd2be9 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -568,8 +568,8 @@ static bool path_connected(struct vfsmount *mnt, struct dentry *dentry)
{
struct super_block *sb = mnt->mnt_sb;
- /* Bind mounts and multi-root filesystems can have disconnected paths */
- if (!(sb->s_iflags & SB_I_MULTIROOT) && (mnt->mnt_root == sb->s_root))
+ /* Bind mounts can have disconnected paths */
+ if (mnt->mnt_root == sb->s_root)
return true;
return is_subdir(dentry, mnt->mnt_root);
@@ -2851,16 +2851,24 @@ static int may_open(const struct path *path, int acc_mode, int flag)
case S_IFDIR:
if (acc_mode & MAY_WRITE)
return -EISDIR;
+ if (acc_mode & MAY_EXEC)
+ return -EACCES;
break;
case S_IFBLK:
case S_IFCHR:
if (!may_open_dev(path))
return -EACCES;
- /*FALLTHRU*/
+ fallthrough;
case S_IFIFO:
case S_IFSOCK:
+ if (acc_mode & MAY_EXEC)
+ return -EACCES;
flag &= ~O_TRUNC;
break;
+ case S_IFREG:
+ if ((acc_mode & MAY_EXEC) && path_noexec(path))
+ return -EACCES;
+ break;
}
error = inode_permission(inode, MAY_OPEN | acc_mode);
@@ -3770,11 +3778,11 @@ exit2:
mnt_drop_write(path.mnt);
exit1:
path_put(&path);
- putname(name);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
goto retry;
}
+ putname(name);
return error;
}
diff --git a/fs/namespace.c b/fs/namespace.c
index bae0e95b3713..294e05a13d17 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3072,10 +3072,10 @@ static void shrink_submounts(struct mount *mnt)
}
}
-void *copy_mount_options(const void __user * data)
+static void *copy_mount_options(const void __user * data)
{
char *copy;
- unsigned size;
+ unsigned left, offset;
if (!data)
return NULL;
@@ -3084,20 +3084,31 @@ void *copy_mount_options(const void __user * data)
if (!copy)
return ERR_PTR(-ENOMEM);
- size = PAGE_SIZE - offset_in_page(data);
+ left = copy_from_user(copy, data, PAGE_SIZE);
- if (copy_from_user(copy, data, size)) {
+ /*
+ * Not all architectures have an exact copy_from_user(). Resort to
+ * byte at a time.
+ */
+ offset = PAGE_SIZE - left;
+ while (left) {
+ char c;
+ if (get_user(c, (const char __user *)data + offset))
+ break;
+ copy[offset] = c;
+ left--;
+ offset++;
+ }
+
+ if (left == PAGE_SIZE) {
kfree(copy);
return ERR_PTR(-EFAULT);
}
- if (size != PAGE_SIZE) {
- if (copy_from_user(copy + size, data + size, PAGE_SIZE - size))
- memset(copy + size, 0, PAGE_SIZE - size);
- }
+
return copy;
}
-char *copy_mount_string(const void __user *data)
+static char *copy_mount_string(const void __user *data)
{
return data ? strndup_user(data, PATH_MAX) : NULL;
}
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 2433c3e03cfa..22d11fdc6deb 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -30,7 +30,7 @@ nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o
nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o
nfsv4-$(CONFIG_SYSCTL) += nfs4sysctl.o
nfsv4-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o pnfs_nfs.o
-nfsv4-$(CONFIG_NFS_V4_2) += nfs42proc.o
+nfsv4-$(CONFIG_NFS_V4_2) += nfs42proc.o nfs42xattr.o
obj-$(CONFIG_PNFS_FILE_LAYOUT) += filelayout/
obj-$(CONFIG_PNFS_BLOCK) += blocklayout/
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index d1a0e2c8b1b4..08108b6d2fa1 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -753,7 +753,7 @@ out:
case -ENODEV:
/* Our extent block devices are unavailable */
set_bit(NFS_LSEG_UNAVAILABLE, &lseg->pls_flags);
- /* Fall through */
+ fallthrough;
case 0:
return lseg;
default:
diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c
index 9fb067a6f7e0..ef9db135c649 100644
--- a/fs/nfs/blocklayout/rpc_pipefs.c
+++ b/fs/nfs/blocklayout/rpc_pipefs.c
@@ -79,7 +79,7 @@ bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b,
goto out_free_data;
bl_msg = msg->data;
- bl_msg->type = BL_DEVICE_MOUNT,
+ bl_msg->type = BL_DEVICE_MOUNT;
bl_msg->totallen = b->simple.len;
nfs4_encode_simple(msg->data + sizeof(*bl_msg), b);
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index f1ff3076e4a4..4b8cc93913f7 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -50,6 +50,7 @@
#include "nfs.h"
#include "netns.h"
#include "sysfs.h"
+#include "nfs42.h"
#define NFSDBG_FACILITY NFSDBG_CLIENT
@@ -749,7 +750,7 @@ error:
static void nfs_server_set_fsinfo(struct nfs_server *server,
struct nfs_fsinfo *fsinfo)
{
- unsigned long max_rpc_payload;
+ unsigned long max_rpc_payload, raw_max_rpc_payload;
/* Work out a lot of parameters */
if (server->rsize == 0)
@@ -762,7 +763,9 @@ static void nfs_server_set_fsinfo(struct nfs_server *server,
if (fsinfo->wtmax >= 512 && server->wsize > fsinfo->wtmax)
server->wsize = nfs_block_size(fsinfo->wtmax, NULL);
- max_rpc_payload = nfs_block_size(rpc_max_payload(server->client), NULL);
+ raw_max_rpc_payload = rpc_max_payload(server->client);
+ max_rpc_payload = nfs_block_size(raw_max_rpc_payload, NULL);
+
if (server->rsize > max_rpc_payload)
server->rsize = max_rpc_payload;
if (server->rsize > NFS_MAX_FILE_IO_SIZE)
@@ -795,6 +798,21 @@ static void nfs_server_set_fsinfo(struct nfs_server *server,
server->clone_blksize = fsinfo->clone_blksize;
/* We're airborne Set socket buffersize */
rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100);
+
+#ifdef CONFIG_NFS_V4_2
+ /*
+ * Defaults until limited by the session parameters.
+ */
+ server->gxasize = min_t(unsigned int, raw_max_rpc_payload,
+ XATTR_SIZE_MAX);
+ server->sxasize = min_t(unsigned int, raw_max_rpc_payload,
+ XATTR_SIZE_MAX);
+ server->lxasize = min_t(unsigned int, raw_max_rpc_payload,
+ nfs42_listxattr_xdrsize(XATTR_LIST_MAX));
+
+ if (fsinfo->xattr_support)
+ server->caps |= NFS_CAP_XATTR;
+#endif
}
/*
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 5a331da5f55a..cb52db9a0cfb 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -579,6 +579,9 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
do {
+ if (entry->label)
+ entry->label->len = NFS4_MAXLABELLEN;
+
status = xdr_decode(desc, entry, &stream);
if (status != 0) {
if (status == -EAGAIN)
@@ -1181,7 +1184,7 @@ int nfs_lookup_verify_inode(struct inode *inode, unsigned int flags)
/* A NFSv4 OPEN will revalidate later */
if (server->caps & NFS_CAP_ATOMIC_OPEN)
goto out;
- /* Fallthrough */
+ fallthrough;
case S_IFDIR:
if (server->flags & NFS_MOUNT_NOCTO)
break;
@@ -2460,7 +2463,7 @@ static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, co
return NULL;
}
-static int nfs_access_get_cached(struct inode *inode, const struct cred *cred, struct nfs_access_entry *res, bool may_block)
+static int nfs_access_get_cached_locked(struct inode *inode, const struct cred *cred, struct nfs_access_entry *res, bool may_block)
{
struct nfs_inode *nfsi = NFS_I(inode);
struct nfs_access_entry *cache;
@@ -2533,6 +2536,20 @@ out:
return err;
}
+int nfs_access_get_cached(struct inode *inode, const struct cred *cred, struct
+nfs_access_entry *res, bool may_block)
+{
+ int status;
+
+ status = nfs_access_get_cached_rcu(inode, cred, res);
+ if (status != 0)
+ status = nfs_access_get_cached_locked(inode, cred, res,
+ may_block);
+
+ return status;
+}
+EXPORT_SYMBOL_GPL(nfs_access_get_cached);
+
static void nfs_access_add_rbtree(struct inode *inode, struct nfs_access_entry *set)
{
struct nfs_inode *nfsi = NFS_I(inode);
@@ -2647,9 +2664,7 @@ static int nfs_do_access(struct inode *inode, const struct cred *cred, int mask)
trace_nfs_access_enter(inode);
- status = nfs_access_get_cached_rcu(inode, cred, &cache);
- if (status != 0)
- status = nfs_access_get_cached(inode, cred, &cache, may_block);
+ status = nfs_access_get_cached(inode, cred, &cache, may_block);
if (status == 0)
goto out_cached;
@@ -2661,6 +2676,10 @@ static int nfs_do_access(struct inode *inode, const struct cred *cred, int mask)
* Determine which access bits we want to ask for...
*/
cache.mask = NFS_ACCESS_READ | NFS_ACCESS_MODIFY | NFS_ACCESS_EXTEND;
+ if (nfs_server_capable(inode, NFS_CAP_XATTR)) {
+ cache.mask |= NFS_ACCESS_XAREAD | NFS_ACCESS_XAWRITE |
+ NFS_ACCESS_XALIST;
+ }
if (S_ISDIR(inode->i_mode))
cache.mask |= NFS_ACCESS_DELETE | NFS_ACCESS_LOOKUP;
else
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 1b79dd5cf661..2d30a4da49fa 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -896,7 +896,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
*/
ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
{
- ssize_t result = -EINVAL, requested;
+ ssize_t result, requested;
size_t count;
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index f96367a2463e..63940a7a70be 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -140,6 +140,7 @@ static int
nfs_file_flush(struct file *file, fl_owner_t id)
{
struct inode *inode = file_inode(file);
+ errseq_t since;
dprintk("NFS: flush(%pD2)\n", file);
@@ -148,7 +149,9 @@ nfs_file_flush(struct file *file, fl_owner_t id)
return 0;
/* Flush writes to the server and return any errors */
- return nfs_wb_all(inode);
+ since = filemap_sample_wb_err(file->f_mapping);
+ nfs_wb_all(inode);
+ return filemap_check_wb_err(file->f_mapping, since);
}
ssize_t
@@ -587,12 +590,14 @@ static const struct vm_operations_struct nfs_file_vm_ops = {
.page_mkwrite = nfs_vm_page_mkwrite,
};
-static int nfs_need_check_write(struct file *filp, struct inode *inode)
+static int nfs_need_check_write(struct file *filp, struct inode *inode,
+ int error)
{
struct nfs_open_context *ctx;
ctx = nfs_file_open_context(filp);
- if (nfs_ctx_key_to_expire(ctx, inode))
+ if (nfs_error_is_fatal_on_server(error) ||
+ nfs_ctx_key_to_expire(ctx, inode))
return 1;
return 0;
}
@@ -603,6 +608,8 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
struct inode *inode = file_inode(file);
unsigned long written = 0;
ssize_t result;
+ errseq_t since;
+ int error;
result = nfs_key_timeout_notify(file, inode);
if (result)
@@ -627,6 +634,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
if (iocb->ki_pos > i_size_read(inode))
nfs_revalidate_mapping(inode, file->f_mapping);
+ since = filemap_sample_wb_err(file->f_mapping);
nfs_start_io_write(inode);
result = generic_write_checks(iocb, from);
if (result > 0) {
@@ -645,7 +653,8 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
goto out;
/* Return error values */
- if (nfs_need_check_write(file, inode)) {
+ error = filemap_check_wb_err(file->f_mapping, since);
+ if (nfs_need_check_write(file, inode, error)) {
int err = nfs_wb_all(inode);
if (err < 0)
result = err;
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index a13e69009f19..7f5aa0403e16 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -187,7 +187,7 @@ static int filelayout_async_handle_error(struct rpc_task *task,
pnfs_error_mark_layout_for_return(inode, lseg);
pnfs_set_lo_fail(lseg);
rpc_wake_up(&tbl->slot_tbl_waitq);
- /* fall through */
+ fallthrough;
default:
reset:
dprintk("%s Retry through MDS. Error %d\n", __func__,
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index de03e440b7ee..a163533446fa 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -715,7 +715,7 @@ nfs4_ff_layout_stat_io_end_write(struct rpc_task *task,
}
static void
-ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, int idx)
+ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, u32 idx)
{
struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
@@ -724,7 +724,7 @@ ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, int idx)
}
static void
-ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, int idx)
+ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, u32 idx)
{
struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
@@ -734,14 +734,14 @@ ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, int idx)
static struct nfs4_pnfs_ds *
ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg,
- int start_idx, int *best_idx,
+ u32 start_idx, u32 *best_idx,
bool check_device)
{
struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
struct nfs4_ff_layout_mirror *mirror;
struct nfs4_pnfs_ds *ds;
bool fail_return = false;
- int idx;
+ u32 idx;
/* mirrors are initially sorted by efficiency */
for (idx = start_idx; idx < fls->mirror_array_cnt; idx++) {
@@ -766,21 +766,21 @@ ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg,
static struct nfs4_pnfs_ds *
ff_layout_choose_any_ds_for_read(struct pnfs_layout_segment *lseg,
- int start_idx, int *best_idx)
+ u32 start_idx, u32 *best_idx)
{
return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, false);
}
static struct nfs4_pnfs_ds *
ff_layout_choose_valid_ds_for_read(struct pnfs_layout_segment *lseg,
- int start_idx, int *best_idx)
+ u32 start_idx, u32 *best_idx)
{
return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, true);
}
static struct nfs4_pnfs_ds *
ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg,
- int start_idx, int *best_idx)
+ u32 start_idx, u32 *best_idx)
{
struct nfs4_pnfs_ds *ds;
@@ -790,6 +790,20 @@ ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg,
return ff_layout_choose_any_ds_for_read(lseg, start_idx, best_idx);
}
+static struct nfs4_pnfs_ds *
+ff_layout_get_ds_for_read(struct nfs_pageio_descriptor *pgio,
+ u32 *best_idx)
+{
+ struct pnfs_layout_segment *lseg = pgio->pg_lseg;
+ struct nfs4_pnfs_ds *ds;
+
+ ds = ff_layout_choose_best_ds_for_read(lseg, pgio->pg_mirror_idx,
+ best_idx);
+ if (ds || !pgio->pg_mirror_idx)
+ return ds;
+ return ff_layout_choose_best_ds_for_read(lseg, 0, best_idx);
+}
+
static void
ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req,
@@ -824,7 +838,7 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
struct nfs_pgio_mirror *pgm;
struct nfs4_ff_layout_mirror *mirror;
struct nfs4_pnfs_ds *ds;
- int ds_idx;
+ u32 ds_idx, i;
retry:
ff_layout_pg_check_layout(pgio, req);
@@ -840,25 +854,24 @@ retry:
goto out_nolseg;
}
- ds = ff_layout_choose_best_ds_for_read(pgio->pg_lseg, 0, &ds_idx);
+ ds = ff_layout_get_ds_for_read(pgio, &ds_idx);
if (!ds) {
if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg))
goto out_mds;
- pnfs_put_lseg(pgio->pg_lseg);
- pgio->pg_lseg = NULL;
+ pnfs_generic_pg_cleanup(pgio);
/* Sleep for 1 second before retrying */
ssleep(1);
goto retry;
}
- mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx);
+ for (i = 0; i < pgio->pg_mirror_count; i++) {
+ mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i);
+ pgm = &pgio->pg_mirrors[i];
+ pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].rsize;
+ }
pgio->pg_mirror_idx = ds_idx;
- /* read always uses only one mirror - idx 0 for pgio layer */
- pgm = &pgio->pg_mirrors[0];
- pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].rsize;
-
if (NFS_SERVER(pgio->pg_inode)->flags &
(NFS_MOUNT_SOFT|NFS_MOUNT_SOFTERR))
pgio->pg_maxretrans = io_maxretrans;
@@ -871,8 +884,6 @@ out_mds:
0, NFS4_MAX_UINT64, IOMODE_READ,
NFS_I(pgio->pg_inode)->layout,
pgio->pg_lseg);
- pnfs_put_lseg(pgio->pg_lseg);
- pgio->pg_lseg = NULL;
pgio->pg_maxretrans = 0;
nfs_pageio_reset_read_mds(pgio);
}
@@ -884,7 +895,7 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
struct nfs4_ff_layout_mirror *mirror;
struct nfs_pgio_mirror *pgm;
struct nfs4_pnfs_ds *ds;
- int i;
+ u32 i;
retry:
ff_layout_pg_check_layout(pgio, req);
@@ -916,8 +927,7 @@ retry:
if (!ds) {
if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg))
goto out_mds;
- pnfs_put_lseg(pgio->pg_lseg);
- pgio->pg_lseg = NULL;
+ pnfs_generic_pg_cleanup(pgio);
/* Sleep for 1 second before retrying */
ssleep(1);
goto retry;
@@ -939,8 +949,6 @@ out_mds:
0, NFS4_MAX_UINT64, IOMODE_RW,
NFS_I(pgio->pg_inode)->layout,
pgio->pg_lseg);
- pnfs_put_lseg(pgio->pg_lseg);
- pgio->pg_lseg = NULL;
pgio->pg_maxretrans = 0;
nfs_pageio_reset_write_mds(pgio);
pgio->pg_error = -EAGAIN;
@@ -953,8 +961,8 @@ ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio,
if (!pgio->pg_lseg) {
pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
nfs_req_openctx(req),
- 0,
- NFS4_MAX_UINT64,
+ req_offset(req),
+ req->wb_bytes,
IOMODE_RW,
false,
GFP_NOFS);
@@ -1028,11 +1036,24 @@ static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs)
}
}
+static void ff_layout_resend_pnfs_read(struct nfs_pgio_header *hdr)
+{
+ u32 idx = hdr->pgio_mirror_idx + 1;
+ u32 new_idx = 0;
+
+ if (ff_layout_choose_any_ds_for_read(hdr->lseg, idx + 1, &new_idx))
+ ff_layout_send_layouterror(hdr->lseg);
+ else
+ pnfs_error_mark_layout_for_return(hdr->inode, hdr->lseg);
+ pnfs_read_resend_pnfs(hdr, new_idx);
+}
+
static void ff_layout_reset_read(struct nfs_pgio_header *hdr)
{
struct rpc_task *task = &hdr->task;
pnfs_layoutcommit_inode(hdr->inode, false);
+ pnfs_error_mark_layout_for_return(hdr->inode, hdr->lseg);
if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
dprintk("%s Reset task %5u for i/o through MDS "
@@ -1055,7 +1076,7 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task,
struct nfs4_state *state,
struct nfs_client *clp,
struct pnfs_layout_segment *lseg,
- int idx)
+ u32 idx)
{
struct pnfs_layout_hdr *lo = lseg->pls_layout;
struct inode *inode = lo->plh_inode;
@@ -1113,7 +1134,7 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task,
nfs4_delete_deviceid(devid->ld, devid->nfs_client,
&devid->deviceid);
rpc_wake_up(&tbl->slot_tbl_waitq);
- /* fall through */
+ fallthrough;
default:
if (ff_layout_avoid_mds_available_ds(lseg))
return -NFS4ERR_RESET_TO_PNFS;
@@ -1129,7 +1150,7 @@ reset:
/* Retry all errors through either pNFS or MDS except for -EJUKEBOX */
static int ff_layout_async_handle_error_v3(struct rpc_task *task,
struct pnfs_layout_segment *lseg,
- int idx)
+ u32 idx)
{
struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
@@ -1164,7 +1185,7 @@ static int ff_layout_async_handle_error(struct rpc_task *task,
struct nfs4_state *state,
struct nfs_client *clp,
struct pnfs_layout_segment *lseg,
- int idx)
+ u32 idx)
{
int vers = clp->cl_nfs_mod->rpc_vers->number;
@@ -1191,7 +1212,7 @@ static int ff_layout_async_handle_error(struct rpc_task *task,
}
static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
- int idx, u64 offset, u64 length,
+ u32 idx, u64 offset, u64 length,
u32 *op_status, int opnum, int error)
{
struct nfs4_ff_layout_mirror *mirror;
@@ -1234,7 +1255,13 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
break;
case NFS4ERR_NXIO:
ff_layout_mark_ds_unreachable(lseg, idx);
- /* Fallthrough */
+ /*
+ * Don't return the layout if this is a read and we still
+ * have layouts to try
+ */
+ if (opnum == OP_READ)
+ break;
+ fallthrough;
default:
pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode,
lseg);
@@ -1247,7 +1274,6 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
static int ff_layout_read_done_cb(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
- int new_idx = hdr->pgio_mirror_idx;
int err;
if (task->tk_status < 0) {
@@ -1267,10 +1293,6 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
clear_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags);
switch (err) {
case -NFS4ERR_RESET_TO_PNFS:
- if (ff_layout_choose_best_ds_for_read(hdr->lseg,
- hdr->pgio_mirror_idx + 1,
- &new_idx))
- goto out_layouterror;
set_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags);
return task->tk_status;
case -NFS4ERR_RESET_TO_MDS:
@@ -1281,10 +1303,6 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
}
return 0;
-out_layouterror:
- ff_layout_read_record_layoutstats_done(task, hdr);
- ff_layout_send_layouterror(hdr->lseg);
- hdr->pgio_mirror_idx = new_idx;
out_eagain:
rpc_restart_call_prepare(task);
return -EAGAIN;
@@ -1411,10 +1429,9 @@ static void ff_layout_read_release(void *data)
struct nfs_pgio_header *hdr = data;
ff_layout_read_record_layoutstats_done(&hdr->task, hdr);
- if (test_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags)) {
- ff_layout_send_layouterror(hdr->lseg);
- pnfs_read_resend_pnfs(hdr);
- } else if (test_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags))
+ if (test_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags))
+ ff_layout_resend_pnfs_read(hdr);
+ else if (test_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags))
ff_layout_reset_read(hdr);
pnfs_generic_rw_release(data);
}
@@ -1793,7 +1810,7 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
loff_t offset = hdr->args.offset;
int vers;
struct nfs_fh *fh;
- int idx = hdr->pgio_mirror_idx;
+ u32 idx = hdr->pgio_mirror_idx;
mirror = FF_LAYOUT_COMP(lseg, idx);
ds = nfs4_ff_layout_prepare_ds(lseg, mirror, true);
diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c
index ccc88be88d6a..222afba70bc0 100644
--- a/fs/nfs/fs_context.c
+++ b/fs/nfs/fs_context.c
@@ -651,21 +651,21 @@ static int nfs_fs_context_parse_param(struct fs_context *fc,
switch (lookup_constant(nfs_xprt_protocol_tokens, param->string, -1)) {
case Opt_xprt_udp6:
protofamily = AF_INET6;
- /* fall through */
+ fallthrough;
case Opt_xprt_udp:
ctx->flags &= ~NFS_MOUNT_TCP;
ctx->nfs_server.protocol = XPRT_TRANSPORT_UDP;
break;
case Opt_xprt_tcp6:
protofamily = AF_INET6;
- /* fall through */
+ fallthrough;
case Opt_xprt_tcp:
ctx->flags |= NFS_MOUNT_TCP;
ctx->nfs_server.protocol = XPRT_TRANSPORT_TCP;
break;
case Opt_xprt_rdma6:
protofamily = AF_INET6;
- /* fall through */
+ fallthrough;
case Opt_xprt_rdma:
/* vector side protocols to TCP */
ctx->flags |= NFS_MOUNT_TCP;
@@ -684,13 +684,13 @@ static int nfs_fs_context_parse_param(struct fs_context *fc,
switch (lookup_constant(nfs_xprt_protocol_tokens, param->string, -1)) {
case Opt_xprt_udp6:
mountfamily = AF_INET6;
- /* fall through */
+ fallthrough;
case Opt_xprt_udp:
ctx->mount_server.protocol = XPRT_TRANSPORT_UDP;
break;
case Opt_xprt_tcp6:
mountfamily = AF_INET6;
- /* fall through */
+ fallthrough;
case Opt_xprt_tcp:
ctx->mount_server.protocol = XPRT_TRANSPORT_TCP;
break;
@@ -899,9 +899,11 @@ static int nfs23_parse_monolithic(struct fs_context *fc,
ctx->version = NFS_DEFAULT_VERSION;
switch (data->version) {
case 1:
- data->namlen = 0; /* fall through */
+ data->namlen = 0;
+ fallthrough;
case 2:
- data->bsize = 0; /* fall through */
+ data->bsize = 0;
+ fallthrough;
case 3:
if (data->flags & NFS_MOUNT_VER3)
goto out_no_v3;
@@ -909,14 +911,14 @@ static int nfs23_parse_monolithic(struct fs_context *fc,
memcpy(data->root.data, data->old_root.data, NFS2_FHSIZE);
/* Turn off security negotiation */
extra_flags |= NFS_MOUNT_SECFLAVOUR;
- /* fall through */
+ fallthrough;
case 4:
if (data->flags & NFS_MOUNT_SECFLAVOUR)
goto out_no_sec;
- /* fall through */
+ fallthrough;
case 5:
memset(data->context, 0, sizeof(data->context));
- /* fall through */
+ fallthrough;
case 6:
if (data->flags & NFS_MOUNT_VER3) {
if (data->root.size > NFS3_FHSIZE || data->root.size == 0)
@@ -982,7 +984,7 @@ static int nfs23_parse_monolithic(struct fs_context *fc,
/*
* The legacy version 6 binary mount data from userspace has a
* field used only to transport selinux information into the
- * the kernel. To continue to support that functionality we
+ * kernel. To continue to support that functionality we
* have a touch of selinux knowledge here in the NFS code. The
* userspace code converted context=blah to just blah so we are
* converting back to the full string selinux understands.
@@ -1037,6 +1039,65 @@ out_invalid_fh:
}
#if IS_ENABLED(CONFIG_NFS_V4)
+struct compat_nfs_string {
+ compat_uint_t len;
+ compat_uptr_t data;
+};
+
+static inline void compat_nfs_string(struct nfs_string *dst,
+ struct compat_nfs_string *src)
+{
+ dst->data = compat_ptr(src->data);
+ dst->len = src->len;
+}
+
+struct compat_nfs4_mount_data_v1 {
+ compat_int_t version;
+ compat_int_t flags;
+ compat_int_t rsize;
+ compat_int_t wsize;
+ compat_int_t timeo;
+ compat_int_t retrans;
+ compat_int_t acregmin;
+ compat_int_t acregmax;
+ compat_int_t acdirmin;
+ compat_int_t acdirmax;
+ struct compat_nfs_string client_addr;
+ struct compat_nfs_string mnt_path;
+ struct compat_nfs_string hostname;
+ compat_uint_t host_addrlen;
+ compat_uptr_t host_addr;
+ compat_int_t proto;
+ compat_int_t auth_flavourlen;
+ compat_uptr_t auth_flavours;
+};
+
+static void nfs4_compat_mount_data_conv(struct nfs4_mount_data *data)
+{
+ struct compat_nfs4_mount_data_v1 *compat =
+ (struct compat_nfs4_mount_data_v1 *)data;
+
+ /* copy the fields backwards */
+ data->auth_flavours = compat_ptr(compat->auth_flavours);
+ data->auth_flavourlen = compat->auth_flavourlen;
+ data->proto = compat->proto;
+ data->host_addr = compat_ptr(compat->host_addr);
+ data->host_addrlen = compat->host_addrlen;
+ compat_nfs_string(&data->hostname, &compat->hostname);
+ compat_nfs_string(&data->mnt_path, &compat->mnt_path);
+ compat_nfs_string(&data->client_addr, &compat->client_addr);
+ data->acdirmax = compat->acdirmax;
+ data->acdirmin = compat->acdirmin;
+ data->acregmax = compat->acregmax;
+ data->acregmin = compat->acregmin;
+ data->retrans = compat->retrans;
+ data->timeo = compat->timeo;
+ data->wsize = compat->wsize;
+ data->rsize = compat->rsize;
+ data->flags = compat->flags;
+ data->version = compat->version;
+}
+
/*
* Validate NFSv4 mount options
*/
@@ -1047,89 +1108,83 @@ static int nfs4_parse_monolithic(struct fs_context *fc,
struct sockaddr *sap = (struct sockaddr *)&ctx->nfs_server.address;
char *c;
- if (data == NULL)
- goto out_no_data;
+ if (!data) {
+ if (is_remount_fc(fc))
+ goto done;
+ return nfs_invalf(fc,
+ "NFS4: mount program didn't pass any mount data");
+ }
ctx->version = 4;
- switch (data->version) {
- case 1:
- if (data->host_addrlen > sizeof(ctx->nfs_server.address))
- goto out_no_address;
- if (data->host_addrlen == 0)
- goto out_no_address;
- ctx->nfs_server.addrlen = data->host_addrlen;
- if (copy_from_user(sap, data->host_addr, data->host_addrlen))
- return -EFAULT;
- if (!nfs_verify_server_address(sap))
- goto out_no_address;
- ctx->nfs_server.port = ntohs(((struct sockaddr_in *)sap)->sin_port);
-
- if (data->auth_flavourlen) {
- rpc_authflavor_t pseudoflavor;
- if (data->auth_flavourlen > 1)
- goto out_inval_auth;
- if (copy_from_user(&pseudoflavor,
- data->auth_flavours,
- sizeof(pseudoflavor)))
- return -EFAULT;
- ctx->selected_flavor = pseudoflavor;
- } else
- ctx->selected_flavor = RPC_AUTH_UNIX;
-
- c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN);
- if (IS_ERR(c))
- return PTR_ERR(c);
- ctx->nfs_server.hostname = c;
+ if (data->version != 1)
+ return generic_parse_monolithic(fc, data);
- c = strndup_user(data->mnt_path.data, NFS4_MAXPATHLEN);
- if (IS_ERR(c))
- return PTR_ERR(c);
- ctx->nfs_server.export_path = c;
- dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", c);
+ if (in_compat_syscall())
+ nfs4_compat_mount_data_conv(data);
- c = strndup_user(data->client_addr.data, 16);
- if (IS_ERR(c))
- return PTR_ERR(c);
- ctx->client_address = c;
-
- /*
- * Translate to nfs_fs_context, which nfs_fill_super
- * can deal with.
- */
+ if (data->host_addrlen > sizeof(ctx->nfs_server.address))
+ goto out_no_address;
+ if (data->host_addrlen == 0)
+ goto out_no_address;
+ ctx->nfs_server.addrlen = data->host_addrlen;
+ if (copy_from_user(sap, data->host_addr, data->host_addrlen))
+ return -EFAULT;
+ if (!nfs_verify_server_address(sap))
+ goto out_no_address;
+ ctx->nfs_server.port = ntohs(((struct sockaddr_in *)sap)->sin_port);
- ctx->flags = data->flags & NFS4_MOUNT_FLAGMASK;
- ctx->rsize = data->rsize;
- ctx->wsize = data->wsize;
- ctx->timeo = data->timeo;
- ctx->retrans = data->retrans;
- ctx->acregmin = data->acregmin;
- ctx->acregmax = data->acregmax;
- ctx->acdirmin = data->acdirmin;
- ctx->acdirmax = data->acdirmax;
- ctx->nfs_server.protocol = data->proto;
- nfs_validate_transport_protocol(ctx);
- if (ctx->nfs_server.protocol == XPRT_TRANSPORT_UDP)
- goto out_invalid_transport_udp;
+ if (data->auth_flavourlen) {
+ rpc_authflavor_t pseudoflavor;
- break;
- default:
- goto generic;
+ if (data->auth_flavourlen > 1)
+ goto out_inval_auth;
+ if (copy_from_user(&pseudoflavor, data->auth_flavours,
+ sizeof(pseudoflavor)))
+ return -EFAULT;
+ ctx->selected_flavor = pseudoflavor;
+ } else {
+ ctx->selected_flavor = RPC_AUTH_UNIX;
}
+ c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN);
+ if (IS_ERR(c))
+ return PTR_ERR(c);
+ ctx->nfs_server.hostname = c;
+
+ c = strndup_user(data->mnt_path.data, NFS4_MAXPATHLEN);
+ if (IS_ERR(c))
+ return PTR_ERR(c);
+ ctx->nfs_server.export_path = c;
+ dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", c);
+
+ c = strndup_user(data->client_addr.data, 16);
+ if (IS_ERR(c))
+ return PTR_ERR(c);
+ ctx->client_address = c;
+
+ /*
+ * Translate to nfs_fs_context, which nfs_fill_super
+ * can deal with.
+ */
+
+ ctx->flags = data->flags & NFS4_MOUNT_FLAGMASK;
+ ctx->rsize = data->rsize;
+ ctx->wsize = data->wsize;
+ ctx->timeo = data->timeo;
+ ctx->retrans = data->retrans;
+ ctx->acregmin = data->acregmin;
+ ctx->acregmax = data->acregmax;
+ ctx->acdirmin = data->acdirmin;
+ ctx->acdirmax = data->acdirmax;
+ ctx->nfs_server.protocol = data->proto;
+ nfs_validate_transport_protocol(ctx);
+ if (ctx->nfs_server.protocol == XPRT_TRANSPORT_UDP)
+ goto out_invalid_transport_udp;
+done:
ctx->skip_reconfig_option_check = true;
return 0;
-generic:
- return generic_parse_monolithic(fc, data);
-
-out_no_data:
- if (is_remount_fc(fc)) {
- ctx->skip_reconfig_option_check = true;
- return 0;
- }
- return nfs_invalf(fc, "NFS4: mount program didn't pass any mount data");
-
out_inval_auth:
return nfs_invalf(fc, "NFS4: Invalid number of RPC auth flavours %d",
data->auth_flavourlen);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 0bf1f835de01..aa6493905bbe 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -193,6 +193,7 @@ bool nfs_check_cache_invalid(struct inode *inode, unsigned long flags)
return nfs_check_cache_invalid_not_delegated(inode, flags);
}
+EXPORT_SYMBOL_GPL(nfs_check_cache_invalid);
static void nfs_set_cache_invalid(struct inode *inode, unsigned long flags)
{
@@ -204,7 +205,8 @@ static void nfs_set_cache_invalid(struct inode *inode, unsigned long flags)
flags &= ~NFS_INO_INVALID_OTHER;
flags &= ~(NFS_INO_INVALID_CHANGE
| NFS_INO_INVALID_SIZE
- | NFS_INO_REVAL_PAGECACHE);
+ | NFS_INO_REVAL_PAGECACHE
+ | NFS_INO_INVALID_XATTR);
}
if (inode->i_mapping->nrpages == 0)
@@ -233,11 +235,13 @@ static void nfs_zap_caches_locked(struct inode *inode)
| NFS_INO_INVALID_DATA
| NFS_INO_INVALID_ACCESS
| NFS_INO_INVALID_ACL
+ | NFS_INO_INVALID_XATTR
| NFS_INO_REVAL_PAGECACHE);
} else
nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR
| NFS_INO_INVALID_ACCESS
| NFS_INO_INVALID_ACL
+ | NFS_INO_INVALID_XATTR
| NFS_INO_REVAL_PAGECACHE);
nfs_zap_label_cache_locked(nfsi);
}
@@ -542,6 +546,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
inode->i_gid = fattr->gid;
else if (nfs_server_capable(inode, NFS_CAP_OWNER_GROUP))
nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER);
+ if (nfs_server_capable(inode, NFS_CAP_XATTR))
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_XATTR);
if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
inode->i_blocks = fattr->du.nfs2.blocks;
if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
@@ -794,8 +800,10 @@ int nfs_getattr(const struct path *path, struct kstat *stat,
trace_nfs_getattr_enter(inode);
- if ((query_flags & AT_STATX_DONT_SYNC) && !force_sync)
+ if ((query_flags & AT_STATX_DONT_SYNC) && !force_sync) {
+ nfs_readdirplus_parent_cache_hit(path->dentry);
goto out_no_update;
+ }
/* Flush out writes to the server in order to update c/mtime. */
if ((request_mask & (STATX_CTIME|STATX_MTIME)) &&
@@ -1375,6 +1383,8 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
inode_set_iversion_raw(inode, fattr->change_attr);
if (S_ISDIR(inode->i_mode))
nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA);
+ else if (nfs_server_capable(inode, NFS_CAP_XATTR))
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_XATTR);
}
/* If we have atomic WCC data, we may update some attributes */
ts = inode->i_ctime;
@@ -1892,7 +1902,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
if (!(have_writers || have_delegation)) {
invalid |= NFS_INO_INVALID_DATA
| NFS_INO_INVALID_ACCESS
- | NFS_INO_INVALID_ACL;
+ | NFS_INO_INVALID_ACL
+ | NFS_INO_INVALID_XATTR;
/* Force revalidate of all attributes */
save_cache_validity |= NFS_INO_INVALID_CTIME
| NFS_INO_INVALID_MTIME
@@ -2095,6 +2106,9 @@ struct inode *nfs_alloc_inode(struct super_block *sb)
#if IS_ENABLED(CONFIG_NFS_V4)
nfsi->nfs4_acl = NULL;
#endif /* CONFIG_NFS_V4 */
+#ifdef CONFIG_NFS_V4_2
+ nfsi->xattr_cache = NULL;
+#endif
return &nfsi->vfs_inode;
}
EXPORT_SYMBOL_GPL(nfs_alloc_inode);
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 26c94b32d6f4..c6c863382f37 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -108,7 +108,7 @@ struct posix_acl *nfs3_get_acl(struct inode *inode, int type)
case -EPROTONOSUPPORT:
dprintk("NFS_V3_ACL extension not supported; disabling\n");
server->caps &= ~NFS_CAP_ACLS;
- /* fall through */
+ fallthrough;
case -ENOTSUPP:
status = -EOPNOTSUPP;
default:
@@ -228,7 +228,7 @@ static int __nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
dprintk("NFS_V3_ACL SETACL RPC not supported"
"(will not retry)\n");
server->caps &= ~NFS_CAP_ACLS;
- /* fall through */
+ fallthrough;
case -ENOTSUPP:
status = -EOPNOTSUPP;
}
diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h
index c891af949886..0fe5aacbcfdf 100644
--- a/fs/nfs/nfs42.h
+++ b/fs/nfs/nfs42.h
@@ -6,6 +6,8 @@
#ifndef __LINUX_FS_NFS_NFS4_2_H
#define __LINUX_FS_NFS_NFS4_2_H
+#include <linux/xattr.h>
+
/*
* FIXME: four LAYOUTSTATS calls per compound at most! Do we need to support
* more? Need to consider not to pre-alloc too much for a compound.
@@ -36,5 +38,27 @@ static inline bool nfs42_files_from_same_server(struct file *in,
return nfs4_check_serverowner_major_id(c_in->cl_serverowner,
c_out->cl_serverowner);
}
+
+ssize_t nfs42_proc_getxattr(struct inode *inode, const char *name,
+ void *buf, size_t buflen);
+int nfs42_proc_setxattr(struct inode *inode, const char *name,
+ const void *buf, size_t buflen, int flags);
+ssize_t nfs42_proc_listxattrs(struct inode *inode, void *buf,
+ size_t buflen, u64 *cookiep, bool *eofp);
+int nfs42_proc_removexattr(struct inode *inode, const char *name);
+
+/*
+ * Maximum XDR buffer size needed for a listxattr buffer of buflen size.
+ *
+ * The upper boundary is a buffer with all 1-byte sized attribute names.
+ * They would be 7 bytes long in the eventual buffer ("user.x\0"), and
+ * 8 bytes long XDR-encoded.
+ *
+ * Include the trailing eof word as well.
+ */
+static inline u32 nfs42_listxattr_xdrsize(u32 buflen)
+{
+ return ((buflen / (XATTR_USER_PREFIX_LEN + 2)) * 8) + 4;
+}
#endif /* CONFIG_NFS_V4_2 */
#endif /* __LINUX_FS_NFS_NFS4_2_H */
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index e2ae54b35dfe..2b2211d1234e 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -17,6 +17,7 @@
#include "nfs4session.h"
#include "internal.h"
#include "delegation.h"
+#include "nfs4trace.h"
#define NFSDBG_FACILITY NFSDBG_PROC
static int nfs42_do_offload_cancel_async(struct file *dst, nfs4_stateid *std);
@@ -355,7 +356,15 @@ static ssize_t _nfs42_proc_copy(struct file *src,
truncate_pagecache_range(dst_inode, pos_dst,
pos_dst + res->write_res.count);
-
+ spin_lock(&dst_inode->i_lock);
+ NFS_I(dst_inode)->cache_validity |= (NFS_INO_REVAL_PAGECACHE |
+ NFS_INO_REVAL_FORCED | NFS_INO_INVALID_SIZE |
+ NFS_INO_INVALID_ATTR | NFS_INO_INVALID_DATA);
+ spin_unlock(&dst_inode->i_lock);
+ spin_lock(&src_inode->i_lock);
+ NFS_I(src_inode)->cache_validity |= (NFS_INO_REVAL_PAGECACHE |
+ NFS_INO_REVAL_FORCED | NFS_INO_INVALID_ATIME);
+ spin_unlock(&src_inode->i_lock);
status = res->write_res.count;
out:
if (args->sync)
@@ -714,7 +723,7 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata)
switch (task->tk_status) {
case 0:
- break;
+ return;
case -NFS4ERR_BADHANDLE:
case -ESTALE:
pnfs_destroy_layout(NFS_I(inode));
@@ -760,6 +769,8 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata)
case -EOPNOTSUPP:
NFS_SERVER(inode)->caps &= ~NFS_CAP_LAYOUTSTATS;
}
+
+ trace_nfs4_layoutstats(inode, &data->args.stateid, task->tk_status);
}
static void
@@ -882,7 +893,7 @@ nfs42_layouterror_done(struct rpc_task *task, void *calldata)
switch (task->tk_status) {
case 0:
- break;
+ return;
case -NFS4ERR_BADHANDLE:
case -ESTALE:
pnfs_destroy_layout(NFS_I(inode));
@@ -926,6 +937,9 @@ nfs42_layouterror_done(struct rpc_task *task, void *calldata)
case -EOPNOTSUPP:
NFS_SERVER(inode)->caps &= ~NFS_CAP_LAYOUTERROR;
}
+
+ trace_nfs4_layouterror(inode, &data->args.errors[0].stateid,
+ task->tk_status);
}
static void
@@ -1088,3 +1102,251 @@ out_put_src_lock:
nfs_put_lock_context(src_lock);
return err;
}
+
+#define NFS4XATTR_MAXPAGES DIV_ROUND_UP(XATTR_SIZE_MAX, PAGE_SIZE)
+
+static int _nfs42_proc_removexattr(struct inode *inode, const char *name)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs42_removexattrargs args = {
+ .fh = NFS_FH(inode),
+ .xattr_name = name,
+ };
+ struct nfs42_removexattrres res;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVEXATTR],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ };
+ int ret;
+ unsigned long timestamp = jiffies;
+
+ ret = nfs4_call_sync(server->client, server, &msg, &args.seq_args,
+ &res.seq_res, 1);
+ if (!ret)
+ nfs4_update_changeattr(inode, &res.cinfo, timestamp, 0);
+
+ return ret;
+}
+
+static int _nfs42_proc_setxattr(struct inode *inode, const char *name,
+ const void *buf, size_t buflen, int flags)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct page *pages[NFS4XATTR_MAXPAGES];
+ struct nfs42_setxattrargs arg = {
+ .fh = NFS_FH(inode),
+ .xattr_pages = pages,
+ .xattr_len = buflen,
+ .xattr_name = name,
+ .xattr_flags = flags,
+ };
+ struct nfs42_setxattrres res;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETXATTR],
+ .rpc_argp = &arg,
+ .rpc_resp = &res,
+ };
+ int ret, np;
+ unsigned long timestamp = jiffies;
+
+ if (buflen > server->sxasize)
+ return -ERANGE;
+
+ if (buflen > 0) {
+ np = nfs4_buf_to_pages_noslab(buf, buflen, arg.xattr_pages);
+ if (np < 0)
+ return np;
+ } else
+ np = 0;
+
+ ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args,
+ &res.seq_res, 1);
+
+ for (; np > 0; np--)
+ put_page(pages[np - 1]);
+
+ if (!ret)
+ nfs4_update_changeattr(inode, &res.cinfo, timestamp, 0);
+
+ return ret;
+}
+
+static ssize_t _nfs42_proc_getxattr(struct inode *inode, const char *name,
+ void *buf, size_t buflen)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct page *pages[NFS4XATTR_MAXPAGES] = {};
+ struct nfs42_getxattrargs arg = {
+ .fh = NFS_FH(inode),
+ .xattr_pages = pages,
+ .xattr_len = buflen,
+ .xattr_name = name,
+ };
+ struct nfs42_getxattrres res;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETXATTR],
+ .rpc_argp = &arg,
+ .rpc_resp = &res,
+ };
+ int ret, np;
+
+ ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args,
+ &res.seq_res, 0);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * Normally, the caching is done one layer up, but for successful
+ * RPCS, always cache the result here, even if the caller was
+ * just querying the length, or if the reply was too big for
+ * the caller. This avoids a second RPC in the case of the
+ * common query-alloc-retrieve cycle for xattrs.
+ *
+ * Note that xattr_len is always capped to XATTR_SIZE_MAX.
+ */
+
+ nfs4_xattr_cache_add(inode, name, NULL, pages, res.xattr_len);
+
+ if (buflen) {
+ if (res.xattr_len > buflen)
+ return -ERANGE;
+ _copy_from_pages(buf, pages, 0, res.xattr_len);
+ }
+
+ np = DIV_ROUND_UP(res.xattr_len, PAGE_SIZE);
+ while (--np >= 0)
+ __free_page(pages[np]);
+
+ return res.xattr_len;
+}
+
+static ssize_t _nfs42_proc_listxattrs(struct inode *inode, void *buf,
+ size_t buflen, u64 *cookiep, bool *eofp)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct page **pages;
+ struct nfs42_listxattrsargs arg = {
+ .fh = NFS_FH(inode),
+ .cookie = *cookiep,
+ };
+ struct nfs42_listxattrsres res = {
+ .eof = false,
+ .xattr_buf = buf,
+ .xattr_len = buflen,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LISTXATTRS],
+ .rpc_argp = &arg,
+ .rpc_resp = &res,
+ };
+ u32 xdrlen;
+ int ret, np;
+
+
+ res.scratch = alloc_page(GFP_KERNEL);
+ if (!res.scratch)
+ return -ENOMEM;
+
+ xdrlen = nfs42_listxattr_xdrsize(buflen);
+ if (xdrlen > server->lxasize)
+ xdrlen = server->lxasize;
+ np = xdrlen / PAGE_SIZE + 1;
+
+ pages = kcalloc(np, sizeof(struct page *), GFP_KERNEL);
+ if (pages == NULL) {
+ __free_page(res.scratch);
+ return -ENOMEM;
+ }
+
+ arg.xattr_pages = pages;
+ arg.count = xdrlen;
+
+ ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args,
+ &res.seq_res, 0);
+
+ if (ret >= 0) {
+ ret = res.copied;
+ *cookiep = res.cookie;
+ *eofp = res.eof;
+ }
+
+ while (--np >= 0) {
+ if (pages[np])
+ __free_page(pages[np]);
+ }
+
+ __free_page(res.scratch);
+ kfree(pages);
+
+ return ret;
+
+}
+
+ssize_t nfs42_proc_getxattr(struct inode *inode, const char *name,
+ void *buf, size_t buflen)
+{
+ struct nfs4_exception exception = { };
+ ssize_t err;
+
+ do {
+ err = _nfs42_proc_getxattr(inode, name, buf, buflen);
+ if (err >= 0)
+ break;
+ err = nfs4_handle_exception(NFS_SERVER(inode), err,
+ &exception);
+ } while (exception.retry);
+
+ return err;
+}
+
+int nfs42_proc_setxattr(struct inode *inode, const char *name,
+ const void *buf, size_t buflen, int flags)
+{
+ struct nfs4_exception exception = { };
+ int err;
+
+ do {
+ err = _nfs42_proc_setxattr(inode, name, buf, buflen, flags);
+ if (!err)
+ break;
+ err = nfs4_handle_exception(NFS_SERVER(inode), err,
+ &exception);
+ } while (exception.retry);
+
+ return err;
+}
+
+ssize_t nfs42_proc_listxattrs(struct inode *inode, void *buf,
+ size_t buflen, u64 *cookiep, bool *eofp)
+{
+ struct nfs4_exception exception = { };
+ ssize_t err;
+
+ do {
+ err = _nfs42_proc_listxattrs(inode, buf, buflen,
+ cookiep, eofp);
+ if (err >= 0)
+ break;
+ err = nfs4_handle_exception(NFS_SERVER(inode), err,
+ &exception);
+ } while (exception.retry);
+
+ return err;
+}
+
+int nfs42_proc_removexattr(struct inode *inode, const char *name)
+{
+ struct nfs4_exception exception = { };
+ int err;
+
+ do {
+ err = _nfs42_proc_removexattr(inode, name);
+ if (!err)
+ break;
+ err = nfs4_handle_exception(NFS_SERVER(inode), err,
+ &exception);
+ } while (exception.retry);
+
+ return err;
+}
diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c
new file mode 100644
index 000000000000..86777996cfec
--- /dev/null
+++ b/fs/nfs/nfs42xattr.c
@@ -0,0 +1,1056 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright 2019, 2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+ *
+ * User extended attribute client side cache functions.
+ *
+ * Author: Frank van der Linden <fllinden@amazon.com>
+ */
+#include <linux/errno.h>
+#include <linux/nfs_fs.h>
+#include <linux/hashtable.h>
+#include <linux/refcount.h>
+#include <uapi/linux/xattr.h>
+
+#include "nfs4_fs.h"
+#include "internal.h"
+
+/*
+ * User extended attributes client side caching is implemented by having
+ * a cache structure attached to NFS inodes. This structure is allocated
+ * when needed, and freed when the cache is zapped.
+ *
+ * The cache structure contains as hash table of entries, and a pointer
+ * to a special-cased entry for the listxattr cache.
+ *
+ * Accessing and allocating / freeing the caches is done via reference
+ * counting. The cache entries use a similar refcounting scheme.
+ *
+ * This makes freeing a cache, both from the shrinker and from the
+ * zap cache path, easy. It also means that, in current use cases,
+ * the large majority of inodes will not waste any memory, as they
+ * will never have any user extended attributes assigned to them.
+ *
+ * Attribute entries are hashed in to a simple hash table. They are
+ * also part of an LRU.
+ *
+ * There are three shrinkers.
+ *
+ * Two shrinkers deal with the cache entries themselves: one for
+ * large entries (> PAGE_SIZE), and one for smaller entries. The
+ * shrinker for the larger entries works more aggressively than
+ * those for the smaller entries.
+ *
+ * The other shrinker frees the cache structures themselves.
+ */
+
+/*
+ * 64 buckets is a good default. There is likely no reasonable
+ * workload that uses more than even 64 user extended attributes.
+ * You can certainly add a lot more - but you get what you ask for
+ * in those circumstances.
+ */
+#define NFS4_XATTR_HASH_SIZE 64
+
+#define NFSDBG_FACILITY NFSDBG_XATTRCACHE
+
+struct nfs4_xattr_cache;
+struct nfs4_xattr_entry;
+
+struct nfs4_xattr_bucket {
+ spinlock_t lock;
+ struct hlist_head hlist;
+ struct nfs4_xattr_cache *cache;
+ bool draining;
+};
+
+struct nfs4_xattr_cache {
+ struct kref ref;
+ spinlock_t hash_lock; /* protects hashtable and lru */
+ struct nfs4_xattr_bucket buckets[NFS4_XATTR_HASH_SIZE];
+ struct list_head lru;
+ struct list_head dispose;
+ atomic_long_t nent;
+ spinlock_t listxattr_lock;
+ struct inode *inode;
+ struct nfs4_xattr_entry *listxattr;
+};
+
+struct nfs4_xattr_entry {
+ struct kref ref;
+ struct hlist_node hnode;
+ struct list_head lru;
+ struct list_head dispose;
+ char *xattr_name;
+ void *xattr_value;
+ size_t xattr_size;
+ struct nfs4_xattr_bucket *bucket;
+ uint32_t flags;
+};
+
+#define NFS4_XATTR_ENTRY_EXTVAL 0x0001
+
+/*
+ * LRU list of NFS inodes that have xattr caches.
+ */
+static struct list_lru nfs4_xattr_cache_lru;
+static struct list_lru nfs4_xattr_entry_lru;
+static struct list_lru nfs4_xattr_large_entry_lru;
+
+static struct kmem_cache *nfs4_xattr_cache_cachep;
+
+/*
+ * Hashing helper functions.
+ */
+static void
+nfs4_xattr_hash_init(struct nfs4_xattr_cache *cache)
+{
+ unsigned int i;
+
+ for (i = 0; i < NFS4_XATTR_HASH_SIZE; i++) {
+ INIT_HLIST_HEAD(&cache->buckets[i].hlist);
+ spin_lock_init(&cache->buckets[i].lock);
+ cache->buckets[i].cache = cache;
+ cache->buckets[i].draining = false;
+ }
+}
+
+/*
+ * Locking order:
+ * 1. inode i_lock or bucket lock
+ * 2. list_lru lock (taken by list_lru_* functions)
+ */
+
+/*
+ * Wrapper functions to add a cache entry to the right LRU.
+ */
+static bool
+nfs4_xattr_entry_lru_add(struct nfs4_xattr_entry *entry)
+{
+ struct list_lru *lru;
+
+ lru = (entry->flags & NFS4_XATTR_ENTRY_EXTVAL) ?
+ &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru;
+
+ return list_lru_add(lru, &entry->lru);
+}
+
+static bool
+nfs4_xattr_entry_lru_del(struct nfs4_xattr_entry *entry)
+{
+ struct list_lru *lru;
+
+ lru = (entry->flags & NFS4_XATTR_ENTRY_EXTVAL) ?
+ &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru;
+
+ return list_lru_del(lru, &entry->lru);
+}
+
+/*
+ * This function allocates cache entries. They are the normal
+ * extended attribute name/value pairs, but may also be a listxattr
+ * cache. Those allocations use the same entry so that they can be
+ * treated as one by the memory shrinker.
+ *
+ * xattr cache entries are allocated together with names. If the
+ * value fits in to one page with the entry structure and the name,
+ * it will also be part of the same allocation (kmalloc). This is
+ * expected to be the vast majority of cases. Larger allocations
+ * have a value pointer that is allocated separately by kvmalloc.
+ *
+ * Parameters:
+ *
+ * @name: Name of the extended attribute. NULL for listxattr cache
+ * entry.
+ * @value: Value of attribute, or listxattr cache. NULL if the
+ * value is to be copied from pages instead.
+ * @pages: Pages to copy the value from, if not NULL. Passed in to
+ * make it easier to copy the value after an RPC, even if
+ * the value will not be passed up to application (e.g.
+ * for a 'query' getxattr with NULL buffer).
+ * @len: Length of the value. Can be 0 for zero-length attribues.
+ * @value and @pages will be NULL if @len is 0.
+ */
+static struct nfs4_xattr_entry *
+nfs4_xattr_alloc_entry(const char *name, const void *value,
+ struct page **pages, size_t len)
+{
+ struct nfs4_xattr_entry *entry;
+ void *valp;
+ char *namep;
+ size_t alloclen, slen;
+ char *buf;
+ uint32_t flags;
+
+ BUILD_BUG_ON(sizeof(struct nfs4_xattr_entry) +
+ XATTR_NAME_MAX + 1 > PAGE_SIZE);
+
+ alloclen = sizeof(struct nfs4_xattr_entry);
+ if (name != NULL) {
+ slen = strlen(name) + 1;
+ alloclen += slen;
+ } else
+ slen = 0;
+
+ if (alloclen + len <= PAGE_SIZE) {
+ alloclen += len;
+ flags = 0;
+ } else {
+ flags = NFS4_XATTR_ENTRY_EXTVAL;
+ }
+
+ buf = kmalloc(alloclen, GFP_KERNEL_ACCOUNT | GFP_NOFS);
+ if (buf == NULL)
+ return NULL;
+ entry = (struct nfs4_xattr_entry *)buf;
+
+ if (name != NULL) {
+ namep = buf + sizeof(struct nfs4_xattr_entry);
+ memcpy(namep, name, slen);
+ } else {
+ namep = NULL;
+ }
+
+
+ if (flags & NFS4_XATTR_ENTRY_EXTVAL) {
+ valp = kvmalloc(len, GFP_KERNEL_ACCOUNT | GFP_NOFS);
+ if (valp == NULL) {
+ kfree(buf);
+ return NULL;
+ }
+ } else if (len != 0) {
+ valp = buf + sizeof(struct nfs4_xattr_entry) + slen;
+ } else
+ valp = NULL;
+
+ if (valp != NULL) {
+ if (value != NULL)
+ memcpy(valp, value, len);
+ else
+ _copy_from_pages(valp, pages, 0, len);
+ }
+
+ entry->flags = flags;
+ entry->xattr_value = valp;
+ kref_init(&entry->ref);
+ entry->xattr_name = namep;
+ entry->xattr_size = len;
+ entry->bucket = NULL;
+ INIT_LIST_HEAD(&entry->lru);
+ INIT_LIST_HEAD(&entry->dispose);
+ INIT_HLIST_NODE(&entry->hnode);
+
+ return entry;
+}
+
+static void
+nfs4_xattr_free_entry(struct nfs4_xattr_entry *entry)
+{
+ if (entry->flags & NFS4_XATTR_ENTRY_EXTVAL)
+ kvfree(entry->xattr_value);
+ kfree(entry);
+}
+
+static void
+nfs4_xattr_free_entry_cb(struct kref *kref)
+{
+ struct nfs4_xattr_entry *entry;
+
+ entry = container_of(kref, struct nfs4_xattr_entry, ref);
+
+ if (WARN_ON(!list_empty(&entry->lru)))
+ return;
+
+ nfs4_xattr_free_entry(entry);
+}
+
+static void
+nfs4_xattr_free_cache_cb(struct kref *kref)
+{
+ struct nfs4_xattr_cache *cache;
+ int i;
+
+ cache = container_of(kref, struct nfs4_xattr_cache, ref);
+
+ for (i = 0; i < NFS4_XATTR_HASH_SIZE; i++) {
+ if (WARN_ON(!hlist_empty(&cache->buckets[i].hlist)))
+ return;
+ cache->buckets[i].draining = false;
+ }
+
+ cache->listxattr = NULL;
+
+ kmem_cache_free(nfs4_xattr_cache_cachep, cache);
+
+}
+
+static struct nfs4_xattr_cache *
+nfs4_xattr_alloc_cache(void)
+{
+ struct nfs4_xattr_cache *cache;
+
+ cache = kmem_cache_alloc(nfs4_xattr_cache_cachep,
+ GFP_KERNEL_ACCOUNT | GFP_NOFS);
+ if (cache == NULL)
+ return NULL;
+
+ kref_init(&cache->ref);
+ atomic_long_set(&cache->nent, 0);
+
+ return cache;
+}
+
+/*
+ * Set the listxattr cache, which is a special-cased cache entry.
+ * The special value ERR_PTR(-ESTALE) is used to indicate that
+ * the cache is being drained - this prevents a new listxattr
+ * cache from being added to what is now a stale cache.
+ */
+static int
+nfs4_xattr_set_listcache(struct nfs4_xattr_cache *cache,
+ struct nfs4_xattr_entry *new)
+{
+ struct nfs4_xattr_entry *old;
+ int ret = 1;
+
+ spin_lock(&cache->listxattr_lock);
+
+ old = cache->listxattr;
+
+ if (old == ERR_PTR(-ESTALE)) {
+ ret = 0;
+ goto out;
+ }
+
+ cache->listxattr = new;
+ if (new != NULL && new != ERR_PTR(-ESTALE))
+ nfs4_xattr_entry_lru_add(new);
+
+ if (old != NULL) {
+ nfs4_xattr_entry_lru_del(old);
+ kref_put(&old->ref, nfs4_xattr_free_entry_cb);
+ }
+out:
+ spin_unlock(&cache->listxattr_lock);
+
+ return ret;
+}
+
+/*
+ * Unlink a cache from its parent inode, clearing out an invalid
+ * cache. Must be called with i_lock held.
+ */
+static struct nfs4_xattr_cache *
+nfs4_xattr_cache_unlink(struct inode *inode)
+{
+ struct nfs_inode *nfsi;
+ struct nfs4_xattr_cache *oldcache;
+
+ nfsi = NFS_I(inode);
+
+ oldcache = nfsi->xattr_cache;
+ if (oldcache != NULL) {
+ list_lru_del(&nfs4_xattr_cache_lru, &oldcache->lru);
+ oldcache->inode = NULL;
+ }
+ nfsi->xattr_cache = NULL;
+ nfsi->cache_validity &= ~NFS_INO_INVALID_XATTR;
+
+ return oldcache;
+
+}
+
+/*
+ * Discard a cache. Called by get_cache() if there was an old,
+ * invalid cache. Can also be called from a shrinker callback.
+ *
+ * The cache is dead, it has already been unlinked from its inode,
+ * and no longer appears on the cache LRU list.
+ *
+ * Mark all buckets as draining, so that no new entries are added. This
+ * could still happen in the unlikely, but possible case that another
+ * thread had grabbed a reference before it was unlinked from the inode,
+ * and is still holding it for an add operation.
+ *
+ * Remove all entries from the LRU lists, so that there is no longer
+ * any way to 'find' this cache. Then, remove the entries from the hash
+ * table.
+ *
+ * At that point, the cache will remain empty and can be freed when the final
+ * reference drops, which is very likely the kref_put at the end of
+ * this function, or the one called immediately afterwards in the
+ * shrinker callback.
+ */
+static void
+nfs4_xattr_discard_cache(struct nfs4_xattr_cache *cache)
+{
+ unsigned int i;
+ struct nfs4_xattr_entry *entry;
+ struct nfs4_xattr_bucket *bucket;
+ struct hlist_node *n;
+
+ nfs4_xattr_set_listcache(cache, ERR_PTR(-ESTALE));
+
+ for (i = 0; i < NFS4_XATTR_HASH_SIZE; i++) {
+ bucket = &cache->buckets[i];
+
+ spin_lock(&bucket->lock);
+ bucket->draining = true;
+ hlist_for_each_entry_safe(entry, n, &bucket->hlist, hnode) {
+ nfs4_xattr_entry_lru_del(entry);
+ hlist_del_init(&entry->hnode);
+ kref_put(&entry->ref, nfs4_xattr_free_entry_cb);
+ }
+ spin_unlock(&bucket->lock);
+ }
+
+ atomic_long_set(&cache->nent, 0);
+
+ kref_put(&cache->ref, nfs4_xattr_free_cache_cb);
+}
+
+/*
+ * Get a referenced copy of the cache structure. Avoid doing allocs
+ * while holding i_lock. Which means that we do some optimistic allocation,
+ * and might have to free the result in rare cases.
+ *
+ * This function only checks the NFS_INO_INVALID_XATTR cache validity bit
+ * and acts accordingly, replacing the cache when needed. For the read case
+ * (!add), this means that the caller must make sure that the cache
+ * is valid before caling this function. getxattr and listxattr call
+ * revalidate_inode to do this. The attribute cache timeout (for the
+ * non-delegated case) is expected to be dealt with in the revalidate
+ * call.
+ */
+
+static struct nfs4_xattr_cache *
+nfs4_xattr_get_cache(struct inode *inode, int add)
+{
+ struct nfs_inode *nfsi;
+ struct nfs4_xattr_cache *cache, *oldcache, *newcache;
+
+ nfsi = NFS_I(inode);
+
+ cache = oldcache = NULL;
+
+ spin_lock(&inode->i_lock);
+
+ if (nfsi->cache_validity & NFS_INO_INVALID_XATTR)
+ oldcache = nfs4_xattr_cache_unlink(inode);
+ else
+ cache = nfsi->xattr_cache;
+
+ if (cache != NULL)
+ kref_get(&cache->ref);
+
+ spin_unlock(&inode->i_lock);
+
+ if (add && cache == NULL) {
+ newcache = NULL;
+
+ cache = nfs4_xattr_alloc_cache();
+ if (cache == NULL)
+ goto out;
+
+ spin_lock(&inode->i_lock);
+ if (nfsi->cache_validity & NFS_INO_INVALID_XATTR) {
+ /*
+ * The cache was invalidated again. Give up,
+ * since what we want to enter is now likely
+ * outdated anyway.
+ */
+ spin_unlock(&inode->i_lock);
+ kref_put(&cache->ref, nfs4_xattr_free_cache_cb);
+ cache = NULL;
+ goto out;
+ }
+
+ /*
+ * Check if someone beat us to it.
+ */
+ if (nfsi->xattr_cache != NULL) {
+ newcache = nfsi->xattr_cache;
+ kref_get(&newcache->ref);
+ } else {
+ kref_get(&cache->ref);
+ nfsi->xattr_cache = cache;
+ cache->inode = inode;
+ list_lru_add(&nfs4_xattr_cache_lru, &cache->lru);
+ }
+
+ spin_unlock(&inode->i_lock);
+
+ /*
+ * If there was a race, throw away the cache we just
+ * allocated, and use the new one allocated by someone
+ * else.
+ */
+ if (newcache != NULL) {
+ kref_put(&cache->ref, nfs4_xattr_free_cache_cb);
+ cache = newcache;
+ }
+ }
+
+out:
+ /*
+ * Discard the now orphaned old cache.
+ */
+ if (oldcache != NULL)
+ nfs4_xattr_discard_cache(oldcache);
+
+ return cache;
+}
+
+static inline struct nfs4_xattr_bucket *
+nfs4_xattr_hash_bucket(struct nfs4_xattr_cache *cache, const char *name)
+{
+ return &cache->buckets[jhash(name, strlen(name), 0) &
+ (ARRAY_SIZE(cache->buckets) - 1)];
+}
+
+static struct nfs4_xattr_entry *
+nfs4_xattr_get_entry(struct nfs4_xattr_bucket *bucket, const char *name)
+{
+ struct nfs4_xattr_entry *entry;
+
+ entry = NULL;
+
+ hlist_for_each_entry(entry, &bucket->hlist, hnode) {
+ if (!strcmp(entry->xattr_name, name))
+ break;
+ }
+
+ return entry;
+}
+
+static int
+nfs4_xattr_hash_add(struct nfs4_xattr_cache *cache,
+ struct nfs4_xattr_entry *entry)
+{
+ struct nfs4_xattr_bucket *bucket;
+ struct nfs4_xattr_entry *oldentry = NULL;
+ int ret = 1;
+
+ bucket = nfs4_xattr_hash_bucket(cache, entry->xattr_name);
+ entry->bucket = bucket;
+
+ spin_lock(&bucket->lock);
+
+ if (bucket->draining) {
+ ret = 0;
+ goto out;
+ }
+
+ oldentry = nfs4_xattr_get_entry(bucket, entry->xattr_name);
+ if (oldentry != NULL) {
+ hlist_del_init(&oldentry->hnode);
+ nfs4_xattr_entry_lru_del(oldentry);
+ } else {
+ atomic_long_inc(&cache->nent);
+ }
+
+ hlist_add_head(&entry->hnode, &bucket->hlist);
+ nfs4_xattr_entry_lru_add(entry);
+
+out:
+ spin_unlock(&bucket->lock);
+
+ if (oldentry != NULL)
+ kref_put(&oldentry->ref, nfs4_xattr_free_entry_cb);
+
+ return ret;
+}
+
+static void
+nfs4_xattr_hash_remove(struct nfs4_xattr_cache *cache, const char *name)
+{
+ struct nfs4_xattr_bucket *bucket;
+ struct nfs4_xattr_entry *entry;
+
+ bucket = nfs4_xattr_hash_bucket(cache, name);
+
+ spin_lock(&bucket->lock);
+
+ entry = nfs4_xattr_get_entry(bucket, name);
+ if (entry != NULL) {
+ hlist_del_init(&entry->hnode);
+ nfs4_xattr_entry_lru_del(entry);
+ atomic_long_dec(&cache->nent);
+ }
+
+ spin_unlock(&bucket->lock);
+
+ if (entry != NULL)
+ kref_put(&entry->ref, nfs4_xattr_free_entry_cb);
+}
+
+static struct nfs4_xattr_entry *
+nfs4_xattr_hash_find(struct nfs4_xattr_cache *cache, const char *name)
+{
+ struct nfs4_xattr_bucket *bucket;
+ struct nfs4_xattr_entry *entry;
+
+ bucket = nfs4_xattr_hash_bucket(cache, name);
+
+ spin_lock(&bucket->lock);
+
+ entry = nfs4_xattr_get_entry(bucket, name);
+ if (entry != NULL)
+ kref_get(&entry->ref);
+
+ spin_unlock(&bucket->lock);
+
+ return entry;
+}
+
+/*
+ * Entry point to retrieve an entry from the cache.
+ */
+ssize_t nfs4_xattr_cache_get(struct inode *inode, const char *name, char *buf,
+ ssize_t buflen)
+{
+ struct nfs4_xattr_cache *cache;
+ struct nfs4_xattr_entry *entry;
+ ssize_t ret;
+
+ cache = nfs4_xattr_get_cache(inode, 0);
+ if (cache == NULL)
+ return -ENOENT;
+
+ ret = 0;
+ entry = nfs4_xattr_hash_find(cache, name);
+
+ if (entry != NULL) {
+ dprintk("%s: cache hit '%s', len %lu\n", __func__,
+ entry->xattr_name, (unsigned long)entry->xattr_size);
+ if (buflen == 0) {
+ /* Length probe only */
+ ret = entry->xattr_size;
+ } else if (buflen < entry->xattr_size)
+ ret = -ERANGE;
+ else {
+ memcpy(buf, entry->xattr_value, entry->xattr_size);
+ ret = entry->xattr_size;
+ }
+ kref_put(&entry->ref, nfs4_xattr_free_entry_cb);
+ } else {
+ dprintk("%s: cache miss '%s'\n", __func__, name);
+ ret = -ENOENT;
+ }
+
+ kref_put(&cache->ref, nfs4_xattr_free_cache_cb);
+
+ return ret;
+}
+
+/*
+ * Retrieve a cached list of xattrs from the cache.
+ */
+ssize_t nfs4_xattr_cache_list(struct inode *inode, char *buf, ssize_t buflen)
+{
+ struct nfs4_xattr_cache *cache;
+ struct nfs4_xattr_entry *entry;
+ ssize_t ret;
+
+ cache = nfs4_xattr_get_cache(inode, 0);
+ if (cache == NULL)
+ return -ENOENT;
+
+ spin_lock(&cache->listxattr_lock);
+
+ entry = cache->listxattr;
+
+ if (entry != NULL && entry != ERR_PTR(-ESTALE)) {
+ if (buflen == 0) {
+ /* Length probe only */
+ ret = entry->xattr_size;
+ } else if (entry->xattr_size > buflen)
+ ret = -ERANGE;
+ else {
+ memcpy(buf, entry->xattr_value, entry->xattr_size);
+ ret = entry->xattr_size;
+ }
+ } else {
+ ret = -ENOENT;
+ }
+
+ spin_unlock(&cache->listxattr_lock);
+
+ kref_put(&cache->ref, nfs4_xattr_free_cache_cb);
+
+ return ret;
+}
+
+/*
+ * Add an xattr to the cache.
+ *
+ * This also invalidates the xattr list cache.
+ */
+void nfs4_xattr_cache_add(struct inode *inode, const char *name,
+ const char *buf, struct page **pages, ssize_t buflen)
+{
+ struct nfs4_xattr_cache *cache;
+ struct nfs4_xattr_entry *entry;
+
+ dprintk("%s: add '%s' len %lu\n", __func__,
+ name, (unsigned long)buflen);
+
+ cache = nfs4_xattr_get_cache(inode, 1);
+ if (cache == NULL)
+ return;
+
+ entry = nfs4_xattr_alloc_entry(name, buf, pages, buflen);
+ if (entry == NULL)
+ goto out;
+
+ (void)nfs4_xattr_set_listcache(cache, NULL);
+
+ if (!nfs4_xattr_hash_add(cache, entry))
+ kref_put(&entry->ref, nfs4_xattr_free_entry_cb);
+
+out:
+ kref_put(&cache->ref, nfs4_xattr_free_cache_cb);
+}
+
+
+/*
+ * Remove an xattr from the cache.
+ *
+ * This also invalidates the xattr list cache.
+ */
+void nfs4_xattr_cache_remove(struct inode *inode, const char *name)
+{
+ struct nfs4_xattr_cache *cache;
+
+ dprintk("%s: remove '%s'\n", __func__, name);
+
+ cache = nfs4_xattr_get_cache(inode, 0);
+ if (cache == NULL)
+ return;
+
+ (void)nfs4_xattr_set_listcache(cache, NULL);
+ nfs4_xattr_hash_remove(cache, name);
+
+ kref_put(&cache->ref, nfs4_xattr_free_cache_cb);
+}
+
+/*
+ * Cache listxattr output, replacing any possible old one.
+ */
+void nfs4_xattr_cache_set_list(struct inode *inode, const char *buf,
+ ssize_t buflen)
+{
+ struct nfs4_xattr_cache *cache;
+ struct nfs4_xattr_entry *entry;
+
+ cache = nfs4_xattr_get_cache(inode, 1);
+ if (cache == NULL)
+ return;
+
+ entry = nfs4_xattr_alloc_entry(NULL, buf, NULL, buflen);
+ if (entry == NULL)
+ goto out;
+
+ /*
+ * This is just there to be able to get to bucket->cache,
+ * which is obviously the same for all buckets, so just
+ * use bucket 0.
+ */
+ entry->bucket = &cache->buckets[0];
+
+ if (!nfs4_xattr_set_listcache(cache, entry))
+ kref_put(&entry->ref, nfs4_xattr_free_entry_cb);
+
+out:
+ kref_put(&cache->ref, nfs4_xattr_free_cache_cb);
+}
+
+/*
+ * Zap the entire cache. Called when an inode is evicted.
+ */
+void nfs4_xattr_cache_zap(struct inode *inode)
+{
+ struct nfs4_xattr_cache *oldcache;
+
+ spin_lock(&inode->i_lock);
+ oldcache = nfs4_xattr_cache_unlink(inode);
+ spin_unlock(&inode->i_lock);
+
+ if (oldcache)
+ nfs4_xattr_discard_cache(oldcache);
+}
+
+/*
+ * The entry LRU is shrunk more aggressively than the cache LRU,
+ * by settings @seeks to 1.
+ *
+ * Cache structures are freed only when they've become empty, after
+ * pruning all but one entry.
+ */
+
+static unsigned long nfs4_xattr_cache_count(struct shrinker *shrink,
+ struct shrink_control *sc);
+static unsigned long nfs4_xattr_entry_count(struct shrinker *shrink,
+ struct shrink_control *sc);
+static unsigned long nfs4_xattr_cache_scan(struct shrinker *shrink,
+ struct shrink_control *sc);
+static unsigned long nfs4_xattr_entry_scan(struct shrinker *shrink,
+ struct shrink_control *sc);
+
+static struct shrinker nfs4_xattr_cache_shrinker = {
+ .count_objects = nfs4_xattr_cache_count,
+ .scan_objects = nfs4_xattr_cache_scan,
+ .seeks = DEFAULT_SEEKS,
+ .flags = SHRINKER_MEMCG_AWARE,
+};
+
+static struct shrinker nfs4_xattr_entry_shrinker = {
+ .count_objects = nfs4_xattr_entry_count,
+ .scan_objects = nfs4_xattr_entry_scan,
+ .seeks = DEFAULT_SEEKS,
+ .batch = 512,
+ .flags = SHRINKER_MEMCG_AWARE,
+};
+
+static struct shrinker nfs4_xattr_large_entry_shrinker = {
+ .count_objects = nfs4_xattr_entry_count,
+ .scan_objects = nfs4_xattr_entry_scan,
+ .seeks = 1,
+ .batch = 512,
+ .flags = SHRINKER_MEMCG_AWARE,
+};
+
+static enum lru_status
+cache_lru_isolate(struct list_head *item,
+ struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
+{
+ struct list_head *dispose = arg;
+ struct inode *inode;
+ struct nfs4_xattr_cache *cache = container_of(item,
+ struct nfs4_xattr_cache, lru);
+
+ if (atomic_long_read(&cache->nent) > 1)
+ return LRU_SKIP;
+
+ /*
+ * If a cache structure is on the LRU list, we know that
+ * its inode is valid. Try to lock it to break the link.
+ * Since we're inverting the lock order here, only try.
+ */
+ inode = cache->inode;
+
+ if (!spin_trylock(&inode->i_lock))
+ return LRU_SKIP;
+
+ kref_get(&cache->ref);
+
+ cache->inode = NULL;
+ NFS_I(inode)->xattr_cache = NULL;
+ NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_XATTR;
+ list_lru_isolate(lru, &cache->lru);
+
+ spin_unlock(&inode->i_lock);
+
+ list_add_tail(&cache->dispose, dispose);
+ return LRU_REMOVED;
+}
+
+static unsigned long
+nfs4_xattr_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
+{
+ LIST_HEAD(dispose);
+ unsigned long freed;
+ struct nfs4_xattr_cache *cache;
+
+ freed = list_lru_shrink_walk(&nfs4_xattr_cache_lru, sc,
+ cache_lru_isolate, &dispose);
+ while (!list_empty(&dispose)) {
+ cache = list_first_entry(&dispose, struct nfs4_xattr_cache,
+ dispose);
+ list_del_init(&cache->dispose);
+ nfs4_xattr_discard_cache(cache);
+ kref_put(&cache->ref, nfs4_xattr_free_cache_cb);
+ }
+
+ return freed;
+}
+
+
+static unsigned long
+nfs4_xattr_cache_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+ unsigned long count;
+
+ count = list_lru_count(&nfs4_xattr_cache_lru);
+ return vfs_pressure_ratio(count);
+}
+
+static enum lru_status
+entry_lru_isolate(struct list_head *item,
+ struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
+{
+ struct list_head *dispose = arg;
+ struct nfs4_xattr_bucket *bucket;
+ struct nfs4_xattr_cache *cache;
+ struct nfs4_xattr_entry *entry = container_of(item,
+ struct nfs4_xattr_entry, lru);
+
+ bucket = entry->bucket;
+ cache = bucket->cache;
+
+ /*
+ * Unhook the entry from its parent (either a cache bucket
+ * or a cache structure if it's a listxattr buf), so that
+ * it's no longer found. Then add it to the isolate list,
+ * to be freed later.
+ *
+ * In both cases, we're reverting lock order, so use
+ * trylock and skip the entry if we can't get the lock.
+ */
+ if (entry->xattr_name != NULL) {
+ /* Regular cache entry */
+ if (!spin_trylock(&bucket->lock))
+ return LRU_SKIP;
+
+ kref_get(&entry->ref);
+
+ hlist_del_init(&entry->hnode);
+ atomic_long_dec(&cache->nent);
+ list_lru_isolate(lru, &entry->lru);
+
+ spin_unlock(&bucket->lock);
+ } else {
+ /* Listxattr cache entry */
+ if (!spin_trylock(&cache->listxattr_lock))
+ return LRU_SKIP;
+
+ kref_get(&entry->ref);
+
+ cache->listxattr = NULL;
+ list_lru_isolate(lru, &entry->lru);
+
+ spin_unlock(&cache->listxattr_lock);
+ }
+
+ list_add_tail(&entry->dispose, dispose);
+ return LRU_REMOVED;
+}
+
+static unsigned long
+nfs4_xattr_entry_scan(struct shrinker *shrink, struct shrink_control *sc)
+{
+ LIST_HEAD(dispose);
+ unsigned long freed;
+ struct nfs4_xattr_entry *entry;
+ struct list_lru *lru;
+
+ lru = (shrink == &nfs4_xattr_large_entry_shrinker) ?
+ &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru;
+
+ freed = list_lru_shrink_walk(lru, sc, entry_lru_isolate, &dispose);
+
+ while (!list_empty(&dispose)) {
+ entry = list_first_entry(&dispose, struct nfs4_xattr_entry,
+ dispose);
+ list_del_init(&entry->dispose);
+
+ /*
+ * Drop two references: the one that we just grabbed
+ * in entry_lru_isolate, and the one that was set
+ * when the entry was first allocated.
+ */
+ kref_put(&entry->ref, nfs4_xattr_free_entry_cb);
+ kref_put(&entry->ref, nfs4_xattr_free_entry_cb);
+ }
+
+ return freed;
+}
+
+static unsigned long
+nfs4_xattr_entry_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+ unsigned long count;
+ struct list_lru *lru;
+
+ lru = (shrink == &nfs4_xattr_large_entry_shrinker) ?
+ &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru;
+
+ count = list_lru_count(lru);
+ return vfs_pressure_ratio(count);
+}
+
+
+static void nfs4_xattr_cache_init_once(void *p)
+{
+ struct nfs4_xattr_cache *cache = (struct nfs4_xattr_cache *)p;
+
+ spin_lock_init(&cache->listxattr_lock);
+ atomic_long_set(&cache->nent, 0);
+ nfs4_xattr_hash_init(cache);
+ cache->listxattr = NULL;
+ INIT_LIST_HEAD(&cache->lru);
+ INIT_LIST_HEAD(&cache->dispose);
+}
+
+int __init nfs4_xattr_cache_init(void)
+{
+ int ret = 0;
+
+ nfs4_xattr_cache_cachep = kmem_cache_create("nfs4_xattr_cache_cache",
+ sizeof(struct nfs4_xattr_cache), 0,
+ (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+ nfs4_xattr_cache_init_once);
+ if (nfs4_xattr_cache_cachep == NULL)
+ return -ENOMEM;
+
+ ret = list_lru_init_memcg(&nfs4_xattr_large_entry_lru,
+ &nfs4_xattr_large_entry_shrinker);
+ if (ret)
+ goto out4;
+
+ ret = list_lru_init_memcg(&nfs4_xattr_entry_lru,
+ &nfs4_xattr_entry_shrinker);
+ if (ret)
+ goto out3;
+
+ ret = list_lru_init_memcg(&nfs4_xattr_cache_lru,
+ &nfs4_xattr_cache_shrinker);
+ if (ret)
+ goto out2;
+
+ ret = register_shrinker(&nfs4_xattr_cache_shrinker);
+ if (ret)
+ goto out1;
+
+ ret = register_shrinker(&nfs4_xattr_entry_shrinker);
+ if (ret)
+ goto out;
+
+ ret = register_shrinker(&nfs4_xattr_large_entry_shrinker);
+ if (!ret)
+ return 0;
+
+ unregister_shrinker(&nfs4_xattr_entry_shrinker);
+out:
+ unregister_shrinker(&nfs4_xattr_cache_shrinker);
+out1:
+ list_lru_destroy(&nfs4_xattr_cache_lru);
+out2:
+ list_lru_destroy(&nfs4_xattr_entry_lru);
+out3:
+ list_lru_destroy(&nfs4_xattr_large_entry_lru);
+out4:
+ kmem_cache_destroy(nfs4_xattr_cache_cachep);
+
+ return ret;
+}
+
+void nfs4_xattr_cache_exit(void)
+{
+ unregister_shrinker(&nfs4_xattr_entry_shrinker);
+ unregister_shrinker(&nfs4_xattr_cache_shrinker);
+ list_lru_destroy(&nfs4_xattr_entry_lru);
+ list_lru_destroy(&nfs4_xattr_cache_lru);
+ kmem_cache_destroy(nfs4_xattr_cache_cachep);
+}
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index c03f3246d6c5..cc50085e151c 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -169,6 +169,78 @@
decode_clone_maxsz + \
decode_getattr_maxsz)
+/* Not limited by NFS itself, limited by the generic xattr code */
+#define nfs4_xattr_name_maxsz XDR_QUADLEN(XATTR_NAME_MAX)
+
+#define encode_getxattr_maxsz (op_encode_hdr_maxsz + 1 + \
+ nfs4_xattr_name_maxsz)
+#define decode_getxattr_maxsz (op_decode_hdr_maxsz + 1 + 1)
+#define encode_setxattr_maxsz (op_encode_hdr_maxsz + \
+ 1 + nfs4_xattr_name_maxsz + 1)
+#define decode_setxattr_maxsz (op_decode_hdr_maxsz + decode_change_info_maxsz)
+#define encode_listxattrs_maxsz (op_encode_hdr_maxsz + 2 + 1)
+#define decode_listxattrs_maxsz (op_decode_hdr_maxsz + 2 + 1 + 1)
+#define encode_removexattr_maxsz (op_encode_hdr_maxsz + 1 + \
+ nfs4_xattr_name_maxsz)
+#define decode_removexattr_maxsz (op_decode_hdr_maxsz + \
+ decode_change_info_maxsz)
+
+#define NFS4_enc_getxattr_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_getxattr_maxsz)
+#define NFS4_dec_getxattr_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_getxattr_maxsz)
+#define NFS4_enc_setxattr_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_setxattr_maxsz)
+#define NFS4_dec_setxattr_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_setxattr_maxsz)
+#define NFS4_enc_listxattrs_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_listxattrs_maxsz)
+#define NFS4_dec_listxattrs_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_listxattrs_maxsz)
+#define NFS4_enc_removexattr_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_removexattr_maxsz)
+#define NFS4_dec_removexattr_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_removexattr_maxsz)
+
+/*
+ * These values specify the maximum amount of data that is not
+ * associated with the extended attribute name or extended
+ * attribute list in the SETXATTR, GETXATTR and LISTXATTR
+ * respectively.
+ */
+const u32 nfs42_maxsetxattr_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
+ compound_encode_hdr_maxsz +
+ encode_sequence_maxsz +
+ encode_putfh_maxsz + 1 +
+ nfs4_xattr_name_maxsz)
+ * XDR_UNIT);
+
+const u32 nfs42_maxgetxattr_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
+ compound_decode_hdr_maxsz +
+ decode_sequence_maxsz +
+ decode_putfh_maxsz + 1) * XDR_UNIT);
+
+const u32 nfs42_maxlistxattrs_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
+ compound_decode_hdr_maxsz +
+ decode_sequence_maxsz +
+ decode_putfh_maxsz + 3) * XDR_UNIT);
+
static void encode_fallocate(struct xdr_stream *xdr,
const struct nfs42_falloc_args *args)
{
@@ -333,6 +405,210 @@ static void encode_layouterror(struct xdr_stream *xdr,
encode_device_error(xdr, &args->errors[0]);
}
+static void encode_setxattr(struct xdr_stream *xdr,
+ const struct nfs42_setxattrargs *arg,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ BUILD_BUG_ON(XATTR_CREATE != SETXATTR4_CREATE);
+ BUILD_BUG_ON(XATTR_REPLACE != SETXATTR4_REPLACE);
+
+ encode_op_hdr(xdr, OP_SETXATTR, decode_setxattr_maxsz, hdr);
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(arg->xattr_flags);
+ encode_string(xdr, strlen(arg->xattr_name), arg->xattr_name);
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(arg->xattr_len);
+ if (arg->xattr_len)
+ xdr_write_pages(xdr, arg->xattr_pages, 0, arg->xattr_len);
+}
+
+static int decode_setxattr(struct xdr_stream *xdr,
+ struct nfs4_change_info *cinfo)
+{
+ int status;
+
+ status = decode_op_hdr(xdr, OP_SETXATTR);
+ if (status)
+ goto out;
+ status = decode_change_info(xdr, cinfo);
+out:
+ return status;
+}
+
+
+static void encode_getxattr(struct xdr_stream *xdr, const char *name,
+ struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_GETXATTR, decode_getxattr_maxsz, hdr);
+ encode_string(xdr, strlen(name), name);
+}
+
+static int decode_getxattr(struct xdr_stream *xdr,
+ struct nfs42_getxattrres *res,
+ struct rpc_rqst *req)
+{
+ int status;
+ __be32 *p;
+ u32 len, rdlen;
+
+ status = decode_op_hdr(xdr, OP_GETXATTR);
+ if (status)
+ return status;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+
+ len = be32_to_cpup(p);
+ if (len > req->rq_rcv_buf.page_len)
+ return -ERANGE;
+
+ res->xattr_len = len;
+
+ if (len > 0) {
+ rdlen = xdr_read_pages(xdr, len);
+ if (rdlen < len)
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static void encode_removexattr(struct xdr_stream *xdr, const char *name,
+ struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_REMOVEXATTR, decode_removexattr_maxsz, hdr);
+ encode_string(xdr, strlen(name), name);
+}
+
+
+static int decode_removexattr(struct xdr_stream *xdr,
+ struct nfs4_change_info *cinfo)
+{
+ int status;
+
+ status = decode_op_hdr(xdr, OP_REMOVEXATTR);
+ if (status)
+ goto out;
+
+ status = decode_change_info(xdr, cinfo);
+out:
+ return status;
+}
+
+static void encode_listxattrs(struct xdr_stream *xdr,
+ const struct nfs42_listxattrsargs *arg,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ encode_op_hdr(xdr, OP_LISTXATTRS, decode_listxattrs_maxsz + 1, hdr);
+
+ p = reserve_space(xdr, 12);
+ if (unlikely(!p))
+ return;
+
+ p = xdr_encode_hyper(p, arg->cookie);
+ /*
+ * RFC 8276 says to specify the full max length of the LISTXATTRS
+ * XDR reply. Count is set to the XDR length of the names array
+ * plus the EOF marker. So, add the cookie and the names count.
+ */
+ *p = cpu_to_be32(arg->count + 8 + 4);
+}
+
+static int decode_listxattrs(struct xdr_stream *xdr,
+ struct nfs42_listxattrsres *res)
+{
+ int status;
+ __be32 *p;
+ u32 count, len, ulen;
+ size_t left, copied;
+ char *buf;
+
+ status = decode_op_hdr(xdr, OP_LISTXATTRS);
+ if (status) {
+ /*
+ * Special case: for LISTXATTRS, NFS4ERR_TOOSMALL
+ * should be translated to ERANGE.
+ */
+ if (status == -ETOOSMALL)
+ status = -ERANGE;
+ goto out;
+ }
+
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+
+ xdr_decode_hyper(p, &res->cookie);
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+
+ left = res->xattr_len;
+ buf = res->xattr_buf;
+
+ count = be32_to_cpup(p);
+ copied = 0;
+
+ /*
+ * We have asked for enough room to encode the maximum number
+ * of possible attribute names, so everything should fit.
+ *
+ * But, don't rely on that assumption. Just decode entries
+ * until they don't fit anymore, just in case the server did
+ * something odd.
+ */
+ while (count--) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+
+ len = be32_to_cpup(p);
+ if (len > (XATTR_NAME_MAX - XATTR_USER_PREFIX_LEN)) {
+ status = -ERANGE;
+ goto out;
+ }
+
+ p = xdr_inline_decode(xdr, len);
+ if (unlikely(!p))
+ return -EIO;
+
+ ulen = len + XATTR_USER_PREFIX_LEN + 1;
+ if (buf) {
+ if (ulen > left) {
+ status = -ERANGE;
+ goto out;
+ }
+
+ memcpy(buf, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
+ memcpy(buf + XATTR_USER_PREFIX_LEN, p, len);
+
+ buf[ulen - 1] = 0;
+ buf += ulen;
+ left -= ulen;
+ }
+ copied += ulen;
+ }
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+
+ res->eof = be32_to_cpup(p);
+ res->copied = copied;
+
+out:
+ if (status == -ERANGE && res->xattr_len == XATTR_LIST_MAX)
+ status = -E2BIG;
+
+ return status;
+}
+
/*
* Encode ALLOCATE request
*/
@@ -988,4 +1264,166 @@ out:
return status;
}
+#ifdef CONFIG_NFS_V4_2
+static void nfs4_xdr_enc_setxattr(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs42_setxattrargs *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_setxattr(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+static int nfs4_xdr_dec_setxattr(struct rpc_rqst *req, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs42_setxattrres *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, req);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+
+ status = decode_setxattr(xdr, &res->cinfo);
+out:
+ return status;
+}
+
+static void nfs4_xdr_enc_getxattr(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs42_getxattrargs *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+ size_t plen;
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_getxattr(xdr, args->xattr_name, &hdr);
+
+ plen = args->xattr_len ? args->xattr_len : XATTR_SIZE_MAX;
+
+ rpc_prepare_reply_pages(req, args->xattr_pages, 0, plen,
+ hdr.replen);
+ req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES;
+
+ encode_nops(&hdr);
+}
+
+static int nfs4_xdr_dec_getxattr(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr, void *data)
+{
+ struct nfs42_getxattrres *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_getxattr(xdr, res, rqstp);
+out:
+ return status;
+}
+
+static void nfs4_xdr_enc_listxattrs(struct rpc_rqst *req,
+ struct xdr_stream *xdr, const void *data)
+{
+ const struct nfs42_listxattrsargs *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_listxattrs(xdr, args, &hdr);
+
+ rpc_prepare_reply_pages(req, args->xattr_pages, 0, args->count,
+ hdr.replen);
+ req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES;
+
+ encode_nops(&hdr);
+}
+
+static int nfs4_xdr_dec_listxattrs(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr, void *data)
+{
+ struct nfs42_listxattrsres *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ xdr_set_scratch_buffer(xdr, page_address(res->scratch), PAGE_SIZE);
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_listxattrs(xdr, res);
+out:
+ return status;
+}
+
+static void nfs4_xdr_enc_removexattr(struct rpc_rqst *req,
+ struct xdr_stream *xdr, const void *data)
+{
+ const struct nfs42_removexattrargs *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_removexattr(xdr, args->xattr_name, &hdr);
+ encode_nops(&hdr);
+}
+
+static int nfs4_xdr_dec_removexattr(struct rpc_rqst *req,
+ struct xdr_stream *xdr, void *data)
+{
+ struct nfs42_removexattrres *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, req);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+
+ status = decode_removexattr(xdr, &res->cinfo);
+out:
+ return status;
+}
+#endif
#endif /* __LINUX_FS_NFS_NFS4_2XDR_H */
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 2b7f6dcd2eb8..0c9505dc852c 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -117,7 +117,7 @@ struct nfs4_state_owner {
unsigned long so_flags;
struct list_head so_states;
struct nfs_seqid_counter so_seqid;
- seqcount_t so_reclaim_seqcount;
+ seqcount_spinlock_t so_reclaim_seqcount;
struct mutex so_delegreturn_mutex;
};
@@ -324,6 +324,13 @@ extern int update_open_stateid(struct nfs4_state *state,
extern int nfs4_proc_get_lease_time(struct nfs_client *clp,
struct nfs_fsinfo *fsinfo);
+extern void nfs4_update_changeattr(struct inode *dir,
+ struct nfs4_change_info *cinfo,
+ unsigned long timestamp,
+ unsigned long cache_validity);
+extern int nfs4_buf_to_pages_noslab(const void *buf, size_t buflen,
+ struct page **pages);
+
#if defined(CONFIG_NFS_V4_1)
extern int nfs41_sequence_done(struct rpc_task *, struct nfs4_sequence_res *);
extern int nfs4_proc_create_session(struct nfs_client *, const struct cred *);
@@ -557,6 +564,12 @@ static inline void nfs4_unregister_sysctl(void)
/* nfs4xdr.c */
extern const struct rpc_procinfo nfs4_procedures[];
+#ifdef CONFIG_NFS_V4_2
+extern const u32 nfs42_maxsetxattr_overhead;
+extern const u32 nfs42_maxgetxattr_overhead;
+extern const u32 nfs42_maxlistxattrs_overhead;
+#endif
+
struct nfs4_mount_data;
/* callback_xdr.c */
@@ -613,12 +626,34 @@ static inline bool nfs4_state_match_open_stateid_other(const struct nfs4_state *
nfs4_stateid_match_other(&state->open_stateid, stateid);
}
+/* nfs42xattr.c */
+#ifdef CONFIG_NFS_V4_2
+extern int __init nfs4_xattr_cache_init(void);
+extern void nfs4_xattr_cache_exit(void);
+extern void nfs4_xattr_cache_add(struct inode *inode, const char *name,
+ const char *buf, struct page **pages,
+ ssize_t buflen);
+extern void nfs4_xattr_cache_remove(struct inode *inode, const char *name);
+extern ssize_t nfs4_xattr_cache_get(struct inode *inode, const char *name,
+ char *buf, ssize_t buflen);
+extern void nfs4_xattr_cache_set_list(struct inode *inode, const char *buf,
+ ssize_t buflen);
+extern ssize_t nfs4_xattr_cache_list(struct inode *inode, char *buf,
+ ssize_t buflen);
+extern void nfs4_xattr_cache_zap(struct inode *inode);
#else
+static inline void nfs4_xattr_cache_zap(struct inode *inode)
+{
+}
+#endif /* CONFIG_NFS_V4_2 */
+
+#else /* CONFIG_NFS_V4 */
#define nfs4_close_state(a, b) do { } while (0)
#define nfs4_close_sync(a, b) do { } while (0)
#define nfs4_state_protect(a, b, c, d) do { } while (0)
#define nfs4_state_protect_write(a, b, c, d) do { } while (0)
+
#endif /* CONFIG_NFS_V4 */
#endif /* __LINUX_FS_NFS_NFS4_FS.H */
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 0bd77cc1f639..daacc78a3d48 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -880,7 +880,7 @@ static int nfs4_set_client(struct nfs_server *server,
if (minorversion == 0)
__set_bit(NFS_CS_REUSEPORT, &cl_init.init_flags);
- else if (proto == XPRT_TRANSPORT_TCP)
+ if (proto == XPRT_TRANSPORT_TCP)
cl_init.nconnect = nconnect;
if (server->flags & NFS_MOUNT_NORESVPORT)
@@ -992,6 +992,36 @@ static void nfs4_session_limit_rwsize(struct nfs_server *server)
#endif /* CONFIG_NFS_V4_1 */
}
+/*
+ * Limit xattr sizes using the channel attributes.
+ */
+static void nfs4_session_limit_xasize(struct nfs_server *server)
+{
+#ifdef CONFIG_NFS_V4_2
+ struct nfs4_session *sess;
+ u32 server_gxa_sz;
+ u32 server_sxa_sz;
+ u32 server_lxa_sz;
+
+ if (!nfs4_has_session(server->nfs_client))
+ return;
+
+ sess = server->nfs_client->cl_session;
+
+ server_gxa_sz = sess->fc_attrs.max_resp_sz - nfs42_maxgetxattr_overhead;
+ server_sxa_sz = sess->fc_attrs.max_rqst_sz - nfs42_maxsetxattr_overhead;
+ server_lxa_sz = sess->fc_attrs.max_resp_sz -
+ nfs42_maxlistxattrs_overhead;
+
+ if (server->gxasize > server_gxa_sz)
+ server->gxasize = server_gxa_sz;
+ if (server->sxasize > server_sxa_sz)
+ server->sxasize = server_sxa_sz;
+ if (server->lxasize > server_lxa_sz)
+ server->lxasize = server_lxa_sz;
+#endif
+}
+
static int nfs4_server_common_setup(struct nfs_server *server,
struct nfs_fh *mntfh, bool auth_probe)
{
@@ -1039,6 +1069,7 @@ static int nfs4_server_common_setup(struct nfs_server *server,
goto out;
nfs4_session_limit_rwsize(server);
+ nfs4_session_limit_xasize(server);
if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN)
server->namelen = NFS4_MAXNAMLEN;
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 8e5d6223ddd3..fdfc77486ace 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -110,6 +110,7 @@ static int
nfs4_file_flush(struct file *file, fl_owner_t id)
{
struct inode *inode = file_inode(file);
+ errseq_t since;
dprintk("NFS: flush(%pD2)\n", file);
@@ -125,7 +126,9 @@ nfs4_file_flush(struct file *file, fl_owner_t id)
return filemap_fdatawrite(file->f_mapping);
/* Flush writes to the server and return any errors */
- return nfs_wb_all(inode);
+ since = filemap_sample_wb_err(file->f_mapping);
+ nfs_wb_all(inode);
+ return filemap_check_wb_err(file->f_mapping, since);
}
#ifdef CONFIG_NFS_V4_2
@@ -208,7 +211,7 @@ static loff_t nfs4_file_llseek(struct file *filep, loff_t offset, int whence)
ret = nfs42_proc_llseek(filep, offset, whence);
if (ret != -ENOTSUPP)
return ret;
- /* Fall through */
+ fallthrough;
default:
return nfs_file_llseek(filep, offset, whence);
}
diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index 1e7296395d71..62e6eea5c516 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -520,7 +520,7 @@ static int nfs_idmap_prepare_message(char *desc, struct idmap *idmap,
switch (token) {
case Opt_find_uid:
im->im_type = IDMAP_TYPE_USER;
- /* Fall through */
+ fallthrough;
case Opt_find_gid:
im->im_conv = IDMAP_CONV_NAMETOID;
ret = match_strlcpy(im->im_name, &substr, IDMAP_NAMESZ);
@@ -528,7 +528,7 @@ static int nfs_idmap_prepare_message(char *desc, struct idmap *idmap,
case Opt_find_user:
im->im_type = IDMAP_TYPE_USER;
- /* Fall through */
+ fallthrough;
case Opt_find_group:
im->im_conv = IDMAP_CONV_IDTONAME;
ret = match_int(&substr, &im->im_id);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 8963062da57e..6e95c85fe395 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -66,6 +66,7 @@
#include "nfs4idmap.h"
#include "nfs4session.h"
#include "fscache.h"
+#include "nfs42.h"
#include "nfs4trace.h"
@@ -256,6 +257,7 @@ const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE
| FATTR4_WORD1_FS_LAYOUT_TYPES,
FATTR4_WORD2_LAYOUT_BLKSIZE
| FATTR4_WORD2_CLONE_BLKSIZE
+ | FATTR4_WORD2_XATTR_SUPPORT
};
const u32 nfs4_fs_locations_bitmap[3] = {
@@ -481,7 +483,7 @@ static int nfs4_do_handle_exception(struct nfs_server *server,
stateid);
goto wait_on_recovery;
}
- /* Fall through */
+ fallthrough;
case -NFS4ERR_OPENMODE:
if (inode) {
int err;
@@ -532,10 +534,10 @@ static int nfs4_do_handle_exception(struct nfs_server *server,
ret = -EBUSY;
break;
}
- /* Fall through */
+ fallthrough;
case -NFS4ERR_DELAY:
nfs_inc_server_stats(server, NFSIOS_DELAY);
- /* Fall through */
+ fallthrough;
case -NFS4ERR_GRACE:
case -NFS4ERR_LAYOUTTRYLATER:
case -NFS4ERR_RECALLCONFLICT:
@@ -1173,37 +1175,49 @@ nfs4_dec_nlink_locked(struct inode *inode)
}
static void
-update_changeattr_locked(struct inode *dir, struct nfs4_change_info *cinfo,
+nfs4_update_changeattr_locked(struct inode *inode,
+ struct nfs4_change_info *cinfo,
unsigned long timestamp, unsigned long cache_validity)
{
- struct nfs_inode *nfsi = NFS_I(dir);
+ struct nfs_inode *nfsi = NFS_I(inode);
nfsi->cache_validity |= NFS_INO_INVALID_CTIME
| NFS_INO_INVALID_MTIME
- | NFS_INO_INVALID_DATA
| cache_validity;
- if (cinfo->atomic && cinfo->before == inode_peek_iversion_raw(dir)) {
+
+ if (cinfo->atomic && cinfo->before == inode_peek_iversion_raw(inode)) {
nfsi->cache_validity &= ~NFS_INO_REVAL_PAGECACHE;
nfsi->attrtimeo_timestamp = jiffies;
} else {
- nfs_force_lookup_revalidate(dir);
- if (cinfo->before != inode_peek_iversion_raw(dir))
+ if (S_ISDIR(inode->i_mode)) {
+ nfsi->cache_validity |= NFS_INO_INVALID_DATA;
+ nfs_force_lookup_revalidate(inode);
+ } else {
+ if (!NFS_PROTO(inode)->have_delegation(inode,
+ FMODE_READ))
+ nfsi->cache_validity |= NFS_INO_REVAL_PAGECACHE;
+ }
+
+ if (cinfo->before != inode_peek_iversion_raw(inode))
nfsi->cache_validity |= NFS_INO_INVALID_ACCESS |
- NFS_INO_INVALID_ACL;
+ NFS_INO_INVALID_ACL |
+ NFS_INO_INVALID_XATTR;
}
- inode_set_iversion_raw(dir, cinfo->after);
+ inode_set_iversion_raw(inode, cinfo->after);
nfsi->read_cache_jiffies = timestamp;
nfsi->attr_gencount = nfs_inc_attr_generation_counter();
nfsi->cache_validity &= ~NFS_INO_INVALID_CHANGE;
- nfs_fscache_invalidate(dir);
+
+ if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
+ nfs_fscache_invalidate(inode);
}
-static void
-update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo,
+void
+nfs4_update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo,
unsigned long timestamp, unsigned long cache_validity)
{
spin_lock(&dir->i_lock);
- update_changeattr_locked(dir, cinfo, timestamp, cache_validity);
+ nfs4_update_changeattr_locked(dir, cinfo, timestamp, cache_validity);
spin_unlock(&dir->i_lock);
}
@@ -1356,6 +1370,12 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
NFS4_ACCESS_MODIFY |
NFS4_ACCESS_EXTEND |
NFS4_ACCESS_EXECUTE;
+#ifdef CONFIG_NFS_V4_2
+ if (server->caps & NFS_CAP_XATTR)
+ p->o_arg.access |= NFS4_ACCESS_XAREAD |
+ NFS4_ACCESS_XAWRITE |
+ NFS4_ACCESS_XALIST;
+#endif
}
}
p->o_arg.clientid = server->nfs_client->cl_clientid;
@@ -1485,7 +1505,7 @@ static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode,
case NFS4_OPEN_CLAIM_PREVIOUS:
if (!test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags))
break;
- /* Fall through */
+ fallthrough;
default:
return 0;
}
@@ -2419,7 +2439,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
data->o_arg.open_bitmap = &nfs4_open_noattr_bitmap[0];
- /* Fall through */
+ fallthrough;
case NFS4_OPEN_CLAIM_FH:
task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR];
}
@@ -2653,8 +2673,9 @@ static int _nfs4_proc_open(struct nfs4_opendata *data,
data->file_created = true;
if (data->file_created ||
inode_peek_iversion_raw(dir) != o_res->cinfo.after)
- update_changeattr(dir, &o_res->cinfo,
- o_res->f_attr->time_start, 0);
+ nfs4_update_changeattr(dir, &o_res->cinfo,
+ o_res->f_attr->time_start,
+ NFS_INO_INVALID_DATA);
}
if ((o_res->rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) == 0)
server->caps &= ~NFS_CAP_POSIX_LOCK;
@@ -3272,8 +3293,10 @@ static int _nfs4_do_setattr(struct inode *inode,
/* Servers should only apply open mode checks for file size changes */
truncate = (arg->iap->ia_valid & ATTR_SIZE) ? true : false;
- if (!truncate)
+ if (!truncate) {
+ nfs4_inode_make_writeable(inode);
goto zero_stateid;
+ }
if (nfs4_copy_delegation_stateid(inode, FMODE_WRITE, &arg->stateid, &delegation_cred)) {
/* Use that stateid */
@@ -3524,11 +3547,11 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
nfs4_free_revoked_stateid(server,
&calldata->arg.stateid,
task->tk_msg.rpc_cred);
- /* Fallthrough */
+ fallthrough;
case -NFS4ERR_BAD_STATEID:
if (calldata->arg.fmode == 0)
break;
- /* Fallthrough */
+ fallthrough;
default:
task->tk_status = nfs4_async_handle_exception(task,
server, task->tk_status, &exception);
@@ -3756,7 +3779,7 @@ static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
#define FATTR4_WORD1_NFS40_MASK (2*FATTR4_WORD1_MOUNTED_ON_FILEID - 1UL)
#define FATTR4_WORD2_NFS41_MASK (2*FATTR4_WORD2_SUPPATTR_EXCLCREAT - 1UL)
-#define FATTR4_WORD2_NFS42_MASK (2*FATTR4_WORD2_MODE_UMASK - 1UL)
+#define FATTR4_WORD2_NFS42_MASK (2*FATTR4_WORD2_XATTR_SUPPORT - 1UL)
static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
{
@@ -4540,7 +4563,8 @@ _nfs4_proc_remove(struct inode *dir, const struct qstr *name, u32 ftype)
status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 1);
if (status == 0) {
spin_lock(&dir->i_lock);
- update_changeattr_locked(dir, &res.cinfo, timestamp, 0);
+ nfs4_update_changeattr_locked(dir, &res.cinfo, timestamp,
+ NFS_INO_INVALID_DATA);
/* Removing a directory decrements nlink in the parent */
if (ftype == NF4DIR && dir->i_nlink > 2)
nfs4_dec_nlink_locked(dir);
@@ -4624,8 +4648,9 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
&data->timeout) == -EAGAIN)
return 0;
if (task->tk_status == 0)
- update_changeattr(dir, &res->cinfo,
- res->dir_attr->time_start, 0);
+ nfs4_update_changeattr(dir, &res->cinfo,
+ res->dir_attr->time_start,
+ NFS_INO_INVALID_DATA);
return 1;
}
@@ -4669,16 +4694,18 @@ static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
if (task->tk_status == 0) {
if (new_dir != old_dir) {
/* Note: If we moved a directory, nlink will change */
- update_changeattr(old_dir, &res->old_cinfo,
+ nfs4_update_changeattr(old_dir, &res->old_cinfo,
res->old_fattr->time_start,
- NFS_INO_INVALID_OTHER);
- update_changeattr(new_dir, &res->new_cinfo,
+ NFS_INO_INVALID_OTHER |
+ NFS_INO_INVALID_DATA);
+ nfs4_update_changeattr(new_dir, &res->new_cinfo,
res->new_fattr->time_start,
- NFS_INO_INVALID_OTHER);
+ NFS_INO_INVALID_OTHER |
+ NFS_INO_INVALID_DATA);
} else
- update_changeattr(old_dir, &res->old_cinfo,
+ nfs4_update_changeattr(old_dir, &res->old_cinfo,
res->old_fattr->time_start,
- 0);
+ NFS_INO_INVALID_DATA);
}
return 1;
}
@@ -4719,7 +4746,8 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, const struct
status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
if (!status) {
- update_changeattr(dir, &res.cinfo, res.fattr->time_start, 0);
+ nfs4_update_changeattr(dir, &res.cinfo, res.fattr->time_start,
+ NFS_INO_INVALID_DATA);
status = nfs_post_op_update_inode(inode, res.fattr);
if (!status)
nfs_setsecurity(inode, res.fattr, res.label);
@@ -4797,8 +4825,9 @@ static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_
&data->arg.seq_args, &data->res.seq_res, 1);
if (status == 0) {
spin_lock(&dir->i_lock);
- update_changeattr_locked(dir, &data->res.dir_cinfo,
- data->res.fattr->time_start, 0);
+ nfs4_update_changeattr_locked(dir, &data->res.dir_cinfo,
+ data->res.fattr->time_start,
+ NFS_INO_INVALID_DATA);
/* Creating a directory bumps nlink in the parent */
if (data->arg.ftype == NF4DIR)
nfs4_inc_nlink_locked(dir);
@@ -5531,7 +5560,7 @@ static inline int nfs4_server_supports_acls(struct nfs_server *server)
*/
#define NFS4ACL_MAXPAGES DIV_ROUND_UP(XATTR_SIZE_MAX, PAGE_SIZE)
-static int buf_to_pages_noslab(const void *buf, size_t buflen,
+int nfs4_buf_to_pages_noslab(const void *buf, size_t buflen,
struct page **pages)
{
struct page *newpage, **spages;
@@ -5773,7 +5802,7 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
return -EOPNOTSUPP;
if (npages > ARRAY_SIZE(pages))
return -ERANGE;
- i = buf_to_pages_noslab(buf, buflen, arg.acl_pages);
+ i = nfs4_buf_to_pages_noslab(buf, buflen, arg.acl_pages);
if (i < 0)
return i;
nfs4_inode_make_writeable(inode);
@@ -5845,8 +5874,6 @@ static int _nfs4_get_security_label(struct inode *inode, void *buf,
return ret;
if (!(fattr.valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL))
return -ENOENT;
- if (buflen < label.len)
- return -ERANGE;
return 0;
}
@@ -6269,7 +6296,7 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
nfs4_free_revoked_stateid(data->res.server,
data->args.stateid,
task->tk_msg.rpc_cred);
- /* Fallthrough */
+ fallthrough;
case -NFS4ERR_BAD_STATEID:
case -NFS4ERR_STALE_STATEID:
case -ETIMEDOUT:
@@ -6289,7 +6316,7 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
data->res.fattr = NULL;
goto out_restart;
}
- /* Fallthrough */
+ fallthrough;
default:
task->tk_status = nfs4_async_handle_exception(task,
data->res.server, task->tk_status,
@@ -6597,13 +6624,13 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
if (nfs4_update_lock_stateid(calldata->lsp,
&calldata->res.stateid))
break;
- /* Fall through */
+ fallthrough;
case -NFS4ERR_ADMIN_REVOKED:
case -NFS4ERR_EXPIRED:
nfs4_free_revoked_stateid(calldata->server,
&calldata->arg.stateid,
task->tk_msg.rpc_cred);
- /* Fall through */
+ fallthrough;
case -NFS4ERR_BAD_STATEID:
case -NFS4ERR_STALE_STATEID:
if (nfs4_sync_lock_stateid(&calldata->arg.stateid,
@@ -7273,7 +7300,12 @@ int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state,
err = nfs4_set_lock_state(state, fl);
if (err != 0)
return err;
- err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW);
+ do {
+ err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW);
+ if (err != -NFS4ERR_DELAY)
+ break;
+ ssleep(1);
+ } while (err == -NFS4ERR_DELAY);
return nfs4_handle_delegation_recall_error(server, state, stateid, fl, err);
}
@@ -7430,6 +7462,133 @@ nfs4_listxattr_nfs4_label(struct inode *inode, char *list, size_t list_len)
#endif
+#ifdef CONFIG_NFS_V4_2
+static int nfs4_xattr_set_nfs4_user(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *key, const void *buf,
+ size_t buflen, int flags)
+{
+ struct nfs_access_entry cache;
+ int ret;
+
+ if (!nfs_server_capable(inode, NFS_CAP_XATTR))
+ return -EOPNOTSUPP;
+
+ /*
+ * There is no mapping from the MAY_* flags to the NFS_ACCESS_XA*
+ * flags right now. Handling of xattr operations use the normal
+ * file read/write permissions.
+ *
+ * Just in case the server has other ideas (which RFC 8276 allows),
+ * do a cached access check for the XA* flags to possibly avoid
+ * doing an RPC and getting EACCES back.
+ */
+ if (!nfs_access_get_cached(inode, current_cred(), &cache, true)) {
+ if (!(cache.mask & NFS_ACCESS_XAWRITE))
+ return -EACCES;
+ }
+
+ if (buf == NULL) {
+ ret = nfs42_proc_removexattr(inode, key);
+ if (!ret)
+ nfs4_xattr_cache_remove(inode, key);
+ } else {
+ ret = nfs42_proc_setxattr(inode, key, buf, buflen, flags);
+ if (!ret)
+ nfs4_xattr_cache_add(inode, key, buf, NULL, buflen);
+ }
+
+ return ret;
+}
+
+static int nfs4_xattr_get_nfs4_user(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *key, void *buf, size_t buflen)
+{
+ struct nfs_access_entry cache;
+ ssize_t ret;
+
+ if (!nfs_server_capable(inode, NFS_CAP_XATTR))
+ return -EOPNOTSUPP;
+
+ if (!nfs_access_get_cached(inode, current_cred(), &cache, true)) {
+ if (!(cache.mask & NFS_ACCESS_XAREAD))
+ return -EACCES;
+ }
+
+ ret = nfs_revalidate_inode(NFS_SERVER(inode), inode);
+ if (ret)
+ return ret;
+
+ ret = nfs4_xattr_cache_get(inode, key, buf, buflen);
+ if (ret >= 0 || (ret < 0 && ret != -ENOENT))
+ return ret;
+
+ ret = nfs42_proc_getxattr(inode, key, buf, buflen);
+
+ return ret;
+}
+
+static ssize_t
+nfs4_listxattr_nfs4_user(struct inode *inode, char *list, size_t list_len)
+{
+ u64 cookie;
+ bool eof;
+ ssize_t ret, size;
+ char *buf;
+ size_t buflen;
+ struct nfs_access_entry cache;
+
+ if (!nfs_server_capable(inode, NFS_CAP_XATTR))
+ return 0;
+
+ if (!nfs_access_get_cached(inode, current_cred(), &cache, true)) {
+ if (!(cache.mask & NFS_ACCESS_XALIST))
+ return 0;
+ }
+
+ ret = nfs_revalidate_inode(NFS_SERVER(inode), inode);
+ if (ret)
+ return ret;
+
+ ret = nfs4_xattr_cache_list(inode, list, list_len);
+ if (ret >= 0 || (ret < 0 && ret != -ENOENT))
+ return ret;
+
+ cookie = 0;
+ eof = false;
+ buflen = list_len ? list_len : XATTR_LIST_MAX;
+ buf = list_len ? list : NULL;
+ size = 0;
+
+ while (!eof) {
+ ret = nfs42_proc_listxattrs(inode, buf, buflen,
+ &cookie, &eof);
+ if (ret < 0)
+ return ret;
+
+ if (list_len) {
+ buf += ret;
+ buflen -= ret;
+ }
+ size += ret;
+ }
+
+ if (list_len)
+ nfs4_xattr_cache_set_list(inode, list, size);
+
+ return size;
+}
+
+#else
+
+static ssize_t
+nfs4_listxattr_nfs4_user(struct inode *inode, char *list, size_t list_len)
+{
+ return 0;
+}
+#endif /* CONFIG_NFS_V4_2 */
+
/*
* nfs_fhget will use either the mounted_on_fileid or the fileid
*/
@@ -8513,7 +8672,7 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata)
dprintk("%s Retry: tk_status %d\n", __func__, task->tk_status);
rpc_delay(task, NFS4_POLL_RETRY_MIN);
task->tk_status = 0;
- /* fall through */
+ fallthrough;
case -NFS4ERR_RETRY_UNCACHED_REP:
rpc_restart_call_prepare(task);
return;
@@ -8961,13 +9120,13 @@ static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nf
switch(task->tk_status) {
case 0:
wake_up_all(&clp->cl_lock_waitq);
- /* Fallthrough */
+ fallthrough;
case -NFS4ERR_COMPLETE_ALREADY:
case -NFS4ERR_WRONG_CRED: /* What to do here? */
break;
case -NFS4ERR_DELAY:
rpc_delay(task, NFS4_POLL_RETRY_MAX);
- /* fall through */
+ fallthrough;
case -NFS4ERR_RETRY_UNCACHED_REP:
return -EAGAIN;
case -NFS4ERR_BADSESSION:
@@ -9282,10 +9441,10 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
&lrp->args.range,
lrp->args.inode))
goto out_restart;
- /* Fallthrough */
+ fallthrough;
default:
task->tk_status = 0;
- /* Fallthrough */
+ fallthrough;
case 0:
break;
case -NFS4ERR_DELAY:
@@ -10035,7 +10194,7 @@ const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = {
static ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size)
{
- ssize_t error, error2;
+ ssize_t error, error2, error3;
error = generic_listxattr(dentry, list, size);
if (error < 0)
@@ -10048,7 +10207,17 @@ static ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size)
error2 = nfs4_listxattr_nfs4_label(d_inode(dentry), list, size);
if (error2 < 0)
return error2;
- return error + error2;
+
+ if (list) {
+ list += error2;
+ size -= error2;
+ }
+
+ error3 = nfs4_listxattr_nfs4_user(d_inode(dentry), list, size);
+ if (error3 < 0)
+ return error3;
+
+ return error + error2 + error3;
}
static const struct inode_operations nfs4_dir_inode_operations = {
@@ -10136,11 +10305,22 @@ static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = {
.set = nfs4_xattr_set_nfs4_acl,
};
+#ifdef CONFIG_NFS_V4_2
+static const struct xattr_handler nfs4_xattr_nfs4_user_handler = {
+ .prefix = XATTR_USER_PREFIX,
+ .get = nfs4_xattr_get_nfs4_user,
+ .set = nfs4_xattr_set_nfs4_user,
+};
+#endif
+
const struct xattr_handler *nfs4_xattr_handlers[] = {
&nfs4_xattr_nfs4_acl_handler,
#ifdef CONFIG_NFS_V4_SECURITY_LABEL
&nfs4_xattr_nfs4_label_handler,
#endif
+#ifdef CONFIG_NFS_V4_2
+ &nfs4_xattr_nfs4_user_handler,
+#endif
NULL
};
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index a8dc25ce48bb..4bf10792cb5b 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -509,7 +509,7 @@ nfs4_alloc_state_owner(struct nfs_server *server,
nfs4_init_seqid_counter(&sp->so_seqid);
atomic_set(&sp->so_count, 1);
INIT_LIST_HEAD(&sp->so_lru);
- seqcount_init(&sp->so_reclaim_seqcount);
+ seqcount_spinlock_init(&sp->so_reclaim_seqcount, &sp->so_lock);
mutex_init(&sp->so_delegreturn_mutex);
return sp;
}
@@ -1530,7 +1530,7 @@ restart:
default:
pr_err("NFS: %s: unhandled error %d\n",
__func__, status);
- /* Fall through */
+ fallthrough;
case -ENOMEM:
case -NFS4ERR_DENIED:
case -NFS4ERR_RECLAIM_BAD:
@@ -1667,7 +1667,7 @@ restart:
break;
}
printk(KERN_ERR "NFS: %s: unhandled error %d\n", __func__, status);
- /* Fall through */
+ fallthrough;
case -ENOENT:
case -ENOMEM:
case -EACCES:
@@ -1683,7 +1683,7 @@ restart:
set_bit(ops->state_flag_bit, &state->flags);
break;
}
- /* Fall through */
+ fallthrough;
case -NFS4ERR_ADMIN_REVOKED:
case -NFS4ERR_STALE_STATEID:
case -NFS4ERR_OLD_STATEID:
@@ -1695,7 +1695,7 @@ restart:
case -NFS4ERR_EXPIRED:
case -NFS4ERR_NO_GRACE:
nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state);
- /* Fall through */
+ fallthrough;
case -NFS4ERR_STALE_CLIENTID:
case -NFS4ERR_BADSESSION:
case -NFS4ERR_BADSLOT:
@@ -2273,11 +2273,11 @@ again:
case -ETIMEDOUT:
if (clnt->cl_softrtry)
break;
- /* Fall through */
+ fallthrough;
case -NFS4ERR_DELAY:
case -EAGAIN:
ssleep(1);
- /* Fall through */
+ fallthrough;
case -NFS4ERR_STALE_CLIENTID:
dprintk("NFS: %s after status %d, retrying\n",
__func__, status);
@@ -2289,7 +2289,7 @@ again:
}
if (clnt->cl_auth->au_flavor == RPC_AUTH_UNIX)
break;
- /* Fall through */
+ fallthrough;
case -NFS4ERR_CLID_INUSE:
case -NFS4ERR_WRONGSEC:
/* No point in retrying if we already used RPC_AUTH_UNIX */
diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
index 1475f932d7da..0c1ab846b83d 100644
--- a/fs/nfs/nfs4super.c
+++ b/fs/nfs/nfs4super.c
@@ -69,6 +69,7 @@ static void nfs4_evict_inode(struct inode *inode)
pnfs_destroy_layout(NFS_I(inode));
/* First call standard NFS clear_inode() code */
nfs_clear_inode(inode);
+ nfs4_xattr_cache_zap(inode);
}
struct nfs_referral_count {
@@ -268,6 +269,12 @@ static int __init init_nfs_v4(void)
if (err)
goto out1;
+#ifdef CONFIG_NFS_V4_2
+ err = nfs4_xattr_cache_init();
+ if (err)
+ goto out2;
+#endif
+
err = nfs4_register_sysctl();
if (err)
goto out2;
@@ -288,6 +295,9 @@ static void __exit exit_nfs_v4(void)
nfs4_pnfs_v3_ds_connect_unload();
unregister_nfs_version(&nfs_v4);
+#ifdef CONFIG_NFS_V4_2
+ nfs4_xattr_cache_exit();
+#endif
nfs4_unregister_sysctl();
nfs_idmap_quit();
nfs_dns_resolver_destroy();
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index 543541173a3d..b4f852d4d099 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -1727,6 +1727,13 @@ DEFINE_NFS4_IDMAP_EVENT(nfs4_map_group_to_gid);
DEFINE_NFS4_IDMAP_EVENT(nfs4_map_uid_to_name);
DEFINE_NFS4_IDMAP_EVENT(nfs4_map_gid_to_group);
+#ifdef CONFIG_NFS_V4_1
+#define NFS4_LSEG_LAYOUT_STATEID_HASH(lseg) \
+ (lseg ? nfs_stateid_hash(&lseg->pls_layout->plh_stateid) : 0)
+#else
+#define NFS4_LSEG_LAYOUT_STATEID_HASH(lseg) (0)
+#endif
+
DECLARE_EVENT_CLASS(nfs4_read_event,
TP_PROTO(
const struct nfs_pgio_header *hdr,
@@ -1745,6 +1752,8 @@ DECLARE_EVENT_CLASS(nfs4_read_event,
__field(unsigned long, error)
__field(int, stateid_seq)
__field(u32, stateid_hash)
+ __field(int, layoutstateid_seq)
+ __field(u32, layoutstateid_hash)
),
TP_fast_assign(
@@ -1754,6 +1763,7 @@ DECLARE_EVENT_CLASS(nfs4_read_event,
hdr->args.fh : &nfsi->fh;
const struct nfs4_state *state =
hdr->args.context->state;
+ const struct pnfs_layout_segment *lseg = hdr->lseg;
__entry->dev = inode->i_sb->s_dev;
__entry->fileid = nfsi->fileid;
@@ -1766,11 +1776,15 @@ DECLARE_EVENT_CLASS(nfs4_read_event,
be32_to_cpu(state->stateid.seqid);
__entry->stateid_hash =
nfs_stateid_hash(&state->stateid);
+ __entry->layoutstateid_seq = lseg ? lseg->pls_seq : 0;
+ __entry->layoutstateid_hash =
+ NFS4_LSEG_LAYOUT_STATEID_HASH(lseg);
),
TP_printk(
"error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
- "offset=%lld count=%u res=%u stateid=%d:0x%08x",
+ "offset=%lld count=%u res=%u stateid=%d:0x%08x "
+ "layoutstateid=%d:0x%08x",
-__entry->error,
show_nfsv4_errors(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
@@ -1778,7 +1792,8 @@ DECLARE_EVENT_CLASS(nfs4_read_event,
__entry->fhandle,
(long long)__entry->offset,
__entry->arg_count, __entry->res_count,
- __entry->stateid_seq, __entry->stateid_hash
+ __entry->stateid_seq, __entry->stateid_hash,
+ __entry->layoutstateid_seq, __entry->layoutstateid_hash
)
);
#define DEFINE_NFS4_READ_EVENT(name) \
@@ -1811,6 +1826,8 @@ DECLARE_EVENT_CLASS(nfs4_write_event,
__field(unsigned long, error)
__field(int, stateid_seq)
__field(u32, stateid_hash)
+ __field(int, layoutstateid_seq)
+ __field(u32, layoutstateid_hash)
),
TP_fast_assign(
@@ -1820,6 +1837,7 @@ DECLARE_EVENT_CLASS(nfs4_write_event,
hdr->args.fh : &nfsi->fh;
const struct nfs4_state *state =
hdr->args.context->state;
+ const struct pnfs_layout_segment *lseg = hdr->lseg;
__entry->dev = inode->i_sb->s_dev;
__entry->fileid = nfsi->fileid;
@@ -1832,11 +1850,15 @@ DECLARE_EVENT_CLASS(nfs4_write_event,
be32_to_cpu(state->stateid.seqid);
__entry->stateid_hash =
nfs_stateid_hash(&state->stateid);
+ __entry->layoutstateid_seq = lseg ? lseg->pls_seq : 0;
+ __entry->layoutstateid_hash =
+ NFS4_LSEG_LAYOUT_STATEID_HASH(lseg);
),
TP_printk(
"error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
- "offset=%lld count=%u res=%u stateid=%d:0x%08x",
+ "offset=%lld count=%u res=%u stateid=%d:0x%08x "
+ "layoutstateid=%d:0x%08x",
-__entry->error,
show_nfsv4_errors(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
@@ -1844,7 +1866,8 @@ DECLARE_EVENT_CLASS(nfs4_write_event,
__entry->fhandle,
(long long)__entry->offset,
__entry->arg_count, __entry->res_count,
- __entry->stateid_seq, __entry->stateid_hash
+ __entry->stateid_seq, __entry->stateid_hash,
+ __entry->layoutstateid_seq, __entry->layoutstateid_hash
)
);
@@ -1875,6 +1898,8 @@ DECLARE_EVENT_CLASS(nfs4_commit_event,
__field(unsigned long, error)
__field(loff_t, offset)
__field(u32, count)
+ __field(int, layoutstateid_seq)
+ __field(u32, layoutstateid_hash)
),
TP_fast_assign(
@@ -1882,6 +1907,7 @@ DECLARE_EVENT_CLASS(nfs4_commit_event,
const struct nfs_inode *nfsi = NFS_I(inode);
const struct nfs_fh *fh = data->args.fh ?
data->args.fh : &nfsi->fh;
+ const struct pnfs_layout_segment *lseg = data->lseg;
__entry->dev = inode->i_sb->s_dev;
__entry->fileid = nfsi->fileid;
@@ -1889,18 +1915,22 @@ DECLARE_EVENT_CLASS(nfs4_commit_event,
__entry->offset = data->args.offset;
__entry->count = data->args.count;
__entry->error = error < 0 ? -error : 0;
+ __entry->layoutstateid_seq = lseg ? lseg->pls_seq : 0;
+ __entry->layoutstateid_hash =
+ NFS4_LSEG_LAYOUT_STATEID_HASH(lseg);
),
TP_printk(
"error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
- "offset=%lld count=%u",
+ "offset=%lld count=%u layoutstateid=%d:0x%08x",
-__entry->error,
show_nfsv4_errors(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
(long long)__entry->offset,
- __entry->count
+ __entry->count,
+ __entry->layoutstateid_seq, __entry->layoutstateid_hash
)
);
#define DEFINE_NFS4_COMMIT_EVENT(name) \
@@ -1993,7 +2023,9 @@ TRACE_EVENT(nfs4_layoutget,
DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_layoutcommit);
DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_layoutreturn);
-DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn_on_close);
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_layoutreturn_on_close);
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_layouterror);
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_layoutstats);
TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_UNKNOWN);
TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_NO_PNFS);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 47817ef0aadb..0b3510f62623 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4166,7 +4166,11 @@ static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap,
return -EIO;
if (len < NFS4_MAXLABELLEN) {
if (label) {
- memcpy(label->label, p, len);
+ if (label->len) {
+ if (label->len < len)
+ return -ERANGE;
+ memcpy(label->label, p, len);
+ }
label->len = len;
label->pi = pi;
label->lfs = lfs;
@@ -4201,6 +4205,26 @@ static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, str
return status;
}
+static int decode_attr_xattrsupport(struct xdr_stream *xdr, uint32_t *bitmap,
+ uint32_t *res)
+{
+ __be32 *p;
+
+ *res = 0;
+ if (unlikely(bitmap[2] & (FATTR4_WORD2_XATTR_SUPPORT - 1U)))
+ return -EIO;
+ if (likely(bitmap[2] & FATTR4_WORD2_XATTR_SUPPORT)) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ *res = be32_to_cpup(p);
+ bitmap[2] &= ~FATTR4_WORD2_XATTR_SUPPORT;
+ }
+ dprintk("%s: XATTR support=%s\n", __func__,
+ *res == 0 ? "false" : "true");
+ return 0;
+}
+
static int verify_attr_len(struct xdr_stream *xdr, unsigned int savep, uint32_t attrlen)
{
unsigned int attrwords = XDR_QUADLEN(attrlen);
@@ -4855,6 +4879,11 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
if (status)
goto xdr_error;
+ status = decode_attr_xattrsupport(xdr, bitmap,
+ &fsinfo->xattr_support);
+ if (status)
+ goto xdr_error;
+
status = verify_attr_len(xdr, savep, attrlen);
xdr_error:
dprintk("%s: xdr returned %d!\n", __func__, -status);
@@ -5227,7 +5256,7 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
* The XDR encode routine has set things up so that
* the link text will be copied directly into the
* buffer. We just have to do overflow-checking,
- * and and null-terminate the text (the VFS expects
+ * and null-terminate the text (the VFS expects
* null-termination).
*/
xdr_terminate_string(rcvbuf, len);
@@ -7456,6 +7485,8 @@ static struct {
{ NFS4ERR_SYMLINK, -ELOOP },
{ NFS4ERR_OP_ILLEGAL, -EOPNOTSUPP },
{ NFS4ERR_DEADLOCK, -EDEADLK },
+ { NFS4ERR_NOXATTR, -ENODATA },
+ { NFS4ERR_XATTR2BIG, -E2BIG },
{ -1, -EIO }
};
@@ -7584,6 +7615,10 @@ const struct rpc_procinfo nfs4_procedures[] = {
PROC42(COPY_NOTIFY, enc_copy_notify, dec_copy_notify),
PROC(LOOKUPP, enc_lookupp, dec_lookupp),
PROC42(LAYOUTERROR, enc_layouterror, dec_layouterror),
+ PROC42(GETXATTR, enc_getxattr, dec_getxattr),
+ PROC42(SETXATTR, enc_setxattr, dec_setxattr),
+ PROC42(LISTXATTRS, enc_listxattrs, dec_listxattrs),
+ PROC42(REMOVEXATTR, enc_removexattr, dec_removexattr),
};
static unsigned int nfs_version4_counts[ARRAY_SIZE(nfs4_procedures)];
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 547cec79899f..5a59dcdce0b2 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -59,7 +59,8 @@ TRACE_DEFINE_ENUM(NFS_INO_INVALID_OTHER);
{ NFS_INO_INVALID_CTIME, "INVALID_CTIME" }, \
{ NFS_INO_INVALID_MTIME, "INVALID_MTIME" }, \
{ NFS_INO_INVALID_SIZE, "INVALID_SIZE" }, \
- { NFS_INO_INVALID_OTHER, "INVALID_OTHER" })
+ { NFS_INO_INVALID_OTHER, "INVALID_OTHER" }, \
+ { NFS_INO_INVALID_XATTR, "INVALID_XATTR" })
TRACE_DEFINE_ENUM(NFS_INO_ADVISE_RDPLUS);
TRACE_DEFINE_ENUM(NFS_INO_STALE);
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 6ea4cac41e46..6985cacf4700 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -711,7 +711,7 @@ static void nfs_pgio_rpcsetup(struct nfs_pgio_header *hdr,
case FLUSH_COND_STABLE:
if (nfs_reqs_to_commit(cinfo))
break;
- /* fall through */
+ fallthrough;
default:
hdr->args.stable = NFS_FILE_SYNC;
}
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index dd2e14f5875d..71f7741126b6 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1226,31 +1226,27 @@ out:
return status;
}
+static bool
+pnfs_layout_segments_returnable(struct pnfs_layout_hdr *lo,
+ enum pnfs_iomode iomode,
+ u32 seq)
+{
+ struct pnfs_layout_range recall_range = {
+ .length = NFS4_MAX_UINT64,
+ .iomode = iomode,
+ };
+ return pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs,
+ &recall_range, seq) != -EBUSY;
+}
+
/* Return true if layoutreturn is needed */
static bool
pnfs_layout_need_return(struct pnfs_layout_hdr *lo)
{
- struct pnfs_layout_segment *s;
- enum pnfs_iomode iomode;
- u32 seq;
-
if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
return false;
-
- seq = lo->plh_return_seq;
- iomode = lo->plh_return_iomode;
-
- /* Defer layoutreturn until all recalled lsegs are done */
- list_for_each_entry(s, &lo->plh_segs, pls_list) {
- if (seq && pnfs_seqid_is_newer(s->pls_seq, seq))
- continue;
- if (iomode != IOMODE_ANY && s->pls_range.iomode != iomode)
- continue;
- if (test_bit(NFS_LSEG_LAYOUTRETURN, &s->pls_flags))
- return false;
- }
-
- return true;
+ return pnfs_layout_segments_returnable(lo, lo->plh_return_iomode,
+ lo->plh_return_seq);
}
static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo)
@@ -1545,16 +1541,16 @@ void pnfs_roc_release(struct nfs4_layoutreturn_args *args,
case 0:
if (res->lrs_present)
res_stateid = &res->stateid;
- /* Fallthrough */
+ fallthrough;
default:
arg_stateid = &args->stateid;
}
+ trace_nfs4_layoutreturn_on_close(args->inode, &args->stateid, ret);
pnfs_layoutreturn_free_lsegs(lo, arg_stateid, &args->range,
res_stateid);
if (ld_private && ld_private->ops && ld_private->ops->free)
ld_private->ops->free(ld_private);
pnfs_put_layout_hdr(lo);
- trace_nfs4_layoutreturn_on_close(args->inode, 0);
}
bool pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task)
@@ -2392,16 +2388,6 @@ out_forget:
return ERR_PTR(-EAGAIN);
}
-static int
-mark_lseg_invalid_or_return(struct pnfs_layout_segment *lseg,
- struct list_head *tmp_list)
-{
- if (!mark_lseg_invalid(lseg, tmp_list))
- return 0;
- pnfs_cache_lseg_for_layoutreturn(lseg->pls_layout, lseg);
- return 1;
-}
-
/**
* pnfs_mark_matching_lsegs_return - Free or return matching layout segments
* @lo: pointer to layout header
@@ -2438,7 +2424,7 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
lseg, lseg->pls_range.iomode,
lseg->pls_range.offset,
lseg->pls_range.length);
- if (mark_lseg_invalid_or_return(lseg, tmp_list))
+ if (mark_lseg_invalid(lseg, tmp_list))
continue;
remaining++;
set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
@@ -2953,7 +2939,8 @@ pnfs_try_to_read_data(struct nfs_pgio_header *hdr,
}
/* Resend all requests through pnfs. */
-void pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr)
+void pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr,
+ unsigned int mirror_idx)
{
struct nfs_pageio_descriptor pgio;
@@ -2964,6 +2951,7 @@ void pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr)
nfs_pageio_init_read(&pgio, hdr->inode, false,
hdr->completion_ops);
+ pgio.pg_mirror_idx = mirror_idx;
hdr->task.tk_status = nfs_pageio_resend(&pgio, hdr);
}
}
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 8e0ada581b92..2661c44c62db 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -311,7 +311,7 @@ int _pnfs_return_layout(struct inode *);
int pnfs_commit_and_return_layout(struct inode *);
void pnfs_ld_write_done(struct nfs_pgio_header *);
void pnfs_ld_read_done(struct nfs_pgio_header *);
-void pnfs_read_resend_pnfs(struct nfs_pgio_header *);
+void pnfs_read_resend_pnfs(struct nfs_pgio_header *, unsigned int mirror_idx);
struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
struct nfs_open_context *ctx,
loff_t pos,
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 7a70287f21a2..f943e37853fa 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1200,13 +1200,6 @@ static void nfs_get_cache_cookie(struct super_block *sb,
}
#endif
-static void nfs_set_readahead(struct backing_dev_info *bdi,
- unsigned long iomax_pages)
-{
- bdi->ra_pages = VM_READAHEAD_PAGES;
- bdi->io_pages = iomax_pages;
-}
-
int nfs_get_tree_common(struct fs_context *fc)
{
struct nfs_fs_context *ctx = nfs_fc2context(fc);
@@ -1251,7 +1244,7 @@ int nfs_get_tree_common(struct fs_context *fc)
MINOR(server->s_dev));
if (error)
goto error_splat_super;
- nfs_set_readahead(s->s_bdi, server->rpages);
+ s->s_bdi->io_pages = server->rpages;
server->super = s;
}
diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
index 8ceb6425e01a..d056ad2fdefd 100644
--- a/fs/nfs_common/nfsacl.c
+++ b/fs/nfs_common/nfsacl.c
@@ -237,7 +237,7 @@ posix_acl_from_nfsacl(struct posix_acl *acl)
break;
case ACL_MASK:
mask = pa;
- /* fall through */
+ fallthrough;
case ACL_OTHER:
break;
}
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index 9bbaa671c079..a07c39c94bbd 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -83,13 +83,13 @@ nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp,
bex->soff = iomap.addr;
break;
}
- /*FALLTHRU*/
+ fallthrough;
case IOMAP_HOLE:
if (seg->iomode == IOMODE_READ) {
bex->es = PNFS_BLOCK_NONE_DATA;
break;
}
- /*FALLTHRU*/
+ fallthrough;
case IOMAP_DELALLOC:
default:
WARN(1, "pnfsd: filesystem returned %d extent\n", iomap.type);
@@ -170,7 +170,7 @@ nfsd4_block_proc_getdeviceinfo(struct super_block *sb,
struct nfs4_client *clp,
struct nfsd4_getdeviceinfo *gdp)
{
- if (sb->s_bdev != sb->s_bdev->bd_contains)
+ if (bdev_is_partition(sb->s_bdev))
return nfserr_inval;
return nfserrno(nfsd4_block_get_device_info_simple(sb, gdp));
}
@@ -382,7 +382,7 @@ nfsd4_scsi_proc_getdeviceinfo(struct super_block *sb,
struct nfs4_client *clp,
struct nfsd4_getdeviceinfo *gdp)
{
- if (sb->s_bdev != sb->s_bdev->bd_contains)
+ if (bdev_is_partition(sb->s_bdev))
return nfserr_inval;
return nfserrno(nfsd4_block_get_device_info_scsi(sb, clp, gdp));
}
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 7fbe9840a03e..052be5bf9ef5 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -1119,7 +1119,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
break;
case -ESERVERFAULT:
++session->se_cb_seq_nr;
- /* Fall through */
+ fallthrough;
case 1:
case -NFS4ERR_BADSESSION:
nfsd4_mark_cb_fault(cb->cb_clp, cb->cb_seq_status);
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index e12409eca7cc..a97873f2d22b 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -681,7 +681,7 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
rpc_delay(task, HZ/100); /* 10 mili-seconds */
return 0;
}
- /* Fallthrough */
+ fallthrough;
default:
/*
* Unknown error or non-responding client, we'll need to fence.
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index a527da3d8052..eaf50eafa935 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -428,7 +428,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out;
open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
reclaim = true;
- /* fall through */
+ fallthrough;
case NFS4_OPEN_CLAIM_FH:
case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
status = do_open_fhandle(rqstp, cstate, open);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 81ed8e8bab3f..c09a2a4281ec 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3117,7 +3117,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
break;
default: /* checked by xdr code */
WARN_ON_ONCE(1);
- /* fall through */
+ fallthrough;
case SP4_SSV:
status = nfserr_encr_alg_unsupp;
goto out_nolock;
@@ -4532,7 +4532,7 @@ static int nfsd4_cb_recall_done(struct nfsd4_callback *cb,
rpc_delay(task, 2 * HZ);
return 0;
}
- /*FALLTHRU*/
+ fallthrough;
default:
return 1;
}
@@ -4597,6 +4597,8 @@ static bool nfsd_breaker_owns_lease(struct file_lock *fl)
if (!i_am_nfsd())
return NULL;
rqst = kthread_data(current);
+ if (!rqst->rq_lease_breaker)
+ return NULL;
clp = *(rqst->rq_lease_breaker);
return dl->dl_stid.sc_client == clp;
}
@@ -5652,7 +5654,7 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid)
break;
default:
printk("unknown stateid type %x\n", s->sc_type);
- /* Fallthrough */
+ fallthrough;
case NFS4_CLOSED_STID:
case NFS4_CLOSED_DELEG_STID:
status = nfserr_bad_stateid;
@@ -6742,7 +6744,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
case NFS4_READW_LT:
if (nfsd4_has_session(cstate))
fl_flags |= FL_SLEEP;
- /* Fallthrough */
+ fallthrough;
case NFS4_READ_LT:
spin_lock(&fp->fi_lock);
nf = find_readable_file_locked(fp);
@@ -6754,7 +6756,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
case NFS4_WRITEW_LT:
if (nfsd4_has_session(cstate))
fl_flags |= FL_SLEEP;
- /* Fallthrough */
+ fallthrough;
case NFS4_WRITE_LT:
spin_lock(&fp->fi_lock);
nf = find_writeable_file_locked(fp);
@@ -6816,7 +6818,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
break;
case FILE_LOCK_DEFERRED:
nbl = NULL;
- /* Fallthrough */
+ fallthrough;
case -EAGAIN: /* conflock holds conflicting lock */
status = nfserr_denied;
dprintk("NFSD: nfsd4_lock: conflicting lock found!\n");
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 37bc8f5f4514..c81dbbad8792 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -459,7 +459,7 @@ static bool fsid_type_ok_for_exp(u8 fsid_type, struct svc_export *exp)
case FSID_DEV:
if (!old_valid_dev(exp_sb(exp)->s_dev))
return false;
- /* FALL THROUGH */
+ fallthrough;
case FSID_MAJOR_MINOR:
case FSID_ENCODE_DEV:
return exp_sb(exp)->s_type->fs_flags & FS_REQUIRES_DEV;
@@ -469,7 +469,7 @@ static bool fsid_type_ok_for_exp(u8 fsid_type, struct svc_export *exp)
case FSID_UUID16:
if (!is_root_export(exp))
return false;
- /* fall through */
+ fallthrough;
case FSID_UUID4_INUM:
case FSID_UUID16_INUM:
return exp->ex_uuid != NULL;
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 543bbe0a556e..6e0b066480c5 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -314,7 +314,7 @@ nfsd_proc_create(struct svc_rqst *rqstp)
rdev = inode->i_rdev;
attr->ia_valid |= ATTR_SIZE;
- /* FALLTHROUGH */
+ fallthrough;
case S_IFIFO:
/* this is probably a permission check..
* at least IRIX implements perm checking on
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index b603dfcdd361..f7f6473578af 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -221,7 +221,7 @@ int nfsd_vers(struct nfsd_net *nn, int vers, enum vers_op change)
case NFSD_TEST:
if (nn->nfsd_versions)
return nn->nfsd_versions[vers];
- /* Fallthrough */
+ fallthrough;
case NFSD_AVAIL:
return nfsd_support_version(vers);
}
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 7d2933b85b65..aba5af9df328 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1456,7 +1456,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
*created = true;
break;
}
- /* fall through */
+ fallthrough;
case NFS4_CREATE_EXCLUSIVE4_1:
if ( d_inode(dchild)->i_mtime.tv_sec == v_mtime
&& d_inode(dchild)->i_atime.tv_sec == v_atime
@@ -1465,7 +1465,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
*created = true;
goto set_attr;
}
- /* fall through */
+ fallthrough;
case NFS3_CREATE_GUARDED:
err = nfserr_exist;
}
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 235b959fc2b3..adf3bb0a8048 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -613,10 +613,10 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
lock = nilfs_mdt_bgl_lock(inode, group);
if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap))
- nilfs_msg(inode->i_sb, KERN_WARNING,
- "%s (ino=%lu): entry number %llu already freed",
- __func__, inode->i_ino,
- (unsigned long long)req->pr_entry_nr);
+ nilfs_warn(inode->i_sb,
+ "%s (ino=%lu): entry number %llu already freed",
+ __func__, inode->i_ino,
+ (unsigned long long)req->pr_entry_nr);
else
nilfs_palloc_group_desc_add_entries(desc, lock, 1);
@@ -654,10 +654,10 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
lock = nilfs_mdt_bgl_lock(inode, group);
if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap))
- nilfs_msg(inode->i_sb, KERN_WARNING,
- "%s (ino=%lu): entry number %llu already freed",
- __func__, inode->i_ino,
- (unsigned long long)req->pr_entry_nr);
+ nilfs_warn(inode->i_sb,
+ "%s (ino=%lu): entry number %llu already freed",
+ __func__, inode->i_ino,
+ (unsigned long long)req->pr_entry_nr);
else
nilfs_palloc_group_desc_add_entries(desc, lock, 1);
@@ -763,10 +763,10 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
do {
if (!nilfs_clear_bit_atomic(lock, group_offset,
bitmap)) {
- nilfs_msg(inode->i_sb, KERN_WARNING,
- "%s (ino=%lu): entry number %llu already freed",
- __func__, inode->i_ino,
- (unsigned long long)entry_nrs[j]);
+ nilfs_warn(inode->i_sb,
+ "%s (ino=%lu): entry number %llu already freed",
+ __func__, inode->i_ino,
+ (unsigned long long)entry_nrs[j]);
} else {
n++;
}
@@ -808,10 +808,10 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
ret = nilfs_palloc_delete_entry_block(inode,
last_nrs[k]);
if (ret && ret != -ENOENT)
- nilfs_msg(inode->i_sb, KERN_WARNING,
- "error %d deleting block that object (entry=%llu, ino=%lu) belongs to",
- ret, (unsigned long long)last_nrs[k],
- inode->i_ino);
+ nilfs_warn(inode->i_sb,
+ "error %d deleting block that object (entry=%llu, ino=%lu) belongs to",
+ ret, (unsigned long long)last_nrs[k],
+ inode->i_ino);
}
desc_kaddr = kmap_atomic(desc_bh->b_page);
@@ -826,9 +826,9 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
if (nfree == nilfs_palloc_entries_per_group(inode)) {
ret = nilfs_palloc_delete_bitmap_block(inode, group);
if (ret && ret != -ENOENT)
- nilfs_msg(inode->i_sb, KERN_WARNING,
- "error %d deleting bitmap block of group=%lu, ino=%lu",
- ret, group, inode->i_ino);
+ nilfs_warn(inode->i_sb,
+ "error %d deleting bitmap block of group=%lu, ino=%lu",
+ ret, group, inode->i_ino);
}
}
return 0;
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index fb5a9a8a13cf..e516ae389ca5 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -519,7 +519,7 @@ int nilfs_bmap_read(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode)
break;
case NILFS_IFILE_INO:
lockdep_set_class(&bmap->b_sem, &nilfs_bmap_mdt_lock_key);
- /* Fall through */
+ fallthrough;
default:
bmap->b_ptr_type = NILFS_BMAP_PTR_VM;
bmap->b_last_allocated_key = 0;
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 23e043eca237..f42ab57201e7 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -351,10 +351,10 @@ static int nilfs_btree_node_broken(const struct nilfs_btree_node *node,
(flags & NILFS_BTREE_NODE_ROOT) ||
nchildren < 0 ||
nchildren > NILFS_BTREE_NODE_NCHILDREN_MAX(size))) {
- nilfs_msg(inode->i_sb, KERN_CRIT,
- "bad btree node (ino=%lu, blocknr=%llu): level = %d, flags = 0x%x, nchildren = %d",
- inode->i_ino, (unsigned long long)blocknr, level,
- flags, nchildren);
+ nilfs_crit(inode->i_sb,
+ "bad btree node (ino=%lu, blocknr=%llu): level = %d, flags = 0x%x, nchildren = %d",
+ inode->i_ino, (unsigned long long)blocknr, level,
+ flags, nchildren);
ret = 1;
}
return ret;
@@ -381,9 +381,9 @@ static int nilfs_btree_root_broken(const struct nilfs_btree_node *node,
level >= NILFS_BTREE_LEVEL_MAX ||
nchildren < 0 ||
nchildren > NILFS_BTREE_ROOT_NCHILDREN_MAX)) {
- nilfs_msg(inode->i_sb, KERN_CRIT,
- "bad btree root (ino=%lu): level = %d, flags = 0x%x, nchildren = %d",
- inode->i_ino, level, flags, nchildren);
+ nilfs_crit(inode->i_sb,
+ "bad btree root (ino=%lu): level = %d, flags = 0x%x, nchildren = %d",
+ inode->i_ino, level, flags, nchildren);
ret = 1;
}
return ret;
@@ -450,10 +450,10 @@ static int nilfs_btree_bad_node(const struct nilfs_bmap *btree,
{
if (unlikely(nilfs_btree_node_get_level(node) != level)) {
dump_stack();
- nilfs_msg(btree->b_inode->i_sb, KERN_CRIT,
- "btree level mismatch (ino=%lu): %d != %d",
- btree->b_inode->i_ino,
- nilfs_btree_node_get_level(node), level);
+ nilfs_crit(btree->b_inode->i_sb,
+ "btree level mismatch (ino=%lu): %d != %d",
+ btree->b_inode->i_ino,
+ nilfs_btree_node_get_level(node), level);
return 1;
}
return 0;
@@ -508,7 +508,7 @@ static int __nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr,
out_no_wait:
if (!buffer_uptodate(bh)) {
- nilfs_msg(btree->b_inode->i_sb, KERN_ERR,
+ nilfs_err(btree->b_inode->i_sb,
"I/O error reading b-tree node block (ino=%lu, blocknr=%llu)",
btree->b_inode->i_ino, (unsigned long long)ptr);
brelse(bh);
@@ -2074,10 +2074,10 @@ static int nilfs_btree_propagate(struct nilfs_bmap *btree,
ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1, 0);
if (ret < 0) {
if (unlikely(ret == -ENOENT))
- nilfs_msg(btree->b_inode->i_sb, KERN_CRIT,
- "writing node/leaf block does not appear in b-tree (ino=%lu) at key=%llu, level=%d",
- btree->b_inode->i_ino,
- (unsigned long long)key, level);
+ nilfs_crit(btree->b_inode->i_sb,
+ "writing node/leaf block does not appear in b-tree (ino=%lu) at key=%llu, level=%d",
+ btree->b_inode->i_ino,
+ (unsigned long long)key, level);
goto out;
}
@@ -2114,11 +2114,11 @@ static void nilfs_btree_add_dirty_buffer(struct nilfs_bmap *btree,
if (level < NILFS_BTREE_LEVEL_NODE_MIN ||
level >= NILFS_BTREE_LEVEL_MAX) {
dump_stack();
- nilfs_msg(btree->b_inode->i_sb, KERN_WARNING,
- "invalid btree level: %d (key=%llu, ino=%lu, blocknr=%llu)",
- level, (unsigned long long)key,
- btree->b_inode->i_ino,
- (unsigned long long)bh->b_blocknr);
+ nilfs_warn(btree->b_inode->i_sb,
+ "invalid btree level: %d (key=%llu, ino=%lu, blocknr=%llu)",
+ level, (unsigned long long)key,
+ btree->b_inode->i_ino,
+ (unsigned long long)bh->b_blocknr);
return;
}
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 8d41311b5db4..86d4d850d130 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -322,7 +322,7 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
int ret, ncps, nicps, nss, count, i;
if (unlikely(start == 0 || start > end)) {
- nilfs_msg(cpfile->i_sb, KERN_ERR,
+ nilfs_err(cpfile->i_sb,
"cannot delete checkpoints: invalid range [%llu, %llu)",
(unsigned long long)start, (unsigned long long)end);
return -EINVAL;
@@ -376,7 +376,7 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
cpfile, cno);
if (ret == 0)
continue;
- nilfs_msg(cpfile->i_sb, KERN_ERR,
+ nilfs_err(cpfile->i_sb,
"error %d deleting checkpoint block",
ret);
break;
@@ -981,12 +981,10 @@ int nilfs_cpfile_read(struct super_block *sb, size_t cpsize,
int err;
if (cpsize > sb->s_blocksize) {
- nilfs_msg(sb, KERN_ERR,
- "too large checkpoint size: %zu bytes", cpsize);
+ nilfs_err(sb, "too large checkpoint size: %zu bytes", cpsize);
return -EINVAL;
} else if (cpsize < NILFS_MIN_CHECKPOINT_SIZE) {
- nilfs_msg(sb, KERN_ERR,
- "too small checkpoint size: %zu bytes", cpsize);
+ nilfs_err(sb, "too small checkpoint size: %zu bytes", cpsize);
return -EINVAL;
}
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 6f4066636be9..8bccdf1158fc 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -340,11 +340,11 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
kaddr = kmap_atomic(entry_bh->b_page);
entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
if (unlikely(entry->de_blocknr == cpu_to_le64(0))) {
- nilfs_msg(dat->i_sb, KERN_CRIT,
- "%s: invalid vblocknr = %llu, [%llu, %llu)",
- __func__, (unsigned long long)vblocknr,
- (unsigned long long)le64_to_cpu(entry->de_start),
- (unsigned long long)le64_to_cpu(entry->de_end));
+ nilfs_crit(dat->i_sb,
+ "%s: invalid vblocknr = %llu, [%llu, %llu)",
+ __func__, (unsigned long long)vblocknr,
+ (unsigned long long)le64_to_cpu(entry->de_start),
+ (unsigned long long)le64_to_cpu(entry->de_end));
kunmap_atomic(kaddr);
brelse(entry_bh);
return -EINVAL;
@@ -471,11 +471,11 @@ int nilfs_dat_read(struct super_block *sb, size_t entry_size,
int err;
if (entry_size > sb->s_blocksize) {
- nilfs_msg(sb, KERN_ERR, "too large DAT entry size: %zu bytes",
+ nilfs_err(sb, "too large DAT entry size: %zu bytes",
entry_size);
return -EINVAL;
} else if (entry_size < NILFS_MIN_DAT_ENTRY_SIZE) {
- nilfs_msg(sb, KERN_ERR, "too small DAT entry size: %zu bytes",
+ nilfs_err(sb, "too small DAT entry size: %zu bytes",
entry_size);
return -EINVAL;
}
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index 533e24ea3a88..f353101955e3 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -328,16 +328,18 @@ static int nilfs_direct_assign(struct nilfs_bmap *bmap,
key = nilfs_bmap_data_get_key(bmap, *bh);
if (unlikely(key > NILFS_DIRECT_KEY_MAX)) {
- nilfs_msg(bmap->b_inode->i_sb, KERN_CRIT,
- "%s (ino=%lu): invalid key: %llu", __func__,
- bmap->b_inode->i_ino, (unsigned long long)key);
+ nilfs_crit(bmap->b_inode->i_sb,
+ "%s (ino=%lu): invalid key: %llu",
+ __func__,
+ bmap->b_inode->i_ino, (unsigned long long)key);
return -EINVAL;
}
ptr = nilfs_direct_get_ptr(bmap, key);
if (unlikely(ptr == NILFS_BMAP_INVALID_PTR)) {
- nilfs_msg(bmap->b_inode->i_sb, KERN_CRIT,
- "%s (ino=%lu): invalid pointer: %llu", __func__,
- bmap->b_inode->i_ino, (unsigned long long)ptr);
+ nilfs_crit(bmap->b_inode->i_sb,
+ "%s (ino=%lu): invalid pointer: %llu",
+ __func__,
+ bmap->b_inode->i_ino, (unsigned long long)ptr);
return -EINVAL;
}
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index aa3c328ee189..448320496856 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -142,7 +142,7 @@ int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh)
if (!buffer_uptodate(bh)) {
struct inode *inode = bh->b_page->mapping->host;
- nilfs_msg(inode->i_sb, KERN_ERR,
+ nilfs_err(inode->i_sb,
"I/O error reading %s block for GC (ino=%lu, vblocknr=%llu)",
buffer_nilfs_node(bh) ? "node" : "data",
inode->i_ino, (unsigned long long)bh->b_blocknr);
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index 4140d232cadc..02727ed3a7c6 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -142,8 +142,8 @@ int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino,
err = nilfs_palloc_get_entry_block(ifile, ino, 0, out_bh);
if (unlikely(err))
- nilfs_msg(sb, KERN_WARNING, "error %d reading inode: ino=%lu",
- err, (unsigned long)ino);
+ nilfs_warn(sb, "error %d reading inode: ino=%lu",
+ err, (unsigned long)ino);
return err;
}
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 28009ec54420..745d371d6fea 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -104,10 +104,10 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
* However, the page having this block must
* be locked in this case.
*/
- nilfs_msg(inode->i_sb, KERN_WARNING,
- "%s (ino=%lu): a race condition while inserting a data block at offset=%llu",
- __func__, inode->i_ino,
- (unsigned long long)blkoff);
+ nilfs_warn(inode->i_sb,
+ "%s (ino=%lu): a race condition while inserting a data block at offset=%llu",
+ __func__, inode->i_ino,
+ (unsigned long long)blkoff);
err = 0;
}
nilfs_transaction_abort(inode->i_sb);
@@ -388,7 +388,8 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
failed_after_creation:
clear_nlink(inode);
- unlock_new_inode(inode);
+ if (inode->i_state & I_NEW)
+ unlock_new_inode(inode);
iput(inode); /*
* raw_inode will be deleted through
* nilfs_evict_inode().
@@ -706,9 +707,8 @@ repeat:
goto repeat;
failed:
- nilfs_msg(ii->vfs_inode.i_sb, KERN_WARNING,
- "error %d truncating bmap (ino=%lu)", ret,
- ii->vfs_inode.i_ino);
+ nilfs_warn(ii->vfs_inode.i_sb, "error %d truncating bmap (ino=%lu)",
+ ret, ii->vfs_inode.i_ino);
}
void nilfs_truncate(struct inode *inode)
@@ -919,9 +919,9 @@ int nilfs_set_file_dirty(struct inode *inode, unsigned int nr_dirty)
* This will happen when somebody is freeing
* this inode.
*/
- nilfs_msg(inode->i_sb, KERN_WARNING,
- "cannot set file dirty (ino=%lu): the file is being freed",
- inode->i_ino);
+ nilfs_warn(inode->i_sb,
+ "cannot set file dirty (ino=%lu): the file is being freed",
+ inode->i_ino);
spin_unlock(&nilfs->ns_inode_lock);
return -EINVAL; /*
* NILFS_I_DIRTY may remain for
@@ -942,9 +942,9 @@ int __nilfs_mark_inode_dirty(struct inode *inode, int flags)
err = nilfs_load_inode_block(inode, &ibh);
if (unlikely(err)) {
- nilfs_msg(inode->i_sb, KERN_WARNING,
- "cannot mark inode dirty (ino=%lu): error %d loading inode block",
- inode->i_ino, err);
+ nilfs_warn(inode->i_sb,
+ "cannot mark inode dirty (ino=%lu): error %d loading inode block",
+ inode->i_ino, err);
return err;
}
nilfs_update_inode(inode, ibh, flags);
@@ -970,8 +970,8 @@ void nilfs_dirty_inode(struct inode *inode, int flags)
struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
if (is_bad_inode(inode)) {
- nilfs_msg(inode->i_sb, KERN_WARNING,
- "tried to mark bad_inode dirty. ignored.");
+ nilfs_warn(inode->i_sb,
+ "tried to mark bad_inode dirty. ignored.");
dump_stack();
return;
}
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 4ba73dbf3e8d..07d26f61f22a 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -569,25 +569,25 @@ static int nilfs_ioctl_move_inode_block(struct inode *inode,
if (unlikely(ret < 0)) {
if (ret == -ENOENT)
- nilfs_msg(inode->i_sb, KERN_CRIT,
- "%s: invalid virtual block address (%s): ino=%llu, cno=%llu, offset=%llu, blocknr=%llu, vblocknr=%llu",
- __func__, vdesc->vd_flags ? "node" : "data",
- (unsigned long long)vdesc->vd_ino,
- (unsigned long long)vdesc->vd_cno,
- (unsigned long long)vdesc->vd_offset,
- (unsigned long long)vdesc->vd_blocknr,
- (unsigned long long)vdesc->vd_vblocknr);
+ nilfs_crit(inode->i_sb,
+ "%s: invalid virtual block address (%s): ino=%llu, cno=%llu, offset=%llu, blocknr=%llu, vblocknr=%llu",
+ __func__, vdesc->vd_flags ? "node" : "data",
+ (unsigned long long)vdesc->vd_ino,
+ (unsigned long long)vdesc->vd_cno,
+ (unsigned long long)vdesc->vd_offset,
+ (unsigned long long)vdesc->vd_blocknr,
+ (unsigned long long)vdesc->vd_vblocknr);
return ret;
}
if (unlikely(!list_empty(&bh->b_assoc_buffers))) {
- nilfs_msg(inode->i_sb, KERN_CRIT,
- "%s: conflicting %s buffer: ino=%llu, cno=%llu, offset=%llu, blocknr=%llu, vblocknr=%llu",
- __func__, vdesc->vd_flags ? "node" : "data",
- (unsigned long long)vdesc->vd_ino,
- (unsigned long long)vdesc->vd_cno,
- (unsigned long long)vdesc->vd_offset,
- (unsigned long long)vdesc->vd_blocknr,
- (unsigned long long)vdesc->vd_vblocknr);
+ nilfs_crit(inode->i_sb,
+ "%s: conflicting %s buffer: ino=%llu, cno=%llu, offset=%llu, blocknr=%llu, vblocknr=%llu",
+ __func__, vdesc->vd_flags ? "node" : "data",
+ (unsigned long long)vdesc->vd_ino,
+ (unsigned long long)vdesc->vd_cno,
+ (unsigned long long)vdesc->vd_offset,
+ (unsigned long long)vdesc->vd_blocknr,
+ (unsigned long long)vdesc->vd_vblocknr);
brelse(bh);
return -EEXIST;
}
@@ -837,8 +837,7 @@ int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs,
return 0;
failed:
- nilfs_msg(nilfs->ns_sb, KERN_ERR, "error %d preparing GC: %s", ret,
- msg);
+ nilfs_err(nilfs->ns_sb, "error %d preparing GC: %s", ret, msg);
return ret;
}
@@ -947,7 +946,7 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
ret = nilfs_ioctl_move_blocks(inode->i_sb, &argv[0], kbufs[0]);
if (ret < 0) {
- nilfs_msg(inode->i_sb, KERN_ERR,
+ nilfs_err(inode->i_sb,
"error %d preparing GC: cannot read source blocks",
ret);
} else {
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 700870a92bc4..c0361ce45f62 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -199,7 +199,7 @@ static int nilfs_mdt_read_block(struct inode *inode, unsigned long block,
out_no_wait:
err = -EIO;
if (!buffer_uptodate(first_bh)) {
- nilfs_msg(inode->i_sb, KERN_ERR,
+ nilfs_err(inode->i_sb,
"I/O error reading meta-data file (ino=%lu, block-offset=%lu)",
inode->i_ino, block);
goto failed_bh;
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 9fe6d4ab74f0..a6ec7961d4f5 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -272,9 +272,9 @@ static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry)
goto out;
if (!inode->i_nlink) {
- nilfs_msg(inode->i_sb, KERN_WARNING,
- "deleting nonexistent file (ino=%lu), %d",
- inode->i_ino, inode->i_nlink);
+ nilfs_warn(inode->i_sb,
+ "deleting nonexistent file (ino=%lu), %d",
+ inode->i_ino, inode->i_nlink);
set_nlink(inode, 1);
}
err = nilfs_delete_entry(de, page);
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 42395ba52da6..f8450ee3fd06 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -289,9 +289,8 @@ static inline int nilfs_mark_inode_dirty_sync(struct inode *inode)
/* super.c */
extern struct inode *nilfs_alloc_inode(struct super_block *);
-extern __printf(3, 4)
-void __nilfs_msg(struct super_block *sb, const char *level,
- const char *fmt, ...);
+__printf(2, 3)
+void __nilfs_msg(struct super_block *sb, const char *fmt, ...);
extern __printf(3, 4)
void __nilfs_error(struct super_block *sb, const char *function,
const char *fmt, ...);
@@ -299,7 +298,7 @@ void __nilfs_error(struct super_block *sb, const char *function,
#ifdef CONFIG_PRINTK
#define nilfs_msg(sb, level, fmt, ...) \
- __nilfs_msg(sb, level, fmt, ##__VA_ARGS__)
+ __nilfs_msg(sb, level fmt, ##__VA_ARGS__)
#define nilfs_error(sb, fmt, ...) \
__nilfs_error(sb, __func__, fmt, ##__VA_ARGS__)
@@ -307,7 +306,7 @@ void __nilfs_error(struct super_block *sb, const char *function,
#define nilfs_msg(sb, level, fmt, ...) \
do { \
- no_printk(fmt, ##__VA_ARGS__); \
+ no_printk(level fmt, ##__VA_ARGS__); \
(void)(sb); \
} while (0)
#define nilfs_error(sb, fmt, ...) \
@@ -318,6 +317,15 @@ void __nilfs_error(struct super_block *sb, const char *function,
#endif /* CONFIG_PRINTK */
+#define nilfs_crit(sb, fmt, ...) \
+ nilfs_msg(sb, KERN_CRIT, fmt, ##__VA_ARGS__)
+#define nilfs_err(sb, fmt, ...) \
+ nilfs_msg(sb, KERN_ERR, fmt, ##__VA_ARGS__)
+#define nilfs_warn(sb, fmt, ...) \
+ nilfs_msg(sb, KERN_WARNING, fmt, ##__VA_ARGS__)
+#define nilfs_info(sb, fmt, ...) \
+ nilfs_msg(sb, KERN_INFO, fmt, ##__VA_ARGS__)
+
extern struct nilfs_super_block *
nilfs_read_super_block(struct super_block *, u64, int, struct buffer_head **);
extern int nilfs_store_magic_and_option(struct super_block *,
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index d7fc8d369d89..b175f1330408 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -391,9 +391,8 @@ void nilfs_clear_dirty_page(struct page *page, bool silent)
BUG_ON(!PageLocked(page));
if (!silent)
- nilfs_msg(sb, KERN_WARNING,
- "discard dirty page: offset=%lld, ino=%lu",
- page_offset(page), inode->i_ino);
+ nilfs_warn(sb, "discard dirty page: offset=%lld, ino=%lu",
+ page_offset(page), inode->i_ino);
ClearPageUptodate(page);
ClearPageMappedToDisk(page);
@@ -409,9 +408,9 @@ void nilfs_clear_dirty_page(struct page *page, bool silent)
do {
lock_buffer(bh);
if (!silent)
- nilfs_msg(sb, KERN_WARNING,
- "discard dirty block: blocknr=%llu, size=%zu",
- (u64)bh->b_blocknr, bh->b_size);
+ nilfs_warn(sb,
+ "discard dirty block: blocknr=%llu, size=%zu",
+ (u64)bh->b_blocknr, bh->b_size);
set_mask_bits(&bh->b_state, clear_bits, 0);
unlock_buffer(bh);
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 140b663e91c7..2217f904a7cf 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -51,7 +51,7 @@ static int nilfs_warn_segment_error(struct super_block *sb, int err)
switch (err) {
case NILFS_SEG_FAIL_IO:
- nilfs_msg(sb, KERN_ERR, "I/O error reading segment");
+ nilfs_err(sb, "I/O error reading segment");
return -EIO;
case NILFS_SEG_FAIL_MAGIC:
msg = "Magic number mismatch";
@@ -72,10 +72,10 @@ static int nilfs_warn_segment_error(struct super_block *sb, int err)
msg = "No super root in the last segment";
break;
default:
- nilfs_msg(sb, KERN_ERR, "unrecognized segment error %d", err);
+ nilfs_err(sb, "unrecognized segment error %d", err);
return -EINVAL;
}
- nilfs_msg(sb, KERN_WARNING, "invalid segment: %s", msg);
+ nilfs_warn(sb, "invalid segment: %s", msg);
return -EINVAL;
}
@@ -543,10 +543,10 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
put_page(page);
failed_inode:
- nilfs_msg(sb, KERN_WARNING,
- "error %d recovering data block (ino=%lu, block-offset=%llu)",
- err, (unsigned long)rb->ino,
- (unsigned long long)rb->blkoff);
+ nilfs_warn(sb,
+ "error %d recovering data block (ino=%lu, block-offset=%llu)",
+ err, (unsigned long)rb->ino,
+ (unsigned long long)rb->blkoff);
if (!err2)
err2 = err;
next:
@@ -626,7 +626,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
!(flags & NILFS_SS_SYNDT))
goto try_next_pseg;
state = RF_DSYNC_ST;
- /* Fall through */
+ fallthrough;
case RF_DSYNC_ST:
if (!(flags & NILFS_SS_SYNDT))
goto confused;
@@ -669,8 +669,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
}
if (nsalvaged_blocks) {
- nilfs_msg(sb, KERN_INFO, "salvaged %lu blocks",
- nsalvaged_blocks);
+ nilfs_info(sb, "salvaged %lu blocks", nsalvaged_blocks);
ri->ri_need_recovery = NILFS_RECOVERY_ROLLFORWARD_DONE;
}
out:
@@ -681,7 +680,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
confused:
err = -EINVAL;
failed:
- nilfs_msg(sb, KERN_ERR,
+ nilfs_err(sb,
"error %d roll-forwarding partial segment at blocknr = %llu",
err, (unsigned long long)pseg_start);
goto out;
@@ -703,8 +702,8 @@ static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
set_buffer_dirty(bh);
err = sync_dirty_buffer(bh);
if (unlikely(err))
- nilfs_msg(nilfs->ns_sb, KERN_WARNING,
- "buffer sync write failed during post-cleaning of recovery.");
+ nilfs_warn(nilfs->ns_sb,
+ "buffer sync write failed during post-cleaning of recovery.");
brelse(bh);
}
@@ -739,8 +738,7 @@ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
err = nilfs_attach_checkpoint(sb, ri->ri_cno, true, &root);
if (unlikely(err)) {
- nilfs_msg(sb, KERN_ERR,
- "error %d loading the latest checkpoint", err);
+ nilfs_err(sb, "error %d loading the latest checkpoint", err);
return err;
}
@@ -751,8 +749,7 @@ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
if (ri->ri_need_recovery == NILFS_RECOVERY_ROLLFORWARD_DONE) {
err = nilfs_prepare_segment_for_recovery(nilfs, sb, ri);
if (unlikely(err)) {
- nilfs_msg(sb, KERN_ERR,
- "error %d preparing segment for recovery",
+ nilfs_err(sb, "error %d preparing segment for recovery",
err);
goto failed;
}
@@ -766,8 +763,7 @@ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
nilfs_detach_log_writer(sb);
if (unlikely(err)) {
- nilfs_msg(sb, KERN_ERR,
- "error %d writing segment for recovery",
+ nilfs_err(sb, "error %d writing segment for recovery",
err);
goto failed;
}
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 20c479b5e41b..1a8729eded8b 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -505,7 +505,7 @@ static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf)
} while (--segbuf->sb_nbio > 0);
if (unlikely(atomic_read(&segbuf->sb_err) > 0)) {
- nilfs_msg(segbuf->sb_super, KERN_ERR,
+ nilfs_err(segbuf->sb_super,
"I/O error writing log (start-blocknr=%llu, block-count=%lu) in segment %llu",
(unsigned long long)segbuf->sb_pseg_start,
segbuf->sb_sum.nblocks,
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 91b58c897f92..e3726aca28ed 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -158,7 +158,7 @@ static int nilfs_prepare_segment_lock(struct super_block *sb,
* it is saved and will be restored on
* nilfs_transaction_commit().
*/
- nilfs_msg(sb, KERN_WARNING, "journal info from a different FS");
+ nilfs_warn(sb, "journal info from a different FS");
save = current->journal_info;
}
if (!ti) {
@@ -1138,7 +1138,8 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
nilfs_sc_cstage_set(sci, NILFS_ST_DAT);
goto dat_stage;
}
- nilfs_sc_cstage_inc(sci); /* Fall through */
+ nilfs_sc_cstage_inc(sci);
+ fallthrough;
case NILFS_ST_GC:
if (nilfs_doing_gc()) {
head = &sci->sc_gc_inodes;
@@ -1159,7 +1160,8 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
}
sci->sc_stage.gc_inode_ptr = NULL;
}
- nilfs_sc_cstage_inc(sci); /* Fall through */
+ nilfs_sc_cstage_inc(sci);
+ fallthrough;
case NILFS_ST_FILE:
head = &sci->sc_dirty_files;
ii = list_prepare_entry(sci->sc_stage.dirty_file_ptr, head,
@@ -1186,7 +1188,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
}
nilfs_sc_cstage_inc(sci);
sci->sc_stage.flags |= NILFS_CF_IFILE_STARTED;
- /* Fall through */
+ fallthrough;
case NILFS_ST_IFILE:
err = nilfs_segctor_scan_file(sci, sci->sc_root->ifile,
&nilfs_sc_file_ops);
@@ -1197,13 +1199,14 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
err = nilfs_segctor_create_checkpoint(sci);
if (unlikely(err))
break;
- /* Fall through */
+ fallthrough;
case NILFS_ST_CPFILE:
err = nilfs_segctor_scan_file(sci, nilfs->ns_cpfile,
&nilfs_sc_file_ops);
if (unlikely(err))
break;
- nilfs_sc_cstage_inc(sci); /* Fall through */
+ nilfs_sc_cstage_inc(sci);
+ fallthrough;
case NILFS_ST_SUFILE:
err = nilfs_sufile_freev(nilfs->ns_sufile, sci->sc_freesegs,
sci->sc_nfreesegs, &ndone);
@@ -1219,7 +1222,8 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
&nilfs_sc_file_ops);
if (unlikely(err))
break;
- nilfs_sc_cstage_inc(sci); /* Fall through */
+ nilfs_sc_cstage_inc(sci);
+ fallthrough;
case NILFS_ST_DAT:
dat_stage:
err = nilfs_segctor_scan_file(sci, nilfs->ns_dat,
@@ -1230,7 +1234,8 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
nilfs_sc_cstage_set(sci, NILFS_ST_DONE);
return 0;
}
- nilfs_sc_cstage_inc(sci); /* Fall through */
+ nilfs_sc_cstage_inc(sci);
+ fallthrough;
case NILFS_ST_SR:
if (mode == SC_LSEG_SR) {
/* Appending a super root */
@@ -1940,9 +1945,9 @@ static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci,
err = nilfs_ifile_get_inode_block(
ifile, ii->vfs_inode.i_ino, &ibh);
if (unlikely(err)) {
- nilfs_msg(sci->sc_super, KERN_WARNING,
- "log writer: error %d getting inode block (ino=%lu)",
- err, ii->vfs_inode.i_ino);
+ nilfs_warn(sci->sc_super,
+ "log writer: error %d getting inode block (ino=%lu)",
+ err, ii->vfs_inode.i_ino);
return err;
}
spin_lock(&nilfs->ns_inode_lock);
@@ -2449,7 +2454,7 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
if (likely(!err))
break;
- nilfs_msg(sb, KERN_WARNING, "error %d cleaning segments", err);
+ nilfs_warn(sb, "error %d cleaning segments", err);
set_current_state(TASK_INTERRUPTIBLE);
schedule_timeout(sci->sc_interval);
}
@@ -2457,9 +2462,9 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
int ret = nilfs_discard_segments(nilfs, sci->sc_freesegs,
sci->sc_nfreesegs);
if (ret) {
- nilfs_msg(sb, KERN_WARNING,
- "error %d on discard request, turning discards off for the device",
- ret);
+ nilfs_warn(sb,
+ "error %d on discard request, turning discards off for the device",
+ ret);
nilfs_clear_opt(nilfs, DISCARD);
}
}
@@ -2540,9 +2545,9 @@ static int nilfs_segctor_thread(void *arg)
/* start sync. */
sci->sc_task = current;
wake_up(&sci->sc_wait_task); /* for nilfs_segctor_start_thread() */
- nilfs_msg(sci->sc_super, KERN_INFO,
- "segctord starting. Construction interval = %lu seconds, CP frequency < %lu seconds",
- sci->sc_interval / HZ, sci->sc_mjcp_freq / HZ);
+ nilfs_info(sci->sc_super,
+ "segctord starting. Construction interval = %lu seconds, CP frequency < %lu seconds",
+ sci->sc_interval / HZ, sci->sc_mjcp_freq / HZ);
spin_lock(&sci->sc_state_lock);
loop:
@@ -2616,8 +2621,8 @@ static int nilfs_segctor_start_thread(struct nilfs_sc_info *sci)
if (IS_ERR(t)) {
int err = PTR_ERR(t);
- nilfs_msg(sci->sc_super, KERN_ERR,
- "error %d creating segctord thread", err);
+ nilfs_err(sci->sc_super, "error %d creating segctord thread",
+ err);
return err;
}
wait_event(sci->sc_wait_task, sci->sc_task != NULL);
@@ -2727,14 +2732,14 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
nilfs_segctor_write_out(sci);
if (!list_empty(&sci->sc_dirty_files)) {
- nilfs_msg(sci->sc_super, KERN_WARNING,
- "disposed unprocessed dirty file(s) when stopping log writer");
+ nilfs_warn(sci->sc_super,
+ "disposed unprocessed dirty file(s) when stopping log writer");
nilfs_dispose_list(nilfs, &sci->sc_dirty_files, 1);
}
if (!list_empty(&sci->sc_iput_queue)) {
- nilfs_msg(sci->sc_super, KERN_WARNING,
- "disposed unprocessed inode(s) in iput queue when stopping log writer");
+ nilfs_warn(sci->sc_super,
+ "disposed unprocessed inode(s) in iput queue when stopping log writer");
nilfs_dispose_list(nilfs, &sci->sc_iput_queue, 1);
}
@@ -2812,8 +2817,8 @@ void nilfs_detach_log_writer(struct super_block *sb)
spin_lock(&nilfs->ns_inode_lock);
if (!list_empty(&nilfs->ns_dirty_files)) {
list_splice_init(&nilfs->ns_dirty_files, &garbage_list);
- nilfs_msg(sb, KERN_WARNING,
- "disposed unprocessed dirty file(s) when detaching log writer");
+ nilfs_warn(sb,
+ "disposed unprocessed dirty file(s) when detaching log writer");
}
spin_unlock(&nilfs->ns_inode_lock);
up_write(&nilfs->ns_segctor_sem);
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index bf3f8f05c89b..42ff67c0c14f 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -171,9 +171,9 @@ int nilfs_sufile_updatev(struct inode *sufile, __u64 *segnumv, size_t nsegs,
down_write(&NILFS_MDT(sufile)->mi_sem);
for (seg = segnumv; seg < segnumv + nsegs; seg++) {
if (unlikely(*seg >= nilfs_sufile_get_nsegments(sufile))) {
- nilfs_msg(sufile->i_sb, KERN_WARNING,
- "%s: invalid segment number: %llu",
- __func__, (unsigned long long)*seg);
+ nilfs_warn(sufile->i_sb,
+ "%s: invalid segment number: %llu",
+ __func__, (unsigned long long)*seg);
nerr++;
}
}
@@ -230,9 +230,8 @@ int nilfs_sufile_update(struct inode *sufile, __u64 segnum, int create,
int ret;
if (unlikely(segnum >= nilfs_sufile_get_nsegments(sufile))) {
- nilfs_msg(sufile->i_sb, KERN_WARNING,
- "%s: invalid segment number: %llu",
- __func__, (unsigned long long)segnum);
+ nilfs_warn(sufile->i_sb, "%s: invalid segment number: %llu",
+ __func__, (unsigned long long)segnum);
return -EINVAL;
}
down_write(&NILFS_MDT(sufile)->mi_sem);
@@ -410,9 +409,8 @@ void nilfs_sufile_do_cancel_free(struct inode *sufile, __u64 segnum,
kaddr = kmap_atomic(su_bh->b_page);
su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
if (unlikely(!nilfs_segment_usage_clean(su))) {
- nilfs_msg(sufile->i_sb, KERN_WARNING,
- "%s: segment %llu must be clean", __func__,
- (unsigned long long)segnum);
+ nilfs_warn(sufile->i_sb, "%s: segment %llu must be clean",
+ __func__, (unsigned long long)segnum);
kunmap_atomic(kaddr);
return;
}
@@ -468,9 +466,8 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum,
kaddr = kmap_atomic(su_bh->b_page);
su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
if (nilfs_segment_usage_clean(su)) {
- nilfs_msg(sufile->i_sb, KERN_WARNING,
- "%s: segment %llu is already clean",
- __func__, (unsigned long long)segnum);
+ nilfs_warn(sufile->i_sb, "%s: segment %llu is already clean",
+ __func__, (unsigned long long)segnum);
kunmap_atomic(kaddr);
return;
}
@@ -1168,12 +1165,12 @@ int nilfs_sufile_read(struct super_block *sb, size_t susize,
int err;
if (susize > sb->s_blocksize) {
- nilfs_msg(sb, KERN_ERR,
- "too large segment usage size: %zu bytes", susize);
+ nilfs_err(sb, "too large segment usage size: %zu bytes",
+ susize);
return -EINVAL;
} else if (susize < NILFS_MIN_SEGMENT_USAGE_SIZE) {
- nilfs_msg(sb, KERN_ERR,
- "too small segment usage size: %zu bytes", susize);
+ nilfs_err(sb, "too small segment usage size: %zu bytes",
+ susize);
return -EINVAL;
}
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 5729ee86da9a..2eee5fb1a882 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -62,19 +62,25 @@ struct kmem_cache *nilfs_btree_path_cache;
static int nilfs_setup_super(struct super_block *sb, int is_mount);
static int nilfs_remount(struct super_block *sb, int *flags, char *data);
-void __nilfs_msg(struct super_block *sb, const char *level, const char *fmt,
- ...)
+void __nilfs_msg(struct super_block *sb, const char *fmt, ...)
{
struct va_format vaf;
va_list args;
+ int level;
va_start(args, fmt);
- vaf.fmt = fmt;
+
+ level = printk_get_level(fmt);
+ vaf.fmt = printk_skip_level(fmt);
vaf.va = &args;
+
if (sb)
- printk("%sNILFS (%s): %pV\n", level, sb->s_id, &vaf);
+ printk("%c%cNILFS (%s): %pV\n",
+ KERN_SOH_ASCII, level, sb->s_id, &vaf);
else
- printk("%sNILFS: %pV\n", level, &vaf);
+ printk("%c%cNILFS: %pV\n",
+ KERN_SOH_ASCII, level, &vaf);
+
va_end(args);
}
@@ -106,7 +112,7 @@ static void nilfs_set_error(struct super_block *sb)
*
* This implements the body of nilfs_error() macro. Normally,
* nilfs_error() should be used. As for sustainable errors such as a
- * single-shot I/O error, nilfs_msg() should be used instead.
+ * single-shot I/O error, nilfs_err() should be used instead.
*
* Callers should not add a trailing newline since this will do it.
*/
@@ -178,8 +184,7 @@ static int nilfs_sync_super(struct super_block *sb, int flag)
}
if (unlikely(err)) {
- nilfs_msg(sb, KERN_ERR, "unable to write superblock: err=%d",
- err);
+ nilfs_err(sb, "unable to write superblock: err=%d", err);
if (err == -EIO && nilfs->ns_sbh[1]) {
/*
* sbp[0] points to newer log than sbp[1],
@@ -249,7 +254,7 @@ struct nilfs_super_block **nilfs_prepare_super(struct super_block *sb,
sbp[1]->s_magic == cpu_to_le16(NILFS_SUPER_MAGIC)) {
memcpy(sbp[0], sbp[1], nilfs->ns_sbsize);
} else {
- nilfs_msg(sb, KERN_CRIT, "superblock broke");
+ nilfs_crit(sb, "superblock broke");
return NULL;
}
} else if (sbp[1] &&
@@ -359,9 +364,9 @@ static int nilfs_move_2nd_super(struct super_block *sb, loff_t sb2off)
offset = sb2off & (nilfs->ns_blocksize - 1);
nsbh = sb_getblk(sb, newblocknr);
if (!nsbh) {
- nilfs_msg(sb, KERN_WARNING,
- "unable to move secondary superblock to block %llu",
- (unsigned long long)newblocknr);
+ nilfs_warn(sb,
+ "unable to move secondary superblock to block %llu",
+ (unsigned long long)newblocknr);
ret = -EIO;
goto out;
}
@@ -524,7 +529,7 @@ int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt,
up_read(&nilfs->ns_segctor_sem);
if (unlikely(err)) {
if (err == -ENOENT || err == -EINVAL) {
- nilfs_msg(sb, KERN_ERR,
+ nilfs_err(sb,
"Invalid checkpoint (checkpoint number=%llu)",
(unsigned long long)cno);
err = -EINVAL;
@@ -622,8 +627,7 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
err = nilfs_ifile_count_free_inodes(root->ifile,
&nmaxinodes, &nfreeinodes);
if (unlikely(err)) {
- nilfs_msg(sb, KERN_WARNING,
- "failed to count free inodes: err=%d", err);
+ nilfs_warn(sb, "failed to count free inodes: err=%d", err);
if (err == -ERANGE) {
/*
* If nilfs_palloc_count_max_entries() returns
@@ -755,7 +759,7 @@ static int parse_options(char *options, struct super_block *sb, int is_remount)
break;
case Opt_snapshot:
if (is_remount) {
- nilfs_msg(sb, KERN_ERR,
+ nilfs_err(sb,
"\"%s\" option is invalid for remount",
p);
return 0;
@@ -771,8 +775,7 @@ static int parse_options(char *options, struct super_block *sb, int is_remount)
nilfs_clear_opt(nilfs, DISCARD);
break;
default:
- nilfs_msg(sb, KERN_ERR,
- "unrecognized mount option \"%s\"", p);
+ nilfs_err(sb, "unrecognized mount option \"%s\"", p);
return 0;
}
}
@@ -808,10 +811,10 @@ static int nilfs_setup_super(struct super_block *sb, int is_mount)
mnt_count = le16_to_cpu(sbp[0]->s_mnt_count);
if (nilfs->ns_mount_state & NILFS_ERROR_FS) {
- nilfs_msg(sb, KERN_WARNING, "mounting fs with errors");
+ nilfs_warn(sb, "mounting fs with errors");
#if 0
} else if (max_mnt_count >= 0 && mnt_count >= max_mnt_count) {
- nilfs_msg(sb, KERN_WARNING, "maximal mount count reached");
+ nilfs_warn(sb, "maximal mount count reached");
#endif
}
if (!max_mnt_count)
@@ -874,7 +877,7 @@ int nilfs_check_feature_compatibility(struct super_block *sb,
features = le64_to_cpu(sbp->s_feature_incompat) &
~NILFS_FEATURE_INCOMPAT_SUPP;
if (features) {
- nilfs_msg(sb, KERN_ERR,
+ nilfs_err(sb,
"couldn't mount because of unsupported optional features (%llx)",
(unsigned long long)features);
return -EINVAL;
@@ -882,7 +885,7 @@ int nilfs_check_feature_compatibility(struct super_block *sb,
features = le64_to_cpu(sbp->s_feature_compat_ro) &
~NILFS_FEATURE_COMPAT_RO_SUPP;
if (!sb_rdonly(sb) && features) {
- nilfs_msg(sb, KERN_ERR,
+ nilfs_err(sb,
"couldn't mount RDWR because of unsupported optional features (%llx)",
(unsigned long long)features);
return -EINVAL;
@@ -901,12 +904,12 @@ static int nilfs_get_root_dentry(struct super_block *sb,
inode = nilfs_iget(sb, root, NILFS_ROOT_INO);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
- nilfs_msg(sb, KERN_ERR, "error %d getting root inode", ret);
+ nilfs_err(sb, "error %d getting root inode", ret);
goto out;
}
if (!S_ISDIR(inode->i_mode) || !inode->i_blocks || !inode->i_size) {
iput(inode);
- nilfs_msg(sb, KERN_ERR, "corrupt root inode");
+ nilfs_err(sb, "corrupt root inode");
ret = -EINVAL;
goto out;
}
@@ -934,7 +937,7 @@ static int nilfs_get_root_dentry(struct super_block *sb,
return ret;
failed_dentry:
- nilfs_msg(sb, KERN_ERR, "error %d getting root dentry", ret);
+ nilfs_err(sb, "error %d getting root dentry", ret);
goto out;
}
@@ -954,7 +957,7 @@ static int nilfs_attach_snapshot(struct super_block *s, __u64 cno,
ret = (ret == -ENOENT) ? -EINVAL : ret;
goto out;
} else if (!ret) {
- nilfs_msg(s, KERN_ERR,
+ nilfs_err(s,
"The specified checkpoint is not a snapshot (checkpoint number=%llu)",
(unsigned long long)cno);
ret = -EINVAL;
@@ -963,7 +966,7 @@ static int nilfs_attach_snapshot(struct super_block *s, __u64 cno,
ret = nilfs_attach_checkpoint(s, cno, false, &root);
if (ret) {
- nilfs_msg(s, KERN_ERR,
+ nilfs_err(s,
"error %d while loading snapshot (checkpoint number=%llu)",
ret, (unsigned long long)cno);
goto out;
@@ -1060,7 +1063,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
cno = nilfs_last_cno(nilfs);
err = nilfs_attach_checkpoint(sb, cno, true, &fsroot);
if (err) {
- nilfs_msg(sb, KERN_ERR,
+ nilfs_err(sb,
"error %d while loading last checkpoint (checkpoint number=%llu)",
err, (unsigned long long)cno);
goto failed_unload;
@@ -1122,8 +1125,8 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
err = -EINVAL;
if (!nilfs_valid_fs(nilfs)) {
- nilfs_msg(sb, KERN_WARNING,
- "couldn't remount because the filesystem is in an incomplete recovery state");
+ nilfs_warn(sb,
+ "couldn't remount because the filesystem is in an incomplete recovery state");
goto restore_opts;
}
@@ -1155,9 +1158,9 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
~NILFS_FEATURE_COMPAT_RO_SUPP;
up_read(&nilfs->ns_sem);
if (features) {
- nilfs_msg(sb, KERN_WARNING,
- "couldn't remount RDWR because of unsupported optional features (%llx)",
- (unsigned long long)features);
+ nilfs_warn(sb,
+ "couldn't remount RDWR because of unsupported optional features (%llx)",
+ (unsigned long long)features);
err = -EROFS;
goto restore_opts;
}
@@ -1216,7 +1219,7 @@ static int nilfs_parse_snapshot_option(const char *option,
return 0;
parse_error:
- nilfs_msg(NULL, KERN_ERR, "invalid option \"%s\": %s", option, msg);
+ nilfs_err(NULL, "invalid option \"%s\": %s", option, msg);
return 1;
}
@@ -1319,7 +1322,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
} else if (!sd.cno) {
if (nilfs_tree_is_busy(s->s_root)) {
if ((flags ^ s->s_flags) & SB_RDONLY) {
- nilfs_msg(s, KERN_ERR,
+ nilfs_err(s,
"the device already has a %s mount.",
sb_rdonly(s) ? "read-only" : "read/write");
err = -EBUSY;
diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c
index e60be7bb55b0..303d71430bdd 100644
--- a/fs/nilfs2/sysfs.c
+++ b/fs/nilfs2/sysfs.c
@@ -263,8 +263,8 @@ nilfs_checkpoints_checkpoints_number_show(struct nilfs_checkpoints_attr *attr,
err = nilfs_cpfile_get_stat(nilfs->ns_cpfile, &cpstat);
up_read(&nilfs->ns_segctor_sem);
if (err < 0) {
- nilfs_msg(nilfs->ns_sb, KERN_ERR,
- "unable to get checkpoint stat: err=%d", err);
+ nilfs_err(nilfs->ns_sb, "unable to get checkpoint stat: err=%d",
+ err);
return err;
}
@@ -286,8 +286,8 @@ nilfs_checkpoints_snapshots_number_show(struct nilfs_checkpoints_attr *attr,
err = nilfs_cpfile_get_stat(nilfs->ns_cpfile, &cpstat);
up_read(&nilfs->ns_segctor_sem);
if (err < 0) {
- nilfs_msg(nilfs->ns_sb, KERN_ERR,
- "unable to get checkpoint stat: err=%d", err);
+ nilfs_err(nilfs->ns_sb, "unable to get checkpoint stat: err=%d",
+ err);
return err;
}
@@ -405,8 +405,8 @@ nilfs_segments_dirty_segments_show(struct nilfs_segments_attr *attr,
err = nilfs_sufile_get_stat(nilfs->ns_sufile, &sustat);
up_read(&nilfs->ns_segctor_sem);
if (err < 0) {
- nilfs_msg(nilfs->ns_sb, KERN_ERR,
- "unable to get segment stat: err=%d", err);
+ nilfs_err(nilfs->ns_sb, "unable to get segment stat: err=%d",
+ err);
return err;
}
@@ -779,15 +779,15 @@ nilfs_superblock_sb_update_frequency_store(struct nilfs_superblock_attr *attr,
err = kstrtouint(skip_spaces(buf), 0, &val);
if (err) {
- nilfs_msg(nilfs->ns_sb, KERN_ERR,
- "unable to convert string: err=%d", err);
+ nilfs_err(nilfs->ns_sb, "unable to convert string: err=%d",
+ err);
return err;
}
if (val < NILFS_SB_FREQ) {
val = NILFS_SB_FREQ;
- nilfs_msg(nilfs->ns_sb, KERN_WARNING,
- "superblock update frequency cannot be lesser than 10 seconds");
+ nilfs_warn(nilfs->ns_sb,
+ "superblock update frequency cannot be lesser than 10 seconds");
}
down_write(&nilfs->ns_sem);
@@ -990,8 +990,7 @@ int nilfs_sysfs_create_device_group(struct super_block *sb)
nilfs->ns_dev_subgroups = kzalloc(devgrp_size, GFP_KERNEL);
if (unlikely(!nilfs->ns_dev_subgroups)) {
err = -ENOMEM;
- nilfs_msg(sb, KERN_ERR,
- "unable to allocate memory for device group");
+ nilfs_err(sb, "unable to allocate memory for device group");
goto failed_create_device_group;
}
@@ -1101,15 +1100,13 @@ int __init nilfs_sysfs_init(void)
nilfs_kset = kset_create_and_add(NILFS_ROOT_GROUP_NAME, NULL, fs_kobj);
if (!nilfs_kset) {
err = -ENOMEM;
- nilfs_msg(NULL, KERN_ERR,
- "unable to create sysfs entry: err=%d", err);
+ nilfs_err(NULL, "unable to create sysfs entry: err=%d", err);
goto failed_sysfs_init;
}
err = sysfs_create_group(&nilfs_kset->kobj, &nilfs_feature_attr_group);
if (unlikely(err)) {
- nilfs_msg(NULL, KERN_ERR,
- "unable to create feature group: err=%d", err);
+ nilfs_err(NULL, "unable to create feature group: err=%d", err);
goto cleanup_sysfs_init;
}
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 484785cdf96e..221a1cc597f0 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -183,7 +183,7 @@ static int nilfs_store_log_cursor(struct the_nilfs *nilfs,
nilfs_get_segnum_of_block(nilfs, nilfs->ns_last_pseg);
nilfs->ns_cno = nilfs->ns_last_cno + 1;
if (nilfs->ns_segnum >= nilfs->ns_nsegments) {
- nilfs_msg(nilfs->ns_sb, KERN_ERR,
+ nilfs_err(nilfs->ns_sb,
"pointed segment number is out of range: segnum=%llu, nsegments=%lu",
(unsigned long long)nilfs->ns_segnum,
nilfs->ns_nsegments);
@@ -210,12 +210,12 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
int err;
if (!valid_fs) {
- nilfs_msg(sb, KERN_WARNING, "mounting unchecked fs");
+ nilfs_warn(sb, "mounting unchecked fs");
if (s_flags & SB_RDONLY) {
- nilfs_msg(sb, KERN_INFO,
- "recovery required for readonly filesystem");
- nilfs_msg(sb, KERN_INFO,
- "write access will be enabled during recovery");
+ nilfs_info(sb,
+ "recovery required for readonly filesystem");
+ nilfs_info(sb,
+ "write access will be enabled during recovery");
}
}
@@ -230,12 +230,11 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
goto scan_error;
if (!nilfs_valid_sb(sbp[1])) {
- nilfs_msg(sb, KERN_WARNING,
- "unable to fall back to spare super block");
+ nilfs_warn(sb,
+ "unable to fall back to spare super block");
goto scan_error;
}
- nilfs_msg(sb, KERN_INFO,
- "trying rollback from an earlier position");
+ nilfs_info(sb, "trying rollback from an earlier position");
/*
* restore super block with its spare and reconfigure
@@ -248,9 +247,9 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
/* verify consistency between two super blocks */
blocksize = BLOCK_SIZE << le32_to_cpu(sbp[0]->s_log_block_size);
if (blocksize != nilfs->ns_blocksize) {
- nilfs_msg(sb, KERN_WARNING,
- "blocksize differs between two super blocks (%d != %d)",
- blocksize, nilfs->ns_blocksize);
+ nilfs_warn(sb,
+ "blocksize differs between two super blocks (%d != %d)",
+ blocksize, nilfs->ns_blocksize);
goto scan_error;
}
@@ -269,8 +268,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
err = nilfs_load_super_root(nilfs, sb, ri.ri_super_root);
if (unlikely(err)) {
- nilfs_msg(sb, KERN_ERR, "error %d while loading super root",
- err);
+ nilfs_err(sb, "error %d while loading super root", err);
goto failed;
}
@@ -281,28 +279,28 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
__u64 features;
if (nilfs_test_opt(nilfs, NORECOVERY)) {
- nilfs_msg(sb, KERN_INFO,
- "norecovery option specified, skipping roll-forward recovery");
+ nilfs_info(sb,
+ "norecovery option specified, skipping roll-forward recovery");
goto skip_recovery;
}
features = le64_to_cpu(nilfs->ns_sbp[0]->s_feature_compat_ro) &
~NILFS_FEATURE_COMPAT_RO_SUPP;
if (features) {
- nilfs_msg(sb, KERN_ERR,
+ nilfs_err(sb,
"couldn't proceed with recovery because of unsupported optional features (%llx)",
(unsigned long long)features);
err = -EROFS;
goto failed_unload;
}
if (really_read_only) {
- nilfs_msg(sb, KERN_ERR,
+ nilfs_err(sb,
"write access unavailable, cannot proceed");
err = -EROFS;
goto failed_unload;
}
sb->s_flags &= ~SB_RDONLY;
} else if (nilfs_test_opt(nilfs, NORECOVERY)) {
- nilfs_msg(sb, KERN_ERR,
+ nilfs_err(sb,
"recovery cancelled because norecovery option was specified for a read/write mount");
err = -EINVAL;
goto failed_unload;
@@ -318,12 +316,12 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
up_write(&nilfs->ns_sem);
if (err) {
- nilfs_msg(sb, KERN_ERR,
+ nilfs_err(sb,
"error %d updating super block. recovery unfinished.",
err);
goto failed_unload;
}
- nilfs_msg(sb, KERN_INFO, "recovery complete");
+ nilfs_info(sb, "recovery complete");
skip_recovery:
nilfs_clear_recovery_info(&ri);
@@ -331,7 +329,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
return 0;
scan_error:
- nilfs_msg(sb, KERN_ERR, "error %d while searching super root", err);
+ nilfs_err(sb, "error %d while searching super root", err);
goto failed;
failed_unload:
@@ -378,7 +376,7 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
struct nilfs_super_block *sbp)
{
if (le32_to_cpu(sbp->s_rev_level) < NILFS_MIN_SUPP_REV) {
- nilfs_msg(nilfs->ns_sb, KERN_ERR,
+ nilfs_err(nilfs->ns_sb,
"unsupported revision (superblock rev.=%d.%d, current rev.=%d.%d). Please check the version of mkfs.nilfs(2).",
le32_to_cpu(sbp->s_rev_level),
le16_to_cpu(sbp->s_minor_rev_level),
@@ -391,13 +389,11 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
nilfs->ns_inode_size = le16_to_cpu(sbp->s_inode_size);
if (nilfs->ns_inode_size > nilfs->ns_blocksize) {
- nilfs_msg(nilfs->ns_sb, KERN_ERR,
- "too large inode size: %d bytes",
+ nilfs_err(nilfs->ns_sb, "too large inode size: %d bytes",
nilfs->ns_inode_size);
return -EINVAL;
} else if (nilfs->ns_inode_size < NILFS_MIN_INODE_SIZE) {
- nilfs_msg(nilfs->ns_sb, KERN_ERR,
- "too small inode size: %d bytes",
+ nilfs_err(nilfs->ns_sb, "too small inode size: %d bytes",
nilfs->ns_inode_size);
return -EINVAL;
}
@@ -406,8 +402,7 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment);
if (nilfs->ns_blocks_per_segment < NILFS_SEG_MIN_BLOCKS) {
- nilfs_msg(nilfs->ns_sb, KERN_ERR,
- "too short segment: %lu blocks",
+ nilfs_err(nilfs->ns_sb, "too short segment: %lu blocks",
nilfs->ns_blocks_per_segment);
return -EINVAL;
}
@@ -417,7 +412,7 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
le32_to_cpu(sbp->s_r_segments_percentage);
if (nilfs->ns_r_segments_percentage < 1 ||
nilfs->ns_r_segments_percentage > 99) {
- nilfs_msg(nilfs->ns_sb, KERN_ERR,
+ nilfs_err(nilfs->ns_sb,
"invalid reserved segments percentage: %lu",
nilfs->ns_r_segments_percentage);
return -EINVAL;
@@ -503,16 +498,16 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
if (!sbp[0]) {
if (!sbp[1]) {
- nilfs_msg(sb, KERN_ERR, "unable to read superblock");
+ nilfs_err(sb, "unable to read superblock");
return -EIO;
}
- nilfs_msg(sb, KERN_WARNING,
- "unable to read primary superblock (blocksize = %d)",
- blocksize);
+ nilfs_warn(sb,
+ "unable to read primary superblock (blocksize = %d)",
+ blocksize);
} else if (!sbp[1]) {
- nilfs_msg(sb, KERN_WARNING,
- "unable to read secondary superblock (blocksize = %d)",
- blocksize);
+ nilfs_warn(sb,
+ "unable to read secondary superblock (blocksize = %d)",
+ blocksize);
}
/*
@@ -534,14 +529,14 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
}
if (!valid[swp]) {
nilfs_release_super_block(nilfs);
- nilfs_msg(sb, KERN_ERR, "couldn't find nilfs on the device");
+ nilfs_err(sb, "couldn't find nilfs on the device");
return -EINVAL;
}
if (!valid[!swp])
- nilfs_msg(sb, KERN_WARNING,
- "broken superblock, retrying with spare superblock (blocksize = %d)",
- blocksize);
+ nilfs_warn(sb,
+ "broken superblock, retrying with spare superblock (blocksize = %d)",
+ blocksize);
if (swp)
nilfs_swap_super_block(nilfs);
@@ -575,7 +570,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data)
blocksize = sb_min_blocksize(sb, NILFS_MIN_BLOCK_SIZE);
if (!blocksize) {
- nilfs_msg(sb, KERN_ERR, "unable to set blocksize");
+ nilfs_err(sb, "unable to set blocksize");
err = -EINVAL;
goto out;
}
@@ -594,7 +589,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data)
blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
if (blocksize < NILFS_MIN_BLOCK_SIZE ||
blocksize > NILFS_MAX_BLOCK_SIZE) {
- nilfs_msg(sb, KERN_ERR,
+ nilfs_err(sb,
"couldn't mount because of unsupported filesystem blocksize %d",
blocksize);
err = -EINVAL;
@@ -604,7 +599,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data)
int hw_blocksize = bdev_logical_block_size(sb->s_bdev);
if (blocksize < hw_blocksize) {
- nilfs_msg(sb, KERN_ERR,
+ nilfs_err(sb,
"blocksize %d too small for device (sector-size = %d)",
blocksize, hw_blocksize);
err = -EINVAL;
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 559de311deca..3e01d8f2ab90 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -1147,7 +1147,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
}
switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) {
- case FAN_MARK_ADD: /* fallthrough */
+ case FAN_MARK_ADD:
case FAN_MARK_REMOVE:
if (!mask)
return -EINVAL;
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 9bb9f0952b18..caf563981532 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -1810,6 +1810,12 @@ int ntfs_read_inode_mount(struct inode *vi)
brelse(bh);
}
+ if (le32_to_cpu(m->bytes_allocated) != vol->mft_record_size) {
+ ntfs_error(sb, "Incorrect mft record size %u in superblock, should be %u.",
+ le32_to_cpu(m->bytes_allocated), vol->mft_record_size);
+ goto err_out;
+ }
+
/* Apply the mst fixups. */
if (post_read_mst_fixup((NTFS_RECORD*)m, vol->mft_record_size)) {
/* FIXME: Try to use the $MFTMirr now. */
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 4c1b90442d6f..78710788c237 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6013,7 +6013,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
goto out;
}
- /* Appending truncate log(TA) and and flushing truncate log(TF) are
+ /* Appending truncate log(TA) and flushing truncate log(TF) are
* two separated transactions. They can be both committed but not
* checkpointed. If crash occurs then, both two transaction will be
* replayed with several already released to global bitmap clusters.
@@ -7654,8 +7654,10 @@ out_mutex:
* main_bm related locks for avoiding the current IO starve, then go to
* trim the next group
*/
- if (ret >= 0 && group <= last_group)
+ if (ret >= 0 && group <= last_group) {
+ cond_resched();
goto next_group;
+ }
out:
range->len = trimmed * sb->s_blocksize;
return ret;
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 89d13e0705fe..0179a73a3fa2 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1766,7 +1766,6 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
int sectsize;
char *p = (char *)page;
struct fd f;
- struct inode *inode;
ssize_t ret = -EINVAL;
int live_threshold;
@@ -1793,20 +1792,16 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
reg->hr_block_bytes == 0)
goto out2;
- inode = igrab(f.file->f_mapping->host);
- if (inode == NULL)
+ if (!S_ISBLK(f.file->f_mapping->host->i_mode))
goto out2;
- if (!S_ISBLK(inode->i_mode))
- goto out3;
-
- reg->hr_bdev = I_BDEV(f.file->f_mapping->host);
- ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ, NULL);
- if (ret) {
+ reg->hr_bdev = blkdev_get_by_dev(f.file->f_mapping->host->i_rdev,
+ FMODE_WRITE | FMODE_READ, NULL);
+ if (IS_ERR(reg->hr_bdev)) {
+ ret = PTR_ERR(reg->hr_bdev);
reg->hr_bdev = NULL;
- goto out3;
+ goto out2;
}
- inode = NULL;
bdevname(reg->hr_bdev, reg->hr_dev_name);
@@ -1909,16 +1904,13 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
config_item_name(&reg->hr_item), reg->hr_dev_name);
out3:
- iput(inode);
+ if (ret < 0) {
+ blkdev_put(reg->hr_bdev, FMODE_READ | FMODE_WRITE);
+ reg->hr_bdev = NULL;
+ }
out2:
fdput(f);
out:
- if (ret < 0) {
- if (reg->hr_bdev) {
- blkdev_put(reg->hr_bdev, FMODE_READ|FMODE_WRITE);
- reg->hr_bdev = NULL;
- }
- }
return ret;
}
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index 1ef24574f481..cea739be77c4 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -67,7 +67,7 @@ static void o2quo_fence_self(void)
default:
WARN_ON(o2nm_single_cluster->cl_fence_method >=
O2NM_FENCE_METHODS);
- /* fall through */
+ fallthrough;
case O2NM_FENCE_RESET:
printk(KERN_ERR "*** ocfs2 is very sorry to be fencing this "
"system by restarting ***\n");
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 720e9f94957e..fc8252a28cb1 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -677,7 +677,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
/*
* Under certain conditions, the window slide code
* might have reduced the number of bits available or
- * disabled the the local alloc entirely. Re-check
+ * disabled the local alloc entirely. Re-check
* here and return -ENOSPC if necessary.
*/
status = -ENOSPC;
diff --git a/fs/open.c b/fs/open.c
index c80e9f497e9b..9af548fb841b 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -779,12 +779,6 @@ static int do_dentry_open(struct file *f,
return 0;
}
- /* Any file opened for execve()/uselib() has to be a regular file. */
- if (unlikely(f->f_flags & FMODE_EXEC && !S_ISREG(inode->i_mode))) {
- error = -EACCES;
- goto cleanup_file;
- }
-
if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
error = get_write_access(inode);
if (unlikely(error))
diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c
index eced272a3c57..a25e6c890975 100644
--- a/fs/orangefs/acl.c
+++ b/fs/orangefs/acl.c
@@ -122,6 +122,8 @@ int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
struct iattr iattr;
int rc;
+ memset(&iattr, 0, sizeof iattr);
+
if (type == ACL_TYPE_ACCESS && acl) {
/*
* posix_acl_update_mode checks to see if the permissions
@@ -138,18 +140,17 @@ int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
return error;
}
- if (acl) {
- rc = __orangefs_set_acl(inode, acl, type);
- } else {
+ if (inode->i_mode != iattr.ia_mode)
iattr.ia_valid = ATTR_MODE;
- rc = __orangefs_setattr(inode, &iattr);
- }
- return rc;
-
- } else {
- return -EINVAL;
}
+
+ rc = __orangefs_set_acl(inode, acl, type);
+
+ if (!rc && (iattr.ia_valid == ATTR_MODE))
+ rc = __orangefs_setattr(inode, &iattr);
+
+ return rc;
}
int orangefs_init_acl(struct inode *inode, struct inode *dir)
diff --git a/fs/orangefs/orangefs-mod.c b/fs/orangefs/orangefs-mod.c
index 289b648ae196..74a3d6337ef4 100644
--- a/fs/orangefs/orangefs-mod.c
+++ b/fs/orangefs/orangefs-mod.c
@@ -149,7 +149,6 @@ static int __init orangefs_init(void)
pr_info("%s: module version %s loaded\n",
__func__,
ORANGEFS_VERSION);
- ret = 0;
goto out;
}
diff --git a/fs/pipe.c b/fs/pipe.c
index 60dbee457143..0ac197658a2d 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -106,25 +106,6 @@ void pipe_double_lock(struct pipe_inode_info *pipe1,
}
}
-/* Drop the inode semaphore and wait for a pipe event, atomically */
-void pipe_wait(struct pipe_inode_info *pipe)
-{
- DEFINE_WAIT(rdwait);
- DEFINE_WAIT(wrwait);
-
- /*
- * Pipes are system-local resources, so sleeping on them
- * is considered a noninteractive wait:
- */
- prepare_to_wait(&pipe->rd_wait, &rdwait, TASK_INTERRUPTIBLE);
- prepare_to_wait(&pipe->wr_wait, &wrwait, TASK_INTERRUPTIBLE);
- pipe_unlock(pipe);
- schedule();
- finish_wait(&pipe->rd_wait, &rdwait);
- finish_wait(&pipe->wr_wait, &wrwait);
- pipe_lock(pipe);
-}
-
static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
@@ -913,19 +894,18 @@ int create_pipe_files(struct file **res, int flags)
{
struct inode *inode = get_pipe_inode();
struct file *f;
+ int error;
if (!inode)
return -ENFILE;
if (flags & O_NOTIFICATION_PIPE) {
-#ifdef CONFIG_WATCH_QUEUE
- if (watch_queue_init(inode->i_pipe) < 0) {
+ error = watch_queue_init(inode->i_pipe);
+ if (error) {
+ free_pipe_info(inode->i_pipe);
iput(inode);
- return -ENOMEM;
+ return error;
}
-#else
- return -ENOPKG;
-#endif
}
f = alloc_file_pseudo(inode, pipe_mnt, "",
@@ -1035,12 +1015,52 @@ SYSCALL_DEFINE1(pipe, int __user *, fildes)
return do_pipe2(fildes, 0);
}
+/*
+ * This is the stupid "wait for pipe to be readable or writable"
+ * model.
+ *
+ * See pipe_read/write() for the proper kind of exclusive wait,
+ * but that requires that we wake up any other readers/writers
+ * if we then do not end up reading everything (ie the whole
+ * "wake_next_reader/writer" logic in pipe_read/write()).
+ */
+void pipe_wait_readable(struct pipe_inode_info *pipe)
+{
+ pipe_unlock(pipe);
+ wait_event_interruptible(pipe->rd_wait, pipe_readable(pipe));
+ pipe_lock(pipe);
+}
+
+void pipe_wait_writable(struct pipe_inode_info *pipe)
+{
+ pipe_unlock(pipe);
+ wait_event_interruptible(pipe->wr_wait, pipe_writable(pipe));
+ pipe_lock(pipe);
+}
+
+/*
+ * This depends on both the wait (here) and the wakeup (wake_up_partner)
+ * holding the pipe lock, so "*cnt" is stable and we know a wakeup cannot
+ * race with the count check and waitqueue prep.
+ *
+ * Normally in order to avoid races, you'd do the prepare_to_wait() first,
+ * then check the condition you're waiting for, and only then sleep. But
+ * because of the pipe lock, we can check the condition before being on
+ * the wait queue.
+ *
+ * We use the 'rd_wait' waitqueue for pipe partner waiting.
+ */
static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt)
{
+ DEFINE_WAIT(rdwait);
int cur = *cnt;
while (cur == *cnt) {
- pipe_wait(pipe);
+ prepare_to_wait(&pipe->rd_wait, &rdwait, TASK_INTERRUPTIBLE);
+ pipe_unlock(pipe);
+ schedule();
+ finish_wait(&pipe->rd_wait, &rdwait);
+ pipe_lock(pipe);
if (signal_pending(current))
break;
}
@@ -1050,7 +1070,6 @@ static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt)
static void wake_up_partner(struct pipe_inode_info *pipe)
{
wake_up_interruptible_all(&pipe->rd_wait);
- wake_up_interruptible_all(&pipe->wr_wait);
}
static int fifo_open(struct inode *inode, struct file *filp)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index a333caeca291..aa69c35d904c 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -551,8 +551,17 @@ static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns,
{
unsigned long totalpages = totalram_pages() + total_swap_pages;
unsigned long points = 0;
+ long badness;
+
+ badness = oom_badness(task, totalpages);
+ /*
+ * Special case OOM_SCORE_ADJ_MIN for all others scale the
+ * badness value into [0, 2000] range which we have been
+ * exporting for a long time so userspace might depend on it.
+ */
+ if (badness != LONG_MIN)
+ points = (1000 + badness * 1000 / (long)totalpages) * 2 / 3;
- points = oom_badness(task, totalpages) * 1000 / totalpages;
seq_printf(m, "%lu\n", points);
return 0;
@@ -1046,7 +1055,6 @@ static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,
static int __set_oom_adj(struct file *file, int oom_adj, bool legacy)
{
- static DEFINE_MUTEX(oom_adj_mutex);
struct mm_struct *mm = NULL;
struct task_struct *task;
int err = 0;
@@ -1086,7 +1094,7 @@ static int __set_oom_adj(struct file *file, int oom_adj, bool legacy)
struct task_struct *p = find_lock_task_mm(task);
if (p) {
- if (atomic_read(&p->mm->mm_users) > 1) {
+ if (test_bit(MMF_MULTIPROCESS, &p->mm->flags)) {
mm = p->mm;
mmgrab(mm);
}
diff --git a/fs/proc/page.c b/fs/proc/page.c
index f909243d4a66..9f1077d94cde 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -217,6 +217,9 @@ u64 stable_page_flags(struct page *page)
u |= kpf_copy_bit(k, KPF_PRIVATE_2, PG_private_2);
u |= kpf_copy_bit(k, KPF_OWNER_PRIVATE, PG_owner_priv_1);
u |= kpf_copy_bit(k, KPF_ARCH, PG_arch_1);
+#ifdef CONFIG_64BIT
+ u |= kpf_copy_bit(k, KPF_ARCH_2, PG_arch_2);
+#endif
return u;
};
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index dbda4499a859..846d43df3fdf 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -520,16 +520,10 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
page = device_private_entry_to_page(swpent);
} else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap
&& pte_none(*pte))) {
- page = find_get_entry(vma->vm_file->f_mapping,
+ page = xa_load(&vma->vm_file->f_mapping->i_pages,
linear_page_index(vma, addr));
- if (!page)
- return;
-
if (xa_is_value(page))
mss->swap += PAGE_SIZE;
- else
- put_page(page);
-
return;
}
@@ -653,6 +647,10 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
[ilog2(VM_MERGEABLE)] = "mg",
[ilog2(VM_UFFD_MISSING)]= "um",
[ilog2(VM_UFFD_WP)] = "uw",
+#ifdef CONFIG_ARM64_MTE
+ [ilog2(VM_MTE)] = "mt",
+ [ilog2(VM_MTE_ALLOWED)] = "",
+#endif
#ifdef CONFIG_ARCH_HAS_PKEYS
/* These come out via ProtectionKey: */
[ilog2(VM_PKEY_BIT0)] = "",
@@ -723,9 +721,21 @@ static const struct mm_walk_ops smaps_shmem_walk_ops = {
.pte_hole = smaps_pte_hole,
};
+/*
+ * Gather mem stats from @vma with the indicated beginning
+ * address @start, and keep them in @mss.
+ *
+ * Use vm_start of @vma as the beginning address if @start is 0.
+ */
static void smap_gather_stats(struct vm_area_struct *vma,
- struct mem_size_stats *mss)
+ struct mem_size_stats *mss, unsigned long start)
{
+ const struct mm_walk_ops *ops = &smaps_walk_ops;
+
+ /* Invalid start */
+ if (start >= vma->vm_end)
+ return;
+
#ifdef CONFIG_SHMEM
/* In case of smaps_rollup, reset the value from previous vma */
mss->check_shmem_swap = false;
@@ -742,18 +752,20 @@ static void smap_gather_stats(struct vm_area_struct *vma,
*/
unsigned long shmem_swapped = shmem_swap_usage(vma);
- if (!shmem_swapped || (vma->vm_flags & VM_SHARED) ||
- !(vma->vm_flags & VM_WRITE)) {
+ if (!start && (!shmem_swapped || (vma->vm_flags & VM_SHARED) ||
+ !(vma->vm_flags & VM_WRITE))) {
mss->swap += shmem_swapped;
} else {
mss->check_shmem_swap = true;
- walk_page_vma(vma, &smaps_shmem_walk_ops, mss);
- return;
+ ops = &smaps_shmem_walk_ops;
}
}
#endif
/* mmap_lock is held in m_start */
- walk_page_vma(vma, &smaps_walk_ops, mss);
+ if (!start)
+ walk_page_vma(vma, ops, mss);
+ else
+ walk_page_range(vma->vm_mm, start, vma->vm_end, ops, mss);
}
#define SEQ_PUT_DEC(str, val) \
@@ -786,7 +798,7 @@ static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss,
SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree);
SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp);
SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp);
- SEQ_PUT_DEC(" kB\nFilePmdMapped: ", mss->file_thp);
+ SEQ_PUT_DEC(" kB\nFilePmdMapped: ", mss->file_thp);
SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb);
seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ",
mss->private_hugetlb >> 10, 7);
@@ -805,7 +817,7 @@ static int show_smap(struct seq_file *m, void *v)
memset(&mss, 0, sizeof(mss));
- smap_gather_stats(vma, &mss);
+ smap_gather_stats(vma, &mss, 0);
show_map_vma(m, vma);
@@ -816,7 +828,7 @@ static int show_smap(struct seq_file *m, void *v)
__show_smap(m, &mss, false);
- seq_printf(m, "THPeligible: %d\n",
+ seq_printf(m, "THPeligible: %d\n",
transparent_hugepage_enabled(vma));
if (arch_pkeys_enabled())
@@ -853,9 +865,73 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
hold_task_mempolicy(priv);
- for (vma = priv->mm->mmap; vma; vma = vma->vm_next) {
- smap_gather_stats(vma, &mss);
+ for (vma = priv->mm->mmap; vma;) {
+ smap_gather_stats(vma, &mss, 0);
last_vma_end = vma->vm_end;
+
+ /*
+ * Release mmap_lock temporarily if someone wants to
+ * access it for write request.
+ */
+ if (mmap_lock_is_contended(mm)) {
+ mmap_read_unlock(mm);
+ ret = mmap_read_lock_killable(mm);
+ if (ret) {
+ release_task_mempolicy(priv);
+ goto out_put_mm;
+ }
+
+ /*
+ * After dropping the lock, there are four cases to
+ * consider. See the following example for explanation.
+ *
+ * +------+------+-----------+
+ * | VMA1 | VMA2 | VMA3 |
+ * +------+------+-----------+
+ * | | | |
+ * 4k 8k 16k 400k
+ *
+ * Suppose we drop the lock after reading VMA2 due to
+ * contention, then we get:
+ *
+ * last_vma_end = 16k
+ *
+ * 1) VMA2 is freed, but VMA3 exists:
+ *
+ * find_vma(mm, 16k - 1) will return VMA3.
+ * In this case, just continue from VMA3.
+ *
+ * 2) VMA2 still exists:
+ *
+ * find_vma(mm, 16k - 1) will return VMA2.
+ * Iterate the loop like the original one.
+ *
+ * 3) No more VMAs can be found:
+ *
+ * find_vma(mm, 16k - 1) will return NULL.
+ * No more things to do, just break.
+ *
+ * 4) (last_vma_end - 1) is the middle of a vma (VMA'):
+ *
+ * find_vma(mm, 16k - 1) will return VMA' whose range
+ * contains last_vma_end.
+ * Iterate VMA' from last_vma_end.
+ */
+ vma = find_vma(mm, last_vma_end - 1);
+ /* Case 3 above */
+ if (!vma)
+ break;
+
+ /* Case 1 above */
+ if (vma->vm_start >= last_vma_end)
+ continue;
+
+ /* Case 4 above */
+ if (vma->vm_end > last_vma_end)
+ smap_gather_stats(vma, &mss, last_vma_end);
+ }
+ /* Case 2 above */
+ vma = vma->vm_next;
}
show_vma_header_prefix(m, priv->mm->mmap->vm_start,
diff --git a/fs/pstore/zone.c b/fs/pstore/zone.c
index 819428dfa32f..3ce89216670c 100644
--- a/fs/pstore/zone.c
+++ b/fs/pstore/zone.c
@@ -1081,7 +1081,6 @@ next_zone:
readop = psz_ftrace_read;
break;
case PSTORE_TYPE_CONSOLE:
- fallthrough;
case PSTORE_TYPE_PMSG:
readop = psz_record_read;
break;
diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
index d1ceb76adb71..b59cd172b5f9 100644
--- a/fs/quota/Kconfig
+++ b/fs/quota/Kconfig
@@ -70,8 +70,3 @@ config QFMT_V2
config QUOTACTL
bool
default n
-
-config QUOTACTL_COMPAT
- bool
- depends on QUOTACTL && COMPAT_FOR_U64_ALIGNMENT
- default y
diff --git a/fs/quota/Makefile b/fs/quota/Makefile
index f2b49d0f0287..9160639daffa 100644
--- a/fs/quota/Makefile
+++ b/fs/quota/Makefile
@@ -4,5 +4,4 @@ obj-$(CONFIG_QFMT_V1) += quota_v1.o
obj-$(CONFIG_QFMT_V2) += quota_v2.o
obj-$(CONFIG_QUOTA_TREE) += quota_tree.o
obj-$(CONFIG_QUOTACTL) += quota.o kqid.o
-obj-$(CONFIG_QUOTACTL_COMPAT) += compat.o
obj-$(CONFIG_QUOTA_NETLINK_INTERFACE) += netlink.o
diff --git a/fs/quota/compat.c b/fs/quota/compat.c
deleted file mode 100644
index c30572857619..000000000000
--- a/fs/quota/compat.c
+++ /dev/null
@@ -1,120 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include <linux/syscalls.h>
-#include <linux/compat.h>
-#include <linux/quotaops.h>
-
-/*
- * This code works only for 32 bit quota tools over 64 bit OS (x86_64, ia64)
- * and is necessary due to alignment problems.
- */
-struct compat_if_dqblk {
- compat_u64 dqb_bhardlimit;
- compat_u64 dqb_bsoftlimit;
- compat_u64 dqb_curspace;
- compat_u64 dqb_ihardlimit;
- compat_u64 dqb_isoftlimit;
- compat_u64 dqb_curinodes;
- compat_u64 dqb_btime;
- compat_u64 dqb_itime;
- compat_uint_t dqb_valid;
-};
-
-/* XFS structures */
-struct compat_fs_qfilestat {
- compat_u64 dqb_bhardlimit;
- compat_u64 qfs_nblks;
- compat_uint_t qfs_nextents;
-};
-
-struct compat_fs_quota_stat {
- __s8 qs_version;
- __u16 qs_flags;
- __s8 qs_pad;
- struct compat_fs_qfilestat qs_uquota;
- struct compat_fs_qfilestat qs_gquota;
- compat_uint_t qs_incoredqs;
- compat_int_t qs_btimelimit;
- compat_int_t qs_itimelimit;
- compat_int_t qs_rtbtimelimit;
- __u16 qs_bwarnlimit;
- __u16 qs_iwarnlimit;
-};
-
-COMPAT_SYSCALL_DEFINE4(quotactl32, unsigned int, cmd,
- const char __user *, special, qid_t, id,
- void __user *, addr)
-{
- unsigned int cmds;
- struct if_dqblk __user *dqblk;
- struct compat_if_dqblk __user *compat_dqblk;
- struct fs_quota_stat __user *fsqstat;
- struct compat_fs_quota_stat __user *compat_fsqstat;
- compat_uint_t data;
- u16 xdata;
- long ret;
-
- cmds = cmd >> SUBCMDSHIFT;
-
- switch (cmds) {
- case Q_GETQUOTA:
- dqblk = compat_alloc_user_space(sizeof(struct if_dqblk));
- compat_dqblk = addr;
- ret = kernel_quotactl(cmd, special, id, dqblk);
- if (ret)
- break;
- if (copy_in_user(compat_dqblk, dqblk, sizeof(*compat_dqblk)) ||
- get_user(data, &dqblk->dqb_valid) ||
- put_user(data, &compat_dqblk->dqb_valid))
- ret = -EFAULT;
- break;
- case Q_SETQUOTA:
- dqblk = compat_alloc_user_space(sizeof(struct if_dqblk));
- compat_dqblk = addr;
- ret = -EFAULT;
- if (copy_in_user(dqblk, compat_dqblk, sizeof(*compat_dqblk)) ||
- get_user(data, &compat_dqblk->dqb_valid) ||
- put_user(data, &dqblk->dqb_valid))
- break;
- ret = kernel_quotactl(cmd, special, id, dqblk);
- break;
- case Q_XGETQSTAT:
- fsqstat = compat_alloc_user_space(sizeof(struct fs_quota_stat));
- compat_fsqstat = addr;
- ret = kernel_quotactl(cmd, special, id, fsqstat);
- if (ret)
- break;
- ret = -EFAULT;
- /* Copying qs_version, qs_flags, qs_pad */
- if (copy_in_user(compat_fsqstat, fsqstat,
- offsetof(struct compat_fs_quota_stat, qs_uquota)))
- break;
- /* Copying qs_uquota */
- if (copy_in_user(&compat_fsqstat->qs_uquota,
- &fsqstat->qs_uquota,
- sizeof(compat_fsqstat->qs_uquota)) ||
- get_user(data, &fsqstat->qs_uquota.qfs_nextents) ||
- put_user(data, &compat_fsqstat->qs_uquota.qfs_nextents))
- break;
- /* Copying qs_gquota */
- if (copy_in_user(&compat_fsqstat->qs_gquota,
- &fsqstat->qs_gquota,
- sizeof(compat_fsqstat->qs_gquota)) ||
- get_user(data, &fsqstat->qs_gquota.qfs_nextents) ||
- put_user(data, &compat_fsqstat->qs_gquota.qfs_nextents))
- break;
- /* Copying the rest */
- if (copy_in_user(&compat_fsqstat->qs_incoredqs,
- &fsqstat->qs_incoredqs,
- sizeof(struct compat_fs_quota_stat) -
- offsetof(struct compat_fs_quota_stat, qs_incoredqs)) ||
- get_user(xdata, &fsqstat->qs_iwarnlimit) ||
- put_user(xdata, &compat_fsqstat->qs_iwarnlimit))
- break;
- ret = 0;
- break;
- default:
- ret = kernel_quotactl(cmd, special, id, addr);
- }
- return ret;
-}
diff --git a/fs/quota/compat.h b/fs/quota/compat.h
new file mode 100644
index 000000000000..ef7d1e12d650
--- /dev/null
+++ b/fs/quota/compat.h
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/compat.h>
+
+struct compat_if_dqblk {
+ compat_u64 dqb_bhardlimit;
+ compat_u64 dqb_bsoftlimit;
+ compat_u64 dqb_curspace;
+ compat_u64 dqb_ihardlimit;
+ compat_u64 dqb_isoftlimit;
+ compat_u64 dqb_curinodes;
+ compat_u64 dqb_btime;
+ compat_u64 dqb_itime;
+ compat_uint_t dqb_valid;
+};
+
+struct compat_fs_qfilestat {
+ compat_u64 dqb_bhardlimit;
+ compat_u64 qfs_nblks;
+ compat_uint_t qfs_nextents;
+};
+
+struct compat_fs_quota_stat {
+ __s8 qs_version;
+ __u16 qs_flags;
+ __s8 qs_pad;
+ struct compat_fs_qfilestat qs_uquota;
+ struct compat_fs_qfilestat qs_gquota;
+ compat_uint_t qs_incoredqs;
+ compat_int_t qs_btimelimit;
+ compat_int_t qs_itimelimit;
+ compat_int_t qs_rtbtimelimit;
+ __u16 qs_bwarnlimit;
+ __u16 qs_iwarnlimit;
+};
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 5444d3c4d93f..6b37d58f1067 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -19,6 +19,7 @@
#include <linux/types.h>
#include <linux/writeback.h>
#include <linux/nospec.h>
+#include "compat.h"
static int check_quotactl_permission(struct super_block *sb, int type, int cmd,
qid_t id)
@@ -38,7 +39,7 @@ static int check_quotactl_permission(struct super_block *sb, int type, int cmd,
if ((type == USRQUOTA && uid_eq(current_euid(), make_kuid(current_user_ns(), id))) ||
(type == GRPQUOTA && in_egroup_p(make_kgid(current_user_ns(), id))))
break;
- /*FALLTHROUGH*/
+ fallthrough;
default:
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -211,8 +212,18 @@ static int quota_getquota(struct super_block *sb, int type, qid_t id,
if (ret)
return ret;
copy_to_if_dqblk(&idq, &fdq);
- if (copy_to_user(addr, &idq, sizeof(idq)))
- return -EFAULT;
+
+ if (compat_need_64bit_alignment_fixup()) {
+ struct compat_if_dqblk __user *compat_dqblk = addr;
+
+ if (copy_to_user(compat_dqblk, &idq, sizeof(*compat_dqblk)))
+ return -EFAULT;
+ if (put_user(idq.dqb_valid, &compat_dqblk->dqb_valid))
+ return -EFAULT;
+ } else {
+ if (copy_to_user(addr, &idq, sizeof(idq)))
+ return -EFAULT;
+ }
return 0;
}
@@ -277,8 +288,16 @@ static int quota_setquota(struct super_block *sb, int type, qid_t id,
struct if_dqblk idq;
struct kqid qid;
- if (copy_from_user(&idq, addr, sizeof(idq)))
- return -EFAULT;
+ if (compat_need_64bit_alignment_fixup()) {
+ struct compat_if_dqblk __user *compat_dqblk = addr;
+
+ if (copy_from_user(&idq, compat_dqblk, sizeof(*compat_dqblk)) ||
+ get_user(idq.dqb_valid, &compat_dqblk->dqb_valid))
+ return -EFAULT;
+ } else {
+ if (copy_from_user(&idq, addr, sizeof(idq)))
+ return -EFAULT;
+ }
if (!sb->s_qcop->set_dqblk)
return -ENOSYS;
qid = make_kqid(current_user_ns(), type, id);
@@ -382,6 +401,33 @@ static int quota_getstate(struct super_block *sb, int type,
return 0;
}
+static int compat_copy_fs_qfilestat(struct compat_fs_qfilestat __user *to,
+ struct fs_qfilestat *from)
+{
+ if (copy_to_user(to, from, sizeof(*to)) ||
+ put_user(from->qfs_nextents, &to->qfs_nextents))
+ return -EFAULT;
+ return 0;
+}
+
+static int compat_copy_fs_quota_stat(struct compat_fs_quota_stat __user *to,
+ struct fs_quota_stat *from)
+{
+ if (put_user(from->qs_version, &to->qs_version) ||
+ put_user(from->qs_flags, &to->qs_flags) ||
+ put_user(from->qs_pad, &to->qs_pad) ||
+ compat_copy_fs_qfilestat(&to->qs_uquota, &from->qs_uquota) ||
+ compat_copy_fs_qfilestat(&to->qs_gquota, &from->qs_gquota) ||
+ put_user(from->qs_incoredqs, &to->qs_incoredqs) ||
+ put_user(from->qs_btimelimit, &to->qs_btimelimit) ||
+ put_user(from->qs_itimelimit, &to->qs_itimelimit) ||
+ put_user(from->qs_rtbtimelimit, &to->qs_rtbtimelimit) ||
+ put_user(from->qs_bwarnlimit, &to->qs_bwarnlimit) ||
+ put_user(from->qs_iwarnlimit, &to->qs_iwarnlimit))
+ return -EFAULT;
+ return 0;
+}
+
static int quota_getxstate(struct super_block *sb, int type, void __user *addr)
{
struct fs_quota_stat fqs;
@@ -390,9 +436,14 @@ static int quota_getxstate(struct super_block *sb, int type, void __user *addr)
if (!sb->s_qcop->get_state)
return -ENOSYS;
ret = quota_getstate(sb, type, &fqs);
- if (!ret && copy_to_user(addr, &fqs, sizeof(fqs)))
+ if (ret)
+ return ret;
+
+ if (compat_need_64bit_alignment_fixup())
+ return compat_copy_fs_quota_stat(addr, &fqs);
+ if (copy_to_user(addr, &fqs, sizeof(fqs)))
return -EFAULT;
- return ret;
+ return 0;
}
static int quota_getstatev(struct super_block *sb, int type,
@@ -816,8 +867,8 @@ static struct super_block *quotactl_block(const char __user *special, int cmd)
* calls. Maybe we need to add the process quotas etc. in the future,
* but we probably should use rlimits for that.
*/
-int kernel_quotactl(unsigned int cmd, const char __user *special,
- qid_t id, void __user *addr)
+SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
+ qid_t, id, void __user *, addr)
{
uint cmds, type;
struct super_block *sb = NULL;
@@ -871,9 +922,3 @@ out:
path_put(pathp);
return ret;
}
-
-SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
- qid_t, id, void __user *, addr)
-{
- return kernel_quotactl(cmd, special, id, addr);
-}
diff --git a/fs/read_write.c b/fs/read_write.c
index 5db58b8c78d0..19f5c4bf75aa 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -538,6 +538,14 @@ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t
inc_syscw(current);
return ret;
}
+/*
+ * This "EXPORT_SYMBOL_GPL()" is more of a "EXPORT_SYMBOL_DONTUSE()",
+ * but autofs is one of the few internal kernel users that actually
+ * wants this _and_ can be built as a module. So we need to export
+ * this symbol for autofs, even though it really isn't appropriate
+ * for any other kernel modules.
+ */
+EXPORT_SYMBOL_GPL(__kernel_write);
ssize_t kernel_write(struct file *file, const void *buf, size_t count,
loff_t *pos)
@@ -752,185 +760,6 @@ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
return ret;
}
-/**
- * rw_copy_check_uvector() - Copy an array of &struct iovec from userspace
- * into the kernel and check that it is valid.
- *
- * @type: One of %CHECK_IOVEC_ONLY, %READ, or %WRITE.
- * @uvector: Pointer to the userspace array.
- * @nr_segs: Number of elements in userspace array.
- * @fast_segs: Number of elements in @fast_pointer.
- * @fast_pointer: Pointer to (usually small on-stack) kernel array.
- * @ret_pointer: (output parameter) Pointer to a variable that will point to
- * either @fast_pointer, a newly allocated kernel array, or NULL,
- * depending on which array was used.
- *
- * This function copies an array of &struct iovec of @nr_segs from
- * userspace into the kernel and checks that each element is valid (e.g.
- * it does not point to a kernel address or cause overflow by being too
- * large, etc.).
- *
- * As an optimization, the caller may provide a pointer to a small
- * on-stack array in @fast_pointer, typically %UIO_FASTIOV elements long
- * (the size of this array, or 0 if unused, should be given in @fast_segs).
- *
- * @ret_pointer will always point to the array that was used, so the
- * caller must take care not to call kfree() on it e.g. in case the
- * @fast_pointer array was used and it was allocated on the stack.
- *
- * Return: The total number of bytes covered by the iovec array on success
- * or a negative error code on error.
- */
-ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
- unsigned long nr_segs, unsigned long fast_segs,
- struct iovec *fast_pointer,
- struct iovec **ret_pointer)
-{
- unsigned long seg;
- ssize_t ret;
- struct iovec *iov = fast_pointer;
-
- /*
- * SuS says "The readv() function *may* fail if the iovcnt argument
- * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
- * traditionally returned zero for zero segments, so...
- */
- if (nr_segs == 0) {
- ret = 0;
- goto out;
- }
-
- /*
- * First get the "struct iovec" from user memory and
- * verify all the pointers
- */
- if (nr_segs > UIO_MAXIOV) {
- ret = -EINVAL;
- goto out;
- }
- if (nr_segs > fast_segs) {
- iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
- if (iov == NULL) {
- ret = -ENOMEM;
- goto out;
- }
- }
- if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
- ret = -EFAULT;
- goto out;
- }
-
- /*
- * According to the Single Unix Specification we should return EINVAL
- * if an element length is < 0 when cast to ssize_t or if the
- * total length would overflow the ssize_t return value of the
- * system call.
- *
- * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
- * overflow case.
- */
- ret = 0;
- for (seg = 0; seg < nr_segs; seg++) {
- void __user *buf = iov[seg].iov_base;
- ssize_t len = (ssize_t)iov[seg].iov_len;
-
- /* see if we we're about to use an invalid len or if
- * it's about to overflow ssize_t */
- if (len < 0) {
- ret = -EINVAL;
- goto out;
- }
- if (type >= 0
- && unlikely(!access_ok(buf, len))) {
- ret = -EFAULT;
- goto out;
- }
- if (len > MAX_RW_COUNT - ret) {
- len = MAX_RW_COUNT - ret;
- iov[seg].iov_len = len;
- }
- ret += len;
- }
-out:
- *ret_pointer = iov;
- return ret;
-}
-
-#ifdef CONFIG_COMPAT
-ssize_t compat_rw_copy_check_uvector(int type,
- const struct compat_iovec __user *uvector, unsigned long nr_segs,
- unsigned long fast_segs, struct iovec *fast_pointer,
- struct iovec **ret_pointer)
-{
- compat_ssize_t tot_len;
- struct iovec *iov = *ret_pointer = fast_pointer;
- ssize_t ret = 0;
- int seg;
-
- /*
- * SuS says "The readv() function *may* fail if the iovcnt argument
- * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
- * traditionally returned zero for zero segments, so...
- */
- if (nr_segs == 0)
- goto out;
-
- ret = -EINVAL;
- if (nr_segs > UIO_MAXIOV)
- goto out;
- if (nr_segs > fast_segs) {
- ret = -ENOMEM;
- iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
- if (iov == NULL)
- goto out;
- }
- *ret_pointer = iov;
-
- ret = -EFAULT;
- if (!access_ok(uvector, nr_segs*sizeof(*uvector)))
- goto out;
-
- /*
- * Single unix specification:
- * We should -EINVAL if an element length is not >= 0 and fitting an
- * ssize_t.
- *
- * In Linux, the total length is limited to MAX_RW_COUNT, there is
- * no overflow possibility.
- */
- tot_len = 0;
- ret = -EINVAL;
- for (seg = 0; seg < nr_segs; seg++) {
- compat_uptr_t buf;
- compat_ssize_t len;
-
- if (__get_user(len, &uvector->iov_len) ||
- __get_user(buf, &uvector->iov_base)) {
- ret = -EFAULT;
- goto out;
- }
- if (len < 0) /* size_t not fitting in compat_ssize_t .. */
- goto out;
- if (type >= 0 &&
- !access_ok(compat_ptr(buf), len)) {
- ret = -EFAULT;
- goto out;
- }
- if (len > MAX_RW_COUNT - tot_len)
- len = MAX_RW_COUNT - tot_len;
- tot_len += len;
- iov->iov_base = compat_ptr(buf);
- iov->iov_len = (compat_size_t) len;
- uvector++;
- iov++;
- }
- ret = tot_len;
-
-out:
- return ret;
-}
-#endif
-
static ssize_t do_iter_read(struct file *file, struct iov_iter *iter,
loff_t *pos, rwf_t flags)
{
@@ -1247,224 +1076,93 @@ SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec,
return do_pwritev(fd, vec, vlen, pos, flags);
}
+/*
+ * Various compat syscalls. Note that they all pretend to take a native
+ * iovec - import_iovec will properly treat those as compat_iovecs based on
+ * in_compat_syscall().
+ */
#ifdef CONFIG_COMPAT
-static size_t compat_readv(struct file *file,
- const struct compat_iovec __user *vec,
- unsigned long vlen, loff_t *pos, rwf_t flags)
-{
- struct iovec iovstack[UIO_FASTIOV];
- struct iovec *iov = iovstack;
- struct iov_iter iter;
- ssize_t ret;
-
- ret = compat_import_iovec(READ, vec, vlen, UIO_FASTIOV, &iov, &iter);
- if (ret >= 0) {
- ret = do_iter_read(file, &iter, pos, flags);
- kfree(iov);
- }
- if (ret > 0)
- add_rchar(current, ret);
- inc_syscr(current);
- return ret;
-}
-
-static size_t do_compat_readv(compat_ulong_t fd,
- const struct compat_iovec __user *vec,
- compat_ulong_t vlen, rwf_t flags)
-{
- struct fd f = fdget_pos(fd);
- ssize_t ret;
- loff_t pos;
-
- if (!f.file)
- return -EBADF;
- pos = f.file->f_pos;
- ret = compat_readv(f.file, vec, vlen, &pos, flags);
- if (ret >= 0)
- f.file->f_pos = pos;
- fdput_pos(f);
- return ret;
-
-}
-
-COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
- const struct compat_iovec __user *,vec,
- compat_ulong_t, vlen)
-{
- return do_compat_readv(fd, vec, vlen, 0);
-}
-
-static long do_compat_preadv64(unsigned long fd,
- const struct compat_iovec __user *vec,
- unsigned long vlen, loff_t pos, rwf_t flags)
-{
- struct fd f;
- ssize_t ret;
-
- if (pos < 0)
- return -EINVAL;
- f = fdget(fd);
- if (!f.file)
- return -EBADF;
- ret = -ESPIPE;
- if (f.file->f_mode & FMODE_PREAD)
- ret = compat_readv(f.file, vec, vlen, &pos, flags);
- fdput(f);
- return ret;
-}
-
#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
- const struct compat_iovec __user *,vec,
+ const struct iovec __user *, vec,
unsigned long, vlen, loff_t, pos)
{
- return do_compat_preadv64(fd, vec, vlen, pos, 0);
+ return do_preadv(fd, vec, vlen, pos, 0);
}
#endif
COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
- const struct compat_iovec __user *,vec,
+ const struct iovec __user *, vec,
compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
{
loff_t pos = ((loff_t)pos_high << 32) | pos_low;
- return do_compat_preadv64(fd, vec, vlen, pos, 0);
+ return do_preadv(fd, vec, vlen, pos, 0);
}
#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2
COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd,
- const struct compat_iovec __user *,vec,
+ const struct iovec __user *, vec,
unsigned long, vlen, loff_t, pos, rwf_t, flags)
{
if (pos == -1)
- return do_compat_readv(fd, vec, vlen, flags);
-
- return do_compat_preadv64(fd, vec, vlen, pos, flags);
+ return do_readv(fd, vec, vlen, flags);
+ return do_preadv(fd, vec, vlen, pos, flags);
}
#endif
COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd,
- const struct compat_iovec __user *,vec,
+ const struct iovec __user *, vec,
compat_ulong_t, vlen, u32, pos_low, u32, pos_high,
rwf_t, flags)
{
loff_t pos = ((loff_t)pos_high << 32) | pos_low;
if (pos == -1)
- return do_compat_readv(fd, vec, vlen, flags);
-
- return do_compat_preadv64(fd, vec, vlen, pos, flags);
-}
-
-static size_t compat_writev(struct file *file,
- const struct compat_iovec __user *vec,
- unsigned long vlen, loff_t *pos, rwf_t flags)
-{
- struct iovec iovstack[UIO_FASTIOV];
- struct iovec *iov = iovstack;
- struct iov_iter iter;
- ssize_t ret;
-
- ret = compat_import_iovec(WRITE, vec, vlen, UIO_FASTIOV, &iov, &iter);
- if (ret >= 0) {
- file_start_write(file);
- ret = do_iter_write(file, &iter, pos, flags);
- file_end_write(file);
- kfree(iov);
- }
- if (ret > 0)
- add_wchar(current, ret);
- inc_syscw(current);
- return ret;
-}
-
-static size_t do_compat_writev(compat_ulong_t fd,
- const struct compat_iovec __user* vec,
- compat_ulong_t vlen, rwf_t flags)
-{
- struct fd f = fdget_pos(fd);
- ssize_t ret;
- loff_t pos;
-
- if (!f.file)
- return -EBADF;
- pos = f.file->f_pos;
- ret = compat_writev(f.file, vec, vlen, &pos, flags);
- if (ret >= 0)
- f.file->f_pos = pos;
- fdput_pos(f);
- return ret;
-}
-
-COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
- const struct compat_iovec __user *, vec,
- compat_ulong_t, vlen)
-{
- return do_compat_writev(fd, vec, vlen, 0);
-}
-
-static long do_compat_pwritev64(unsigned long fd,
- const struct compat_iovec __user *vec,
- unsigned long vlen, loff_t pos, rwf_t flags)
-{
- struct fd f;
- ssize_t ret;
-
- if (pos < 0)
- return -EINVAL;
- f = fdget(fd);
- if (!f.file)
- return -EBADF;
- ret = -ESPIPE;
- if (f.file->f_mode & FMODE_PWRITE)
- ret = compat_writev(f.file, vec, vlen, &pos, flags);
- fdput(f);
- return ret;
+ return do_readv(fd, vec, vlen, flags);
+ return do_preadv(fd, vec, vlen, pos, flags);
}
#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64
COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
- const struct compat_iovec __user *,vec,
+ const struct iovec __user *, vec,
unsigned long, vlen, loff_t, pos)
{
- return do_compat_pwritev64(fd, vec, vlen, pos, 0);
+ return do_pwritev(fd, vec, vlen, pos, 0);
}
#endif
COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
- const struct compat_iovec __user *,vec,
+ const struct iovec __user *,vec,
compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
{
loff_t pos = ((loff_t)pos_high << 32) | pos_low;
- return do_compat_pwritev64(fd, vec, vlen, pos, 0);
+ return do_pwritev(fd, vec, vlen, pos, 0);
}
#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2
COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd,
- const struct compat_iovec __user *,vec,
+ const struct iovec __user *, vec,
unsigned long, vlen, loff_t, pos, rwf_t, flags)
{
if (pos == -1)
- return do_compat_writev(fd, vec, vlen, flags);
-
- return do_compat_pwritev64(fd, vec, vlen, pos, flags);
+ return do_writev(fd, vec, vlen, flags);
+ return do_pwritev(fd, vec, vlen, pos, flags);
}
#endif
COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd,
- const struct compat_iovec __user *,vec,
+ const struct iovec __user *,vec,
compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags)
{
loff_t pos = ((loff_t)pos_high << 32) | pos_low;
if (pos == -1)
- return do_compat_writev(fd, vec, vlen, flags);
-
- return do_compat_pwritev64(fd, vec, vlen, pos, flags);
+ return do_writev(fd, vec, vlen, flags);
+ return do_pwritev(fd, vec, vlen, pos, flags);
}
-
-#endif
+#endif /* CONFIG_COMPAT */
static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
size_t count, loff_t max)
diff --git a/fs/romfs/storage.c b/fs/romfs/storage.c
index 6b2b4362089e..b57b3ffcbc32 100644
--- a/fs/romfs/storage.c
+++ b/fs/romfs/storage.c
@@ -217,10 +217,8 @@ int romfs_dev_read(struct super_block *sb, unsigned long pos,
size_t limit;
limit = romfs_maxsize(sb);
- if (pos >= limit)
+ if (pos >= limit || buflen > limit - pos)
return -EIO;
- if (buflen > limit - pos)
- buflen = limit - pos;
#ifdef CONFIG_ROMFS_ON_MTD
if (sb->s_mtd)
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 4e6239f33c06..31219c1db17d 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -295,7 +295,7 @@ loff_t seq_lseek(struct file *file, loff_t offset, int whence)
switch (whence) {
case SEEK_CUR:
offset += file->f_pos;
- /* fall through */
+ fallthrough;
case SEEK_SET:
if (offset < 0)
break;
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 44b6845b071c..456046e15873 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -176,7 +176,7 @@ static ssize_t signalfd_dequeue(struct signalfd_ctx *ctx, kernel_siginfo_t *info
if (!nonblock)
break;
ret = -EAGAIN;
- /* fall through */
+ fallthrough;
default:
spin_unlock_irq(&current->sighand->siglock);
return ret;
@@ -314,9 +314,10 @@ SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask,
{
sigset_t mask;
- if (sizemask != sizeof(sigset_t) ||
- copy_from_user(&mask, user_mask, sizeof(mask)))
+ if (sizemask != sizeof(sigset_t))
return -EINVAL;
+ if (copy_from_user(&mask, user_mask, sizeof(mask)))
+ return -EFAULT;
return do_signalfd4(ufd, &mask, flags);
}
@@ -325,9 +326,10 @@ SYSCALL_DEFINE3(signalfd, int, ufd, sigset_t __user *, user_mask,
{
sigset_t mask;
- if (sizemask != sizeof(sigset_t) ||
- copy_from_user(&mask, user_mask, sizeof(mask)))
+ if (sizemask != sizeof(sigset_t))
return -EINVAL;
+ if (copy_from_user(&mask, user_mask, sizeof(mask)))
+ return -EFAULT;
return do_signalfd4(ufd, &mask, 0);
}
diff --git a/fs/splice.c b/fs/splice.c
index d7c8a7c4db07..70cc52af780b 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -33,7 +33,6 @@
#include <linux/security.h>
#include <linux/gfp.h>
#include <linux/socket.h>
-#include <linux/compat.h>
#include <linux/sched/signal.h>
#include "internal.h"
@@ -526,6 +525,22 @@ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_des
return 1;
}
+/* We know we have a pipe buffer, but maybe it's empty? */
+static inline bool eat_empty_buffer(struct pipe_inode_info *pipe)
+{
+ unsigned int tail = pipe->tail;
+ unsigned int mask = pipe->ring_size - 1;
+ struct pipe_buffer *buf = &pipe->bufs[tail & mask];
+
+ if (unlikely(!buf->len)) {
+ pipe_buf_release(pipe, buf);
+ pipe->tail = tail+1;
+ return true;
+ }
+
+ return false;
+}
+
/**
* splice_from_pipe_next - wait for some data to splice from
* @pipe: pipe to splice from
@@ -545,6 +560,7 @@ static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_des
if (signal_pending(current))
return -ERESTARTSYS;
+repeat:
while (pipe_empty(pipe->head, pipe->tail)) {
if (!pipe->writers)
return 0;
@@ -563,9 +579,12 @@ static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_des
sd->need_wakeup = false;
}
- pipe_wait(pipe);
+ pipe_wait_readable(pipe);
}
+ if (eat_empty_buffer(pipe))
+ goto repeat;
+
return 1;
}
@@ -1077,7 +1096,7 @@ static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags)
return -EAGAIN;
if (signal_pending(current))
return -ERESTARTSYS;
- pipe_wait(pipe);
+ pipe_wait_writable(pipe);
}
}
@@ -1332,20 +1351,6 @@ static int vmsplice_type(struct fd f, int *type)
* Currently we punt and implement it as a normal copy, see pipe_to_user().
*
*/
-static long do_vmsplice(struct file *f, struct iov_iter *iter, unsigned int flags)
-{
- if (unlikely(flags & ~SPLICE_F_ALL))
- return -EINVAL;
-
- if (!iov_iter_count(iter))
- return 0;
-
- if (iov_iter_rw(iter) == WRITE)
- return vmsplice_to_pipe(f, iter, flags);
- else
- return vmsplice_to_user(f, iter, flags);
-}
-
SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
unsigned long, nr_segs, unsigned int, flags)
{
@@ -1356,6 +1361,9 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
struct fd f;
int type;
+ if (unlikely(flags & ~SPLICE_F_ALL))
+ return -EINVAL;
+
f = fdget(fd);
error = vmsplice_type(f, &type);
if (error)
@@ -1363,40 +1371,21 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
error = import_iovec(type, uiov, nr_segs,
ARRAY_SIZE(iovstack), &iov, &iter);
- if (error >= 0) {
- error = do_vmsplice(f.file, &iter, flags);
- kfree(iov);
- }
- fdput(f);
- return error;
-}
-
-#ifdef CONFIG_COMPAT
-COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, iov32,
- unsigned int, nr_segs, unsigned int, flags)
-{
- struct iovec iovstack[UIO_FASTIOV];
- struct iovec *iov = iovstack;
- struct iov_iter iter;
- ssize_t error;
- struct fd f;
- int type;
+ if (error < 0)
+ goto out_fdput;
- f = fdget(fd);
- error = vmsplice_type(f, &type);
- if (error)
- return error;
+ if (!iov_iter_count(&iter))
+ error = 0;
+ else if (iov_iter_rw(&iter) == WRITE)
+ error = vmsplice_to_pipe(f.file, &iter, flags);
+ else
+ error = vmsplice_to_user(f.file, &iter, flags);
- error = compat_import_iovec(type, iov32, nr_segs,
- ARRAY_SIZE(iovstack), &iov, &iter);
- if (error >= 0) {
- error = do_vmsplice(f.file, &iter, flags);
- kfree(iov);
- }
+ kfree(iov);
+out_fdput:
fdput(f);
return error;
}
-#endif
SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
int, fd_out, loff_t __user *, off_out,
@@ -1454,7 +1443,7 @@ static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
ret = -EAGAIN;
break;
}
- pipe_wait(pipe);
+ pipe_wait_readable(pipe);
}
pipe_unlock(pipe);
@@ -1493,7 +1482,7 @@ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
ret = -ERESTARTSYS;
break;
}
- pipe_wait(pipe);
+ pipe_wait_writable(pipe);
}
pipe_unlock(pipe);
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 76bb1c846845..8a19773b5a0b 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -87,7 +87,11 @@ static int squashfs_bio_read(struct super_block *sb, u64 index, int length,
int error, i;
struct bio *bio;
- bio = bio_alloc(GFP_NOIO, page_count);
+ if (page_count <= BIO_MAX_PAGES)
+ bio = bio_alloc(GFP_NOIO, page_count);
+ else
+ bio = bio_kmalloc(GFP_NOIO, page_count);
+
if (!bio)
return -ENOMEM;
diff --git a/fs/super.c b/fs/super.c
index 904459b35119..a51c2083cd6b 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1256,6 +1256,8 @@ static int set_bdev_super(struct super_block *s, void *data)
s->s_dev = s->s_bdev->bd_dev;
s->s_bdi = bdi_get(s->s_bdev->bd_bdi);
+ if (blk_queue_stable_writes(s->s_bdev->bd_disk->queue))
+ s->s_iflags |= SB_I_STABLE_WRITES;
return 0;
}
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 9d042942d8b2..155521e51ac5 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -81,19 +81,6 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
struct ubifs_inode *ui;
bool encrypted = false;
- if (IS_ENCRYPTED(dir)) {
- err = fscrypt_get_encryption_info(dir);
- if (err) {
- ubifs_err(c, "fscrypt_get_encryption_info failed: %i", err);
- return ERR_PTR(err);
- }
-
- if (!fscrypt_has_encryption_key(dir))
- return ERR_PTR(-EPERM);
-
- encrypted = true;
- }
-
inode = new_inode(c->vfs_sb);
ui = ubifs_inode(inode);
if (!inode)
@@ -112,6 +99,12 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
current_time(inode);
inode->i_mapping->nrpages = 0;
+ err = fscrypt_prepare_new_inode(dir, inode, &encrypted);
+ if (err) {
+ ubifs_err(c, "fscrypt_prepare_new_inode failed: %i", err);
+ goto out_iput;
+ }
+
switch (mode & S_IFMT) {
case S_IFREG:
inode->i_mapping->a_ops = &ubifs_file_address_operations;
@@ -131,7 +124,6 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
case S_IFBLK:
case S_IFCHR:
inode->i_op = &ubifs_file_inode_operations;
- encrypted = false;
break;
default:
BUG();
@@ -151,9 +143,8 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
if (c->highest_inum >= INUM_WATERMARK) {
spin_unlock(&c->cnt_lock);
ubifs_err(c, "out of inode numbers");
- make_bad_inode(inode);
- iput(inode);
- return ERR_PTR(-EINVAL);
+ err = -EINVAL;
+ goto out_iput;
}
ubifs_warn(c, "running out of inode numbers (current %lu, max %u)",
(unsigned long)c->highest_inum, INUM_WATERMARK);
@@ -171,16 +162,19 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
spin_unlock(&c->cnt_lock);
if (encrypted) {
- err = fscrypt_inherit_context(dir, inode, &encrypted, true);
+ err = fscrypt_set_context(inode, NULL);
if (err) {
- ubifs_err(c, "fscrypt_inherit_context failed: %i", err);
- make_bad_inode(inode);
- iput(inode);
- return ERR_PTR(err);
+ ubifs_err(c, "fscrypt_set_context failed: %i", err);
+ goto out_iput;
}
}
return inode;
+
+out_iput:
+ make_bad_inode(inode);
+ iput(inode);
+ return ERR_PTR(err);
}
static int dbg_check_name(const struct ubifs_info *c,
@@ -515,7 +509,7 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx)
if (err)
return err;
- err = fscrypt_fname_alloc_buffer(dir, UBIFS_MAX_NLEN, &fstr);
+ err = fscrypt_fname_alloc_buffer(UBIFS_MAX_NLEN, &fstr);
if (err)
return err;
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 2e6264318bd9..4a5b06f8d812 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -539,7 +539,7 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
const struct fscrypt_name *nm, const struct inode *inode,
int deletion, int xent)
{
- int err, dlen, ilen, len, lnum, ino_offs, dent_offs;
+ int err, dlen, ilen, len, lnum, ino_offs, dent_offs, orphan_added = 0;
int aligned_dlen, aligned_ilen, sync = IS_DIRSYNC(dir);
int last_reference = !!(deletion && inode->i_nlink == 0);
struct ubifs_inode *ui = ubifs_inode(inode);
@@ -630,6 +630,7 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
goto out_finish;
}
ui->del_cmtno = c->cmt_no;
+ orphan_added = 1;
}
err = write_head(c, BASEHD, dent, len, &lnum, &dent_offs, sync);
@@ -702,7 +703,7 @@ out_release:
kfree(dent);
out_ro:
ubifs_ro_mode(c, err);
- if (last_reference)
+ if (orphan_added)
ubifs_delete_orphan(c, inode->i_ino);
finish_reservation(c);
return err;
@@ -1218,7 +1219,7 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
void *p;
union ubifs_key key;
struct ubifs_dent_node *dent, *dent2;
- int err, dlen1, dlen2, ilen, lnum, offs, len;
+ int err, dlen1, dlen2, ilen, lnum, offs, len, orphan_added = 0;
int aligned_dlen1, aligned_dlen2, plen = UBIFS_INO_NODE_SZ;
int last_reference = !!(new_inode && new_inode->i_nlink == 0);
int move = (old_dir != new_dir);
@@ -1334,6 +1335,7 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
goto out_finish;
}
new_ui->del_cmtno = c->cmt_no;
+ orphan_added = 1;
}
err = write_head(c, BASEHD, dent, len, &lnum, &offs, sync);
@@ -1415,7 +1417,7 @@ out_release:
release_head(c, BASEHD);
out_ro:
ubifs_ro_mode(c, err);
- if (last_reference)
+ if (orphan_added)
ubifs_delete_orphan(c, new_inode->i_ino);
out_finish:
finish_reservation(c);
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index 22bfda158f7f..6d6cd85c2b4c 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -269,7 +269,7 @@ void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops,
break;
/* No more room on heap so make it un-categorized */
cat = LPROPS_UNCAT;
- /* Fall through */
+ fallthrough;
case LPROPS_UNCAT:
list_add(&lprops->list, &c->uncat_list);
break;
@@ -313,7 +313,7 @@ static void ubifs_remove_from_cat(struct ubifs_info *c,
case LPROPS_FREEABLE:
c->freeable_cnt -= 1;
ubifs_assert(c, c->freeable_cnt >= 0);
- /* Fall through */
+ fallthrough;
case LPROPS_UNCAT:
case LPROPS_EMPTY:
case LPROPS_FRDI_IDX:
diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h
index c97a4d537d83..615878e884ba 100644
--- a/fs/ubifs/misc.h
+++ b/fs/ubifs/misc.h
@@ -121,7 +121,7 @@ static inline const char *ubifs_compr_name(struct ubifs_info *c, int compr_type)
* ubifs_wbuf_sync - synchronize write-buffer.
* @wbuf: write-buffer to synchronize
*
- * This is the same as as 'ubifs_wbuf_sync_nolock()' but it does not assume
+ * This is the same as 'ubifs_wbuf_sync_nolock()' but it does not assume
* that the write-buffer is already locked.
*/
static inline int ubifs_wbuf_sync(struct ubifs_wbuf *wbuf)
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 4b4b65b48c57..c0d3e4008d23 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -174,7 +174,8 @@ static int create_default_filesystem(struct ubifs_info *c)
tmp64 = (long long)max_buds * c->leb_size;
if (big_lpt)
sup_flags |= UBIFS_FLG_BIGLPT;
- sup_flags |= UBIFS_FLG_DOUBLE_HASH;
+ if (ubifs_default_version > 4)
+ sup_flags |= UBIFS_FLG_DOUBLE_HASH;
if (ubifs_authenticated(c)) {
sup_flags |= UBIFS_FLG_AUTHENTICATION;
@@ -200,7 +201,7 @@ static int create_default_filesystem(struct ubifs_info *c)
sup->jhead_cnt = cpu_to_le32(DEFAULT_JHEADS_CNT);
sup->fanout = cpu_to_le32(DEFAULT_FANOUT);
sup->lsave_cnt = cpu_to_le32(c->lsave_cnt);
- sup->fmt_version = cpu_to_le32(UBIFS_FORMAT_VERSION);
+ sup->fmt_version = cpu_to_le32(ubifs_default_version);
sup->time_gran = cpu_to_le32(DEFAULT_TIME_GRAN);
if (c->mount_opts.override_compr)
sup->default_compr = cpu_to_le16(c->mount_opts.compr_type);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 7fc2f3f07c16..fbddb2a1c03f 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -26,6 +26,24 @@
#include <linux/writeback.h>
#include "ubifs.h"
+static int ubifs_default_version_set(const char *val, const struct kernel_param *kp)
+{
+ int n = 0, ret;
+
+ ret = kstrtoint(val, 10, &n);
+ if (ret != 0 || n < 4 || n > UBIFS_FORMAT_VERSION)
+ return -EINVAL;
+ return param_set_int(val, kp);
+}
+
+static const struct kernel_param_ops ubifs_default_version_ops = {
+ .set = ubifs_default_version_set,
+ .get = param_get_int,
+};
+
+int ubifs_default_version = UBIFS_FORMAT_VERSION;
+module_param_cb(default_version, &ubifs_default_version_ops, &ubifs_default_version, 0600);
+
/*
* Maximum amount of memory we may 'kmalloc()' without worrying that we are
* allocating too much.
@@ -2159,6 +2177,8 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
c->vi.vol_id);
if (err)
goto out_close;
+ sb->s_bdi->ra_pages = 0;
+ sb->s_bdi->io_pages = 0;
sb->s_fs_info = c;
sb->s_magic = UBIFS_SUPER_MAGIC;
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index bff682309fbe..4ffd832e3b93 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1504,6 +1504,7 @@ extern const struct file_operations ubifs_dir_operations;
extern const struct inode_operations ubifs_dir_inode_operations;
extern const struct inode_operations ubifs_symlink_inode_operations;
extern struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT];
+extern int ubifs_default_version;
/* auth.c */
static inline int ubifs_authenticated(const struct ubifs_info *c)
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index 6023c97c6da2..25ff91c7e94a 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -52,7 +52,7 @@ static int udf_pc_to_char(struct super_block *sb, unsigned char *from,
elen += pc->lengthComponentIdent;
break;
}
- /* Fall through */
+ fallthrough;
case 2:
if (tolen == 0)
return -ENAMETOOLONG;
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 1da0be667409..e3b69fb280e8 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -101,7 +101,7 @@ static struct inode *ufs_nfs_get_inode(struct super_block *sb, u64 ino, u32 gene
struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
struct inode *inode;
- if (ino < UFS_ROOTINO || ino > uspi->s_ncg * uspi->s_ipg)
+ if (ino < UFS_ROOTINO || ino > (u64)uspi->s_ncg * uspi->s_ipg)
return ERR_PTR(-ESTALE);
inode = ufs_iget(sb, ino);
diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index e1f1b2e868a7..4931bec1a01c 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -42,7 +42,7 @@ ufs_get_fs_state(struct super_block *sb, struct ufs_super_block_first *usb1,
case UFS_ST_SUNOS:
if (fs32_to_cpu(sb, usb3->fs_postblformat) == UFS_42POSTBLFMT)
return fs32_to_cpu(sb, usb1->fs_u0.fs_sun.fs_state);
- /* Fall Through - to UFS_ST_SUN */
+ fallthrough; /* to UFS_ST_SUN */
case UFS_ST_SUN:
return fs32_to_cpu(sb, usb3->fs_un2.fs_sun.fs_state);
case UFS_ST_SUNx86:
@@ -63,7 +63,7 @@ ufs_set_fs_state(struct super_block *sb, struct ufs_super_block_first *usb1,
usb1->fs_u0.fs_sun.fs_state = cpu_to_fs32(sb, value);
break;
}
- /* Fall Through - to UFS_ST_SUN */
+ fallthrough; /* to UFS_ST_SUN */
case UFS_ST_SUN:
usb3->fs_un2.fs_sun.fs_state = cpu_to_fs32(sb, value);
break;
@@ -197,7 +197,7 @@ ufs_get_inode_uid(struct super_block *sb, struct ufs_inode *inode)
case UFS_UID_EFT:
if (inode->ui_u1.oldids.ui_suid == 0xFFFF)
return fs32_to_cpu(sb, inode->ui_u3.ui_sun.ui_uid);
- /* Fall through */
+ fallthrough;
default:
return fs16_to_cpu(sb, inode->ui_u1.oldids.ui_suid);
}
@@ -215,7 +215,7 @@ ufs_set_inode_uid(struct super_block *sb, struct ufs_inode *inode, u32 value)
inode->ui_u3.ui_sun.ui_uid = cpu_to_fs32(sb, value);
if (value > 0xFFFF)
value = 0xFFFF;
- /* Fall through */
+ fallthrough;
default:
inode->ui_u1.oldids.ui_suid = cpu_to_fs16(sb, value);
break;
@@ -231,7 +231,7 @@ ufs_get_inode_gid(struct super_block *sb, struct ufs_inode *inode)
case UFS_UID_EFT:
if (inode->ui_u1.oldids.ui_sgid == 0xFFFF)
return fs32_to_cpu(sb, inode->ui_u3.ui_sun.ui_gid);
- /* Fall through */
+ fallthrough;
default:
return fs16_to_cpu(sb, inode->ui_u1.oldids.ui_sgid);
}
@@ -249,7 +249,7 @@ ufs_set_inode_gid(struct super_block *sb, struct ufs_inode *inode, u32 value)
inode->ui_u3.ui_sun.ui_gid = cpu_to_fs32(sb, value);
if (value > 0xFFFF)
value = 0xFFFF;
- /* Fall through */
+ fallthrough;
default:
inode->ui_u1.oldids.ui_sgid = cpu_to_fs16(sb, value);
break;
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 6e264dded46e..0e4a3837da52 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -61,7 +61,7 @@ struct userfaultfd_ctx {
/* waitqueue head for events */
wait_queue_head_t event_wqh;
/* a refile sequence protected by fault_pending_wqh lock */
- struct seqcount refile_seq;
+ seqcount_spinlock_t refile_seq;
/* pseudo fd refcounting */
refcount_t refcount;
/* userfaultfd syscall flags */
@@ -1961,7 +1961,7 @@ static void init_once_userfaultfd_ctx(void *mem)
init_waitqueue_head(&ctx->fault_wqh);
init_waitqueue_head(&ctx->event_wqh);
init_waitqueue_head(&ctx->fd_wqh);
- seqcount_init(&ctx->refile_seq);
+ seqcount_spinlock_init(&ctx->refile_seq, &ctx->fault_pending_wqh.lock);
}
SYSCALL_DEFINE1(userfaultfd, int, flags)
diff --git a/fs/vboxsf/super.c b/fs/vboxsf/super.c
index 8fe03b4a0d2b..d7816c01a4f6 100644
--- a/fs/vboxsf/super.c
+++ b/fs/vboxsf/super.c
@@ -167,6 +167,8 @@ static int vboxsf_fill_super(struct super_block *sb, struct fs_context *fc)
err = super_setup_bdi_name(sb, "vboxsf-%d", sbi->bdi_id);
if (err)
goto fail_free;
+ sb->s_bdi->ra_pages = 0;
+ sb->s_bdi->io_pages = 0;
/* Turn source into a shfl_string and map the folder */
size = strlen(fc->source) + 1;
@@ -384,7 +386,7 @@ fail_nomem:
static int vboxsf_parse_monolithic(struct fs_context *fc, void *data)
{
- char *options = data;
+ unsigned char *options = data;
if (options && options[0] == VBSF_MOUNT_SIGNATURE_BYTE_0 &&
options[1] == VBSF_MOUNT_SIGNATURE_BYTE_1 &&
diff --git a/fs/vboxsf/utils.c b/fs/vboxsf/utils.c
index 96bd160da48b..018057546067 100644
--- a/fs/vboxsf/utils.c
+++ b/fs/vboxsf/utils.c
@@ -226,7 +226,7 @@ int vboxsf_getattr(const struct path *path, struct kstat *kstat,
break;
case AT_STATX_FORCE_SYNC:
sf_i->force_restat = 1;
- /* fall-through */
+ fallthrough;
default:
err = vboxsf_inode_revalidate(dentry);
}
diff --git a/fs/xattr.c b/fs/xattr.c
index 386b45676d7e..cd7a563e8bcd 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -232,15 +232,15 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name,
}
/**
- * __vfs_setxattr_locked: set an extended attribute while holding the inode
+ * __vfs_setxattr_locked - set an extended attribute while holding the inode
* lock
*
- * @dentry - object to perform setxattr on
- * @name - xattr name to set
- * @value - value to set @name to
- * @size - size of @value
- * @flags - flags to pass into filesystem operations
- * @delegated_inode - on return, will contain an inode pointer that
+ * @dentry: object to perform setxattr on
+ * @name: xattr name to set
+ * @value: value to set @name to
+ * @size: size of @value
+ * @flags: flags to pass into filesystem operations
+ * @delegated_inode: on return, will contain an inode pointer that
* a delegation was broken on, NULL if none.
*/
int
@@ -443,12 +443,12 @@ __vfs_removexattr(struct dentry *dentry, const char *name)
EXPORT_SYMBOL(__vfs_removexattr);
/**
- * __vfs_removexattr_locked: set an extended attribute while holding the inode
+ * __vfs_removexattr_locked - set an extended attribute while holding the inode
* lock
*
- * @dentry - object to perform setxattr on
- * @name - name of xattr to remove
- * @delegated_inode - on return, will contain an inode pointer that
+ * @dentry: object to perform setxattr on
+ * @name: name of xattr to remove
+ * @delegated_inode: on return, will contain an inode pointer that
* a delegation was broken on, NULL if none.
*/
int
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index e841ed781a25..e986b95d94c9 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -93,25 +93,3 @@ kmem_alloc_large(size_t size, xfs_km_flags_t flags)
return ptr;
return __kmem_vmalloc(size, flags);
}
-
-void *
-kmem_realloc(const void *old, size_t newsize, xfs_km_flags_t flags)
-{
- int retries = 0;
- gfp_t lflags = kmem_flags_convert(flags);
- void *ptr;
-
- trace_kmem_realloc(newsize, flags, _RET_IP_);
-
- do {
- ptr = krealloc(old, newsize, lflags);
- if (ptr || (flags & KM_MAYFAIL))
- return ptr;
- if (!(++retries % 100))
- xfs_err(NULL,
- "%s(%u) possible memory allocation deadlock size %zu in %s (mode:0x%x)",
- current->comm, current->pid,
- newsize, __func__, lflags);
- congestion_wait(BLK_RW_ASYNC, HZ/50);
- } while (1);
-}
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index 8e8555817e6d..38007117697e 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -59,7 +59,6 @@ kmem_flags_convert(xfs_km_flags_t flags)
extern void *kmem_alloc(size_t, xfs_km_flags_t);
extern void *kmem_alloc_io(size_t size, int align_mask, xfs_km_flags_t flags);
extern void *kmem_alloc_large(size_t size, xfs_km_flags_t);
-extern void *kmem_realloc(const void *, size_t, xfs_km_flags_t);
static inline void kmem_free(const void *ptr)
{
kvfree(ptr);
@@ -72,12 +71,6 @@ kmem_zalloc(size_t size, xfs_km_flags_t flags)
return kmem_alloc(size, flags | KM_ZERO);
}
-static inline void *
-kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
-{
- return kmem_alloc_large(size, flags | KM_ZERO);
-}
-
/*
* Zone interfaces
*/
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index 8cf73fe4338e..9331f3516afa 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -333,6 +333,11 @@ xfs_agiblock_init(
}
for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++)
agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
+ if (xfs_sb_version_hasinobtcounts(&mp->m_sb)) {
+ agi->agi_iblocks = cpu_to_be32(1);
+ if (xfs_sb_version_hasfinobt(&mp->m_sb))
+ agi->agi_fblocks = cpu_to_be32(1);
+ }
}
typedef void (*aghdr_init_work_f)(struct xfs_mount *mp, struct xfs_buf *bp,
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 2e055c079f39..fd8e6418a0d3 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -428,7 +428,7 @@ xfs_attr_set(
*/
if (XFS_IFORK_Q(dp) == 0) {
int sf_size = sizeof(struct xfs_attr_sf_hdr) +
- XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen,
+ xfs_attr_sf_entsize_byname(args->namelen,
args->valuelen);
error = xfs_bmap_add_attrfork(dp, sf_size, rsvd);
@@ -523,6 +523,14 @@ out_trans_cancel:
* External routines when attribute list is inside the inode
*========================================================================*/
+static inline int xfs_attr_sf_totsize(struct xfs_inode *dp)
+{
+ struct xfs_attr_shortform *sf;
+
+ sf = (struct xfs_attr_shortform *)dp->i_afp->if_u1.if_data;
+ return be16_to_cpu(sf->hdr.totsize);
+}
+
/*
* Add a name to the shortform attribute list structure
* This is the external routine.
@@ -555,8 +563,8 @@ xfs_attr_shortform_addname(xfs_da_args_t *args)
args->valuelen >= XFS_ATTR_SF_ENTSIZE_MAX)
return -ENOSPC;
- newsize = XFS_ATTR_SF_TOTSIZE(args->dp);
- newsize += XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen);
+ newsize = xfs_attr_sf_totsize(args->dp);
+ newsize += xfs_attr_sf_entsize_byname(args->namelen, args->valuelen);
forkoff = xfs_attr_shortform_bytesfit(args->dp, newsize);
if (!forkoff)
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 8623c815164a..bb128db220ac 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -653,8 +653,8 @@ xfs_attr_shortform_create(
ASSERT(ifp->if_flags & XFS_IFINLINE);
}
xfs_idata_realloc(dp, sizeof(*hdr), XFS_ATTR_FORK);
- hdr = (xfs_attr_sf_hdr_t *)ifp->if_u1.if_data;
- hdr->count = 0;
+ hdr = (struct xfs_attr_sf_hdr *)ifp->if_u1.if_data;
+ memset(hdr, 0, sizeof(*hdr));
hdr->totsize = cpu_to_be16(sizeof(*hdr));
xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA);
}
@@ -684,9 +684,9 @@ xfs_attr_sf_findname(
sf = (struct xfs_attr_shortform *)args->dp->i_afp->if_u1.if_data;
sfe = &sf->list[0];
end = sf->hdr.count;
- for (i = 0; i < end; sfe = XFS_ATTR_SF_NEXTENTRY(sfe),
+ for (i = 0; i < end; sfe = xfs_attr_sf_nextentry(sfe),
base += size, i++) {
- size = XFS_ATTR_SF_ENTSIZE(sfe);
+ size = xfs_attr_sf_entsize(sfe);
if (!xfs_attr_match(args, sfe->namelen, sfe->nameval,
sfe->flags))
continue;
@@ -728,15 +728,15 @@ xfs_attr_shortform_add(
ifp = dp->i_afp;
ASSERT(ifp->if_flags & XFS_IFINLINE);
- sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
+ sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
if (xfs_attr_sf_findname(args, &sfe, NULL) == -EEXIST)
ASSERT(0);
offset = (char *)sfe - (char *)sf;
- size = XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen);
+ size = xfs_attr_sf_entsize_byname(args->namelen, args->valuelen);
xfs_idata_realloc(dp, size, XFS_ATTR_FORK);
- sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
- sfe = (xfs_attr_sf_entry_t *)((char *)sf + offset);
+ sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
+ sfe = (struct xfs_attr_sf_entry *)((char *)sf + offset);
sfe->namelen = args->namelen;
sfe->valuelen = args->valuelen;
@@ -787,12 +787,12 @@ xfs_attr_shortform_remove(
dp = args->dp;
mp = dp->i_mount;
- sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data;
+ sf = (struct xfs_attr_shortform *)dp->i_afp->if_u1.if_data;
error = xfs_attr_sf_findname(args, &sfe, &base);
if (error != -EEXIST)
return error;
- size = XFS_ATTR_SF_ENTSIZE(sfe);
+ size = xfs_attr_sf_entsize(sfe);
/*
* Fix up the attribute fork data, covering the hole
@@ -837,8 +837,8 @@ xfs_attr_shortform_remove(
int
xfs_attr_shortform_lookup(xfs_da_args_t *args)
{
- xfs_attr_shortform_t *sf;
- xfs_attr_sf_entry_t *sfe;
+ struct xfs_attr_shortform *sf;
+ struct xfs_attr_sf_entry *sfe;
int i;
struct xfs_ifork *ifp;
@@ -846,10 +846,10 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args)
ifp = args->dp->i_afp;
ASSERT(ifp->if_flags & XFS_IFINLINE);
- sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
+ sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
sfe = &sf->list[0];
for (i = 0; i < sf->hdr.count;
- sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
+ sfe = xfs_attr_sf_nextentry(sfe), i++) {
if (xfs_attr_match(args, sfe->namelen, sfe->nameval,
sfe->flags))
return -EEXIST;
@@ -873,10 +873,10 @@ xfs_attr_shortform_getvalue(
int i;
ASSERT(args->dp->i_afp->if_flags == XFS_IFINLINE);
- sf = (xfs_attr_shortform_t *)args->dp->i_afp->if_u1.if_data;
+ sf = (struct xfs_attr_shortform *)args->dp->i_afp->if_u1.if_data;
sfe = &sf->list[0];
for (i = 0; i < sf->hdr.count;
- sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
+ sfe = xfs_attr_sf_nextentry(sfe), i++) {
if (xfs_attr_match(args, sfe->namelen, sfe->nameval,
sfe->flags))
return xfs_attr_copy_value(args,
@@ -908,12 +908,12 @@ xfs_attr_shortform_to_leaf(
dp = args->dp;
ifp = dp->i_afp;
- sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
+ sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
size = be16_to_cpu(sf->hdr.totsize);
tmpbuffer = kmem_alloc(size, 0);
ASSERT(tmpbuffer != NULL);
memcpy(tmpbuffer, ifp->if_u1.if_data, size);
- sf = (xfs_attr_shortform_t *)tmpbuffer;
+ sf = (struct xfs_attr_shortform *)tmpbuffer;
xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
xfs_bmap_local_to_extents_empty(args->trans, dp, XFS_ATTR_FORK);
@@ -951,7 +951,7 @@ xfs_attr_shortform_to_leaf(
ASSERT(error != -ENOSPC);
if (error)
goto out;
- sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
+ sfe = xfs_attr_sf_nextentry(sfe);
}
error = 0;
*leaf_bp = bp;
@@ -992,9 +992,8 @@ xfs_attr_shortform_allfit(
return 0;
if (be16_to_cpu(name_loc->valuelen) >= XFS_ATTR_SF_ENTSIZE_MAX)
return 0;
- bytes += sizeof(struct xfs_attr_sf_entry) - 1
- + name_loc->namelen
- + be16_to_cpu(name_loc->valuelen);
+ bytes += xfs_attr_sf_entsize_byname(name_loc->namelen,
+ be16_to_cpu(name_loc->valuelen));
}
if ((dp->i_mount->m_flags & XFS_MOUNT_ATTR2) &&
(dp->i_df.if_format != XFS_DINODE_FMT_BTREE) &&
@@ -1036,6 +1035,8 @@ xfs_attr_shortform_verify(
* struct xfs_attr_sf_entry has a variable length.
* Check the fixed-offset parts of the structure are
* within the data buffer.
+ * xfs_attr_sf_entry is defined with a 1-byte variable
+ * array at the end, so we must subtract that off.
*/
if (((char *)sfep + sizeof(*sfep)) >= endp)
return __this_address;
@@ -1049,7 +1050,7 @@ xfs_attr_shortform_verify(
* within the data buffer. The next entry starts after the
* name component, so nextentry is an acceptable test.
*/
- next_sfep = XFS_ATTR_SF_NEXTENTRY(sfep);
+ next_sfep = xfs_attr_sf_nextentry(sfep);
if ((char *)next_sfep > endp)
return __this_address;
diff --git a/fs/xfs/libxfs/xfs_attr_sf.h b/fs/xfs/libxfs/xfs_attr_sf.h
index bb004fb7944a..37578b369d9b 100644
--- a/fs/xfs/libxfs/xfs_attr_sf.h
+++ b/fs/xfs/libxfs/xfs_attr_sf.h
@@ -13,7 +13,6 @@
* to fit into the literal area of the inode.
*/
typedef struct xfs_attr_sf_hdr xfs_attr_sf_hdr_t;
-typedef struct xfs_attr_sf_entry xfs_attr_sf_entry_t;
/*
* We generate this then sort it, attr_list() must return things in hash-order.
@@ -27,16 +26,26 @@ typedef struct xfs_attr_sf_sort {
unsigned char *name; /* name value, pointer into buffer */
} xfs_attr_sf_sort_t;
-#define XFS_ATTR_SF_ENTSIZE_BYNAME(nlen,vlen) /* space name/value uses */ \
- (((int)sizeof(xfs_attr_sf_entry_t)-1 + (nlen)+(vlen)))
#define XFS_ATTR_SF_ENTSIZE_MAX /* max space for name&value */ \
((1 << (NBBY*(int)sizeof(uint8_t))) - 1)
-#define XFS_ATTR_SF_ENTSIZE(sfep) /* space an entry uses */ \
- ((int)sizeof(xfs_attr_sf_entry_t)-1 + (sfep)->namelen+(sfep)->valuelen)
-#define XFS_ATTR_SF_NEXTENTRY(sfep) /* next entry in struct */ \
- ((xfs_attr_sf_entry_t *)((char *)(sfep) + XFS_ATTR_SF_ENTSIZE(sfep)))
-#define XFS_ATTR_SF_TOTSIZE(dp) /* total space in use */ \
- (be16_to_cpu(((xfs_attr_shortform_t *) \
- ((dp)->i_afp->if_u1.if_data))->hdr.totsize))
+
+/* space name/value uses */
+static inline int xfs_attr_sf_entsize_byname(uint8_t nlen, uint8_t vlen)
+{
+ return sizeof(struct xfs_attr_sf_entry) + nlen + vlen;
+}
+
+/* space an entry uses */
+static inline int xfs_attr_sf_entsize(struct xfs_attr_sf_entry *sfep)
+{
+ return struct_size(sfep, nameval, sfep->namelen + sfep->valuelen);
+}
+
+/* next entry in struct */
+static inline struct xfs_attr_sf_entry *
+xfs_attr_sf_nextentry(struct xfs_attr_sf_entry *sfep)
+{
+ return (void *)sfep + xfs_attr_sf_entsize(sfep);
+}
#endif /* __XFS_ATTR_SF_H__ */
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 9c40d5971035..1b0a01b06a05 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -6226,7 +6226,7 @@ xfs_bmap_validate_extent(
isrt = XFS_IS_REALTIME_INODE(ip);
endfsb = irec->br_startblock + irec->br_blockcount - 1;
- if (isrt) {
+ if (isrt && whichfork == XFS_DATA_FORK) {
if (!xfs_verify_rtbno(mp, irec->br_startblock))
return __this_address;
if (!xfs_verify_rtbno(mp, endfsb))
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index 059ac108b1b3..b40a4e80f5ee 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -579,7 +579,7 @@ xfs_dir2_block_leaf_p(struct xfs_dir2_block_tail *btp)
/*
* Entries are packed toward the top as tight as possible.
*/
-typedef struct xfs_attr_shortform {
+struct xfs_attr_shortform {
struct xfs_attr_sf_hdr { /* constant-structure header block */
__be16 totsize; /* total bytes in shortform list */
__u8 count; /* count of active entries */
@@ -589,9 +589,9 @@ typedef struct xfs_attr_shortform {
uint8_t namelen; /* actual length of name (no NULL) */
uint8_t valuelen; /* actual length of value (no NULL) */
uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */
- uint8_t nameval[1]; /* name & value bytes concatenated */
+ uint8_t nameval[]; /* name & value bytes concatenated */
} list[1]; /* variable sized array */
-} xfs_attr_shortform_t;
+};
typedef struct xfs_attr_leaf_map { /* RLE map of free bytes */
__be16 base; /* base of free region */
diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
index 5a2db00b9d5f..6766417d5ba4 100644
--- a/fs/xfs/libxfs/xfs_dquot_buf.c
+++ b/fs/xfs/libxfs/xfs_dquot_buf.c
@@ -69,6 +69,13 @@ xfs_dquot_verify(
ddq_type != XFS_DQTYPE_GROUP)
return __this_address;
+ if ((ddq->d_type & XFS_DQTYPE_BIGTIME) &&
+ !xfs_sb_version_hasbigtime(&mp->m_sb))
+ return __this_address;
+
+ if ((ddq->d_type & XFS_DQTYPE_BIGTIME) && !ddq->d_id)
+ return __this_address;
+
if (id != -1 && id != be32_to_cpu(ddq->d_id))
return __this_address;
@@ -288,3 +295,31 @@ const struct xfs_buf_ops xfs_dquot_buf_ra_ops = {
.verify_read = xfs_dquot_buf_readahead_verify,
.verify_write = xfs_dquot_buf_write_verify,
};
+
+/* Convert an on-disk timer value into an incore timer value. */
+time64_t
+xfs_dquot_from_disk_ts(
+ struct xfs_disk_dquot *ddq,
+ __be32 dtimer)
+{
+ uint32_t t = be32_to_cpu(dtimer);
+
+ if (t != 0 && (ddq->d_type & XFS_DQTYPE_BIGTIME))
+ return xfs_dq_bigtime_to_unix(t);
+
+ return t;
+}
+
+/* Convert an incore timer value into an on-disk timer value. */
+__be32
+xfs_dquot_to_disk_ts(
+ struct xfs_dquot *dqp,
+ time64_t timer)
+{
+ uint32_t t = timer;
+
+ if (timer != 0 && (dqp->q_type & XFS_DQTYPE_BIGTIME))
+ t = xfs_dq_unix_to_bigtime(timer);
+
+ return cpu_to_be32(t);
+}
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 31b7ece985bb..dd764da08f6f 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -449,10 +449,12 @@ xfs_sb_has_compat_feature(
#define XFS_SB_FEAT_RO_COMPAT_FINOBT (1 << 0) /* free inode btree */
#define XFS_SB_FEAT_RO_COMPAT_RMAPBT (1 << 1) /* reverse map btree */
#define XFS_SB_FEAT_RO_COMPAT_REFLINK (1 << 2) /* reflinked files */
+#define XFS_SB_FEAT_RO_COMPAT_INOBTCNT (1 << 3) /* inobt block counts */
#define XFS_SB_FEAT_RO_COMPAT_ALL \
(XFS_SB_FEAT_RO_COMPAT_FINOBT | \
XFS_SB_FEAT_RO_COMPAT_RMAPBT | \
- XFS_SB_FEAT_RO_COMPAT_REFLINK)
+ XFS_SB_FEAT_RO_COMPAT_REFLINK| \
+ XFS_SB_FEAT_RO_COMPAT_INOBTCNT)
#define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL
static inline bool
xfs_sb_has_ro_compat_feature(
@@ -465,10 +467,12 @@ xfs_sb_has_ro_compat_feature(
#define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */
#define XFS_SB_FEAT_INCOMPAT_SPINODES (1 << 1) /* sparse inode chunks */
#define XFS_SB_FEAT_INCOMPAT_META_UUID (1 << 2) /* metadata UUID */
+#define XFS_SB_FEAT_INCOMPAT_BIGTIME (1 << 3) /* large timestamps */
#define XFS_SB_FEAT_INCOMPAT_ALL \
(XFS_SB_FEAT_INCOMPAT_FTYPE| \
XFS_SB_FEAT_INCOMPAT_SPINODES| \
- XFS_SB_FEAT_INCOMPAT_META_UUID)
+ XFS_SB_FEAT_INCOMPAT_META_UUID| \
+ XFS_SB_FEAT_INCOMPAT_BIGTIME)
#define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL
static inline bool
@@ -563,6 +567,23 @@ static inline bool xfs_sb_version_hasreflink(struct xfs_sb *sbp)
(sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_REFLINK);
}
+static inline bool xfs_sb_version_hasbigtime(struct xfs_sb *sbp)
+{
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
+ (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_BIGTIME);
+}
+
+/*
+ * Inode btree block counter. We record the number of inobt and finobt blocks
+ * in the AGI header so that we can skip the finobt walk at mount time when
+ * setting up per-AG reservations.
+ */
+static inline bool xfs_sb_version_hasinobtcounts(struct xfs_sb *sbp)
+{
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
+ (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_INOBTCNT);
+}
+
/*
* end of superblock version macros
*/
@@ -765,6 +786,9 @@ typedef struct xfs_agi {
__be32 agi_free_root; /* root of the free inode btree */
__be32 agi_free_level;/* levels in free inode btree */
+ __be32 agi_iblocks; /* inobt blocks used */
+ __be32 agi_fblocks; /* finobt blocks used */
+
/* structure must be padded to 64 bit alignment */
} xfs_agi_t;
@@ -785,7 +809,8 @@ typedef struct xfs_agi {
#define XFS_AGI_ALL_BITS_R1 ((1 << XFS_AGI_NUM_BITS_R1) - 1)
#define XFS_AGI_FREE_ROOT (1 << 11)
#define XFS_AGI_FREE_LEVEL (1 << 12)
-#define XFS_AGI_NUM_BITS_R2 13
+#define XFS_AGI_IBLOCKS (1 << 13) /* both inobt/finobt block counters */
+#define XFS_AGI_NUM_BITS_R2 14
/* disk block (xfs_daddr_t) in the AG */
#define XFS_AGI_DADDR(mp) ((xfs_daddr_t)(2 << (mp)->m_sectbb_log))
@@ -831,10 +856,87 @@ struct xfs_agfl {
ASSERT(xfs_daddr_to_agno(mp, d) == \
xfs_daddr_to_agno(mp, (d) + (len) - 1)))
-typedef struct xfs_timestamp {
+/*
+ * XFS Timestamps
+ * ==============
+ *
+ * Traditional ondisk inode timestamps consist of signed 32-bit counters for
+ * seconds and nanoseconds; time zero is the Unix epoch, Jan 1 00:00:00 UTC
+ * 1970, which means that the timestamp epoch is the same as the Unix epoch.
+ * Therefore, the ondisk min and max defined here can be used directly to
+ * constrain the incore timestamps on a Unix system. Note that we actually
+ * encode a __be64 value on disk.
+ *
+ * When the bigtime feature is enabled, ondisk inode timestamps become an
+ * unsigned 64-bit nanoseconds counter. This means that the bigtime inode
+ * timestamp epoch is the start of the classic timestamp range, which is
+ * Dec 31 20:45:52 UTC 1901. Because the epochs are not the same, callers
+ * /must/ use the bigtime conversion functions when encoding and decoding raw
+ * timestamps.
+ */
+typedef __be64 xfs_timestamp_t;
+
+/* Legacy timestamp encoding format. */
+struct xfs_legacy_timestamp {
__be32 t_sec; /* timestamp seconds */
__be32 t_nsec; /* timestamp nanoseconds */
-} xfs_timestamp_t;
+};
+
+/*
+ * Smallest possible ondisk seconds value with traditional timestamps. This
+ * corresponds exactly with the incore timestamp Dec 13 20:45:52 UTC 1901.
+ */
+#define XFS_LEGACY_TIME_MIN ((int64_t)S32_MIN)
+
+/*
+ * Largest possible ondisk seconds value with traditional timestamps. This
+ * corresponds exactly with the incore timestamp Jan 19 03:14:07 UTC 2038.
+ */
+#define XFS_LEGACY_TIME_MAX ((int64_t)S32_MAX)
+
+/*
+ * Smallest possible ondisk seconds value with bigtime timestamps. This
+ * corresponds (after conversion to a Unix timestamp) with the traditional
+ * minimum timestamp of Dec 13 20:45:52 UTC 1901.
+ */
+#define XFS_BIGTIME_TIME_MIN ((int64_t)0)
+
+/*
+ * Largest supported ondisk seconds value with bigtime timestamps. This
+ * corresponds (after conversion to a Unix timestamp) with an incore timestamp
+ * of Jul 2 20:20:24 UTC 2486.
+ *
+ * We round down the ondisk limit so that the bigtime quota and inode max
+ * timestamps will be the same.
+ */
+#define XFS_BIGTIME_TIME_MAX ((int64_t)((-1ULL / NSEC_PER_SEC) & ~0x3ULL))
+
+/*
+ * Bigtime epoch is set exactly to the minimum time value that a traditional
+ * 32-bit timestamp can represent when using the Unix epoch as a reference.
+ * Hence the Unix epoch is at a fixed offset into the supported bigtime
+ * timestamp range.
+ *
+ * The bigtime epoch also matches the minimum value an on-disk 32-bit XFS
+ * timestamp can represent so we will not lose any fidelity in converting
+ * to/from unix and bigtime timestamps.
+ *
+ * The following conversion factor converts a seconds counter from the Unix
+ * epoch to the bigtime epoch.
+ */
+#define XFS_BIGTIME_EPOCH_OFFSET (-(int64_t)S32_MIN)
+
+/* Convert a timestamp from the Unix epoch to the bigtime epoch. */
+static inline uint64_t xfs_unix_to_bigtime(time64_t unix_seconds)
+{
+ return (uint64_t)unix_seconds + XFS_BIGTIME_EPOCH_OFFSET;
+}
+
+/* Convert a timestamp from the bigtime epoch to the Unix epoch. */
+static inline time64_t xfs_bigtime_to_unix(uint64_t ondisk_seconds)
+{
+ return (time64_t)ondisk_seconds - XFS_BIGTIME_EPOCH_OFFSET;
+}
/*
* On-disk inode structure.
@@ -1061,12 +1163,22 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
#define XFS_DIFLAG2_DAX_BIT 0 /* use DAX for this inode */
#define XFS_DIFLAG2_REFLINK_BIT 1 /* file's blocks may be shared */
#define XFS_DIFLAG2_COWEXTSIZE_BIT 2 /* copy on write extent size hint */
+#define XFS_DIFLAG2_BIGTIME_BIT 3 /* big timestamps */
+
#define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT)
#define XFS_DIFLAG2_REFLINK (1 << XFS_DIFLAG2_REFLINK_BIT)
#define XFS_DIFLAG2_COWEXTSIZE (1 << XFS_DIFLAG2_COWEXTSIZE_BIT)
+#define XFS_DIFLAG2_BIGTIME (1 << XFS_DIFLAG2_BIGTIME_BIT)
#define XFS_DIFLAG2_ANY \
- (XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE)
+ (XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \
+ XFS_DIFLAG2_BIGTIME)
+
+static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip)
+{
+ return dip->di_version >= 3 &&
+ (dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_BIGTIME));
+}
/*
* Inode number format:
@@ -1152,13 +1264,98 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
#define XFS_DQTYPE_USER 0x01 /* user dquot record */
#define XFS_DQTYPE_PROJ 0x02 /* project dquot record */
#define XFS_DQTYPE_GROUP 0x04 /* group dquot record */
+#define XFS_DQTYPE_BIGTIME 0x80 /* large expiry timestamps */
/* bitmask to determine if this is a user/group/project dquot */
#define XFS_DQTYPE_REC_MASK (XFS_DQTYPE_USER | \
XFS_DQTYPE_PROJ | \
XFS_DQTYPE_GROUP)
-#define XFS_DQTYPE_ANY (XFS_DQTYPE_REC_MASK)
+#define XFS_DQTYPE_ANY (XFS_DQTYPE_REC_MASK | \
+ XFS_DQTYPE_BIGTIME)
+
+/*
+ * XFS Quota Timers
+ * ================
+ *
+ * Traditional quota grace period expiration timers are an unsigned 32-bit
+ * seconds counter; time zero is the Unix epoch, Jan 1 00:00:01 UTC 1970.
+ * Note that an expiration value of zero means that the quota limit has not
+ * been reached, and therefore no expiration has been set. Therefore, the
+ * ondisk min and max defined here can be used directly to constrain the incore
+ * quota expiration timestamps on a Unix system.
+ *
+ * When bigtime is enabled, we trade two bits of precision to expand the
+ * expiration timeout range to match that of big inode timestamps. The min and
+ * max recorded here are the on-disk limits, not a Unix timestamp.
+ *
+ * The grace period for each quota type is stored in the root dquot (id = 0)
+ * and is applied to a non-root dquot when it exceeds the soft or hard limits.
+ * The length of quota grace periods are unsigned 32-bit quantities measured in
+ * units of seconds. A value of zero means to use the default period.
+ */
+
+/*
+ * Smallest possible ondisk quota expiration value with traditional timestamps.
+ * This corresponds exactly with the incore expiration Jan 1 00:00:01 UTC 1970.
+ */
+#define XFS_DQ_LEGACY_EXPIRY_MIN ((int64_t)1)
+
+/*
+ * Largest possible ondisk quota expiration value with traditional timestamps.
+ * This corresponds exactly with the incore expiration Feb 7 06:28:15 UTC 2106.
+ */
+#define XFS_DQ_LEGACY_EXPIRY_MAX ((int64_t)U32_MAX)
+
+/*
+ * Smallest possible ondisk quota expiration value with bigtime timestamps.
+ * This corresponds (after conversion to a Unix timestamp) with the incore
+ * expiration of Jan 1 00:00:04 UTC 1970.
+ */
+#define XFS_DQ_BIGTIME_EXPIRY_MIN (XFS_DQ_LEGACY_EXPIRY_MIN)
+
+/*
+ * Largest supported ondisk quota expiration value with bigtime timestamps.
+ * This corresponds (after conversion to a Unix timestamp) with an incore
+ * expiration of Jul 2 20:20:24 UTC 2486.
+ *
+ * The ondisk field supports values up to -1U, which corresponds to an incore
+ * expiration in 2514. This is beyond the maximum the bigtime inode timestamp,
+ * so we cap the maximum bigtime quota expiration to the max inode timestamp.
+ */
+#define XFS_DQ_BIGTIME_EXPIRY_MAX ((int64_t)4074815106U)
+
+/*
+ * The following conversion factors assist in converting a quota expiration
+ * timestamp between the incore and ondisk formats.
+ */
+#define XFS_DQ_BIGTIME_SHIFT (2)
+#define XFS_DQ_BIGTIME_SLACK ((int64_t)(1ULL << XFS_DQ_BIGTIME_SHIFT) - 1)
+
+/* Convert an incore quota expiration timestamp to an ondisk bigtime value. */
+static inline uint32_t xfs_dq_unix_to_bigtime(time64_t unix_seconds)
+{
+ /*
+ * Round the expiration timestamp up to the nearest bigtime timestamp
+ * that we can store, to give users the most time to fix problems.
+ */
+ return ((uint64_t)unix_seconds + XFS_DQ_BIGTIME_SLACK) >>
+ XFS_DQ_BIGTIME_SHIFT;
+}
+
+/* Convert an ondisk bigtime quota expiration value to an incore timestamp. */
+static inline time64_t xfs_dq_bigtime_to_unix(uint32_t ondisk_seconds)
+{
+ return (time64_t)ondisk_seconds << XFS_DQ_BIGTIME_SHIFT;
+}
+
+/*
+ * Default quota grace periods, ranging from zero (use the compiled defaults)
+ * to ~136 years. These are applied to a non-root dquot that has exceeded
+ * either limit.
+ */
+#define XFS_DQ_GRACE_MIN ((int64_t)0)
+#define XFS_DQ_GRACE_MAX ((int64_t)U32_MAX)
/*
* This is the main portion of the on-disk representation of quota information
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 84bcffa87753..2a2e3cfd94f0 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -249,6 +249,7 @@ typedef struct xfs_fsop_resblks {
#define XFS_FSOP_GEOM_FLAGS_SPINODES (1 << 18) /* sparse inode chunks */
#define XFS_FSOP_GEOM_FLAGS_RMAPBT (1 << 19) /* reverse mapping btree */
#define XFS_FSOP_GEOM_FLAGS_REFLINK (1 << 20) /* files can share blocks */
+#define XFS_FSOP_GEOM_FLAGS_BIGTIME (1 << 21) /* 64-bit nsec timestamps */
/*
* Minimum and maximum sizes need for growth checks.
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index f742a96a2fe1..974e71bc4a3a 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -688,7 +688,7 @@ xfs_ialloc_ag_alloc(
args.minalignslop = igeo->cluster_align - 1;
/* Allow space for the inode btree to split. */
- args.minleft = igeo->inobt_maxlevels - 1;
+ args.minleft = igeo->inobt_maxlevels;
if ((error = xfs_alloc_vextent(&args)))
return error;
@@ -736,7 +736,7 @@ xfs_ialloc_ag_alloc(
/*
* Allow space for the inode btree to split.
*/
- args.minleft = igeo->inobt_maxlevels - 1;
+ args.minleft = igeo->inobt_maxlevels;
if ((error = xfs_alloc_vextent(&args)))
return error;
}
@@ -2473,6 +2473,7 @@ xfs_ialloc_log_agi(
offsetof(xfs_agi_t, agi_unlinked),
offsetof(xfs_agi_t, agi_free_root),
offsetof(xfs_agi_t, agi_free_level),
+ offsetof(xfs_agi_t, agi_iblocks),
sizeof(xfs_agi_t)
};
#ifdef DEBUG
@@ -2806,6 +2807,10 @@ xfs_ialloc_setup_geometry(
uint64_t icount;
uint inodes;
+ igeo->new_diflags2 = 0;
+ if (xfs_sb_version_hasbigtime(&mp->m_sb))
+ igeo->new_diflags2 |= XFS_DIFLAG2_BIGTIME;
+
/* Compute inode btree geometry. */
igeo->agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
igeo->inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1);
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 3c8aebc36e64..cc919a2ee870 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -67,6 +67,25 @@ xfs_finobt_set_root(
XFS_AGI_FREE_ROOT | XFS_AGI_FREE_LEVEL);
}
+/* Update the inode btree block counter for this btree. */
+static inline void
+xfs_inobt_mod_blockcount(
+ struct xfs_btree_cur *cur,
+ int howmuch)
+{
+ struct xfs_buf *agbp = cur->bc_ag.agbp;
+ struct xfs_agi *agi = agbp->b_addr;
+
+ if (!xfs_sb_version_hasinobtcounts(&cur->bc_mp->m_sb))
+ return;
+
+ if (cur->bc_btnum == XFS_BTNUM_FINO)
+ be32_add_cpu(&agi->agi_fblocks, howmuch);
+ else if (cur->bc_btnum == XFS_BTNUM_INO)
+ be32_add_cpu(&agi->agi_iblocks, howmuch);
+ xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_IBLOCKS);
+}
+
STATIC int
__xfs_inobt_alloc_block(
struct xfs_btree_cur *cur,
@@ -102,6 +121,7 @@ __xfs_inobt_alloc_block(
new->s = cpu_to_be32(XFS_FSB_TO_AGBNO(args.mp, args.fsbno));
*stat = 1;
+ xfs_inobt_mod_blockcount(cur, 1);
return 0;
}
@@ -134,6 +154,7 @@ __xfs_inobt_free_block(
struct xfs_buf *bp,
enum xfs_ag_resv_type resv)
{
+ xfs_inobt_mod_blockcount(cur, -1);
return xfs_free_extent(cur->bc_tp,
XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)), 1,
&XFS_RMAP_OINFO_INOBT, resv);
@@ -480,19 +501,29 @@ xfs_inobt_commit_staged_btree(
{
struct xfs_agi *agi = agbp->b_addr;
struct xbtree_afakeroot *afake = cur->bc_ag.afake;
+ int fields;
ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
if (cur->bc_btnum == XFS_BTNUM_INO) {
+ fields = XFS_AGI_ROOT | XFS_AGI_LEVEL;
agi->agi_root = cpu_to_be32(afake->af_root);
agi->agi_level = cpu_to_be32(afake->af_levels);
- xfs_ialloc_log_agi(tp, agbp, XFS_AGI_ROOT | XFS_AGI_LEVEL);
+ if (xfs_sb_version_hasinobtcounts(&cur->bc_mp->m_sb)) {
+ agi->agi_iblocks = cpu_to_be32(afake->af_blocks);
+ fields |= XFS_AGI_IBLOCKS;
+ }
+ xfs_ialloc_log_agi(tp, agbp, fields);
xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_inobt_ops);
} else {
+ fields = XFS_AGI_FREE_ROOT | XFS_AGI_FREE_LEVEL;
agi->agi_free_root = cpu_to_be32(afake->af_root);
agi->agi_free_level = cpu_to_be32(afake->af_levels);
- xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREE_ROOT |
- XFS_AGI_FREE_LEVEL);
+ if (xfs_sb_version_hasinobtcounts(&cur->bc_mp->m_sb)) {
+ agi->agi_fblocks = cpu_to_be32(afake->af_blocks);
+ fields |= XFS_AGI_IBLOCKS;
+ }
+ xfs_ialloc_log_agi(tp, agbp, fields);
xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_finobt_ops);
}
}
@@ -673,6 +704,28 @@ xfs_inobt_count_blocks(
return error;
}
+/* Read finobt block count from AGI header. */
+static int
+xfs_finobt_read_blocks(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ xfs_extlen_t *tree_blocks)
+{
+ struct xfs_buf *agbp;
+ struct xfs_agi *agi;
+ int error;
+
+ error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
+ if (error)
+ return error;
+
+ agi = agbp->b_addr;
+ *tree_blocks = be32_to_cpu(agi->agi_fblocks);
+ xfs_trans_brelse(tp, agbp);
+ return 0;
+}
+
/*
* Figure out how many blocks to reserve and how many are used by this btree.
*/
@@ -690,7 +743,11 @@ xfs_finobt_calc_reserves(
if (!xfs_sb_version_hasfinobt(&mp->m_sb))
return 0;
- error = xfs_inobt_count_blocks(mp, tp, agno, XFS_BTNUM_FINO, &tree_len);
+ if (xfs_sb_version_hasinobtcounts(&mp->m_sb))
+ error = xfs_finobt_read_blocks(mp, tp, agno, &tree_len);
+ else
+ error = xfs_inobt_count_blocks(mp, tp, agno, XFS_BTNUM_FINO,
+ &tree_len);
if (error)
return error;
diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c
index 52451809c478..b4164256993d 100644
--- a/fs/xfs/libxfs/xfs_iext_tree.c
+++ b/fs/xfs/libxfs/xfs_iext_tree.c
@@ -603,7 +603,7 @@ xfs_iext_realloc_root(
if (new_size / sizeof(struct xfs_iext_rec) == RECS_PER_LEAF)
new_size = NODE_SIZE;
- new = kmem_realloc(ifp->if_u1.if_root, new_size, KM_NOFS);
+ new = krealloc(ifp->if_u1.if_root, new_size, GFP_NOFS | __GFP_NOFAIL);
memset(new + ifp->if_bytes, 0, new_size - ifp->if_bytes);
ifp->if_u1.if_root = new;
cur->leaf = new;
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 8d5dd08eab75..c667c63f2cb0 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -157,6 +157,36 @@ xfs_imap_to_bp(
return 0;
}
+static inline struct timespec64 xfs_inode_decode_bigtime(uint64_t ts)
+{
+ struct timespec64 tv;
+ uint32_t n;
+
+ tv.tv_sec = xfs_bigtime_to_unix(div_u64_rem(ts, NSEC_PER_SEC, &n));
+ tv.tv_nsec = n;
+
+ return tv;
+}
+
+/* Convert an ondisk timestamp to an incore timestamp. */
+struct timespec64
+xfs_inode_from_disk_ts(
+ struct xfs_dinode *dip,
+ const xfs_timestamp_t ts)
+{
+ struct timespec64 tv;
+ struct xfs_legacy_timestamp *lts;
+
+ if (xfs_dinode_has_bigtime(dip))
+ return xfs_inode_decode_bigtime(be64_to_cpu(ts));
+
+ lts = (struct xfs_legacy_timestamp *)&ts;
+ tv.tv_sec = (int)be32_to_cpu(lts->t_sec);
+ tv.tv_nsec = (int)be32_to_cpu(lts->t_nsec);
+
+ return tv;
+}
+
int
xfs_inode_from_disk(
struct xfs_inode *ip,
@@ -211,12 +241,9 @@ xfs_inode_from_disk(
* a time before epoch is converted to a time long after epoch
* on 64 bit systems.
*/
- inode->i_atime.tv_sec = (int)be32_to_cpu(from->di_atime.t_sec);
- inode->i_atime.tv_nsec = (int)be32_to_cpu(from->di_atime.t_nsec);
- inode->i_mtime.tv_sec = (int)be32_to_cpu(from->di_mtime.t_sec);
- inode->i_mtime.tv_nsec = (int)be32_to_cpu(from->di_mtime.t_nsec);
- inode->i_ctime.tv_sec = (int)be32_to_cpu(from->di_ctime.t_sec);
- inode->i_ctime.tv_nsec = (int)be32_to_cpu(from->di_ctime.t_nsec);
+ inode->i_atime = xfs_inode_from_disk_ts(from, from->di_atime);
+ inode->i_mtime = xfs_inode_from_disk_ts(from, from->di_mtime);
+ inode->i_ctime = xfs_inode_from_disk_ts(from, from->di_ctime);
to->di_size = be64_to_cpu(from->di_size);
to->di_nblocks = be64_to_cpu(from->di_nblocks);
@@ -229,8 +256,7 @@ xfs_inode_from_disk(
if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) {
inode_set_iversion_queried(inode,
be64_to_cpu(from->di_changecount));
- to->di_crtime.tv_sec = be32_to_cpu(from->di_crtime.t_sec);
- to->di_crtime.tv_nsec = be32_to_cpu(from->di_crtime.t_nsec);
+ to->di_crtime = xfs_inode_from_disk_ts(from, from->di_crtime);
to->di_flags2 = be64_to_cpu(from->di_flags2);
to->di_cowextsize = be32_to_cpu(from->di_cowextsize);
}
@@ -252,6 +278,25 @@ out_destroy_data_fork:
return error;
}
+/* Convert an incore timestamp to an ondisk timestamp. */
+static inline xfs_timestamp_t
+xfs_inode_to_disk_ts(
+ struct xfs_inode *ip,
+ const struct timespec64 tv)
+{
+ struct xfs_legacy_timestamp *lts;
+ xfs_timestamp_t ts;
+
+ if (xfs_inode_has_bigtime(ip))
+ return cpu_to_be64(xfs_inode_encode_bigtime(tv));
+
+ lts = (struct xfs_legacy_timestamp *)&ts;
+ lts->t_sec = cpu_to_be32(tv.tv_sec);
+ lts->t_nsec = cpu_to_be32(tv.tv_nsec);
+
+ return ts;
+}
+
void
xfs_inode_to_disk(
struct xfs_inode *ip,
@@ -271,12 +316,9 @@ xfs_inode_to_disk(
to->di_projid_hi = cpu_to_be16(from->di_projid >> 16);
memset(to->di_pad, 0, sizeof(to->di_pad));
- to->di_atime.t_sec = cpu_to_be32(inode->i_atime.tv_sec);
- to->di_atime.t_nsec = cpu_to_be32(inode->i_atime.tv_nsec);
- to->di_mtime.t_sec = cpu_to_be32(inode->i_mtime.tv_sec);
- to->di_mtime.t_nsec = cpu_to_be32(inode->i_mtime.tv_nsec);
- to->di_ctime.t_sec = cpu_to_be32(inode->i_ctime.tv_sec);
- to->di_ctime.t_nsec = cpu_to_be32(inode->i_ctime.tv_nsec);
+ to->di_atime = xfs_inode_to_disk_ts(ip, inode->i_atime);
+ to->di_mtime = xfs_inode_to_disk_ts(ip, inode->i_mtime);
+ to->di_ctime = xfs_inode_to_disk_ts(ip, inode->i_ctime);
to->di_nlink = cpu_to_be32(inode->i_nlink);
to->di_gen = cpu_to_be32(inode->i_generation);
to->di_mode = cpu_to_be16(inode->i_mode);
@@ -295,8 +337,7 @@ xfs_inode_to_disk(
if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) {
to->di_version = 3;
to->di_changecount = cpu_to_be64(inode_peek_iversion(inode));
- to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.tv_sec);
- to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.tv_nsec);
+ to->di_crtime = xfs_inode_to_disk_ts(ip, from->di_crtime);
to->di_flags2 = cpu_to_be64(from->di_flags2);
to->di_cowextsize = cpu_to_be32(from->di_cowextsize);
to->di_ino = cpu_to_be64(ip->i_ino);
@@ -310,58 +351,6 @@ xfs_inode_to_disk(
}
}
-void
-xfs_log_dinode_to_disk(
- struct xfs_log_dinode *from,
- struct xfs_dinode *to)
-{
- to->di_magic = cpu_to_be16(from->di_magic);
- to->di_mode = cpu_to_be16(from->di_mode);
- to->di_version = from->di_version;
- to->di_format = from->di_format;
- to->di_onlink = 0;
- to->di_uid = cpu_to_be32(from->di_uid);
- to->di_gid = cpu_to_be32(from->di_gid);
- to->di_nlink = cpu_to_be32(from->di_nlink);
- to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
- to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
- memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
-
- to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
- to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec);
- to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec);
- to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec);
- to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec);
- to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec);
-
- to->di_size = cpu_to_be64(from->di_size);
- to->di_nblocks = cpu_to_be64(from->di_nblocks);
- to->di_extsize = cpu_to_be32(from->di_extsize);
- to->di_nextents = cpu_to_be32(from->di_nextents);
- to->di_anextents = cpu_to_be16(from->di_anextents);
- to->di_forkoff = from->di_forkoff;
- to->di_aformat = from->di_aformat;
- to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
- to->di_dmstate = cpu_to_be16(from->di_dmstate);
- to->di_flags = cpu_to_be16(from->di_flags);
- to->di_gen = cpu_to_be32(from->di_gen);
-
- if (from->di_version == 3) {
- to->di_changecount = cpu_to_be64(from->di_changecount);
- to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
- to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
- to->di_flags2 = cpu_to_be64(from->di_flags2);
- to->di_cowextsize = cpu_to_be32(from->di_cowextsize);
- to->di_ino = cpu_to_be64(from->di_ino);
- to->di_lsn = cpu_to_be64(from->di_lsn);
- memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
- uuid_copy(&to->di_uuid, &from->di_uuid);
- to->di_flushiter = 0;
- } else {
- to->di_flushiter = cpu_to_be16(from->di_flushiter);
- }
-}
-
static xfs_failaddr_t
xfs_dinode_verify_fork(
struct xfs_dinode *dip,
@@ -568,6 +557,11 @@ xfs_dinode_verify(
if (fa)
return fa;
+ /* bigtime iflag can only happen on bigtime filesystems */
+ if (xfs_dinode_has_bigtime(dip) &&
+ !xfs_sb_version_hasbigtime(&mp->m_sb))
+ return __this_address;
+
return NULL;
}
diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h
index 6b08b9d060c2..536666143fe7 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.h
+++ b/fs/xfs/libxfs/xfs_inode_buf.h
@@ -32,6 +32,11 @@ struct xfs_icdinode {
struct timespec64 di_crtime; /* time created */
};
+static inline bool xfs_icdinode_has_bigtime(const struct xfs_icdinode *icd)
+{
+ return icd->di_flags2 & XFS_DIFLAG2_BIGTIME;
+}
+
/*
* Inode location information. Stored in the inode and passed to
* xfs_imap_to_bp() to get a buffer and dinode for a given inode.
@@ -49,8 +54,6 @@ void xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *);
void xfs_inode_to_disk(struct xfs_inode *ip, struct xfs_dinode *to,
xfs_lsn_t lsn);
int xfs_inode_from_disk(struct xfs_inode *ip, struct xfs_dinode *from);
-void xfs_log_dinode_to_disk(struct xfs_log_dinode *from,
- struct xfs_dinode *to);
xfs_failaddr_t xfs_dinode_verify(struct xfs_mount *mp, xfs_ino_t ino,
struct xfs_dinode *dip);
@@ -60,4 +63,12 @@ xfs_failaddr_t xfs_inode_validate_cowextsize(struct xfs_mount *mp,
uint32_t cowextsize, uint16_t mode, uint16_t flags,
uint64_t flags2);
+static inline uint64_t xfs_inode_encode_bigtime(struct timespec64 tv)
+{
+ return xfs_unix_to_bigtime(tv.tv_sec) * NSEC_PER_SEC + tv.tv_nsec;
+}
+
+struct timespec64 xfs_inode_from_disk_ts(struct xfs_dinode *dip,
+ const xfs_timestamp_t ts);
+
#endif /* __XFS_INODE_BUF_H__ */
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 0cf853d42d62..7575de5cecb1 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -386,8 +386,8 @@ xfs_iroot_realloc(
cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
new_max = cur_max + rec_diff;
new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
- ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
- KM_NOFS);
+ ifp->if_broot = krealloc(ifp->if_broot, new_size,
+ GFP_NOFS | __GFP_NOFAIL);
op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
ifp->if_broot_bytes);
np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
@@ -496,8 +496,8 @@ xfs_idata_realloc(
* in size so that it can be logged and stay on word boundaries.
* We enforce that here.
*/
- ifp->if_u1.if_data = kmem_realloc(ifp->if_u1.if_data,
- roundup(new_size, 4), KM_NOFS);
+ ifp->if_u1.if_data = krealloc(ifp->if_u1.if_data, roundup(new_size, 4),
+ GFP_NOFS | __GFP_NOFAIL);
ifp->if_bytes = new_size;
}
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index e3400c9c71cd..8bd00da6d2a4 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -368,10 +368,13 @@ static inline int xfs_ilog_fdata(int w)
* directly mirrors the xfs_dinode structure as it must contain all the same
* information.
*/
-typedef struct xfs_ictimestamp {
+typedef uint64_t xfs_ictimestamp_t;
+
+/* Legacy timestamp encoding format. */
+struct xfs_legacy_ictimestamp {
int32_t t_sec; /* timestamp seconds */
int32_t t_nsec; /* timestamp nanoseconds */
-} xfs_ictimestamp_t;
+};
/*
* Define the format of the inode core that is logged. This structure must be
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
index 641132d0e39d..3cca2bfe714c 100644
--- a/fs/xfs/libxfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -121,7 +121,6 @@ struct xlog_recover {
void xlog_buf_readahead(struct xlog *log, xfs_daddr_t blkno, uint len,
const struct xfs_buf_ops *ops);
bool xlog_is_buffer_cancelled(struct xlog *log, xfs_daddr_t blkno, uint len);
-void xlog_recover_iodone(struct xfs_buf *bp);
void xlog_recover_release_intent(struct xlog *log, unsigned short intent_type,
uint64_t intent_id);
diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h
index 076bdc7037ee..0f0af4e35032 100644
--- a/fs/xfs/libxfs/xfs_quota_defs.h
+++ b/fs/xfs/libxfs/xfs_quota_defs.h
@@ -23,7 +23,8 @@ typedef uint8_t xfs_dqtype_t;
#define XFS_DQTYPE_STRINGS \
{ XFS_DQTYPE_USER, "USER" }, \
{ XFS_DQTYPE_PROJ, "PROJ" }, \
- { XFS_DQTYPE_GROUP, "GROUP" }
+ { XFS_DQTYPE_GROUP, "GROUP" }, \
+ { XFS_DQTYPE_BIGTIME, "BIGTIME" }
/*
* flags for q_flags field in the dquot.
@@ -143,4 +144,9 @@ extern int xfs_calc_dquots_per_chunk(unsigned int nbblks);
extern void xfs_dqblk_repair(struct xfs_mount *mp, struct xfs_dqblk *dqb,
xfs_dqid_t id, xfs_dqtype_t type);
+struct xfs_dquot;
+time64_t xfs_dquot_from_disk_ts(struct xfs_disk_dquot *ddq,
+ __be32 dtimer);
+__be32 xfs_dquot_to_disk_ts(struct xfs_dquot *ddq, time64_t timer);
+
#endif /* __XFS_QUOTA_H__ */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 4df87546bd40..5aeafa59ed27 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -600,7 +600,7 @@ xfs_sb_quota_to_disk(
* disk. If neither are active, we should NULL the inode.
*
* In all cases, the separate pquotino must remain 0 because it
- * it beyond the "end" of the valid non-pquotino superblock.
+ * is beyond the "end" of the valid non-pquotino superblock.
*/
if (from->sb_qflags & XFS_GQUOTA_ACCT)
to->sb_gquotino = cpu_to_be64(from->sb_gquotino);
@@ -954,7 +954,7 @@ xfs_log_sb(
struct xfs_trans *tp)
{
struct xfs_mount *mp = tp->t_mountp;
- struct xfs_buf *bp = xfs_trans_getsb(tp, mp);
+ struct xfs_buf *bp = xfs_trans_getsb(tp);
mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount);
mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree);
@@ -1084,7 +1084,7 @@ xfs_sync_sb_buf(
if (error)
return error;
- bp = xfs_trans_getsb(tp, mp);
+ bp = xfs_trans_getsb(tp);
xfs_log_sb(tp);
xfs_trans_bhold(tp, bp);
xfs_trans_set_sync(tp);
@@ -1166,6 +1166,8 @@ xfs_fs_geometry(
geo->flags |= XFS_FSOP_GEOM_FLAGS_RMAPBT;
if (xfs_sb_version_hasreflink(sbp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_REFLINK;
+ if (xfs_sb_version_hasbigtime(sbp))
+ geo->flags |= XFS_FSOP_GEOM_FLAGS_BIGTIME;
if (xfs_sb_version_hassector(sbp))
geo->logsectsize = sbp->sb_logsectsize;
else
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 708feb8eac76..c795ae47b3c9 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -176,6 +176,9 @@ struct xfs_ino_geometry {
unsigned int ialloc_align;
unsigned int agino_log; /* #bits for agino in inum */
+
+ /* precomputed value for di_flags2 */
+ uint64_t new_diflags2;
};
#endif /* __XFS_SHARED_H__ */
diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c
index e15129647e00..90f1d5645052 100644
--- a/fs/xfs/libxfs/xfs_trans_inode.c
+++ b/fs/xfs/libxfs/xfs_trans_inode.c
@@ -110,9 +110,9 @@ xfs_trans_log_inode(
* to log the timestamps, or will clear already cleared fields in the
* worst case.
*/
- if (inode->i_state & (I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED)) {
+ if (inode->i_state & I_DIRTY_TIME) {
spin_lock(&inode->i_lock);
- inode->i_state &= ~(I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED);
+ inode->i_state &= ~I_DIRTY_TIME;
spin_unlock(&inode->i_lock);
}
@@ -132,6 +132,17 @@ xfs_trans_log_inode(
}
/*
+ * If we're updating the inode core or the timestamps and it's possible
+ * to upgrade this inode to bigtime format, do so now.
+ */
+ if ((flags & (XFS_ILOG_CORE | XFS_ILOG_TIMESTAMP)) &&
+ xfs_sb_version_hasbigtime(&ip->i_mount->m_sb) &&
+ !xfs_inode_has_bigtime(ip)) {
+ ip->i_d.di_flags2 |= XFS_DIFLAG2_BIGTIME;
+ flags |= XFS_ILOG_CORE;
+ }
+
+ /*
* Record the specific change for fdatasync optimisation. This allows
* fdatasync to skip log forces for inodes that are only timestamp
* dirty.
@@ -177,9 +188,9 @@ xfs_trans_log_inode(
/*
* Always OR in the bits from the ili_last_fields field. This is to
- * coordinate with the xfs_iflush() and xfs_iflush_done() routines in
- * the eventual clearing of the ili_fields bits. See the big comment in
- * xfs_iflush() for an explanation of this coordination mechanism.
+ * coordinate with the xfs_iflush() and xfs_buf_inode_iodone() routines
+ * in the eventual clearing of the ili_fields bits. See the big comment
+ * in xfs_iflush() for an explanation of this coordination mechanism.
*/
iip->ili_fields |= (flags | iip->ili_last_fields | iversion_flags);
spin_unlock(&iip->ili_lock);
diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h
index c6df01a2a158..7ad3659c5d2a 100644
--- a/fs/xfs/libxfs/xfs_trans_space.h
+++ b/fs/xfs/libxfs/xfs_trans_space.h
@@ -58,7 +58,7 @@
#define XFS_IALLOC_SPACE_RES(mp) \
(M_IGEO(mp)->ialloc_blks + \
((xfs_sb_version_hasfinobt(&mp->m_sb) ? 2 : 1) * \
- (M_IGEO(mp)->inobt_maxlevels - 1)))
+ M_IGEO(mp)->inobt_maxlevels))
/*
* Space reservation values for various transactions.
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index e9bcf1faa183..ae8e2e0ac64a 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -781,6 +781,35 @@ xchk_agi_xref_icounts(
xchk_block_xref_set_corrupt(sc, sc->sa.agi_bp);
}
+/* Check agi_[fi]blocks against tree size */
+static inline void
+xchk_agi_xref_fiblocks(
+ struct xfs_scrub *sc)
+{
+ struct xfs_agi *agi = sc->sa.agi_bp->b_addr;
+ xfs_agblock_t blocks;
+ int error = 0;
+
+ if (!xfs_sb_version_hasinobtcounts(&sc->mp->m_sb))
+ return;
+
+ if (sc->sa.ino_cur) {
+ error = xfs_btree_count_blocks(sc->sa.ino_cur, &blocks);
+ if (!xchk_should_check_xref(sc, &error, &sc->sa.ino_cur))
+ return;
+ if (blocks != be32_to_cpu(agi->agi_iblocks))
+ xchk_block_xref_set_corrupt(sc, sc->sa.agi_bp);
+ }
+
+ if (sc->sa.fino_cur) {
+ error = xfs_btree_count_blocks(sc->sa.fino_cur, &blocks);
+ if (!xchk_should_check_xref(sc, &error, &sc->sa.fino_cur))
+ return;
+ if (blocks != be32_to_cpu(agi->agi_fblocks))
+ xchk_block_xref_set_corrupt(sc, sc->sa.agi_bp);
+ }
+}
+
/* Cross-reference with the other btrees. */
STATIC void
xchk_agi_xref(
@@ -804,6 +833,7 @@ xchk_agi_xref(
xchk_agi_xref_icounts(sc);
xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS);
xchk_xref_is_not_shared(sc, agbno, 1);
+ xchk_agi_xref_fiblocks(sc);
/* scrub teardown will take care of sc->sa for us */
}
diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index bca2ab1d4be9..401f71579ce6 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -810,10 +810,34 @@ xrep_agi_calc_from_btrees(
error = xfs_ialloc_count_inodes(cur, &count, &freecount);
if (error)
goto err;
+ if (xfs_sb_version_hasinobtcounts(&mp->m_sb)) {
+ xfs_agblock_t blocks;
+
+ error = xfs_btree_count_blocks(cur, &blocks);
+ if (error)
+ goto err;
+ agi->agi_iblocks = cpu_to_be32(blocks);
+ }
xfs_btree_del_cursor(cur, error);
agi->agi_count = cpu_to_be32(count);
agi->agi_freecount = cpu_to_be32(freecount);
+
+ if (xfs_sb_version_hasfinobt(&mp->m_sb) &&
+ xfs_sb_version_hasinobtcounts(&mp->m_sb)) {
+ xfs_agblock_t blocks;
+
+ cur = xfs_inobt_init_cursor(mp, sc->tp, agi_bp, sc->sa.agno,
+ XFS_BTNUM_FINO);
+ if (error)
+ goto err;
+ error = xfs_btree_count_blocks(cur, &blocks);
+ if (error)
+ goto err;
+ xfs_btree_del_cursor(cur, error);
+ agi->agi_fblocks = cpu_to_be32(blocks);
+ }
+
return 0;
err:
xfs_btree_del_cursor(cur, error);
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index 6d483ab29e63..3aa85b64de36 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -190,11 +190,30 @@ xchk_inode_flags2(
if ((flags2 & XFS_DIFLAG2_DAX) && (flags2 & XFS_DIFLAG2_REFLINK))
goto bad;
+ /* no bigtime iflag without the bigtime feature */
+ if (xfs_dinode_has_bigtime(dip) &&
+ !xfs_sb_version_hasbigtime(&mp->m_sb))
+ goto bad;
+
return;
bad:
xchk_ino_set_corrupt(sc, ino);
}
+static inline void
+xchk_dinode_nsec(
+ struct xfs_scrub *sc,
+ xfs_ino_t ino,
+ struct xfs_dinode *dip,
+ const xfs_timestamp_t ts)
+{
+ struct timespec64 tv;
+
+ tv = xfs_inode_from_disk_ts(dip, ts);
+ if (tv.tv_nsec < 0 || tv.tv_nsec >= NSEC_PER_SEC)
+ xchk_ino_set_corrupt(sc, ino);
+}
+
/* Scrub all the ondisk inode fields. */
STATIC void
xchk_dinode(
@@ -293,12 +312,9 @@ xchk_dinode(
}
/* di_[amc]time.nsec */
- if (be32_to_cpu(dip->di_atime.t_nsec) >= NSEC_PER_SEC)
- xchk_ino_set_corrupt(sc, ino);
- if (be32_to_cpu(dip->di_mtime.t_nsec) >= NSEC_PER_SEC)
- xchk_ino_set_corrupt(sc, ino);
- if (be32_to_cpu(dip->di_ctime.t_nsec) >= NSEC_PER_SEC)
- xchk_ino_set_corrupt(sc, ino);
+ xchk_dinode_nsec(sc, ino, dip, dip->di_atime);
+ xchk_dinode_nsec(sc, ino, dip, dip->di_mtime);
+ xchk_dinode_nsec(sc, ino, dip, dip->di_ctime);
/*
* di_size. xfs_dinode_verify checks for things that screw up
@@ -403,8 +419,7 @@ xchk_dinode(
}
if (dip->di_version >= 3) {
- if (be32_to_cpu(dip->di_crtime.t_nsec) >= NSEC_PER_SEC)
- xchk_ino_set_corrupt(sc, ino);
+ xchk_dinode_nsec(sc, ino, dip, dip->di_crtime);
xchk_inode_flags2(sc, dip, ino, mode, flags, flags2);
xchk_inode_cowextsize(sc, dip, ino, mode, flags,
flags2);
diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c
index 5641ae512c9e..c08be5ede066 100644
--- a/fs/xfs/scrub/symlink.c
+++ b/fs/xfs/scrub/symlink.c
@@ -22,7 +22,7 @@ xchk_setup_symlink(
struct xfs_inode *ip)
{
/* Allocate the buffer without the inode lock held. */
- sc->buf = kmem_zalloc_large(XFS_SYMLINK_MAXLEN + 1, 0);
+ sc->buf = kvzalloc(XFS_SYMLINK_MAXLEN + 1, GFP_KERNEL);
if (!sc->buf)
return -ENOMEM;
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index d4c687b5cd06..c544951a0c07 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -192,7 +192,7 @@ __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
if (acl) {
args.valuelen = XFS_ACL_SIZE(acl->a_count);
- args.value = kmem_zalloc_large(args.valuelen, 0);
+ args.value = kvzalloc(args.valuelen, GFP_KERNEL);
if (!args.value)
return -ENOMEM;
xfs_acl_to_disk(args.value, acl);
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index b35611882ff9..55d126d4e096 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -544,7 +544,7 @@ xfs_discard_page(
page, ip->i_ino, offset);
error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
- PAGE_SIZE / i_blocksize(inode));
+ i_blocks_per_page(inode, page));
if (error && !XFS_FORCED_SHUTDOWN(mp))
xfs_alert(mp, "page discard unable to remove delalloc mapping.");
out_invalidate:
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index e380bd1a9bfc..8f8837fe21cf 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -44,7 +44,7 @@ xfs_attr_shortform_compare(const void *a, const void *b)
/*
* Copy out entries of shortform attribute lists for attr_list().
* Shortform attribute lists are not stored in hashval sorted order.
- * If the output buffer is not large enough to hold them all, then we
+ * If the output buffer is not large enough to hold them all, then
* we have to calculate each entries' hashvalue and sort them before
* we can begin returning them to the user.
*/
@@ -61,7 +61,7 @@ xfs_attr_shortform_list(
int error = 0;
ASSERT(dp->i_afp != NULL);
- sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data;
+ sf = (struct xfs_attr_shortform *)dp->i_afp->if_u1.if_data;
ASSERT(sf != NULL);
if (!sf->hdr.count)
return 0;
@@ -96,7 +96,7 @@ xfs_attr_shortform_list(
*/
if (context->seen_enough)
break;
- sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
+ sfe = xfs_attr_sf_nextentry(sfe);
}
trace_xfs_attr_list_sf_all(context);
return 0;
@@ -136,7 +136,7 @@ xfs_attr_shortform_list(
/* These are bytes, and both on-disk, don't endian-flip */
sbp->valuelen = sfe->valuelen;
sbp->flags = sfe->flags;
- sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
+ sfe = xfs_attr_sf_nextentry(sfe);
sbp++;
nsbuf++;
}
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 73cafc843cd7..f2a8a0e75e1f 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -946,6 +946,14 @@ xfs_free_file_space(
startoffset_fsb = XFS_B_TO_FSB(mp, offset);
endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
+ /* We can only free complete realtime extents. */
+ if (XFS_IS_REALTIME_INODE(ip)) {
+ xfs_extlen_t extsz = xfs_get_extsz_hint(ip);
+
+ if ((startoffset_fsb | endoffset_fsb) & (extsz - 1))
+ return -EINVAL;
+ }
+
/*
* Need to zero the stuff we're not freeing, on disk.
*/
@@ -1139,6 +1147,14 @@ xfs_insert_file_space(
trace_xfs_insert_file_space(ip);
+ /* We can only insert complete realtime extents. */
+ if (XFS_IS_REALTIME_INODE(ip)) {
+ xfs_extlen_t extsz = xfs_get_extsz_hint(ip);
+
+ if ((stop_fsb | shift_fsb) & (extsz - 1))
+ return -EINVAL;
+ }
+
error = xfs_bmap_can_insert_extents(ip, stop_fsb, shift_fsb);
if (error)
return error;
@@ -1165,7 +1181,7 @@ xfs_insert_file_space(
goto out_trans_cancel;
do {
- error = xfs_trans_roll_inode(&tp, ip);
+ error = xfs_defer_finish(&tp);
if (error)
goto out_trans_cancel;
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index d4cdcb6fb2fe..4e4cf91f4f9f 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -52,6 +52,15 @@ static kmem_zone_t *xfs_buf_zone;
* b_lock (trylock due to inversion)
*/
+static int __xfs_buf_submit(struct xfs_buf *bp, bool wait);
+
+static inline int
+xfs_buf_submit(
+ struct xfs_buf *bp)
+{
+ return __xfs_buf_submit(bp, !(bp->b_flags & XBF_ASYNC));
+}
+
static inline int
xfs_buf_is_vmapped(
struct xfs_buf *bp)
@@ -751,7 +760,7 @@ found:
return 0;
}
-STATIC int
+int
_xfs_buf_read(
xfs_buf_t *bp,
xfs_buf_flags_t flags)
@@ -759,7 +768,7 @@ _xfs_buf_read(
ASSERT(!(flags & XBF_WRITE));
ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL);
- bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD);
+ bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD | XBF_DONE);
bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
return xfs_buf_submit(bp);
@@ -1170,20 +1179,145 @@ xfs_buf_wait_unpin(
set_current_state(TASK_RUNNING);
}
+static void
+xfs_buf_ioerror_alert_ratelimited(
+ struct xfs_buf *bp)
+{
+ static unsigned long lasttime;
+ static struct xfs_buftarg *lasttarg;
+
+ if (bp->b_target != lasttarg ||
+ time_after(jiffies, (lasttime + 5*HZ))) {
+ lasttime = jiffies;
+ xfs_buf_ioerror_alert(bp, __this_address);
+ }
+ lasttarg = bp->b_target;
+}
+
/*
- * Buffer Utility Routines
+ * Account for this latest trip around the retry handler, and decide if
+ * we've failed enough times to constitute a permanent failure.
*/
+static bool
+xfs_buf_ioerror_permanent(
+ struct xfs_buf *bp,
+ struct xfs_error_cfg *cfg)
+{
+ struct xfs_mount *mp = bp->b_mount;
-void
+ if (cfg->max_retries != XFS_ERR_RETRY_FOREVER &&
+ ++bp->b_retries > cfg->max_retries)
+ return true;
+ if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
+ time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time))
+ return true;
+
+ /* At unmount we may treat errors differently */
+ if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount)
+ return true;
+
+ return false;
+}
+
+/*
+ * On a sync write or shutdown we just want to stale the buffer and let the
+ * caller handle the error in bp->b_error appropriately.
+ *
+ * If the write was asynchronous then no one will be looking for the error. If
+ * this is the first failure of this type, clear the error state and write the
+ * buffer out again. This means we always retry an async write failure at least
+ * once, but we also need to set the buffer up to behave correctly now for
+ * repeated failures.
+ *
+ * If we get repeated async write failures, then we take action according to the
+ * error configuration we have been set up to use.
+ *
+ * Returns true if this function took care of error handling and the caller must
+ * not touch the buffer again. Return false if the caller should proceed with
+ * normal I/O completion handling.
+ */
+static bool
+xfs_buf_ioend_handle_error(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_error_cfg *cfg;
+
+ /*
+ * If we've already decided to shutdown the filesystem because of I/O
+ * errors, there's no point in giving this a retry.
+ */
+ if (XFS_FORCED_SHUTDOWN(mp))
+ goto out_stale;
+
+ xfs_buf_ioerror_alert_ratelimited(bp);
+
+ /*
+ * We're not going to bother about retrying this during recovery.
+ * One strike!
+ */
+ if (bp->b_flags & _XBF_LOGRECOVERY) {
+ xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+ return false;
+ }
+
+ /*
+ * Synchronous writes will have callers process the error.
+ */
+ if (!(bp->b_flags & XBF_ASYNC))
+ goto out_stale;
+
+ trace_xfs_buf_iodone_async(bp, _RET_IP_);
+
+ cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error);
+ if (bp->b_last_error != bp->b_error ||
+ !(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL))) {
+ bp->b_last_error = bp->b_error;
+ if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
+ !bp->b_first_retry_time)
+ bp->b_first_retry_time = jiffies;
+ goto resubmit;
+ }
+
+ /*
+ * Permanent error - we need to trigger a shutdown if we haven't already
+ * to indicate that inconsistency will result from this action.
+ */
+ if (xfs_buf_ioerror_permanent(bp, cfg)) {
+ xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+ goto out_stale;
+ }
+
+ /* Still considered a transient error. Caller will schedule retries. */
+ if (bp->b_flags & _XBF_INODES)
+ xfs_buf_inode_io_fail(bp);
+ else if (bp->b_flags & _XBF_DQUOTS)
+ xfs_buf_dquot_io_fail(bp);
+ else
+ ASSERT(list_empty(&bp->b_li_list));
+ xfs_buf_ioerror(bp, 0);
+ xfs_buf_relse(bp);
+ return true;
+
+resubmit:
+ xfs_buf_ioerror(bp, 0);
+ bp->b_flags |= (XBF_DONE | XBF_WRITE_FAIL);
+ xfs_buf_submit(bp);
+ return true;
+out_stale:
+ xfs_buf_stale(bp);
+ bp->b_flags |= XBF_DONE;
+ bp->b_flags &= ~XBF_WRITE;
+ trace_xfs_buf_error_relse(bp, _RET_IP_);
+ return false;
+}
+
+static void
xfs_buf_ioend(
struct xfs_buf *bp)
{
- bool read = bp->b_flags & XBF_READ;
-
trace_xfs_buf_iodone(bp, _RET_IP_);
- bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
-
/*
* Pull in IO completion errors now. We are guaranteed to be running
* single threaded, so we don't need the lock to read b_io_error.
@@ -1191,39 +1325,47 @@ xfs_buf_ioend(
if (!bp->b_error && bp->b_io_error)
xfs_buf_ioerror(bp, bp->b_io_error);
- if (read) {
+ if (bp->b_flags & XBF_READ) {
if (!bp->b_error && bp->b_ops)
bp->b_ops->verify_read(bp);
if (!bp->b_error)
bp->b_flags |= XBF_DONE;
- xfs_buf_ioend_finish(bp);
- return;
- }
+ } else {
+ if (!bp->b_error) {
+ bp->b_flags &= ~XBF_WRITE_FAIL;
+ bp->b_flags |= XBF_DONE;
+ }
- if (!bp->b_error) {
- bp->b_flags &= ~XBF_WRITE_FAIL;
- bp->b_flags |= XBF_DONE;
- }
+ if (unlikely(bp->b_error) && xfs_buf_ioend_handle_error(bp))
+ return;
- /*
- * If this is a log recovery buffer, we aren't doing transactional IO
- * yet so we need to let it handle IO completions.
- */
- if (bp->b_flags & _XBF_LOGRECOVERY) {
- xlog_recover_iodone(bp);
- return;
- }
+ /* clear the retry state */
+ bp->b_last_error = 0;
+ bp->b_retries = 0;
+ bp->b_first_retry_time = 0;
- if (bp->b_flags & _XBF_INODES) {
- xfs_buf_inode_iodone(bp);
- return;
- }
+ /*
+ * Note that for things like remote attribute buffers, there may
+ * not be a buffer log item here, so processing the buffer log
+ * item must remain optional.
+ */
+ if (bp->b_log_item)
+ xfs_buf_item_done(bp);
+
+ if (bp->b_flags & _XBF_INODES)
+ xfs_buf_inode_iodone(bp);
+ else if (bp->b_flags & _XBF_DQUOTS)
+ xfs_buf_dquot_iodone(bp);
- if (bp->b_flags & _XBF_DQUOTS) {
- xfs_buf_dquot_iodone(bp);
- return;
}
- xfs_buf_iodone(bp);
+
+ bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD |
+ _XBF_LOGRECOVERY);
+
+ if (bp->b_flags & XBF_ASYNC)
+ xfs_buf_relse(bp);
+ else
+ complete(&bp->b_iowait);
}
static void
@@ -1506,7 +1648,7 @@ xfs_buf_iowait(
* safe to reference the buffer after a call to this function unless the caller
* holds an additional reference itself.
*/
-int
+static int
__xfs_buf_submit(
struct xfs_buf *bp,
bool wait)
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 755b652e695a..bfd2907e7bc4 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -249,6 +249,7 @@ int xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, int flags,
int xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr,
size_t numblks, int flags, struct xfs_buf **bpp,
const struct xfs_buf_ops *ops);
+int _xfs_buf_read(struct xfs_buf *bp, xfs_buf_flags_t flags);
void xfs_buf_hold(struct xfs_buf *bp);
/* Releasing Buffers */
@@ -269,28 +270,12 @@ static inline void xfs_buf_relse(xfs_buf_t *bp)
/* Buffer Read and Write Routines */
extern int xfs_bwrite(struct xfs_buf *bp);
-extern void xfs_buf_ioend(struct xfs_buf *bp);
-static inline void xfs_buf_ioend_finish(struct xfs_buf *bp)
-{
- if (bp->b_flags & XBF_ASYNC)
- xfs_buf_relse(bp);
- else
- complete(&bp->b_iowait);
-}
extern void __xfs_buf_ioerror(struct xfs_buf *bp, int error,
xfs_failaddr_t failaddr);
#define xfs_buf_ioerror(bp, err) __xfs_buf_ioerror((bp), (err), __this_address)
extern void xfs_buf_ioerror_alert(struct xfs_buf *bp, xfs_failaddr_t fa);
void xfs_buf_ioend_fail(struct xfs_buf *);
-
-extern int __xfs_buf_submit(struct xfs_buf *bp, bool);
-static inline int xfs_buf_submit(struct xfs_buf *bp)
-{
- bool wait = bp->b_flags & XBF_ASYNC ? false : true;
- return __xfs_buf_submit(bp, wait);
-}
-
void xfs_buf_zero(struct xfs_buf *bp, size_t boff, size_t bsize);
void __xfs_buf_mark_corrupt(struct xfs_buf *bp, xfs_failaddr_t fa);
#define xfs_buf_mark_corrupt(bp) __xfs_buf_mark_corrupt((bp), __this_address)
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 5bb6f22cc11a..0356f2e340a1 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -30,8 +30,6 @@ static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip)
return container_of(lip, struct xfs_buf_log_item, bli_item);
}
-static void xfs_buf_item_done(struct xfs_buf *bp);
-
/* Is this log iovec plausibly large enough to contain the buffer log format? */
bool
xfs_buf_log_check_iovec(
@@ -127,7 +125,7 @@ xfs_buf_item_size_segment(
* stretch of non-contiguous chunks to be logged. Contiguous chunks are logged
* in a single iovec.
*
- * Discontiguous buffers need a format structure per region that that is being
+ * Discontiguous buffers need a format structure per region that is being
* logged. This makes the changes in the buffer appear to log recovery as though
* they came from separate buffers, just like would occur if multiple buffers
* were used instead of a single discontiguous buffer. This enables
@@ -463,7 +461,7 @@ xfs_buf_item_unpin(
*/
if (bip->bli_flags & XFS_BLI_STALE_INODE) {
xfs_buf_item_done(bp);
- xfs_iflush_done(bp);
+ xfs_buf_inode_iodone(bp);
ASSERT(list_empty(&bp->b_li_list));
} else {
xfs_trans_ail_delete(lip, SHUTDOWN_LOG_IO_ERROR);
@@ -956,153 +954,10 @@ xfs_buf_item_relse(
xfs_buf_item_free(bip);
}
-/*
- * Decide if we're going to retry the write after a failure, and prepare
- * the buffer for retrying the write.
- */
-static bool
-xfs_buf_ioerror_fail_without_retry(
- struct xfs_buf *bp)
-{
- struct xfs_mount *mp = bp->b_mount;
- static ulong lasttime;
- static xfs_buftarg_t *lasttarg;
-
- /*
- * If we've already decided to shutdown the filesystem because of
- * I/O errors, there's no point in giving this a retry.
- */
- if (XFS_FORCED_SHUTDOWN(mp))
- return true;
-
- if (bp->b_target != lasttarg ||
- time_after(jiffies, (lasttime + 5*HZ))) {
- lasttime = jiffies;
- xfs_buf_ioerror_alert(bp, __this_address);
- }
- lasttarg = bp->b_target;
-
- /* synchronous writes will have callers process the error */
- if (!(bp->b_flags & XBF_ASYNC))
- return true;
- return false;
-}
-
-static bool
-xfs_buf_ioerror_retry(
- struct xfs_buf *bp,
- struct xfs_error_cfg *cfg)
-{
- if ((bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL)) &&
- bp->b_last_error == bp->b_error)
- return false;
-
- bp->b_flags |= (XBF_WRITE | XBF_DONE | XBF_WRITE_FAIL);
- bp->b_last_error = bp->b_error;
- if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
- !bp->b_first_retry_time)
- bp->b_first_retry_time = jiffies;
- return true;
-}
-
-/*
- * Account for this latest trip around the retry handler, and decide if
- * we've failed enough times to constitute a permanent failure.
- */
-static bool
-xfs_buf_ioerror_permanent(
- struct xfs_buf *bp,
- struct xfs_error_cfg *cfg)
-{
- struct xfs_mount *mp = bp->b_mount;
-
- if (cfg->max_retries != XFS_ERR_RETRY_FOREVER &&
- ++bp->b_retries > cfg->max_retries)
- return true;
- if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
- time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time))
- return true;
-
- /* At unmount we may treat errors differently */
- if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount)
- return true;
-
- return false;
-}
-
-/*
- * On a sync write or shutdown we just want to stale the buffer and let the
- * caller handle the error in bp->b_error appropriately.
- *
- * If the write was asynchronous then no one will be looking for the error. If
- * this is the first failure of this type, clear the error state and write the
- * buffer out again. This means we always retry an async write failure at least
- * once, but we also need to set the buffer up to behave correctly now for
- * repeated failures.
- *
- * If we get repeated async write failures, then we take action according to the
- * error configuration we have been set up to use.
- *
- * Multi-state return value:
- *
- * XBF_IOERROR_FINISH: clear IO error retry state and run callback completions
- * XBF_IOERROR_DONE: resubmitted immediately, do not run any completions
- * XBF_IOERROR_FAIL: transient error, run failure callback completions and then
- * release the buffer
- */
-enum {
- XBF_IOERROR_FINISH,
- XBF_IOERROR_DONE,
- XBF_IOERROR_FAIL,
-};
-
-static int
-xfs_buf_iodone_error(
- struct xfs_buf *bp)
-{
- struct xfs_mount *mp = bp->b_mount;
- struct xfs_error_cfg *cfg;
-
- if (xfs_buf_ioerror_fail_without_retry(bp))
- goto out_stale;
-
- trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
-
- cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error);
- if (xfs_buf_ioerror_retry(bp, cfg)) {
- xfs_buf_ioerror(bp, 0);
- xfs_buf_submit(bp);
- return XBF_IOERROR_DONE;
- }
-
- /*
- * Permanent error - we need to trigger a shutdown if we haven't already
- * to indicate that inconsistency will result from this action.
- */
- if (xfs_buf_ioerror_permanent(bp, cfg)) {
- xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
- goto out_stale;
- }
-
- /* Still considered a transient error. Caller will schedule retries. */
- return XBF_IOERROR_FAIL;
-
-out_stale:
- xfs_buf_stale(bp);
- bp->b_flags |= XBF_DONE;
- trace_xfs_buf_error_relse(bp, _RET_IP_);
- return XBF_IOERROR_FINISH;
-}
-
-static void
+void
xfs_buf_item_done(
struct xfs_buf *bp)
{
- struct xfs_buf_log_item *bip = bp->b_log_item;
-
- if (!bip)
- return;
-
/*
* If we are forcibly shutting down, this may well be off the AIL
* already. That's because we simulate the log-committed callbacks to
@@ -1111,113 +966,12 @@ xfs_buf_item_done(
* xfs_trans_ail_delete() takes care of these.
*
* Either way, AIL is useless if we're forcing a shutdown.
+ *
+ * Note that log recovery writes might have buffer items that are not on
+ * the AIL even when the file system is not shut down.
*/
- xfs_trans_ail_delete(&bip->bli_item, SHUTDOWN_CORRUPT_INCORE);
- bp->b_log_item = NULL;
- xfs_buf_item_free(bip);
- xfs_buf_rele(bp);
-}
-
-static inline void
-xfs_buf_clear_ioerror_retry_state(
- struct xfs_buf *bp)
-{
- bp->b_last_error = 0;
- bp->b_retries = 0;
- bp->b_first_retry_time = 0;
-}
-
-/*
- * Inode buffer iodone callback function.
- */
-void
-xfs_buf_inode_iodone(
- struct xfs_buf *bp)
-{
- if (bp->b_error) {
- struct xfs_log_item *lip;
- int ret = xfs_buf_iodone_error(bp);
-
- if (ret == XBF_IOERROR_FINISH)
- goto finish_iodone;
- if (ret == XBF_IOERROR_DONE)
- return;
- ASSERT(ret == XBF_IOERROR_FAIL);
- list_for_each_entry(lip, &bp->b_li_list, li_bio_list) {
- set_bit(XFS_LI_FAILED, &lip->li_flags);
- }
- xfs_buf_ioerror(bp, 0);
- xfs_buf_relse(bp);
- return;
- }
-
-finish_iodone:
- xfs_buf_clear_ioerror_retry_state(bp);
- xfs_buf_item_done(bp);
- xfs_iflush_done(bp);
- xfs_buf_ioend_finish(bp);
-}
-
-/*
- * Dquot buffer iodone callback function.
- */
-void
-xfs_buf_dquot_iodone(
- struct xfs_buf *bp)
-{
- if (bp->b_error) {
- struct xfs_log_item *lip;
- int ret = xfs_buf_iodone_error(bp);
-
- if (ret == XBF_IOERROR_FINISH)
- goto finish_iodone;
- if (ret == XBF_IOERROR_DONE)
- return;
- ASSERT(ret == XBF_IOERROR_FAIL);
- spin_lock(&bp->b_mount->m_ail->ail_lock);
- list_for_each_entry(lip, &bp->b_li_list, li_bio_list) {
- xfs_set_li_failed(lip, bp);
- }
- spin_unlock(&bp->b_mount->m_ail->ail_lock);
- xfs_buf_ioerror(bp, 0);
- xfs_buf_relse(bp);
- return;
- }
-
-finish_iodone:
- xfs_buf_clear_ioerror_retry_state(bp);
- /* a newly allocated dquot buffer might have a log item attached */
- xfs_buf_item_done(bp);
- xfs_dquot_done(bp);
- xfs_buf_ioend_finish(bp);
-}
-
-/*
- * Dirty buffer iodone callback function.
- *
- * Note that for things like remote attribute buffers, there may not be a buffer
- * log item here, so processing the buffer log item must remain be optional.
- */
-void
-xfs_buf_iodone(
- struct xfs_buf *bp)
-{
- if (bp->b_error) {
- int ret = xfs_buf_iodone_error(bp);
-
- if (ret == XBF_IOERROR_FINISH)
- goto finish_iodone;
- if (ret == XBF_IOERROR_DONE)
- return;
- ASSERT(ret == XBF_IOERROR_FAIL);
- ASSERT(list_empty(&bp->b_li_list));
- xfs_buf_ioerror(bp, 0);
- xfs_buf_relse(bp);
- return;
- }
-
-finish_iodone:
- xfs_buf_clear_ioerror_retry_state(bp);
- xfs_buf_item_done(bp);
- xfs_buf_ioend_finish(bp);
+ xfs_trans_ail_delete(&bp->b_log_item->bli_item,
+ (bp->b_flags & _XBF_LOGRECOVERY) ? 0 :
+ SHUTDOWN_CORRUPT_INCORE);
+ xfs_buf_item_relse(bp);
}
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 23507cbb4c41..50aa0f5ef959 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -50,12 +50,24 @@ struct xfs_buf_log_item {
};
int xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
+void xfs_buf_item_done(struct xfs_buf *bp);
void xfs_buf_item_relse(struct xfs_buf *);
bool xfs_buf_item_put(struct xfs_buf_log_item *);
void xfs_buf_item_log(struct xfs_buf_log_item *, uint, uint);
bool xfs_buf_item_dirty_format(struct xfs_buf_log_item *);
void xfs_buf_inode_iodone(struct xfs_buf *);
+void xfs_buf_inode_io_fail(struct xfs_buf *bp);
+#ifdef CONFIG_XFS_QUOTA
void xfs_buf_dquot_iodone(struct xfs_buf *);
+void xfs_buf_dquot_io_fail(struct xfs_buf *bp);
+#else
+static inline void xfs_buf_dquot_iodone(struct xfs_buf *bp)
+{
+}
+static inline void xfs_buf_dquot_io_fail(struct xfs_buf *bp)
+{
+}
+#endif /* CONFIG_XFS_QUOTA */
void xfs_buf_iodone(struct xfs_buf *);
bool xfs_buf_log_check_iovec(struct xfs_log_iovec *iovec);
diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c
index d480f11e6b00..24c7a8d11e1a 100644
--- a/fs/xfs/xfs_buf_item_recover.c
+++ b/fs/xfs/xfs_buf_item_recover.c
@@ -414,7 +414,7 @@ xlog_recover_validate_buf_type(
*
* Write verifiers update the metadata LSN from log items attached to
* the buffer. Therefore, initialize a bli purely to carry the LSN to
- * the verifier. We'll clean it up in our ->iodone() callback.
+ * the verifier.
*/
if (bp->b_ops) {
struct xfs_buf_log_item *bip;
@@ -948,7 +948,7 @@ xlog_recover_buf_commit_pass2(
* or inode_cluster_size bytes, whichever is bigger. The inode
* buffers in the log can be a different size if the log was generated
* by an older kernel using unclustered inode buffers or a newer kernel
- * running with a different inode cluster size. Regardless, if the
+ * running with a different inode cluster size. Regardless, if
* the inode buffer size isn't max(blocksize, inode_cluster_size)
* for *our* value of inode_cluster_size, then we need to keep
* the buffer out of the buffer cache so that the buffer won't
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 04dc2be19c3a..3072814e407d 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -98,12 +98,33 @@ xfs_qm_adjust_dqlimits(
xfs_dquot_set_prealloc_limits(dq);
}
+/* Set the expiration time of a quota's grace period. */
+time64_t
+xfs_dquot_set_timeout(
+ struct xfs_mount *mp,
+ time64_t timeout)
+{
+ struct xfs_quotainfo *qi = mp->m_quotainfo;
+
+ return clamp_t(time64_t, timeout, qi->qi_expiry_min,
+ qi->qi_expiry_max);
+}
+
+/* Set the length of the default grace period. */
+time64_t
+xfs_dquot_set_grace_period(
+ time64_t grace)
+{
+ return clamp_t(time64_t, grace, XFS_DQ_GRACE_MIN, XFS_DQ_GRACE_MAX);
+}
+
/*
* Determine if this quota counter is over either limit and set the quota
* timers as appropriate.
*/
static inline void
xfs_qm_adjust_res_timer(
+ struct xfs_mount *mp,
struct xfs_dquot_res *res,
struct xfs_quota_limits *qlim)
{
@@ -112,7 +133,8 @@ xfs_qm_adjust_res_timer(
if ((res->softlimit && res->count > res->softlimit) ||
(res->hardlimit && res->count > res->hardlimit)) {
if (res->timer == 0)
- res->timer = ktime_get_real_seconds() + qlim->time;
+ res->timer = xfs_dquot_set_timeout(mp,
+ ktime_get_real_seconds() + qlim->time);
} else {
if (res->timer == 0)
res->warnings = 0;
@@ -145,9 +167,9 @@ xfs_qm_adjust_dqtimers(
ASSERT(dq->q_id);
defq = xfs_get_defquota(qi, xfs_dquot_type(dq));
- xfs_qm_adjust_res_timer(&dq->q_blk, &defq->blk);
- xfs_qm_adjust_res_timer(&dq->q_ino, &defq->ino);
- xfs_qm_adjust_res_timer(&dq->q_rtb, &defq->rtb);
+ xfs_qm_adjust_res_timer(dq->q_mount, &dq->q_blk, &defq->blk);
+ xfs_qm_adjust_res_timer(dq->q_mount, &dq->q_ino, &defq->ino);
+ xfs_qm_adjust_res_timer(dq->q_mount, &dq->q_rtb, &defq->rtb);
}
/*
@@ -201,6 +223,8 @@ xfs_qm_init_dquot_blk(
d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
d->dd_diskdq.d_id = cpu_to_be32(curid);
d->dd_diskdq.d_type = type;
+ if (curid > 0 && xfs_sb_version_hasbigtime(&mp->m_sb))
+ d->dd_diskdq.d_type |= XFS_DQTYPE_BIGTIME;
if (xfs_sb_version_hascrc(&mp->m_sb)) {
uuid_copy(&d->dd_uuid, &mp->m_sb.sb_meta_uuid);
xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
@@ -514,9 +538,9 @@ xfs_dquot_from_disk(
dqp->q_ino.warnings = be16_to_cpu(ddqp->d_iwarns);
dqp->q_rtb.warnings = be16_to_cpu(ddqp->d_rtbwarns);
- dqp->q_blk.timer = be32_to_cpu(ddqp->d_btimer);
- dqp->q_ino.timer = be32_to_cpu(ddqp->d_itimer);
- dqp->q_rtb.timer = be32_to_cpu(ddqp->d_rtbtimer);
+ dqp->q_blk.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_btimer);
+ dqp->q_ino.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_itimer);
+ dqp->q_rtb.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_rtbtimer);
/*
* Reservation counters are defined as reservation plus current usage
@@ -559,9 +583,9 @@ xfs_dquot_to_disk(
ddqp->d_iwarns = cpu_to_be16(dqp->q_ino.warnings);
ddqp->d_rtbwarns = cpu_to_be16(dqp->q_rtb.warnings);
- ddqp->d_btimer = cpu_to_be32(dqp->q_blk.timer);
- ddqp->d_itimer = cpu_to_be32(dqp->q_ino.timer);
- ddqp->d_rtbtimer = cpu_to_be32(dqp->q_rtb.timer);
+ ddqp->d_btimer = xfs_dquot_to_disk_ts(dqp, dqp->q_blk.timer);
+ ddqp->d_itimer = xfs_dquot_to_disk_ts(dqp, dqp->q_ino.timer);
+ ddqp->d_rtbtimer = xfs_dquot_to_disk_ts(dqp, dqp->q_rtb.timer);
}
/* Allocate and initialize the dquot buffer for this in-core dquot. */
@@ -807,7 +831,7 @@ xfs_qm_dqget_checks(
}
/*
- * Given the file system, id, and type (UDQUOT/GDQUOT), return a a locked
+ * Given the file system, id, and type (UDQUOT/GDQUOT), return a locked
* dquot, doing an allocation (if requested) as needed.
*/
int
@@ -1107,7 +1131,7 @@ xfs_qm_dqflush_done(
}
void
-xfs_dquot_done(
+xfs_buf_dquot_iodone(
struct xfs_buf *bp)
{
struct xfs_log_item *lip, *n;
@@ -1118,6 +1142,18 @@ xfs_dquot_done(
}
}
+void
+xfs_buf_dquot_io_fail(
+ struct xfs_buf *bp)
+{
+ struct xfs_log_item *lip;
+
+ spin_lock(&bp->b_mount->m_ail->ail_lock);
+ list_for_each_entry(lip, &bp->b_li_list, li_bio_list)
+ xfs_set_li_failed(lip, bp);
+ spin_unlock(&bp->b_mount->m_ail->ail_lock);
+}
+
/* Check incore dquot for errors before we flush. */
static xfs_failaddr_t
xfs_qm_dqflush_check(
@@ -1145,6 +1181,14 @@ xfs_qm_dqflush_check(
!dqp->q_rtb.timer)
return __this_address;
+ /* bigtime flag should never be set on root dquots */
+ if (dqp->q_type & XFS_DQTYPE_BIGTIME) {
+ if (!xfs_sb_version_hasbigtime(&dqp->q_mount->m_sb))
+ return __this_address;
+ if (dqp->q_id == 0)
+ return __this_address;
+ }
+
return NULL;
}
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 282a65da93c7..f642884a6834 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -237,4 +237,7 @@ typedef int (*xfs_qm_dqiterate_fn)(struct xfs_dquot *dq,
int xfs_qm_dqiterate(struct xfs_mount *mp, xfs_dqtype_t type,
xfs_qm_dqiterate_fn iter_fn, void *priv);
+time64_t xfs_dquot_set_timeout(struct xfs_mount *mp, time64_t timeout);
+time64_t xfs_dquot_set_grace_period(time64_t grace);
+
#endif /* __XFS_DQUOT_H__ */
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 5a4b0119143a..465fd9e048d4 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -56,7 +56,7 @@ xfs_fs_encode_fh(
fileid_type = FILEID_INO32_GEN_PARENT;
/*
- * If the the filesystem may contain 64bit inode numbers, we need
+ * If the filesystem may contain 64bit inode numbers, we need
* to use larger file handles that can represent them.
*
* While we only allocate inodes that do not fit into 32 bits any
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index c31cd3be9fb2..3d1b95124744 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1008,6 +1008,21 @@ xfs_file_fadvise(
return ret;
}
+/* Does this file, inode, or mount want synchronous writes? */
+static inline bool xfs_file_sync_writes(struct file *filp)
+{
+ struct xfs_inode *ip = XFS_I(file_inode(filp));
+
+ if (ip->i_mount->m_flags & XFS_MOUNT_WSYNC)
+ return true;
+ if (filp->f_flags & (__O_SYNC | O_DSYNC))
+ return true;
+ if (IS_SYNC(file_inode(filp)))
+ return true;
+
+ return false;
+}
+
STATIC loff_t
xfs_file_remap_range(
struct file *file_in,
@@ -1065,7 +1080,7 @@ xfs_file_remap_range(
if (ret)
goto out_unlock;
- if (mp->m_flags & XFS_MOUNT_WSYNC)
+ if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out))
xfs_log_force_inode(dest);
out_unlock:
xfs_iunlock2_io_mmap(src, dest);
@@ -1223,6 +1238,14 @@ __xfs_filemap_fault(
return ret;
}
+static inline bool
+xfs_is_write_fault(
+ struct vm_fault *vmf)
+{
+ return (vmf->flags & FAULT_FLAG_WRITE) &&
+ (vmf->vma->vm_flags & VM_SHARED);
+}
+
static vm_fault_t
xfs_filemap_fault(
struct vm_fault *vmf)
@@ -1230,7 +1253,7 @@ xfs_filemap_fault(
/* DAX can shortcut the normal fault path on write faults! */
return __xfs_filemap_fault(vmf, PE_SIZE_PTE,
IS_DAX(file_inode(vmf->vma->vm_file)) &&
- (vmf->flags & FAULT_FLAG_WRITE));
+ xfs_is_write_fault(vmf));
}
static vm_fault_t
@@ -1243,7 +1266,7 @@ xfs_filemap_huge_fault(
/* DAX can shortcut the normal fault path on write faults! */
return __xfs_filemap_fault(vmf, pe_size,
- (vmf->flags & FAULT_FLAG_WRITE));
+ xfs_is_write_fault(vmf));
}
static vm_fault_t
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 101028ebb571..deb99300d171 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -52,7 +52,6 @@ xfs_inode_alloc(
XFS_STATS_INC(mp, vn_active);
ASSERT(atomic_read(&ip->i_pincount) == 0);
- ASSERT(!xfs_isiflocked(ip));
ASSERT(ip->i_ino == 0);
/* initialise the xfs inode */
@@ -123,7 +122,7 @@ void
xfs_inode_free(
struct xfs_inode *ip)
{
- ASSERT(!xfs_isiflocked(ip));
+ ASSERT(!xfs_iflags_test(ip, XFS_IFLUSHING));
/*
* Because we use RCU freeing we need to ensure the inode always
@@ -1035,23 +1034,21 @@ xfs_reclaim_inode(
if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
goto out;
- if (!xfs_iflock_nowait(ip))
+ if (xfs_iflags_test_and_set(ip, XFS_IFLUSHING))
goto out_iunlock;
if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
xfs_iunpin_wait(ip);
- /* xfs_iflush_abort() drops the flush lock */
xfs_iflush_abort(ip);
goto reclaim;
}
if (xfs_ipincount(ip))
- goto out_ifunlock;
+ goto out_clear_flush;
if (!xfs_inode_clean(ip))
- goto out_ifunlock;
+ goto out_clear_flush;
- xfs_ifunlock(ip);
+ xfs_iflags_clear(ip, XFS_IFLUSHING);
reclaim:
- ASSERT(!xfs_isiflocked(ip));
/*
* Because we use RCU freeing we need to ensure the inode always appears
@@ -1101,8 +1098,8 @@ reclaim:
__xfs_inode_free(ip);
return;
-out_ifunlock:
- xfs_ifunlock(ip);
+out_clear_flush:
+ xfs_iflags_clear(ip, XFS_IFLUSHING);
out_iunlock:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
out:
@@ -1211,7 +1208,7 @@ xfs_reclaim_inodes(
while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
xfs_ail_push_all_sync(mp->m_ail);
xfs_reclaim_inodes_ag(mp, &nr_to_scan);
- };
+ }
}
/*
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 407d6299606d..49624973eecc 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -451,7 +451,7 @@ xfs_lock_inodes(
/*
* Currently supports between 2 and 5 inodes with exclusive locking. We
* support an arbitrary depth of locking here, but absolute limits on
- * inodes depend on the the type of locking and the limits placed by
+ * inodes depend on the type of locking and the limits placed by
* lockdep annotations in xfs_lock_inumorder. These are all checked by
* the asserts.
*/
@@ -598,22 +598,6 @@ xfs_lock_two_inodes(
}
}
-void
-__xfs_iflock(
- struct xfs_inode *ip)
-{
- wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
- DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
-
- do {
- prepare_to_wait_exclusive(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
- if (xfs_isiflocked(ip))
- io_schedule();
- } while (!xfs_iflock_nowait(ip));
-
- finish_wait(wq, &wait.wq_entry);
-}
-
STATIC uint
_xfs_dic2xflags(
uint16_t di_flags,
@@ -840,7 +824,7 @@ xfs_ialloc(
if (xfs_sb_version_has_v3inode(&mp->m_sb)) {
inode_set_iversion(inode, 1);
- ip->i_d.di_flags2 = 0;
+ ip->i_d.di_flags2 = mp->m_ino_geo.new_diflags2;
ip->i_d.di_cowextsize = 0;
ip->i_d.di_crtime = tv;
}
@@ -2531,11 +2515,8 @@ retry:
* valid, the wrong inode or stale.
*/
spin_lock(&ip->i_flags_lock);
- if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE)) {
- spin_unlock(&ip->i_flags_lock);
- rcu_read_unlock();
- return;
- }
+ if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE))
+ goto out_iflags_unlock;
/*
* Don't try to lock/unlock the current inode, but we _cannot_ skip the
@@ -2552,16 +2533,14 @@ retry:
}
}
ip->i_flags |= XFS_ISTALE;
- spin_unlock(&ip->i_flags_lock);
- rcu_read_unlock();
/*
- * If we can't get the flush lock, the inode is already attached. All
+ * If the inode is flushing, it is already attached to the buffer. All
* we needed to do here is mark the inode stale so buffer IO completion
* will remove it from the AIL.
*/
iip = ip->i_itemp;
- if (!xfs_iflock_nowait(ip)) {
+ if (__xfs_iflags_test(ip, XFS_IFLUSHING)) {
ASSERT(!list_empty(&iip->ili_item.li_bio_list));
ASSERT(iip->ili_last_fields);
goto out_iunlock;
@@ -2573,10 +2552,12 @@ retry:
* commit as the flock synchronises removal of the inode from the
* cluster buffer against inode reclaim.
*/
- if (!iip || list_empty(&iip->ili_item.li_bio_list)) {
- xfs_ifunlock(ip);
+ if (!iip || list_empty(&iip->ili_item.li_bio_list))
goto out_iunlock;
- }
+
+ __xfs_iflags_set(ip, XFS_IFLUSHING);
+ spin_unlock(&ip->i_flags_lock);
+ rcu_read_unlock();
/* we have a dirty inode in memory that has not yet been flushed. */
spin_lock(&iip->ili_lock);
@@ -2586,9 +2567,16 @@ retry:
spin_unlock(&iip->ili_lock);
ASSERT(iip->ili_last_fields);
+ if (ip != free_ip)
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return;
+
out_iunlock:
if (ip != free_ip)
xfs_iunlock(ip, XFS_ILOCK_EXCL);
+out_iflags_unlock:
+ spin_unlock(&ip->i_flags_lock);
+ rcu_read_unlock();
}
/*
@@ -2631,8 +2619,9 @@ xfs_ifree_cluster(
/*
* We obtain and lock the backing buffer first in the process
- * here, as we have to ensure that any dirty inode that we
- * can't get the flush lock on is attached to the buffer.
+ * here to ensure dirty inodes attached to the buffer remain in
+ * the flushing state while we mark them stale.
+ *
* If we scan the in-memory inodes first, then buffer IO can
* complete before we get a lock on it, and hence we may fail
* to mark all the active inodes on the buffer stale.
@@ -2717,7 +2706,7 @@ xfs_ifree(
VFS_I(ip)->i_mode = 0; /* mark incore inode as free */
ip->i_d.di_flags = 0;
- ip->i_d.di_flags2 = 0;
+ ip->i_d.di_flags2 = ip->i_mount->m_ino_geo.new_diflags2;
ip->i_d.di_dmevmask = 0;
ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */
ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
@@ -3105,7 +3094,7 @@ out_trans_abort:
/*
* xfs_rename_alloc_whiteout()
*
- * Return a referenced, unlinked, unlocked inode that that can be used as a
+ * Return a referenced, unlinked, unlocked inode that can be used as a
* whiteout in a rename transaction. We use a tmpfile inode here so that if we
* crash between allocating the inode and linking it into the rename transaction
* recovery will free the inode and we won't leak it.
@@ -3443,7 +3432,7 @@ xfs_iflush(
int error;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
- ASSERT(xfs_isiflocked(ip));
+ ASSERT(xfs_iflags_test(ip, XFS_IFLUSHING));
ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE ||
ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
ASSERT(iip->ili_item.li_buf == bp);
@@ -3553,8 +3542,8 @@ xfs_iflush(
*
* What we do is move the bits to the ili_last_fields field. When
* logging the inode, these bits are moved back to the ili_fields field.
- * In the xfs_iflush_done() routine we clear ili_last_fields, since we
- * know that the information those bits represent is permanently on
+ * In the xfs_buf_inode_iodone() routine we clear ili_last_fields, since
+ * we know that the information those bits represent is permanently on
* disk. As long as the flush completes before the inode is logged
* again, then both ili_fields and ili_last_fields will be cleared.
*/
@@ -3568,7 +3557,7 @@ flush_out:
/*
* Store the current LSN of the inode so that we can tell whether the
- * item has moved in the AIL from xfs_iflush_done().
+ * item has moved in the AIL from xfs_buf_inode_iodone().
*/
xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
&iip->ili_item.li_lsn);
@@ -3613,7 +3602,7 @@ xfs_iflush_cluster(
/*
* Quick and dirty check to avoid locks if possible.
*/
- if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK))
+ if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLUSHING))
continue;
if (xfs_ipincount(ip))
continue;
@@ -3627,7 +3616,7 @@ xfs_iflush_cluster(
*/
spin_lock(&ip->i_flags_lock);
ASSERT(!__xfs_iflags_test(ip, XFS_ISTALE));
- if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK)) {
+ if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLUSHING)) {
spin_unlock(&ip->i_flags_lock);
continue;
}
@@ -3635,24 +3624,17 @@ xfs_iflush_cluster(
/*
* ILOCK will pin the inode against reclaim and prevent
* concurrent transactions modifying the inode while we are
- * flushing the inode.
+ * flushing the inode. If we get the lock, set the flushing
+ * state before we drop the i_flags_lock.
*/
if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
spin_unlock(&ip->i_flags_lock);
continue;
}
+ __xfs_iflags_set(ip, XFS_IFLUSHING);
spin_unlock(&ip->i_flags_lock);
/*
- * Skip inodes that are already flush locked as they have
- * already been written to the buffer.
- */
- if (!xfs_iflock_nowait(ip)) {
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
- continue;
- }
-
- /*
* Abort flushing this inode if we are shut down because the
* inode may not currently be in the AIL. This can occur when
* log I/O failure unpins the inode without inserting into the
@@ -3661,7 +3643,6 @@ xfs_iflush_cluster(
*/
if (XFS_FORCED_SHUTDOWN(mp)) {
xfs_iunpin_wait(ip);
- /* xfs_iflush_abort() drops the flush lock */
xfs_iflush_abort(ip);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
error = -EIO;
@@ -3670,7 +3651,7 @@ xfs_iflush_cluster(
/* don't block waiting on a log force to unpin dirty inodes */
if (xfs_ipincount(ip)) {
- xfs_ifunlock(ip);
+ xfs_iflags_clear(ip, XFS_IFLUSHING);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
continue;
}
@@ -3678,7 +3659,7 @@ xfs_iflush_cluster(
if (!xfs_inode_clean(ip))
error = xfs_iflush(ip, bp);
else
- xfs_ifunlock(ip);
+ xfs_iflags_clear(ip, XFS_IFLUSHING);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
if (error)
break;
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index e9a8bb184d1f..751a3d1d7d84 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -194,6 +194,11 @@ static inline bool xfs_inode_has_cow_data(struct xfs_inode *ip)
return ip->i_cowfp && ip->i_cowfp->if_bytes;
}
+static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip)
+{
+ return ip->i_d.di_flags2 & XFS_DIFLAG2_BIGTIME;
+}
+
/*
* Return the buftarg used for data allocations on a given inode.
*/
@@ -211,8 +216,7 @@ static inline bool xfs_inode_has_cow_data(struct xfs_inode *ip)
#define XFS_INEW (1 << __XFS_INEW_BIT)
#define XFS_ITRUNCATED (1 << 5) /* truncated down so flush-on-close */
#define XFS_IDIRTY_RELEASE (1 << 6) /* dirty release already seen */
-#define __XFS_IFLOCK_BIT 7 /* inode is being flushed right now */
-#define XFS_IFLOCK (1 << __XFS_IFLOCK_BIT)
+#define XFS_IFLUSHING (1 << 7) /* inode is being flushed */
#define __XFS_IPINNED_BIT 8 /* wakeup key for zero pin count */
#define XFS_IPINNED (1 << __XFS_IPINNED_BIT)
#define XFS_IEOFBLOCKS (1 << 9) /* has the preallocblocks tag set */
@@ -234,36 +238,6 @@ static inline bool xfs_inode_has_cow_data(struct xfs_inode *ip)
XFS_IDIRTY_RELEASE | XFS_ITRUNCATED)
/*
- * Synchronize processes attempting to flush the in-core inode back to disk.
- */
-
-static inline int xfs_isiflocked(struct xfs_inode *ip)
-{
- return xfs_iflags_test(ip, XFS_IFLOCK);
-}
-
-extern void __xfs_iflock(struct xfs_inode *ip);
-
-static inline int xfs_iflock_nowait(struct xfs_inode *ip)
-{
- return !xfs_iflags_test_and_set(ip, XFS_IFLOCK);
-}
-
-static inline void xfs_iflock(struct xfs_inode *ip)
-{
- if (!xfs_iflock_nowait(ip))
- __xfs_iflock(ip);
-}
-
-static inline void xfs_ifunlock(struct xfs_inode *ip)
-{
- ASSERT(xfs_isiflocked(ip));
- xfs_iflags_clear(ip, XFS_IFLOCK);
- smp_mb();
- wake_up_bit(&ip->i_flags, __XFS_IFLOCK_BIT);
-}
-
-/*
* Flags for inode locking.
* Bit ranges: 1<<1 - 1<<16-1 -- iolock/ilock modes (bitfield)
* 1<<16 - 1<<32-1 -- lockdep annotation (integers)
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 895f61b2b4f0..17e20a6d8b4e 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -191,7 +191,7 @@ xfs_inode_item_format_data_fork(
ip->i_df.if_bytes > 0) {
/*
* Round i_bytes up to a word boundary.
- * The underlying memory is guaranteed to
+ * The underlying memory is guaranteed
* to be there by xfs_idata_realloc().
*/
data_bytes = roundup(ip->i_df.if_bytes, 4);
@@ -275,7 +275,7 @@ xfs_inode_item_format_attr_fork(
ip->i_afp->if_bytes > 0) {
/*
* Round i_bytes up to a word boundary.
- * The underlying memory is guaranteed to
+ * The underlying memory is guaranteed
* to be there by xfs_idata_realloc().
*/
data_bytes = roundup(ip->i_afp->if_bytes, 4);
@@ -295,6 +295,28 @@ xfs_inode_item_format_attr_fork(
}
}
+/*
+ * Convert an incore timestamp to a log timestamp. Note that the log format
+ * specifies host endian format!
+ */
+static inline xfs_ictimestamp_t
+xfs_inode_to_log_dinode_ts(
+ struct xfs_inode *ip,
+ const struct timespec64 tv)
+{
+ struct xfs_legacy_ictimestamp *lits;
+ xfs_ictimestamp_t its;
+
+ if (xfs_inode_has_bigtime(ip))
+ return xfs_inode_encode_bigtime(tv);
+
+ lits = (struct xfs_legacy_ictimestamp *)&its;
+ lits->t_sec = tv.tv_sec;
+ lits->t_nsec = tv.tv_nsec;
+
+ return its;
+}
+
static void
xfs_inode_to_log_dinode(
struct xfs_inode *ip,
@@ -313,12 +335,9 @@ xfs_inode_to_log_dinode(
memset(to->di_pad, 0, sizeof(to->di_pad));
memset(to->di_pad3, 0, sizeof(to->di_pad3));
- to->di_atime.t_sec = inode->i_atime.tv_sec;
- to->di_atime.t_nsec = inode->i_atime.tv_nsec;
- to->di_mtime.t_sec = inode->i_mtime.tv_sec;
- to->di_mtime.t_nsec = inode->i_mtime.tv_nsec;
- to->di_ctime.t_sec = inode->i_ctime.tv_sec;
- to->di_ctime.t_nsec = inode->i_ctime.tv_nsec;
+ to->di_atime = xfs_inode_to_log_dinode_ts(ip, inode->i_atime);
+ to->di_mtime = xfs_inode_to_log_dinode_ts(ip, inode->i_mtime);
+ to->di_ctime = xfs_inode_to_log_dinode_ts(ip, inode->i_ctime);
to->di_nlink = inode->i_nlink;
to->di_gen = inode->i_generation;
to->di_mode = inode->i_mode;
@@ -340,8 +359,7 @@ xfs_inode_to_log_dinode(
if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) {
to->di_version = 3;
to->di_changecount = inode_peek_iversion(inode);
- to->di_crtime.t_sec = from->di_crtime.tv_sec;
- to->di_crtime.t_nsec = from->di_crtime.tv_nsec;
+ to->di_crtime = xfs_inode_to_log_dinode_ts(ip, from->di_crtime);
to->di_flags2 = from->di_flags2;
to->di_cowextsize = from->di_cowextsize;
to->di_ino = ip->i_ino;
@@ -491,8 +509,7 @@ xfs_inode_item_push(
(ip->i_flags & XFS_ISTALE))
return XFS_ITEM_PINNED;
- /* If the inode is already flush locked, we're already flushing. */
- if (xfs_isiflocked(ip))
+ if (xfs_iflags_test(ip, XFS_IFLUSHING))
return XFS_ITEM_FLUSHING;
if (!xfs_buf_trylock(bp))
@@ -703,7 +720,7 @@ xfs_iflush_finish(
iip->ili_last_fields = 0;
iip->ili_flush_lsn = 0;
spin_unlock(&iip->ili_lock);
- xfs_ifunlock(iip->ili_inode);
+ xfs_iflags_clear(iip->ili_inode, XFS_IFLUSHING);
if (drop_buffer)
xfs_buf_rele(bp);
}
@@ -711,11 +728,11 @@ xfs_iflush_finish(
/*
* Inode buffer IO completion routine. It is responsible for removing inodes
- * attached to the buffer from the AIL if they have not been re-logged, as well
- * as completing the flush and unlocking the inode.
+ * attached to the buffer from the AIL if they have not been re-logged and
+ * completing the inode flush.
*/
void
-xfs_iflush_done(
+xfs_buf_inode_iodone(
struct xfs_buf *bp)
{
struct xfs_log_item *lip, *n;
@@ -754,11 +771,21 @@ xfs_iflush_done(
list_splice_tail(&flushed_inodes, &bp->b_li_list);
}
+void
+xfs_buf_inode_io_fail(
+ struct xfs_buf *bp)
+{
+ struct xfs_log_item *lip;
+
+ list_for_each_entry(lip, &bp->b_li_list, li_bio_list)
+ set_bit(XFS_LI_FAILED, &lip->li_flags);
+}
+
/*
- * This is the inode flushing abort routine. It is called from xfs_iflush when
+ * This is the inode flushing abort routine. It is called when
* the filesystem is shutting down to clean up the inode state. It is
* responsible for removing the inode item from the AIL if it has not been
- * re-logged, and unlocking the inode's flush lock.
+ * re-logged and clearing the inode's flush state.
*/
void
xfs_iflush_abort(
@@ -790,7 +817,7 @@ xfs_iflush_abort(
list_del_init(&iip->ili_item.li_bio_list);
spin_unlock(&iip->ili_lock);
}
- xfs_ifunlock(ip);
+ xfs_iflags_clear(ip, XFS_IFLUSHING);
if (bp)
xfs_buf_rele(bp);
}
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 048b5e7dee90..4b926e32831c 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -25,8 +25,8 @@ struct xfs_inode_log_item {
*
* We need atomic changes between inode dirtying, inode flushing and
* inode completion, but these all hold different combinations of
- * ILOCK and iflock and hence we need some other method of serialising
- * updates to the flush state.
+ * ILOCK and IFLUSHING and hence we need some other method of
+ * serialising updates to the flush state.
*/
spinlock_t ili_lock; /* flush state lock */
unsigned int ili_last_fields; /* fields when flushed */
@@ -43,7 +43,6 @@ static inline int xfs_inode_clean(struct xfs_inode *ip)
extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *);
extern void xfs_inode_item_destroy(struct xfs_inode *);
-extern void xfs_iflush_done(struct xfs_buf *);
extern void xfs_iflush_abort(struct xfs_inode *);
extern int xfs_inode_item_format_convert(xfs_log_iovec_t *,
struct xfs_inode_log_format *);
diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c
index 5e0d291835b3..cb44f7653f03 100644
--- a/fs/xfs/xfs_inode_item_recover.c
+++ b/fs/xfs/xfs_inode_item_recover.c
@@ -115,6 +115,82 @@ out_free_ip:
return error;
}
+static inline bool xfs_log_dinode_has_bigtime(const struct xfs_log_dinode *ld)
+{
+ return ld->di_version >= 3 &&
+ (ld->di_flags2 & XFS_DIFLAG2_BIGTIME);
+}
+
+/* Convert a log timestamp to an ondisk timestamp. */
+static inline xfs_timestamp_t
+xfs_log_dinode_to_disk_ts(
+ struct xfs_log_dinode *from,
+ const xfs_ictimestamp_t its)
+{
+ struct xfs_legacy_timestamp *lts;
+ struct xfs_legacy_ictimestamp *lits;
+ xfs_timestamp_t ts;
+
+ if (xfs_log_dinode_has_bigtime(from))
+ return cpu_to_be64(its);
+
+ lts = (struct xfs_legacy_timestamp *)&ts;
+ lits = (struct xfs_legacy_ictimestamp *)&its;
+ lts->t_sec = cpu_to_be32(lits->t_sec);
+ lts->t_nsec = cpu_to_be32(lits->t_nsec);
+
+ return ts;
+}
+
+STATIC void
+xfs_log_dinode_to_disk(
+ struct xfs_log_dinode *from,
+ struct xfs_dinode *to)
+{
+ to->di_magic = cpu_to_be16(from->di_magic);
+ to->di_mode = cpu_to_be16(from->di_mode);
+ to->di_version = from->di_version;
+ to->di_format = from->di_format;
+ to->di_onlink = 0;
+ to->di_uid = cpu_to_be32(from->di_uid);
+ to->di_gid = cpu_to_be32(from->di_gid);
+ to->di_nlink = cpu_to_be32(from->di_nlink);
+ to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
+ to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
+ memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
+
+ to->di_atime = xfs_log_dinode_to_disk_ts(from, from->di_atime);
+ to->di_mtime = xfs_log_dinode_to_disk_ts(from, from->di_mtime);
+ to->di_ctime = xfs_log_dinode_to_disk_ts(from, from->di_ctime);
+
+ to->di_size = cpu_to_be64(from->di_size);
+ to->di_nblocks = cpu_to_be64(from->di_nblocks);
+ to->di_extsize = cpu_to_be32(from->di_extsize);
+ to->di_nextents = cpu_to_be32(from->di_nextents);
+ to->di_anextents = cpu_to_be16(from->di_anextents);
+ to->di_forkoff = from->di_forkoff;
+ to->di_aformat = from->di_aformat;
+ to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
+ to->di_dmstate = cpu_to_be16(from->di_dmstate);
+ to->di_flags = cpu_to_be16(from->di_flags);
+ to->di_gen = cpu_to_be32(from->di_gen);
+
+ if (from->di_version == 3) {
+ to->di_changecount = cpu_to_be64(from->di_changecount);
+ to->di_crtime = xfs_log_dinode_to_disk_ts(from,
+ from->di_crtime);
+ to->di_flags2 = cpu_to_be64(from->di_flags2);
+ to->di_cowextsize = cpu_to_be32(from->di_cowextsize);
+ to->di_ino = cpu_to_be64(from->di_ino);
+ to->di_lsn = cpu_to_be64(from->di_lsn);
+ memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
+ uuid_copy(&to->di_uuid, &from->di_uuid);
+ to->di_flushiter = 0;
+ } else {
+ to->di_flushiter = cpu_to_be16(from->di_flushiter);
+ }
+}
+
STATIC int
xlog_recover_inode_commit_pass2(
struct xlog *log,
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 6f22a66777cd..bca7659fb5c6 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -404,7 +404,7 @@ xfs_ioc_attr_list(
context.cursor.offset))
return -EINVAL;
- buffer = kmem_zalloc_large(bufsize, 0);
+ buffer = kvzalloc(bufsize, GFP_KERNEL);
if (!buffer)
return -ENOMEM;
@@ -1190,7 +1190,8 @@ xfs_flags2diflags2(
unsigned int xflags)
{
uint64_t di_flags2 =
- (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK);
+ (ip->i_d.di_flags2 & (XFS_DIFLAG2_REFLINK |
+ XFS_DIFLAG2_BIGTIME));
if (xflags & FS_XFLAG_DAX)
di_flags2 |= XFS_DIFLAG2_DAX;
@@ -1690,7 +1691,7 @@ xfs_ioc_getbmap(
if (bmx.bmv_count > ULONG_MAX / recsize)
return -ENOMEM;
- buf = kmem_zalloc_large(bmx.bmv_count * sizeof(*buf), 0);
+ buf = kvzalloc(bmx.bmv_count * sizeof(*buf), GFP_KERNEL);
if (!buf)
return -ENOMEM;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 0e3f62cde375..3abb8b9d6f4c 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -865,7 +865,7 @@ xfs_buffered_write_iomap_begin(
}
/*
- * Search the data fork fork first to look up our source mapping. We
+ * Search the data fork first to look up our source mapping. We
* always need the data fork map, as we have to return it to the
* iomap code so that the higher level write code can read data in to
* perform read-modify-write cycles for unaligned writes.
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 56c32eecffea..b0ef071b3cb5 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -239,7 +239,7 @@ xfs_cil_prepare_item(
* this CIL context and so we need to pin it. If we are replacing the
* old_lv, then remove the space it accounts for and make it the shadow
* buffer for later freeing. In both cases we are now switching to the
- * shadow buffer, so update the the pointer to it appropriately.
+ * shadow buffer, so update the pointer to it appropriately.
*/
if (!old_lv) {
if (lv->lv_item->li_ops->iop_pin)
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 52a65a74208f..a17d788921d6 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -265,32 +265,6 @@ xlog_header_check_mount(
return 0;
}
-void
-xlog_recover_iodone(
- struct xfs_buf *bp)
-{
- if (bp->b_error) {
- /*
- * We're not going to bother about retrying
- * this during recovery. One strike!
- */
- if (!XFS_FORCED_SHUTDOWN(bp->b_mount)) {
- xfs_buf_ioerror_alert(bp, __this_address);
- xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR);
- }
- }
-
- /*
- * On v5 supers, a bli could be attached to update the metadata LSN.
- * Clean it up.
- */
- if (bp->b_log_item)
- xfs_buf_item_relse(bp);
- ASSERT(bp->b_log_item == NULL);
- bp->b_flags &= ~_XBF_LOGRECOVERY;
- xfs_buf_ioend_finish(bp);
-}
-
/*
* This routine finds (to an approximation) the first block in the physical
* log which contains the given cycle. It uses a binary search algorithm.
@@ -1100,7 +1074,7 @@ xlog_verify_head(
*
* Note that xlog_find_tail() clears the blocks at the new head
* (i.e., the records with invalid CRC) if the cycle number
- * matches the the current cycle.
+ * matches the current cycle.
*/
found = xlog_rseek_logrec_hdr(log, first_bad, *tail_blk, 1,
buffer, rhead_blk, rhead, wrapped);
@@ -2097,7 +2071,7 @@ xlog_recover_add_to_cont_trans(
old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
old_len = item->ri_buf[item->ri_cnt-1].i_len;
- ptr = kmem_realloc(old_ptr, len + old_len, 0);
+ ptr = krealloc(old_ptr, len + old_len, GFP_KERNEL | __GFP_NOFAIL);
memcpy(&ptr[old_len], dp, len);
item->ri_buf[item->ri_cnt-1].i_len += len;
item->ri_buf[item->ri_cnt-1].i_addr = ptr;
@@ -3294,14 +3268,14 @@ xlog_do_log_recovery(
*/
STATIC int
xlog_do_recover(
- struct xlog *log,
- xfs_daddr_t head_blk,
- xfs_daddr_t tail_blk)
+ struct xlog *log,
+ xfs_daddr_t head_blk,
+ xfs_daddr_t tail_blk)
{
- struct xfs_mount *mp = log->l_mp;
- int error;
- xfs_buf_t *bp;
- xfs_sb_t *sbp;
+ struct xfs_mount *mp = log->l_mp;
+ struct xfs_buf *bp = mp->m_sb_bp;
+ struct xfs_sb *sbp = &mp->m_sb;
+ int error;
trace_xfs_log_recover(log, head_blk, tail_blk);
@@ -3315,9 +3289,8 @@ xlog_do_recover(
/*
* If IO errors happened during recovery, bail out.
*/
- if (XFS_FORCED_SHUTDOWN(mp)) {
+ if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
- }
/*
* We now update the tail_lsn since much of the recovery has completed
@@ -3331,16 +3304,12 @@ xlog_do_recover(
xlog_assign_tail_lsn(mp);
/*
- * Now that we've finished replaying all buffer and inode
- * updates, re-read in the superblock and reverify it.
+ * Now that we've finished replaying all buffer and inode updates,
+ * re-read the superblock and reverify it.
*/
- bp = xfs_getsb(mp);
- bp->b_flags &= ~(XBF_DONE | XBF_ASYNC);
- ASSERT(!(bp->b_flags & XBF_WRITE));
- bp->b_flags |= XBF_READ;
- bp->b_ops = &xfs_sb_buf_ops;
-
- error = xfs_buf_submit(bp);
+ xfs_buf_lock(bp);
+ xfs_buf_hold(bp);
+ error = _xfs_buf_read(bp, XBF_READ);
if (error) {
if (!XFS_FORCED_SHUTDOWN(mp)) {
xfs_buf_ioerror_alert(bp, __this_address);
@@ -3351,7 +3320,6 @@ xlog_do_recover(
}
/* Convert superblock from on-disk format */
- sbp = &mp->m_sb;
xfs_sb_from_disk(sbp, bp->b_addr);
xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index c8ae49a1e99c..150ee5cb8645 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -80,9 +80,9 @@ xfs_uuid_mount(
}
if (hole < 0) {
- xfs_uuid_table = kmem_realloc(xfs_uuid_table,
+ xfs_uuid_table = krealloc(xfs_uuid_table,
(xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table),
- 0);
+ GFP_KERNEL | __GFP_NOFAIL);
hole = xfs_uuid_table_size++;
}
xfs_uuid_table[hole] = *uuid;
@@ -1059,11 +1059,12 @@ xfs_unmountfs(
* We can potentially deadlock here if we have an inode cluster
* that has been freed has its buffer still pinned in memory because
* the transaction is still sitting in a iclog. The stale inodes
- * on that buffer will have their flush locks held until the
- * transaction hits the disk and the callbacks run. the inode
- * flush takes the flush lock unconditionally and with nothing to
- * push out the iclog we will never get that unlocked. hence we
- * need to force the log first.
+ * on that buffer will be pinned to the buffer until the
+ * transaction hits the disk and the callbacks run. Pushing the AIL will
+ * skip the stale inodes and may never see the pinned buffer, so
+ * nothing will push out the iclog and unpin the buffer. Hence we
+ * need to force the log here to ensure all items are flushed into the
+ * AIL before we go any further.
*/
xfs_log_force(mp, XFS_LOG_SYNC);
@@ -1289,23 +1290,6 @@ xfs_mod_frextents(
}
/*
- * xfs_getsb() is called to obtain the buffer for the superblock.
- * The buffer is returned locked and read in from disk.
- * The buffer should be released with a call to xfs_brelse().
- */
-struct xfs_buf *
-xfs_getsb(
- struct xfs_mount *mp)
-{
- struct xfs_buf *bp = mp->m_sb_bp;
-
- xfs_buf_lock(bp);
- xfs_buf_hold(bp);
- ASSERT(bp->b_flags & XBF_DONE);
- return bp;
-}
-
-/*
* Used to free the superblock along various error paths.
*/
void
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index a72cfcaa4ad1..dfa429b77ee2 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -410,7 +410,6 @@ extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta,
bool reserved);
extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta);
-extern struct xfs_buf *xfs_getsb(xfs_mount_t *);
extern int xfs_readsb(xfs_mount_t *, int);
extern void xfs_freesb(xfs_mount_t *);
extern bool xfs_fs_writable(struct xfs_mount *mp, int level);
diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h
index 5f04d8a5ab2a..0aa87c210104 100644
--- a/fs/xfs/xfs_ondisk.h
+++ b/fs/xfs/xfs_ondisk.h
@@ -15,6 +15,10 @@
"XFS: offsetof(" #structname ", " #member ") is wrong, " \
"expected " #off)
+#define XFS_CHECK_VALUE(value, expected) \
+ BUILD_BUG_ON_MSG((value) != (expected), \
+ "XFS: value of " #value " is wrong, expected " #expected)
+
static inline void __init
xfs_check_ondisk_structs(void)
{
@@ -23,7 +27,7 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_STRUCT_SIZE(struct xfs_acl_entry, 12);
XFS_CHECK_STRUCT_SIZE(struct xfs_agf, 224);
XFS_CHECK_STRUCT_SIZE(struct xfs_agfl, 36);
- XFS_CHECK_STRUCT_SIZE(struct xfs_agi, 336);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_agi, 344);
XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_key, 8);
XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_rec, 16);
XFS_CHECK_STRUCT_SIZE(struct xfs_bmdr_block, 4);
@@ -41,7 +45,8 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_STRUCT_SIZE(struct xfs_refcount_rec, 12);
XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_key, 20);
XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_rec, 24);
- XFS_CHECK_STRUCT_SIZE(struct xfs_timestamp, 8);
+ XFS_CHECK_STRUCT_SIZE(xfs_timestamp_t, 8);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_legacy_timestamp, 8);
XFS_CHECK_STRUCT_SIZE(xfs_alloc_key_t, 8);
XFS_CHECK_STRUCT_SIZE(xfs_alloc_ptr_t, 4);
XFS_CHECK_STRUCT_SIZE(xfs_alloc_rec_t, 8);
@@ -84,12 +89,12 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, namelen, 8);
XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, name, 9);
XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t, 40);
- XFS_CHECK_OFFSET(xfs_attr_shortform_t, hdr.totsize, 0);
- XFS_CHECK_OFFSET(xfs_attr_shortform_t, hdr.count, 2);
- XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].namelen, 4);
- XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].valuelen, 5);
- XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].flags, 6);
- XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].nameval, 7);
+ XFS_CHECK_OFFSET(struct xfs_attr_shortform, hdr.totsize, 0);
+ XFS_CHECK_OFFSET(struct xfs_attr_shortform, hdr.count, 2);
+ XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].namelen, 4);
+ XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].valuelen, 5);
+ XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].flags, 6);
+ XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].nameval, 7);
XFS_CHECK_STRUCT_SIZE(xfs_da_blkinfo_t, 12);
XFS_CHECK_STRUCT_SIZE(xfs_da_intnode_t, 16);
XFS_CHECK_STRUCT_SIZE(xfs_da_node_entry_t, 8);
@@ -121,7 +126,8 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_STRUCT_SIZE(struct xfs_extent_64, 16);
XFS_CHECK_STRUCT_SIZE(struct xfs_log_dinode, 176);
XFS_CHECK_STRUCT_SIZE(struct xfs_icreate_log, 28);
- XFS_CHECK_STRUCT_SIZE(struct xfs_ictimestamp, 8);
+ XFS_CHECK_STRUCT_SIZE(xfs_ictimestamp_t, 8);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_legacy_ictimestamp, 8);
XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format_32, 52);
XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format, 56);
XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat, 20);
@@ -152,6 +158,20 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_STRUCT_SIZE(struct xfs_inumbers, 24);
XFS_CHECK_STRUCT_SIZE(struct xfs_bulkstat_req, 64);
XFS_CHECK_STRUCT_SIZE(struct xfs_inumbers_req, 64);
+
+ /*
+ * Make sure the incore inode timestamp range corresponds to hand
+ * converted values based on the ondisk format specification.
+ */
+ XFS_CHECK_VALUE(XFS_BIGTIME_TIME_MIN - XFS_BIGTIME_EPOCH_OFFSET,
+ XFS_LEGACY_TIME_MIN);
+ XFS_CHECK_VALUE(XFS_BIGTIME_TIME_MAX - XFS_BIGTIME_EPOCH_OFFSET,
+ 16299260424LL);
+
+ /* Do the same with the incore quota expiration range. */
+ XFS_CHECK_VALUE(XFS_DQ_BIGTIME_EXPIRY_MIN << XFS_DQ_BIGTIME_SHIFT, 4);
+ XFS_CHECK_VALUE(XFS_DQ_BIGTIME_EXPIRY_MAX << XFS_DQ_BIGTIME_SHIFT,
+ 16299260424LL);
}
#endif /* __XFS_ONDISK_H */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index be67570badf8..3f82e0c92c2d 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -661,6 +661,17 @@ xfs_qm_init_quotainfo(
/* Precalc some constants */
qinf->qi_dqchunklen = XFS_FSB_TO_BB(mp, XFS_DQUOT_CLUSTER_SIZE_FSB);
qinf->qi_dqperchunk = xfs_calc_dquots_per_chunk(qinf->qi_dqchunklen);
+ if (xfs_sb_version_hasbigtime(&mp->m_sb)) {
+ qinf->qi_expiry_min =
+ xfs_dq_bigtime_to_unix(XFS_DQ_BIGTIME_EXPIRY_MIN);
+ qinf->qi_expiry_max =
+ xfs_dq_bigtime_to_unix(XFS_DQ_BIGTIME_EXPIRY_MAX);
+ } else {
+ qinf->qi_expiry_min = XFS_DQ_LEGACY_EXPIRY_MIN;
+ qinf->qi_expiry_max = XFS_DQ_LEGACY_EXPIRY_MAX;
+ }
+ trace_xfs_quota_expiry_range(mp, qinf->qi_expiry_min,
+ qinf->qi_expiry_max);
mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD);
@@ -879,6 +890,8 @@ xfs_qm_reset_dqcounts(
ddq->d_bwarns = 0;
ddq->d_iwarns = 0;
ddq->d_rtbwarns = 0;
+ if (xfs_sb_version_hasbigtime(&mp->m_sb))
+ ddq->d_type |= XFS_DQTYPE_BIGTIME;
}
if (xfs_sb_version_hascrc(&mp->m_sb)) {
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 9c078c35d924..e3dabab44097 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -65,6 +65,10 @@ struct xfs_quotainfo {
struct xfs_def_quota qi_grp_default;
struct xfs_def_quota qi_prj_default;
struct shrinker qi_shrinker;
+
+ /* Minimum and maximum quota expiration timestamp values. */
+ time64_t qi_expiry_min;
+ time64_t qi_expiry_max;
};
static inline struct radix_tree_root *
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 1c542b4a5220..ca1b57d291dc 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -479,13 +479,19 @@ xfs_setqlim_warns(
static inline void
xfs_setqlim_timer(
+ struct xfs_mount *mp,
struct xfs_dquot_res *res,
struct xfs_quota_limits *qlim,
s64 timer)
{
- res->timer = timer;
- if (qlim)
- qlim->time = timer;
+ if (qlim) {
+ /* Set the length of the default grace period. */
+ res->timer = xfs_dquot_set_grace_period(timer);
+ qlim->time = res->timer;
+ } else {
+ /* Set the grace period expiration on a quota. */
+ res->timer = xfs_dquot_set_timeout(mp, timer);
+ }
}
/*
@@ -574,7 +580,7 @@ xfs_qm_scall_setqlim(
if (newlim->d_fieldmask & QC_SPC_WARNS)
xfs_setqlim_warns(res, qlim, newlim->d_spc_warns);
if (newlim->d_fieldmask & QC_SPC_TIMER)
- xfs_setqlim_timer(res, qlim, newlim->d_spc_timer);
+ xfs_setqlim_timer(mp, res, qlim, newlim->d_spc_timer);
/* Blocks on the realtime device. */
hard = (newlim->d_fieldmask & QC_RT_SPC_HARD) ?
@@ -590,7 +596,7 @@ xfs_qm_scall_setqlim(
if (newlim->d_fieldmask & QC_RT_SPC_WARNS)
xfs_setqlim_warns(res, qlim, newlim->d_rt_spc_warns);
if (newlim->d_fieldmask & QC_RT_SPC_TIMER)
- xfs_setqlim_timer(res, qlim, newlim->d_rt_spc_timer);
+ xfs_setqlim_timer(mp, res, qlim, newlim->d_rt_spc_timer);
/* Inodes */
hard = (newlim->d_fieldmask & QC_INO_HARD) ?
@@ -606,7 +612,7 @@ xfs_qm_scall_setqlim(
if (newlim->d_fieldmask & QC_INO_WARNS)
xfs_setqlim_warns(res, qlim, newlim->d_ino_warns);
if (newlim->d_fieldmask & QC_INO_TIMER)
- xfs_setqlim_timer(res, qlim, newlim->d_ino_timer);
+ xfs_setqlim_timer(mp, res, qlim, newlim->d_ino_timer);
if (id != 0) {
/*
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 06b22e35fc90..5a62398940d0 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -108,8 +108,6 @@ extern void xfs_qm_mount_quotas(struct xfs_mount *);
extern void xfs_qm_unmount(struct xfs_mount *);
extern void xfs_qm_unmount_quotas(struct xfs_mount *);
-void xfs_dquot_done(struct xfs_buf *);
-
#else
static inline int
xfs_qm_vop_dqalloc(struct xfs_inode *ip, kuid_t kuid, kgid_t kgid,
@@ -151,12 +149,6 @@ static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp,
#define xfs_qm_mount_quotas(mp)
#define xfs_qm_unmount(mp)
#define xfs_qm_unmount_quotas(mp)
-
-static inline void xfs_dquot_done(struct xfs_buf *bp)
-{
- return;
-}
-
#endif /* CONFIG_XFS_QUOTA */
#define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 7b2c72bc2858..ca93b6488377 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -485,7 +485,7 @@ xfs_cui_item_recover(
* transaction. Normally, any work that needs to be deferred
* gets attached to the same defer_ops that scheduled the
* refcount update. However, we're in log recovery here, so we
- * we use the passed in defer_ops and to finish up any work that
+ * use the passed in defer_ops and to finish up any work that
* doesn't fit. We need to reserve enough blocks to handle a
* full btree split on either end of the refcount range.
*/
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index aac83f9d6107..16098dc42add 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -721,7 +721,7 @@ xfs_reflink_end_cow(
* repeatedly cycles the ILOCK to allocate one transaction per remapped
* extent.
*
- * If we're being called by writeback then the the pages will still
+ * If we're being called by writeback then the pages will still
* have PageWriteback set, which prevents races with reflink remapping
* and truncate. Reflink remapping prevents races with writeback by
* taking the iolock and mmaplock before flushing the pages and
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 6209e7b6b895..5b89c12f1566 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -247,6 +247,9 @@ xfs_rtallocate_extent_block(
end = XFS_BLOCKTOBIT(mp, bbno + 1) - 1;
i <= end;
i++) {
+ /* Make sure we don't scan off the end of the rt volume. */
+ maxlen = min(mp->m_sb.sb_rextents, i + maxlen) - i;
+
/*
* See if there's a free extent of maxlen starting at i.
* If it's not so then next will contain the first non-free.
@@ -442,6 +445,14 @@ xfs_rtallocate_extent_near(
*/
if (bno >= mp->m_sb.sb_rextents)
bno = mp->m_sb.sb_rextents - 1;
+
+ /* Make sure we don't run off the end of the rt volume. */
+ maxlen = min(mp->m_sb.sb_rextents, bno + maxlen) - bno;
+ if (maxlen < minlen) {
+ *rtblock = NULLRTBLOCK;
+ return 0;
+ }
+
/*
* Try the exact allocation first.
*/
@@ -862,7 +873,7 @@ xfs_alloc_rsum_cache(
* lower bound on the minimum level with any free extents. We can
* continue without the cache if it couldn't be allocated.
*/
- mp->m_rsum_cache = kmem_zalloc_large(rbmblocks, 0);
+ mp->m_rsum_cache = kvzalloc(rbmblocks, GFP_KERNEL);
if (!mp->m_rsum_cache)
xfs_warn(mp, "could not allocate realtime summary cache");
}
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 71ac6c1cdc36..baf5de30eebb 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -654,11 +654,11 @@ xfs_fs_destroy_inode(
ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM));
/*
- * We always use background reclaim here because even if the
- * inode is clean, it still may be under IO and hence we have
- * to take the flush lock. The background reclaim path handles
- * this more efficiently than we can here, so simply let background
- * reclaim tear down all inodes.
+ * We always use background reclaim here because even if the inode is
+ * clean, it still may be under IO and hence we have wait for IO
+ * completion to occur before we can reclaim the inode. The background
+ * reclaim path handles this more efficiently than we can here, so
+ * simply let background reclaim tear down all inodes.
*/
xfs_inode_set_reclaim_tag(ip);
}
@@ -1484,8 +1484,14 @@ xfs_fc_fill_super(
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_max_links = XFS_MAXLINK;
sb->s_time_gran = 1;
- sb->s_time_min = S32_MIN;
- sb->s_time_max = S32_MAX;
+ if (xfs_sb_version_hasbigtime(&mp->m_sb)) {
+ sb->s_time_min = xfs_bigtime_to_unix(XFS_BIGTIME_TIME_MIN);
+ sb->s_time_max = xfs_bigtime_to_unix(XFS_BIGTIME_TIME_MAX);
+ } else {
+ sb->s_time_min = XFS_LEGACY_TIME_MIN;
+ sb->s_time_max = XFS_LEGACY_TIME_MAX;
+ }
+ trace_xfs_inode_timestamp_range(mp, sb->s_time_min, sb->s_time_max);
sb->s_iflags |= SB_I_CGROUPWB;
set_posix_acl_flag(sb);
@@ -1494,6 +1500,10 @@ xfs_fc_fill_super(
if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5)
sb->s_flags |= SB_I_VERSION;
+ if (xfs_sb_version_hasbigtime(&mp->m_sb))
+ xfs_warn(mp,
+ "EXPERIMENTAL big timestamp feature in use. Use at your own risk!");
+
if (mp->m_flags & XFS_MOUNT_DAX_ALWAYS) {
bool rtdev_is_dax = false, datadev_is_dax;
@@ -1549,6 +1559,10 @@ xfs_fc_fill_super(
goto out_filestream_unmount;
}
+ if (xfs_sb_version_hasinobtcounts(&mp->m_sb))
+ xfs_warn(mp,
+ "EXPERIMENTAL inode btree counters feature in use. Use at your own risk!");
+
error = xfs_mountfs(mp);
if (error)
goto out_filestream_unmount;
diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h
index e9f810fc6731..43585850f154 100644
--- a/fs/xfs/xfs_sysfs.h
+++ b/fs/xfs/xfs_sysfs.h
@@ -32,9 +32,11 @@ xfs_sysfs_init(
struct xfs_kobj *parent_kobj,
const char *name)
{
+ struct kobject *parent;
+
+ parent = parent_kobj ? &parent_kobj->kobject : NULL;
init_completion(&kobj->complete);
- return kobject_init_and_add(&kobj->kobject, ktype,
- &parent_kobj->kobject, "%s", name);
+ return kobject_init_and_add(&kobj->kobject, ktype, parent, "%s", name);
}
static inline void
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index abb1d859f226..dcdcf99cfa5d 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -338,7 +338,7 @@ DEFINE_BUF_EVENT(xfs_buf_delwri_split);
DEFINE_BUF_EVENT(xfs_buf_delwri_pushbuf);
DEFINE_BUF_EVENT(xfs_buf_get_uncached);
DEFINE_BUF_EVENT(xfs_buf_item_relse);
-DEFINE_BUF_EVENT(xfs_buf_item_iodone_async);
+DEFINE_BUF_EVENT(xfs_buf_iodone_async);
DEFINE_BUF_EVENT(xfs_buf_error_relse);
DEFINE_BUF_EVENT(xfs_buf_wait_buftarg);
DEFINE_BUF_EVENT(xfs_trans_read_buf_shut);
@@ -3676,7 +3676,6 @@ DEFINE_EVENT(xfs_kmem_class, name, \
DEFINE_KMEM_EVENT(kmem_alloc);
DEFINE_KMEM_EVENT(kmem_alloc_io);
DEFINE_KMEM_EVENT(kmem_alloc_large);
-DEFINE_KMEM_EVENT(kmem_realloc);
TRACE_EVENT(xfs_check_new_dalign,
TP_PROTO(struct xfs_mount *mp, int new_dalign, xfs_ino_t calc_rootino),
@@ -3844,6 +3843,32 @@ TRACE_EVENT(xfs_btree_bload_block,
__entry->nr_records)
)
+DECLARE_EVENT_CLASS(xfs_timestamp_range_class,
+ TP_PROTO(struct xfs_mount *mp, time64_t min, time64_t max),
+ TP_ARGS(mp, min, max),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(long long, min)
+ __field(long long, max)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->min = min;
+ __entry->max = max;
+ ),
+ TP_printk("dev %d:%d min %lld max %lld",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->min,
+ __entry->max)
+)
+
+#define DEFINE_TIMESTAMP_RANGE_EVENT(name) \
+DEFINE_EVENT(xfs_timestamp_range_class, name, \
+ TP_PROTO(struct xfs_mount *mp, long long min, long long max), \
+ TP_ARGS(mp, min, max))
+DEFINE_TIMESTAMP_RANGE_EVENT(xfs_inode_timestamp_range);
+DEFINE_TIMESTAMP_RANGE_EVENT(xfs_quota_expiry_range);
+
#endif /* _TRACE_XFS_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index ed72867b1a19..ca18a040336a 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -468,7 +468,7 @@ xfs_trans_apply_sb_deltas(
xfs_buf_t *bp;
int whole = 0;
- bp = xfs_trans_getsb(tp, tp->t_mountp);
+ bp = xfs_trans_getsb(tp);
sbp = bp->b_addr;
/*
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index b752501818d2..f46534b75236 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -209,7 +209,7 @@ xfs_trans_read_buf(
flags, bpp, ops);
}
-struct xfs_buf *xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *);
+struct xfs_buf *xfs_trans_getsb(struct xfs_trans *);
void xfs_trans_brelse(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_bjoin(xfs_trans_t *, struct xfs_buf *);
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 0c783d339675..dbb69b4bf3ed 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -480,7 +480,7 @@ xfsaild_push(
* inode buffer is locked because we already pushed the
* updates to it as part of inode clustering.
*
- * We do not want to to stop flushing just because lots
+ * We do not want to stop flushing just because lots
* of items are already being flushed, but we need to
* re-try the flushing relatively soon if most of the
* AIL is being flushed.
@@ -515,7 +515,7 @@ xfsaild_push(
/*
* Are there too many items we can't do anything with?
*
- * If we we are skipping too many items because we can't flush
+ * If we are skipping too many items because we can't flush
* them or they are already being flushed, we back off and
* given them time to complete whatever operation is being
* done. i.e. remove pressure from the AIL while we can't make
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 11cd666cd99a..42d63b830cb9 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -166,50 +166,34 @@ xfs_trans_get_buf_map(
}
/*
- * Get and lock the superblock buffer of this file system for the
- * given transaction.
- *
- * We don't need to use incore_match() here, because the superblock
- * buffer is a private buffer which we keep a pointer to in the
- * mount structure.
+ * Get and lock the superblock buffer for the given transaction.
*/
-xfs_buf_t *
+struct xfs_buf *
xfs_trans_getsb(
- xfs_trans_t *tp,
- struct xfs_mount *mp)
+ struct xfs_trans *tp)
{
- xfs_buf_t *bp;
- struct xfs_buf_log_item *bip;
+ struct xfs_buf *bp = tp->t_mountp->m_sb_bp;
/*
- * Default to just trying to lock the superblock buffer
- * if tp is NULL.
+ * Just increment the lock recursion count if the buffer is already
+ * attached to this transaction.
*/
- if (tp == NULL)
- return xfs_getsb(mp);
-
- /*
- * If the superblock buffer already has this transaction
- * pointer in its b_fsprivate2 field, then we know we already
- * have it locked. In this case we just increment the lock
- * recursion count and return the buffer to the caller.
- */
- bp = mp->m_sb_bp;
if (bp->b_transp == tp) {
- bip = bp->b_log_item;
+ struct xfs_buf_log_item *bip = bp->b_log_item;
+
ASSERT(bip != NULL);
ASSERT(atomic_read(&bip->bli_refcount) > 0);
bip->bli_recur++;
+
trace_xfs_trans_getsb_recur(bip);
- return bp;
- }
+ } else {
+ xfs_buf_lock(bp);
+ xfs_buf_hold(bp);
+ _xfs_trans_bjoin(tp, bp, 1);
- bp = xfs_getsb(mp);
- if (bp == NULL)
- return NULL;
+ trace_xfs_trans_getsb(bp->b_log_item);
+ }
- _xfs_trans_bjoin(tp, bp, 1);
- trace_xfs_trans_getsb(bp->b_log_item);
return bp;
}
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index c6ba7ef18e06..133fc6fc3edd 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -55,6 +55,12 @@ xfs_trans_log_dquot(
{
ASSERT(XFS_DQ_IS_LOCKED(dqp));
+ /* Upgrade the dquot to bigtime format if possible. */
+ if (dqp->q_id != 0 &&
+ xfs_sb_version_hasbigtime(&tp->t_mountp->m_sb) &&
+ !(dqp->q_type & XFS_DQTYPE_BIGTIME))
+ dqp->q_type |= XFS_DQTYPE_BIGTIME;
+
tp->t_flags |= XFS_TRANS_DIRTY;
set_bit(XFS_LI_DIRTY, &dqp->q_logitem.qli_item.li_flags);
}
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 4a0bff65d3d6..8ec7c8f109d7 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -335,7 +335,7 @@ static void zonefs_io_error(struct inode *inode, bool write)
struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
unsigned int noio_flag;
unsigned int nr_zones =
- zi->i_max_size >> (sbi->s_zone_sectors_shift + SECTOR_SHIFT);
+ zi->i_zone_size >> (sbi->s_zone_sectors_shift + SECTOR_SHIFT);
struct zonefs_ioerr_data err = {
.inode = inode,
.write = write,
@@ -398,7 +398,7 @@ static int zonefs_file_truncate(struct inode *inode, loff_t isize)
goto unlock;
ret = blkdev_zone_mgmt(inode->i_sb->s_bdev, op, zi->i_zsector,
- zi->i_max_size >> SECTOR_SHIFT, GFP_NOFS);
+ zi->i_zone_size >> SECTOR_SHIFT, GFP_NOFS);
if (ret) {
zonefs_err(inode->i_sb,
"Zone management operation at %llu failed %d",
@@ -1053,14 +1053,16 @@ static void zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone,
zi->i_ztype = type;
zi->i_zsector = zone->start;
+ zi->i_zone_size = zone->len << SECTOR_SHIFT;
+
zi->i_max_size = min_t(loff_t, MAX_LFS_FILESIZE,
- zone->len << SECTOR_SHIFT);
+ zone->capacity << SECTOR_SHIFT);
zi->i_wpoffset = zonefs_check_zone_condition(inode, zone, true, true);
inode->i_uid = sbi->s_uid;
inode->i_gid = sbi->s_gid;
inode->i_size = zi->i_wpoffset;
- inode->i_blocks = zone->len;
+ inode->i_blocks = zi->i_max_size >> SECTOR_SHIFT;
inode->i_op = &zonefs_file_inode_operations;
inode->i_fop = &zonefs_file_operations;
@@ -1167,12 +1169,18 @@ static int zonefs_create_zgroup(struct zonefs_zone_data *zd,
if (zonefs_zone_type(next) != type)
break;
zone->len += next->len;
+ zone->capacity += next->capacity;
if (next->cond == BLK_ZONE_COND_READONLY &&
zone->cond != BLK_ZONE_COND_OFFLINE)
zone->cond = BLK_ZONE_COND_READONLY;
else if (next->cond == BLK_ZONE_COND_OFFLINE)
zone->cond = BLK_ZONE_COND_OFFLINE;
}
+ if (zone->capacity != zone->len) {
+ zonefs_err(sb, "Invalid conventional zone capacity\n");
+ ret = -EINVAL;
+ goto free;
+ }
}
/*
diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h
index ad17fef7ce91..55b39970acb2 100644
--- a/fs/zonefs/zonefs.h
+++ b/fs/zonefs/zonefs.h
@@ -56,6 +56,9 @@ struct zonefs_inode_info {
/* File maximum size */
loff_t i_max_size;
+ /* File zone size */
+ loff_t i_zone_size;
+
/*
* To serialise fully against both syscall and mmap based IO and
* sequential file truncation, two locks are used. For serializing