summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/Makefile5
-rw-r--r--fs/9p/acl.c83
-rw-r--r--fs/9p/cache.c8
-rw-r--r--fs/9p/cache.h1
-rw-r--r--fs/9p/v9fs.c2
-rw-r--r--fs/9p/v9fs.h2
-rw-r--r--fs/9p/vfs_file.c5
-rw-r--r--fs/9p/vfs_inode.c33
-rw-r--r--fs/9p/vfs_inode_dotl.c24
-rw-r--r--fs/9p/xattr.c38
-rw-r--r--fs/9p/xattr.h3
-rw-r--r--fs/9p/xattr_security.c80
-rw-r--r--fs/9p/xattr_trusted.c80
-rw-r--r--fs/9p/xattr_user.c80
-rw-r--r--fs/Kconfig17
-rw-r--r--fs/Makefile7
-rw-r--r--fs/adfs/adfs.h6
-rw-r--r--fs/adfs/dir.c6
-rw-r--r--fs/adfs/dir_f.c2
-rw-r--r--fs/adfs/dir_fplus.c2
-rw-r--r--fs/adfs/super.c2
-rw-r--r--fs/affs/affs.h2
-rw-r--r--fs/affs/amigaffs.c13
-rw-r--r--fs/affs/inode.c1
-rw-r--r--fs/affs/namei.c1
-rw-r--r--fs/affs/super.c4
-rw-r--r--fs/affs/symlink.c9
-rw-r--r--fs/afs/inode.c1
-rw-r--r--fs/afs/proc.c25
-rw-r--r--fs/afs/super.c2
-rw-r--r--fs/autofs4/symlink.c14
-rw-r--r--fs/bad_inode.c2
-rw-r--r--fs/befs/linuxvfs.c42
-rw-r--r--fs/bfs/inode.c2
-rw-r--r--fs/binfmt_elf.c22
-rw-r--r--fs/binfmt_elf_fdpic.c68
-rw-r--r--fs/block_dev.c199
-rw-r--r--fs/btrfs/Makefile5
-rw-r--r--fs/btrfs/acl.c12
-rw-r--r--fs/btrfs/async-thread.c4
-rw-r--r--fs/btrfs/backref.c31
-rw-r--r--fs/btrfs/btrfs_inode.h4
-rw-r--r--fs/btrfs/check-integrity.c113
-rw-r--r--fs/btrfs/compression.c104
-rw-r--r--fs/btrfs/ctree.c21
-rw-r--r--fs/btrfs/ctree.h372
-rw-r--r--fs/btrfs/delayed-inode.c11
-rw-r--r--fs/btrfs/delayed-ref.c194
-rw-r--r--fs/btrfs/delayed-ref.h31
-rw-r--r--fs/btrfs/dev-replace.c55
-rw-r--r--fs/btrfs/disk-io.c222
-rw-r--r--fs/btrfs/disk-io.h7
-rw-r--r--fs/btrfs/extent-tree.c904
-rw-r--r--fs/btrfs/extent-tree.h0
-rw-r--r--fs/btrfs/extent_io.c457
-rw-r--r--fs/btrfs/extent_io.h155
-rw-r--r--fs/btrfs/file-item.c4
-rw-r--r--fs/btrfs/file.c283
-rw-r--r--fs/btrfs/free-space-cache.c121
-rw-r--r--fs/btrfs/free-space-cache.h3
-rw-r--r--fs/btrfs/free-space-tree.c1591
-rw-r--r--fs/btrfs/free-space-tree.h72
-rw-r--r--fs/btrfs/inode-item.c2
-rw-r--r--fs/btrfs/inode-map.c14
-rw-r--r--fs/btrfs/inode.c617
-rw-r--r--fs/btrfs/ioctl.c540
-rw-r--r--fs/btrfs/locking.c14
-rw-r--r--fs/btrfs/ordered-data.c70
-rw-r--r--fs/btrfs/ordered-data.h2
-rw-r--r--fs/btrfs/props.c13
-rw-r--r--fs/btrfs/qgroup.c247
-rw-r--r--fs/btrfs/qgroup.h31
-rw-r--r--fs/btrfs/raid56.c8
-rw-r--r--fs/btrfs/reada.c8
-rw-r--r--fs/btrfs/relocation.c38
-rw-r--r--fs/btrfs/root-tree.c11
-rw-r--r--fs/btrfs/scrub.c295
-rw-r--r--fs/btrfs/send.c228
-rw-r--r--fs/btrfs/send.h4
-rw-r--r--fs/btrfs/super.c145
-rw-r--r--fs/btrfs/sysfs.c52
-rw-r--r--fs/btrfs/sysfs.h4
-rw-r--r--fs/btrfs/tests/btrfs-tests.c58
-rw-r--r--fs/btrfs/tests/btrfs-tests.h10
-rw-r--r--fs/btrfs/tests/extent-io-tests.c149
-rw-r--r--fs/btrfs/tests/free-space-tests.c251
-rw-r--r--fs/btrfs/tests/free-space-tree-tests.c571
-rw-r--r--fs/btrfs/tests/inode-tests.c2
-rw-r--r--fs/btrfs/tests/qgroup-tests.c20
-rw-r--r--fs/btrfs/transaction.c212
-rw-r--r--fs/btrfs/transaction.h28
-rw-r--r--fs/btrfs/tree-defrag.c27
-rw-r--r--fs/btrfs/tree-log.c24
-rw-r--r--fs/btrfs/volumes.c545
-rw-r--r--fs/btrfs/volumes.h19
-rw-r--r--fs/btrfs/xattr.c174
-rw-r--r--fs/btrfs/xattr.h2
-rw-r--r--fs/buffer.c47
-rw-r--r--fs/cachefiles/daemon.c12
-rw-r--r--fs/cachefiles/internal.h2
-rw-r--r--fs/cachefiles/namei.c2
-rw-r--r--fs/cachefiles/rdwr.c75
-rw-r--r--fs/ceph/acl.c16
-rw-r--r--fs/ceph/addr.c7
-rw-r--r--fs/ceph/cache.c2
-rw-r--r--fs/ceph/caps.c76
-rw-r--r--fs/ceph/file.c87
-rw-r--r--fs/ceph/inode.c3
-rw-r--r--fs/ceph/locks.c4
-rw-r--r--fs/ceph/mds_client.c57
-rw-r--r--fs/ceph/mds_client.h3
-rw-r--r--fs/ceph/super.c4
-rw-r--r--fs/ceph/super.h1
-rw-r--r--fs/cifs/cifs_spnego.c6
-rw-r--r--fs/cifs/cifsacl.c25
-rw-r--r--fs/cifs/cifsfs.c82
-rw-r--r--fs/cifs/cifsfs.h6
-rw-r--r--fs/cifs/cifsglob.h8
-rw-r--r--fs/cifs/connect.c87
-rw-r--r--fs/cifs/file.c12
-rw-r--r--fs/cifs/inode.c6
-rw-r--r--fs/cifs/ioctl.c123
-rw-r--r--fs/cifs/link.c10
-rw-r--r--fs/cifs/readdir.c2
-rw-r--r--fs/cifs/sess.c2
-rw-r--r--fs/cifs/smb2file.c19
-rw-r--r--fs/cifs/smb2ops.c10
-rw-r--r--fs/cifs/smb2pdu.c125
-rw-r--r--fs/cifs/smb2pdu.h45
-rw-r--r--fs/cifs/smbfsctl.h2
-rw-r--r--fs/cifs/xattr.c16
-rw-r--r--fs/coda/cnode.c5
-rw-r--r--fs/coda/inode.c6
-rw-r--r--fs/coda/symlink.c4
-rw-r--r--fs/compat.c21
-rw-r--r--fs/compat_ioctl.c271
-rw-r--r--fs/configfs/configfs_internal.h14
-rw-r--r--fs/configfs/dir.c278
-rw-r--r--fs/configfs/file.c261
-rw-r--r--fs/configfs/inode.c2
-rw-r--r--fs/configfs/symlink.c22
-rw-r--r--fs/coredump.c43
-rw-r--r--fs/cramfs/inode.c1
-rw-r--r--fs/dax.c307
-rw-r--r--fs/dcache.c21
-rw-r--r--fs/debugfs/file.c177
-rw-r--r--fs/debugfs/inode.c12
-rw-r--r--fs/direct-io.c36
-rw-r--r--fs/dlm/config.c288
-rw-r--r--fs/dlm/lowcomms.c4
-rw-r--r--fs/dlm/plock.c6
-rw-r--r--fs/dlm/user.c11
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h5
-rw-r--r--fs/ecryptfs/inode.c23
-rw-r--r--fs/ecryptfs/main.c6
-rw-r--r--fs/efs/inode.c1
-rw-r--r--fs/efs/super.c6
-rw-r--r--fs/efs/symlink.c4
-rw-r--r--fs/eventfd.c4
-rw-r--r--fs/exec.c4
-rw-r--r--fs/exofs/inode.c6
-rw-r--r--fs/exofs/namei.c4
-rw-r--r--fs/exofs/super.c4
-rw-r--r--fs/ext2/ext2.h11
-rw-r--r--fs/ext2/file.c84
-rw-r--r--fs/ext2/inode.c11
-rw-r--r--fs/ext2/namei.c4
-rw-r--r--fs/ext2/super.c7
-rw-r--r--fs/ext2/symlink.c5
-rw-r--r--fs/ext2/xattr.c22
-rw-r--r--fs/ext2/xattr_security.c30
-rw-r--r--fs/ext2/xattr_trusted.c32
-rw-r--r--fs/ext2/xattr_user.c32
-rw-r--r--fs/ext4/Makefile2
-rw-r--r--fs/ext4/balloc.c92
-rw-r--r--fs/ext4/block_validity.c2
-rw-r--r--fs/ext4/crypto.c54
-rw-r--r--fs/ext4/crypto_fname.c2
-rw-r--r--fs/ext4/crypto_key.c20
-rw-r--r--fs/ext4/crypto_policy.c3
-rw-r--r--fs/ext4/dir.c7
-rw-r--r--fs/ext4/ext4.h228
-rw-r--r--fs/ext4/ext4_jbd2.c6
-rw-r--r--fs/ext4/ext4_jbd2.h10
-rw-r--r--fs/ext4/extents.c88
-rw-r--r--fs/ext4/extents_status.c60
-rw-r--r--fs/ext4/extents_status.h2
-rw-r--r--fs/ext4/ialloc.c113
-rw-r--r--fs/ext4/indirect.c5
-rw-r--r--fs/ext4/inline.c3
-rw-r--r--fs/ext4/inode.c90
-rw-r--r--fs/ext4/ioctl.c15
-rw-r--r--fs/ext4/mballoc.c87
-rw-r--r--fs/ext4/migrate.c9
-rw-r--r--fs/ext4/mmp.c8
-rw-r--r--fs/ext4/namei.c42
-rw-r--r--fs/ext4/page-io.c10
-rw-r--r--fs/ext4/readpage.c4
-rw-r--r--fs/ext4/resize.c34
-rw-r--r--fs/ext4/super.c705
-rw-r--r--fs/ext4/symlink.c33
-rw-r--r--fs/ext4/sysfs.c448
-rw-r--r--fs/ext4/xattr.c56
-rw-r--r--fs/ext4/xattr_security.c31
-rw-r--r--fs/ext4/xattr_trusted.c32
-rw-r--r--fs/ext4/xattr_user.c32
-rw-r--r--fs/f2fs/checkpoint.c222
-rw-r--r--fs/f2fs/crypto_key.c4
-rw-r--r--fs/f2fs/data.c489
-rw-r--r--fs/f2fs/debug.c71
-rw-r--r--fs/f2fs/dir.c57
-rw-r--r--fs/f2fs/extent_cache.c315
-rw-r--r--fs/f2fs/f2fs.h232
-rw-r--r--fs/f2fs/file.c639
-rw-r--r--fs/f2fs/gc.c86
-rw-r--r--fs/f2fs/gc.h14
-rw-r--r--fs/f2fs/inline.c51
-rw-r--r--fs/f2fs/inode.c35
-rw-r--r--fs/f2fs/namei.c112
-rw-r--r--fs/f2fs/node.c196
-rw-r--r--fs/f2fs/node.h10
-rw-r--r--fs/f2fs/recovery.c55
-rw-r--r--fs/f2fs/segment.c294
-rw-r--r--fs/f2fs/segment.h4
-rw-r--r--fs/f2fs/shrinker.c3
-rw-r--r--fs/f2fs/super.c287
-rw-r--r--fs/f2fs/xattr.c125
-rw-r--r--fs/f2fs/xattr.h2
-rw-r--r--fs/fat/dir.c16
-rw-r--r--fs/fat/inode.c2
-rw-r--r--fs/fcntl.c3
-rw-r--r--fs/file.c107
-rw-r--r--fs/freevxfs/vxfs_inode.c1
-rw-r--r--fs/fs-writeback.c16
-rw-r--r--fs/fscache/cookie.c2
-rw-r--r--fs/fscache/netfs.c38
-rw-r--r--fs/fscache/object-list.c4
-rw-r--r--fs/fscache/page.c8
-rw-r--r--fs/fuse/dir.c17
-rw-r--r--fs/fuse/file.c2
-rw-r--r--fs/fuse/inode.c4
-rw-r--r--fs/gfs2/acl.c4
-rw-r--r--fs/gfs2/acl.h2
-rw-r--r--fs/gfs2/aops.c2
-rw-r--r--fs/gfs2/bmap.c13
-rw-r--r--fs/gfs2/dir.c182
-rw-r--r--fs/gfs2/file.c44
-rw-r--r--fs/gfs2/glock.c102
-rw-r--r--fs/gfs2/glock.h30
-rw-r--r--fs/gfs2/glops.c12
-rw-r--r--fs/gfs2/incore.h26
-rw-r--r--fs/gfs2/inode.c67
-rw-r--r--fs/gfs2/lock_dlm.c2
-rw-r--r--fs/gfs2/log.c3
-rw-r--r--fs/gfs2/main.c21
-rw-r--r--fs/gfs2/meta_io.c82
-rw-r--r--fs/gfs2/meta_io.h2
-rw-r--r--fs/gfs2/ops_fstype.c14
-rw-r--r--fs/gfs2/quota.c117
-rw-r--r--fs/gfs2/quota.h2
-rw-r--r--fs/gfs2/rgrp.c66
-rw-r--r--fs/gfs2/rgrp.h6
-rw-r--r--fs/gfs2/super.c43
-rw-r--r--fs/gfs2/trans.c4
-rw-r--r--fs/gfs2/util.c2
-rw-r--r--fs/gfs2/util.h2
-rw-r--r--fs/gfs2/xattr.c73
-rw-r--r--fs/gfs2/xattr.h1
-rw-r--r--fs/hfs/mdb.c4
-rw-r--r--fs/hfs/super.c4
-rw-r--r--fs/hfsplus/inode.c2
-rw-r--r--fs/hfsplus/posix_acl.c8
-rw-r--r--fs/hfsplus/super.c2
-rw-r--r--fs/hfsplus/xattr.c33
-rw-r--r--fs/hfsplus/xattr_security.c21
-rw-r--r--fs/hfsplus/xattr_trusted.c21
-rw-r--r--fs/hfsplus/xattr_user.c21
-rw-r--r--fs/hostfs/hostfs_kern.c28
-rw-r--r--fs/hpfs/inode.c1
-rw-r--r--fs/hpfs/map.c2
-rw-r--r--fs/hpfs/namei.c7
-rw-r--r--fs/hpfs/super.c2
-rw-r--r--fs/hugetlbfs/inode.c214
-rw-r--r--fs/inode.c11
-rw-r--r--fs/internal.h9
-rw-r--r--fs/ioctl.c71
-rw-r--r--fs/isofs/inode.c3
-rw-r--r--fs/isofs/rock.c4
-rw-r--r--fs/jbd2/checkpoint.c8
-rw-r--r--fs/jbd2/commit.c22
-rw-r--r--fs/jbd2/journal.c27
-rw-r--r--fs/jbd2/recovery.c26
-rw-r--r--fs/jbd2/revoke.c4
-rw-r--r--fs/jbd2/transaction.c22
-rw-r--r--fs/jffs2/background.c7
-rw-r--r--fs/jffs2/dir.c3
-rw-r--r--fs/jffs2/malloc.c27
-rw-r--r--fs/jffs2/readinode.c6
-rw-r--r--fs/jffs2/security.c30
-rw-r--r--fs/jffs2/super.c2
-rw-r--r--fs/jffs2/symlink.c2
-rw-r--r--fs/jffs2/wbuf.c5
-rw-r--r--fs/jffs2/xattr.c25
-rw-r--r--fs/jffs2/xattr_trusted.c26
-rw-r--r--fs/jffs2/xattr_user.c28
-rw-r--r--fs/jfs/acl.c8
-rw-r--r--fs/jfs/inode.c1
-rw-r--r--fs/jfs/jfs_logmgr.c9
-rw-r--r--fs/jfs/namei.c4
-rw-r--r--fs/jfs/super.c5
-rw-r--r--fs/jfs/symlink.c5
-rw-r--r--fs/kernfs/dir.c55
-rw-r--r--fs/kernfs/inode.c4
-rw-r--r--fs/kernfs/symlink.c24
-rw-r--r--fs/libfs.c22
-rw-r--r--fs/lockd/clntproc.c13
-rw-r--r--fs/lockd/host.c8
-rw-r--r--fs/lockd/mon.c125
-rw-r--r--fs/lockd/netns.h4
-rw-r--r--fs/lockd/svc.c81
-rw-r--r--fs/lockd/svc4proc.c2
-rw-r--r--fs/lockd/svcproc.c2
-rw-r--r--fs/locks.c266
-rw-r--r--fs/logfs/Kconfig2
-rw-r--r--fs/logfs/dev_bdev.c4
-rw-r--r--fs/logfs/dir.c9
-rw-r--r--fs/logfs/inode.c6
-rw-r--r--fs/logfs/logfs.h7
-rw-r--r--fs/logfs/readwrite.c4
-rw-r--r--fs/logfs/segment.c4
-rw-r--r--fs/minix/inode.c6
-rw-r--r--fs/minix/itree_v1.c9
-rw-r--r--fs/minix/itree_v2.c9
-rw-r--r--fs/mpage.c27
-rw-r--r--fs/namei.c340
-rw-r--r--fs/namespace.c49
-rw-r--r--fs/ncpfs/dir.c4
-rw-r--r--fs/ncpfs/inode.c6
-rw-r--r--fs/ncpfs/ioctl.c2
-rw-r--r--fs/nfs/blocklayout/blocklayout.c7
-rw-r--r--fs/nfs/callback.c40
-rw-r--r--fs/nfs/callback.h12
-rw-r--r--fs/nfs/callback_proc.c54
-rw-r--r--fs/nfs/callback_xdr.c39
-rw-r--r--fs/nfs/client.c1
-rw-r--r--fs/nfs/delegation.c6
-rw-r--r--fs/nfs/dir.c29
-rw-r--r--fs/nfs/direct.c48
-rw-r--r--fs/nfs/file.c25
-rw-r--r--fs/nfs/filelayout/filelayout.c18
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c245
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.h8
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c16
-rw-r--r--fs/nfs/inode.c119
-rw-r--r--fs/nfs/internal.h41
-rw-r--r--fs/nfs/mount_clnt.c4
-rw-r--r--fs/nfs/nfs3acl.c4
-rw-r--r--fs/nfs/nfs42.h1
-rw-r--r--fs/nfs/nfs42proc.c103
-rw-r--r--fs/nfs/nfs42xdr.c97
-rw-r--r--fs/nfs/nfs4_fs.h6
-rw-r--r--fs/nfs/nfs4client.c2
-rw-r--r--fs/nfs/nfs4file.c80
-rw-r--r--fs/nfs/nfs4idmap.c4
-rw-r--r--fs/nfs/nfs4proc.c363
-rw-r--r--fs/nfs/nfs4sysctl.c2
-rw-r--r--fs/nfs/nfs4trace.c1
-rw-r--r--fs/nfs/nfs4trace.h431
-rw-r--r--fs/nfs/nfs4xdr.c54
-rw-r--r--fs/nfs/nfsroot.c2
-rw-r--r--fs/nfs/nfstrace.h1
-rw-r--r--fs/nfs/objlayout/objio_osd.c7
-rw-r--r--fs/nfs/pagelist.c126
-rw-r--r--fs/nfs/pnfs.c246
-rw-r--r--fs/nfs/pnfs.h54
-rw-r--r--fs/nfs/pnfs_nfs.c10
-rw-r--r--fs/nfs/read.c52
-rw-r--r--fs/nfs/super.c2
-rw-r--r--fs/nfs/symlink.c39
-rw-r--r--fs/nfs/write.c140
-rw-r--r--fs/nfsd/lockd.c2
-rw-r--r--fs/nfsd/netns.h2
-rw-r--r--fs/nfsd/nfs3xdr.c6
-rw-r--r--fs/nfsd/nfs4callback.c6
-rw-r--r--fs/nfsd/nfs4layouts.c73
-rw-r--r--fs/nfsd/nfs4proc.c67
-rw-r--r--fs/nfsd/nfs4recover.c6
-rw-r--r--fs/nfsd/nfs4state.c335
-rw-r--r--fs/nfsd/nfs4xdr.c29
-rw-r--r--fs/nfsd/nfscache.c32
-rw-r--r--fs/nfsd/nfsfh.c5
-rw-r--r--fs/nfsd/nfsfh.h43
-rw-r--r--fs/nfsd/nfssvc.c75
-rw-r--r--fs/nfsd/state.h51
-rw-r--r--fs/nfsd/trace.c2
-rw-r--r--fs/nfsd/trace.h43
-rw-r--r--fs/nfsd/vfs.c50
-rw-r--r--fs/nfsd/vfs.h6
-rw-r--r--fs/nfsd/xdr4.h12
-rw-r--r--fs/nilfs2/alloc.c308
-rw-r--r--fs/nilfs2/alloc.h1
-rw-r--r--fs/nilfs2/btree.c7
-rw-r--r--fs/nilfs2/dat.c2
-rw-r--r--fs/nilfs2/file.c2
-rw-r--r--fs/nilfs2/inode.c5
-rw-r--r--fs/nilfs2/mdt.c6
-rw-r--r--fs/nilfs2/mdt.h2
-rw-r--r--fs/nilfs2/namei.c7
-rw-r--r--fs/nilfs2/recovery.c4
-rw-r--r--fs/nilfs2/segment.c107
-rw-r--r--fs/nilfs2/segment.h3
-rw-r--r--fs/nilfs2/sufile.c11
-rw-r--r--fs/nilfs2/super.c26
-rw-r--r--fs/notify/fdinfo.c9
-rw-r--r--fs/notify/inode_mark.c3
-rw-r--r--fs/notify/inotify/inotify_user.c14
-rw-r--r--fs/notify/mark.c66
-rw-r--r--fs/ntfs/file.c4
-rw-r--r--fs/ntfs/super.c4
-rw-r--r--fs/ocfs2/alloc.c15
-rw-r--r--fs/ocfs2/alloc.h2
-rw-r--r--fs/ocfs2/aops.c2
-rw-r--r--fs/ocfs2/cluster/heartbeat.c228
-rw-r--r--fs/ocfs2/cluster/nodemanager.c283
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h11
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c4
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c37
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c17
-rw-r--r--fs/ocfs2/dlm/dlmthread.c3
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c2
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c2
-rw-r--r--fs/ocfs2/dlmglue.c11
-rw-r--r--fs/ocfs2/file.c8
-rw-r--r--fs/ocfs2/inode.c1
-rw-r--r--fs/ocfs2/inode.h2
-rw-r--r--fs/ocfs2/ioctl.c4
-rw-r--r--fs/ocfs2/journal.c115
-rw-r--r--fs/ocfs2/localalloc.c10
-rw-r--r--fs/ocfs2/locks.c13
-rw-r--r--fs/ocfs2/namei.c37
-rw-r--r--fs/ocfs2/namei.h3
-rw-r--r--fs/ocfs2/quota.h2
-rw-r--r--fs/ocfs2/quota_global.c2
-rw-r--r--fs/ocfs2/refcounttree.c5
-rw-r--r--fs/ocfs2/resize.c15
-rw-r--r--fs/ocfs2/slot_map.c14
-rw-r--r--fs/ocfs2/suballoc.c5
-rw-r--r--fs/ocfs2/super.c13
-rw-r--r--fs/ocfs2/symlink.c3
-rw-r--r--fs/ocfs2/xattr.c195
-rw-r--r--fs/open.c5
-rw-r--r--fs/openpromfs/inode.c2
-rw-r--r--fs/overlayfs/copy_up.c23
-rw-r--r--fs/overlayfs/inode.c72
-rw-r--r--fs/overlayfs/overlayfs.h3
-rw-r--r--fs/pipe.c18
-rw-r--r--fs/posix_acl.c43
-rw-r--r--fs/proc/array.c26
-rw-r--r--fs/proc/base.c61
-rw-r--r--fs/proc/fd.c15
-rw-r--r--fs/proc/inode.c24
-rw-r--r--fs/proc/meminfo.c12
-rw-r--r--fs/proc/namespaces.c10
-rw-r--r--fs/proc/page.c4
-rw-r--r--fs/proc/proc_sysctl.c2
-rw-r--r--fs/proc/self.c18
-rw-r--r--fs/proc/task_mmu.c197
-rw-r--r--fs/proc/thread_self.c19
-rw-r--r--fs/proc_namespace.c27
-rw-r--r--fs/pstore/Kconfig2
-rw-r--r--fs/pstore/Makefile6
-rw-r--r--fs/pstore/ftrace.c25
-rw-r--r--fs/pstore/inode.c11
-rw-r--r--fs/pstore/internal.h6
-rw-r--r--fs/pstore/platform.c47
-rw-r--r--fs/pstore/pmsg.c9
-rw-r--r--fs/pstore/ram.c19
-rw-r--r--fs/qnx4/inode.c3
-rw-r--r--fs/qnx6/inode.c3
-rw-r--r--fs/quota/dquot.c2
-rw-r--r--fs/quota/netlink.c5
-rw-r--r--fs/quota/quota_v2.c4
-rw-r--r--fs/ramfs/inode.c1
-rw-r--r--fs/read_write.c341
-rw-r--r--fs/reiserfs/inode.c1
-rw-r--r--fs/reiserfs/journal.c24
-rw-r--r--fs/reiserfs/namei.c7
-rw-r--r--fs/reiserfs/prints.c9
-rw-r--r--fs/reiserfs/procfs.c5
-rw-r--r--fs/reiserfs/super.c3
-rw-r--r--fs/reiserfs/xattr.c22
-rw-r--r--fs/reiserfs/xattr_acl.c8
-rw-r--r--fs/reiserfs/xattr_security.c23
-rw-r--r--fs/reiserfs/xattr_trusted.c22
-rw-r--r--fs/reiserfs/xattr_user.c21
-rw-r--r--fs/romfs/super.c5
-rw-r--r--fs/select.c6
-rw-r--r--fs/seq_file.c51
-rw-r--r--fs/splice.c18
-rw-r--r--fs/squashfs/inode.c2
-rw-r--r--fs/squashfs/super.c10
-rw-r--r--fs/squashfs/symlink.c3
-rw-r--r--fs/squashfs/xattr.c86
-rw-r--r--fs/stat.c4
-rw-r--r--fs/super.c6
-rw-r--r--fs/sync.c10
-rw-r--r--fs/sysfs/file.c4
-rw-r--r--fs/sysfs/group.c61
-rw-r--r--fs/sysv/inode.c17
-rw-r--r--fs/tracefs/inode.c6
-rw-r--r--fs/ubifs/Kconfig15
-rw-r--r--fs/ubifs/debug.c8
-rw-r--r--fs/ubifs/dir.c11
-rw-r--r--fs/ubifs/file.c52
-rw-r--r--fs/ubifs/key.h6
-rw-r--r--fs/ubifs/lpt.c6
-rw-r--r--fs/ubifs/misc.h9
-rw-r--r--fs/ubifs/recovery.c8
-rw-r--r--fs/ubifs/super.c17
-rw-r--r--fs/ubifs/tnc.c3
-rw-r--r--fs/ubifs/ubifs.h8
-rw-r--r--fs/ubifs/xattr.c56
-rw-r--r--fs/udf/balloc.c98
-rw-r--r--fs/udf/inode.c246
-rw-r--r--fs/udf/namei.c8
-rw-r--r--fs/udf/super.c17
-rw-r--r--fs/udf/symlink.c4
-rw-r--r--fs/udf/udfdecl.h5
-rw-r--r--fs/udf/unicode.c21
-rw-r--r--fs/ufs/Makefile2
-rw-r--r--fs/ufs/inode.c5
-rw-r--r--fs/ufs/namei.c5
-rw-r--r--fs/ufs/super.c2
-rw-r--r--fs/ufs/symlink.c42
-rw-r--r--fs/ufs/ufs.h4
-rw-r--r--fs/xattr.c232
-rw-r--r--fs/xfs/Makefile2
-rw-r--r--fs/xfs/kmem.c10
-rw-r--r--fs/xfs/kmem.h1
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c34
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h9
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c35
-rw-r--r--fs/xfs/libxfs/xfs_attr.c147
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c4
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c34
-rw-r--r--fs/xfs/libxfs/xfs_bit.c6
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c108
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h15
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c1
-rw-r--r--fs/xfs/libxfs/xfs_btree.c79
-rw-r--r--fs/xfs/libxfs/xfs_btree.h42
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c5
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c6
-rw-r--r--fs/xfs/libxfs/xfs_dir2_block.c4
-rw-r--r--fs/xfs/libxfs/xfs_dir2_data.c5
-rw-r--r--fs/xfs/libxfs/xfs_dir2_leaf.c5
-rw-r--r--fs/xfs/libxfs/xfs_dir2_node.c4
-rw-r--r--fs/xfs/libxfs/xfs_dquot_buf.c37
-rw-r--r--fs/xfs/libxfs/xfs_format.h20
-rw-r--r--fs/xfs/libxfs/xfs_fs.h10
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c11
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c27
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c16
-rw-r--r--fs/xfs/libxfs/xfs_log_recover.h1
-rw-r--r--fs/xfs/libxfs/xfs_quota_defs.h2
-rw-r--r--fs/xfs/libxfs/xfs_sb.c12
-rw-r--r--fs/xfs/libxfs/xfs_shared.h1
-rw-r--r--fs/xfs/libxfs/xfs_symlink_remote.c8
-rw-r--r--fs/xfs/xfs_acl.c37
-rw-r--r--fs/xfs/xfs_acl.h8
-rw-r--r--fs/xfs/xfs_aops.c121
-rw-r--r--fs/xfs/xfs_aops.h3
-rw-r--r--fs/xfs/xfs_attr_list.c2
-rw-r--r--fs/xfs/xfs_bmap_util.c81
-rw-r--r--fs/xfs/xfs_buf.c38
-rw-r--r--fs/xfs/xfs_buf.h1
-rw-r--r--fs/xfs/xfs_dir2_readdir.c2
-rw-r--r--fs/xfs/xfs_dquot.c27
-rw-r--r--fs/xfs/xfs_error.c4
-rw-r--r--fs/xfs/xfs_file.c139
-rw-r--r--fs/xfs/xfs_icache.c18
-rw-r--r--fs/xfs/xfs_inode.c33
-rw-r--r--fs/xfs/xfs_inode_item.c1
-rw-r--r--fs/xfs/xfs_inode_item.h1
-rw-r--r--fs/xfs/xfs_ioctl.c23
-rw-r--r--fs/xfs/xfs_ioctl32.c2
-rw-r--r--fs/xfs/xfs_iomap.c85
-rw-r--r--fs/xfs/xfs_iops.c18
-rw-r--r--fs/xfs/xfs_linux.h7
-rw-r--r--fs/xfs/xfs_log.c144
-rw-r--r--fs/xfs/xfs_log.h1
-rw-r--r--fs/xfs/xfs_log_priv.h54
-rw-r--r--fs/xfs/xfs_log_recover.c595
-rw-r--r--fs/xfs/xfs_message.c7
-rw-r--r--fs/xfs/xfs_mount.c21
-rw-r--r--fs/xfs/xfs_mount.h5
-rw-r--r--fs/xfs/xfs_pnfs.c5
-rw-r--r--fs/xfs/xfs_qm.c16
-rw-r--r--fs/xfs/xfs_rtalloc.c3
-rw-r--r--fs/xfs/xfs_stats.c93
-rw-r--r--fs/xfs/xfs_stats.h36
-rw-r--r--fs/xfs/xfs_super.c63
-rw-r--r--fs/xfs/xfs_symlink.c12
-rw-r--r--fs/xfs/xfs_sysctl.c15
-rw-r--r--fs/xfs/xfs_sysfs.c213
-rw-r--r--fs/xfs/xfs_sysfs.h1
-rw-r--r--fs/xfs/xfs_trace.h28
-rw-r--r--fs/xfs/xfs_trans.c6
-rw-r--r--fs/xfs/xfs_trans_ail.c13
-rw-r--r--fs/xfs/xfs_trans_dquot.c14
-rw-r--r--fs/xfs/xfs_trans_inode.c9
-rw-r--r--fs/xfs/xfs_xattr.c184
612 files changed, 20729 insertions, 11416 deletions
diff --git a/fs/9p/Makefile b/fs/9p/Makefile
index ff7be98f84f2..9619ccadd2fc 100644
--- a/fs/9p/Makefile
+++ b/fs/9p/Makefile
@@ -10,10 +10,7 @@ obj-$(CONFIG_9P_FS) := 9p.o
vfs_dentry.o \
v9fs.o \
fid.o \
- xattr.o \
- xattr_user.o \
- xattr_trusted.o
+ xattr.o
9p-$(CONFIG_9P_FSCACHE) += cache.o
9p-$(CONFIG_9P_FS_POSIX_ACL) += acl.o
-9p-$(CONFIG_9P_FS_SECURITY) += xattr_security.o
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 31c010372660..9da967f38387 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -67,8 +67,8 @@ int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
return 0;
}
/* get the default/access acl values and cache them */
- dacl = __v9fs_get_acl(fid, POSIX_ACL_XATTR_DEFAULT);
- pacl = __v9fs_get_acl(fid, POSIX_ACL_XATTR_ACCESS);
+ dacl = __v9fs_get_acl(fid, XATTR_NAME_POSIX_ACL_DEFAULT);
+ pacl = __v9fs_get_acl(fid, XATTR_NAME_POSIX_ACL_ACCESS);
if (!IS_ERR(dacl) && !IS_ERR(pacl)) {
set_cached_acl(inode, ACL_TYPE_DEFAULT, dacl);
@@ -133,10 +133,10 @@ static int v9fs_set_acl(struct p9_fid *fid, int type, struct posix_acl *acl)
goto err_free_out;
switch (type) {
case ACL_TYPE_ACCESS:
- name = POSIX_ACL_XATTR_ACCESS;
+ name = XATTR_NAME_POSIX_ACL_ACCESS;
break;
case ACL_TYPE_DEFAULT:
- name = POSIX_ACL_XATTR_DEFAULT;
+ name = XATTR_NAME_POSIX_ACL_DEFAULT;
break;
default:
BUG();
@@ -212,42 +212,22 @@ int v9fs_acl_mode(struct inode *dir, umode_t *modep,
return 0;
}
-static int v9fs_remote_get_acl(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
-{
- char *full_name;
-
- switch (type) {
- case ACL_TYPE_ACCESS:
- full_name = POSIX_ACL_XATTR_ACCESS;
- break;
- case ACL_TYPE_DEFAULT:
- full_name = POSIX_ACL_XATTR_DEFAULT;
- break;
- default:
- BUG();
- }
- return v9fs_xattr_get(dentry, full_name, buffer, size);
-}
-
-static int v9fs_xattr_get_acl(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int v9fs_xattr_get_acl(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
struct v9fs_session_info *v9ses;
struct posix_acl *acl;
int error;
- if (strcmp(name, "") != 0)
- return -EINVAL;
-
v9ses = v9fs_dentry2v9ses(dentry);
/*
* We allow set/get/list of acl when access=client is not specified
*/
if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
- return v9fs_remote_get_acl(dentry, name, buffer, size, type);
+ return v9fs_xattr_get(dentry, handler->name, buffer, size);
- acl = v9fs_get_cached_acl(d_inode(dentry), type);
+ acl = v9fs_get_cached_acl(d_inode(dentry), handler->flags);
if (IS_ERR(acl))
return PTR_ERR(acl);
if (acl == NULL)
@@ -258,46 +238,23 @@ static int v9fs_xattr_get_acl(struct dentry *dentry, const char *name,
return error;
}
-static int v9fs_remote_set_acl(struct dentry *dentry, const char *name,
- const void *value, size_t size,
- int flags, int type)
-{
- char *full_name;
-
- switch (type) {
- case ACL_TYPE_ACCESS:
- full_name = POSIX_ACL_XATTR_ACCESS;
- break;
- case ACL_TYPE_DEFAULT:
- full_name = POSIX_ACL_XATTR_DEFAULT;
- break;
- default:
- BUG();
- }
- return v9fs_xattr_set(dentry, full_name, value, size, flags);
-}
-
-
-static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
- const void *value, size_t size,
- int flags, int type)
+static int v9fs_xattr_set_acl(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
int retval;
struct posix_acl *acl;
struct v9fs_session_info *v9ses;
struct inode *inode = d_inode(dentry);
- if (strcmp(name, "") != 0)
- return -EINVAL;
-
v9ses = v9fs_dentry2v9ses(dentry);
/*
* set the attribute on the remote. Without even looking at the
* xattr value. We leave it to the server to validate
*/
if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
- return v9fs_remote_set_acl(dentry, name,
- value, size, flags, type);
+ return v9fs_xattr_set(dentry, handler->name, value, size,
+ flags);
if (S_ISLNK(inode->i_mode))
return -EOPNOTSUPP;
@@ -316,9 +273,8 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
} else
acl = NULL;
- switch (type) {
+ switch (handler->flags) {
case ACL_TYPE_ACCESS:
- name = POSIX_ACL_XATTR_ACCESS;
if (acl) {
umode_t mode = inode->i_mode;
retval = posix_acl_equiv_mode(acl, &mode);
@@ -349,7 +305,6 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
}
break;
case ACL_TYPE_DEFAULT:
- name = POSIX_ACL_XATTR_DEFAULT;
if (!S_ISDIR(inode->i_mode)) {
retval = acl ? -EINVAL : 0;
goto err_out;
@@ -358,23 +313,23 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
default:
BUG();
}
- retval = v9fs_xattr_set(dentry, name, value, size, flags);
+ retval = v9fs_xattr_set(dentry, handler->name, value, size, flags);
if (!retval)
- set_cached_acl(inode, type, acl);
+ set_cached_acl(inode, handler->flags, acl);
err_out:
posix_acl_release(acl);
return retval;
}
const struct xattr_handler v9fs_xattr_acl_access_handler = {
- .prefix = POSIX_ACL_XATTR_ACCESS,
+ .name = XATTR_NAME_POSIX_ACL_ACCESS,
.flags = ACL_TYPE_ACCESS,
.get = v9fs_xattr_get_acl,
.set = v9fs_xattr_set_acl,
};
const struct xattr_handler v9fs_xattr_acl_default_handler = {
- .prefix = POSIX_ACL_XATTR_DEFAULT,
+ .name = XATTR_NAME_POSIX_ACL_DEFAULT,
.flags = ACL_TYPE_DEFAULT,
.get = v9fs_xattr_get_acl,
.set = v9fs_xattr_set_acl,
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index a69260f27555..103ca5e1267b 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -243,14 +243,14 @@ void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp)
if (!v9inode->fscache)
return;
- spin_lock(&v9inode->fscache_lock);
+ mutex_lock(&v9inode->fscache_lock);
if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
v9fs_cache_inode_flush_cookie(inode);
else
v9fs_cache_inode_get_cookie(inode);
- spin_unlock(&v9inode->fscache_lock);
+ mutex_unlock(&v9inode->fscache_lock);
}
void v9fs_cache_inode_reset_cookie(struct inode *inode)
@@ -264,7 +264,7 @@ void v9fs_cache_inode_reset_cookie(struct inode *inode)
old = v9inode->fscache;
- spin_lock(&v9inode->fscache_lock);
+ mutex_lock(&v9inode->fscache_lock);
fscache_relinquish_cookie(v9inode->fscache, 1);
v9ses = v9fs_inode2v9ses(inode);
@@ -274,7 +274,7 @@ void v9fs_cache_inode_reset_cookie(struct inode *inode)
p9_debug(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p\n",
inode, old, v9inode->fscache);
- spin_unlock(&v9inode->fscache_lock);
+ mutex_unlock(&v9inode->fscache_lock);
}
int __v9fs_fscache_release_page(struct page *page, gfp_t gfp)
diff --git a/fs/9p/cache.h b/fs/9p/cache.h
index 2f9675491095..247e47e54bcc 100644
--- a/fs/9p/cache.h
+++ b/fs/9p/cache.h
@@ -21,6 +21,7 @@
*/
#ifndef _9P_CACHE_H
+#define _9P_CACHE_H
#ifdef CONFIG_9P_FSCACHE
#include <linux/fscache.h>
#include <linux/spinlock.h>
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 6caca025019d..072e7599583a 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -575,7 +575,7 @@ static int v9fs_init_inode_cache(void)
v9fs_inode_cache = kmem_cache_create("v9fs_inode_cache",
sizeof(struct v9fs_inode),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
v9fs_inode_init_once);
if (!v9fs_inode_cache)
return -ENOMEM;
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 0923f2cf3c80..6877050384a1 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -123,7 +123,7 @@ struct v9fs_session_info {
struct v9fs_inode {
#ifdef CONFIG_9P_FSCACHE
- spinlock_t fscache_lock;
+ struct mutex fscache_lock;
struct fscache_cookie *fscache;
#endif
struct p9_qid qid;
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 3abc447783aa..7bf835f85bc8 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -161,7 +161,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
if ((fl->fl_flags & FL_POSIX) != FL_POSIX)
BUG();
- res = posix_lock_file_wait(filp, fl);
+ res = locks_lock_file_wait(filp, fl);
if (res < 0)
goto out;
@@ -231,7 +231,8 @@ out_unlock:
if (res < 0 && fl->fl_type != F_UNLCK) {
fl_type = fl->fl_type;
fl->fl_type = F_UNLCK;
- res = posix_lock_file_wait(filp, fl);
+ /* Even if this fails we want to return the remote error */
+ locks_lock_file_wait(filp, fl);
fl->fl_type = fl_type;
}
out:
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index b1dc51888048..3a08b3e6ff1d 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -244,7 +244,7 @@ struct inode *v9fs_alloc_inode(struct super_block *sb)
return NULL;
#ifdef CONFIG_9P_FSCACHE
v9inode->fscache = NULL;
- spin_lock_init(&v9inode->fscache_lock);
+ mutex_init(&v9inode->fscache_lock);
#endif
v9inode->writeback_fid = NULL;
v9inode->cache_validity = 0;
@@ -451,9 +451,9 @@ void v9fs_evict_inode(struct inode *inode)
{
struct v9fs_inode *v9inode = V9FS_I(inode);
- truncate_inode_pages_final(inode->i_mapping);
+ truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
- filemap_fdatawrite(inode->i_mapping);
+ filemap_fdatawrite(&inode->i_data);
v9fs_cache_inode_put_cookie(inode);
/* clunk the fid stashed in writeback_fid */
@@ -1223,18 +1223,26 @@ ino_t v9fs_qid2ino(struct p9_qid *qid)
}
/**
- * v9fs_vfs_follow_link - follow a symlink path
+ * v9fs_vfs_get_link - follow a symlink path
* @dentry: dentry for symlink
- * @cookie: place to pass the data to put_link()
+ * @inode: inode for symlink
+ * @done: delayed call for when we are done with the return value
*/
-static const char *v9fs_vfs_follow_link(struct dentry *dentry, void **cookie)
+static const char *v9fs_vfs_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- struct v9fs_session_info *v9ses = v9fs_dentry2v9ses(dentry);
- struct p9_fid *fid = v9fs_fid_lookup(dentry);
+ struct v9fs_session_info *v9ses;
+ struct p9_fid *fid;
struct p9_wstat *st;
char *res;
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
+ v9ses = v9fs_dentry2v9ses(dentry);
+ fid = v9fs_fid_lookup(dentry);
p9_debug(P9_DEBUG_VFS, "%pd\n", dentry);
if (IS_ERR(fid))
@@ -1259,7 +1267,8 @@ static const char *v9fs_vfs_follow_link(struct dentry *dentry, void **cookie)
p9stat_free(st);
kfree(st);
- return *cookie = res;
+ set_delayed_call(done, kfree_link, res);
+ return res;
}
/**
@@ -1368,9 +1377,6 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rde
dir->i_ino, dentry, mode,
MAJOR(rdev), MINOR(rdev));
- if (!new_valid_dev(rdev))
- return -EINVAL;
-
/* build extension */
if (S_ISBLK(mode))
sprintf(name, "b %u %u", MAJOR(rdev), MINOR(rdev));
@@ -1455,8 +1461,7 @@ static const struct inode_operations v9fs_file_inode_operations = {
static const struct inode_operations v9fs_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = v9fs_vfs_follow_link,
- .put_link = kfree_put_link,
+ .get_link = v9fs_vfs_get_link,
.getattr = v9fs_vfs_getattr,
.setattr = v9fs_vfs_setattr,
};
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index e8aa57dc8d6d..a34702c998f5 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -829,9 +829,6 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
dir->i_ino, dentry, omode,
MAJOR(rdev), MINOR(rdev));
- if (!new_valid_dev(rdev))
- return -EINVAL;
-
v9ses = v9fs_inode2v9ses(dir);
dir_dentry = dentry->d_parent;
dfid = v9fs_fid_lookup(dir_dentry);
@@ -902,26 +899,34 @@ error:
}
/**
- * v9fs_vfs_follow_link_dotl - follow a symlink path
+ * v9fs_vfs_get_link_dotl - follow a symlink path
* @dentry: dentry for symlink
- * @cookie: place to pass the data to put_link()
+ * @inode: inode for symlink
+ * @done: destructor for return value
*/
static const char *
-v9fs_vfs_follow_link_dotl(struct dentry *dentry, void **cookie)
+v9fs_vfs_get_link_dotl(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- struct p9_fid *fid = v9fs_fid_lookup(dentry);
+ struct p9_fid *fid;
char *target;
int retval;
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
p9_debug(P9_DEBUG_VFS, "%pd\n", dentry);
+ fid = v9fs_fid_lookup(dentry);
if (IS_ERR(fid))
return ERR_CAST(fid);
retval = p9_client_readlink(fid, &target);
if (retval)
return ERR_PTR(retval);
- return *cookie = target;
+ set_delayed_call(done, kfree_link, target);
+ return target;
}
int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
@@ -987,8 +992,7 @@ const struct inode_operations v9fs_file_inode_operations_dotl = {
const struct inode_operations v9fs_symlink_inode_operations_dotl = {
.readlink = generic_readlink,
- .follow_link = v9fs_vfs_follow_link_dotl,
- .put_link = kfree_put_link,
+ .get_link = v9fs_vfs_get_link_dotl,
.getattr = v9fs_vfs_getattr_dotl,
.setattr = v9fs_vfs_setattr_dotl,
.setxattr = generic_setxattr,
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index 0cf44b6cccd6..9dd9b47a6c1a 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -137,6 +137,44 @@ ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
return v9fs_xattr_get(dentry, NULL, buffer, buffer_size);
}
+static int v9fs_xattr_handler_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
+{
+ const char *full_name = xattr_full_name(handler, name);
+
+ return v9fs_xattr_get(dentry, full_name, buffer, size);
+}
+
+static int v9fs_xattr_handler_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
+{
+ const char *full_name = xattr_full_name(handler, name);
+
+ return v9fs_xattr_set(dentry, full_name, value, size, flags);
+}
+
+static struct xattr_handler v9fs_xattr_user_handler = {
+ .prefix = XATTR_USER_PREFIX,
+ .get = v9fs_xattr_handler_get,
+ .set = v9fs_xattr_handler_set,
+};
+
+static struct xattr_handler v9fs_xattr_trusted_handler = {
+ .prefix = XATTR_TRUSTED_PREFIX,
+ .get = v9fs_xattr_handler_get,
+ .set = v9fs_xattr_handler_set,
+};
+
+#ifdef CONFIG_9P_FS_SECURITY
+static struct xattr_handler v9fs_xattr_security_handler = {
+ .prefix = XATTR_SECURITY_PREFIX,
+ .get = v9fs_xattr_handler_get,
+ .set = v9fs_xattr_handler_set,
+};
+#endif
+
const struct xattr_handler *v9fs_xattr_handlers[] = {
&v9fs_xattr_user_handler,
&v9fs_xattr_trusted_handler,
diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h
index d3e2ea3840be..c63c3bea5de5 100644
--- a/fs/9p/xattr.h
+++ b/fs/9p/xattr.h
@@ -19,9 +19,6 @@
#include <net/9p/client.h>
extern const struct xattr_handler *v9fs_xattr_handlers[];
-extern struct xattr_handler v9fs_xattr_user_handler;
-extern struct xattr_handler v9fs_xattr_trusted_handler;
-extern struct xattr_handler v9fs_xattr_security_handler;
extern const struct xattr_handler v9fs_xattr_acl_access_handler;
extern const struct xattr_handler v9fs_xattr_acl_default_handler;
diff --git a/fs/9p/xattr_security.c b/fs/9p/xattr_security.c
deleted file mode 100644
index cb247a142a6e..000000000000
--- a/fs/9p/xattr_security.c
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright IBM Corporation, 2010
- * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2.1 of the GNU Lesser General Public License
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- */
-
-
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/fs.h>
-#include <linux/slab.h>
-#include "xattr.h"
-
-static int v9fs_xattr_security_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
-{
- int retval;
- char *full_name;
- size_t name_len;
- size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
-
- if (name == NULL)
- return -EINVAL;
-
- if (strcmp(name, "") == 0)
- return -EINVAL;
-
- name_len = strlen(name);
- full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
- if (!full_name)
- return -ENOMEM;
- memcpy(full_name, XATTR_SECURITY_PREFIX, prefix_len);
- memcpy(full_name+prefix_len, name, name_len);
- full_name[prefix_len + name_len] = '\0';
-
- retval = v9fs_xattr_get(dentry, full_name, buffer, size);
- kfree(full_name);
- return retval;
-}
-
-static int v9fs_xattr_security_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
-{
- int retval;
- char *full_name;
- size_t name_len;
- size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
-
- if (name == NULL)
- return -EINVAL;
-
- if (strcmp(name, "") == 0)
- return -EINVAL;
-
- name_len = strlen(name);
- full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
- if (!full_name)
- return -ENOMEM;
- memcpy(full_name, XATTR_SECURITY_PREFIX, prefix_len);
- memcpy(full_name + prefix_len, name, name_len);
- full_name[prefix_len + name_len] = '\0';
-
- retval = v9fs_xattr_set(dentry, full_name, value, size, flags);
- kfree(full_name);
- return retval;
-}
-
-struct xattr_handler v9fs_xattr_security_handler = {
- .prefix = XATTR_SECURITY_PREFIX,
- .get = v9fs_xattr_security_get,
- .set = v9fs_xattr_security_set,
-};
diff --git a/fs/9p/xattr_trusted.c b/fs/9p/xattr_trusted.c
deleted file mode 100644
index e30d33b8a3fb..000000000000
--- a/fs/9p/xattr_trusted.c
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright IBM Corporation, 2010
- * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2.1 of the GNU Lesser General Public License
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- */
-
-
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/fs.h>
-#include <linux/slab.h>
-#include "xattr.h"
-
-static int v9fs_xattr_trusted_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
-{
- int retval;
- char *full_name;
- size_t name_len;
- size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
-
- if (name == NULL)
- return -EINVAL;
-
- if (strcmp(name, "") == 0)
- return -EINVAL;
-
- name_len = strlen(name);
- full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
- if (!full_name)
- return -ENOMEM;
- memcpy(full_name, XATTR_TRUSTED_PREFIX, prefix_len);
- memcpy(full_name+prefix_len, name, name_len);
- full_name[prefix_len + name_len] = '\0';
-
- retval = v9fs_xattr_get(dentry, full_name, buffer, size);
- kfree(full_name);
- return retval;
-}
-
-static int v9fs_xattr_trusted_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
-{
- int retval;
- char *full_name;
- size_t name_len;
- size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
-
- if (name == NULL)
- return -EINVAL;
-
- if (strcmp(name, "") == 0)
- return -EINVAL;
-
- name_len = strlen(name);
- full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
- if (!full_name)
- return -ENOMEM;
- memcpy(full_name, XATTR_TRUSTED_PREFIX, prefix_len);
- memcpy(full_name + prefix_len, name, name_len);
- full_name[prefix_len + name_len] = '\0';
-
- retval = v9fs_xattr_set(dentry, full_name, value, size, flags);
- kfree(full_name);
- return retval;
-}
-
-struct xattr_handler v9fs_xattr_trusted_handler = {
- .prefix = XATTR_TRUSTED_PREFIX,
- .get = v9fs_xattr_trusted_get,
- .set = v9fs_xattr_trusted_set,
-};
diff --git a/fs/9p/xattr_user.c b/fs/9p/xattr_user.c
deleted file mode 100644
index d0b701b72080..000000000000
--- a/fs/9p/xattr_user.c
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright IBM Corporation, 2010
- * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2.1 of the GNU Lesser General Public License
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- */
-
-
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/fs.h>
-#include <linux/slab.h>
-#include "xattr.h"
-
-static int v9fs_xattr_user_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
-{
- int retval;
- char *full_name;
- size_t name_len;
- size_t prefix_len = XATTR_USER_PREFIX_LEN;
-
- if (name == NULL)
- return -EINVAL;
-
- if (strcmp(name, "") == 0)
- return -EINVAL;
-
- name_len = strlen(name);
- full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
- if (!full_name)
- return -ENOMEM;
- memcpy(full_name, XATTR_USER_PREFIX, prefix_len);
- memcpy(full_name+prefix_len, name, name_len);
- full_name[prefix_len + name_len] = '\0';
-
- retval = v9fs_xattr_get(dentry, full_name, buffer, size);
- kfree(full_name);
- return retval;
-}
-
-static int v9fs_xattr_user_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
-{
- int retval;
- char *full_name;
- size_t name_len;
- size_t prefix_len = XATTR_USER_PREFIX_LEN;
-
- if (name == NULL)
- return -EINVAL;
-
- if (strcmp(name, "") == 0)
- return -EINVAL;
-
- name_len = strlen(name);
- full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
- if (!full_name)
- return -ENOMEM;
- memcpy(full_name, XATTR_USER_PREFIX, prefix_len);
- memcpy(full_name + prefix_len, name, name_len);
- full_name[prefix_len + name_len] = '\0';
-
- retval = v9fs_xattr_set(dentry, full_name, value, size, flags);
- kfree(full_name);
- return retval;
-}
-
-struct xattr_handler v9fs_xattr_user_handler = {
- .prefix = XATTR_USER_PREFIX,
- .get = v9fs_xattr_user_get,
- .set = v9fs_xattr_user_set,
-};
diff --git a/fs/Kconfig b/fs/Kconfig
index da3f32f1a4e4..9adee0d7536e 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -46,6 +46,13 @@ config FS_DAX
or if unsure, say N. Saying Y will increase the size of the kernel
by about 5kB.
+config FS_DAX_PMD
+ bool
+ default FS_DAX
+ depends on FS_DAX
+ depends on ZONE_DEVICE
+ depends on TRANSPARENT_HUGEPAGE
+
endif # BLOCK
# Posix ACL utility routines
@@ -67,6 +74,16 @@ config FILE_LOCKING
for filesystems like NFS and for the flock() system
call. Disabling this option saves about 11k.
+config MANDATORY_FILE_LOCKING
+ bool "Enable Mandatory file locking"
+ depends on FILE_LOCKING
+ default y
+ help
+ This option enables files appropriately marked files on appropriely
+ mounted filesystems to support mandatory locking.
+
+ To the best of my knowledge this is dead code that no one cares about.
+
source "fs/notify/Kconfig"
source "fs/quota/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index f79cf4043e60..79f522575cba 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -63,10 +63,11 @@ obj-$(CONFIG_DLM) += dlm/
# Do not add any filesystems before this line
obj-$(CONFIG_FSCACHE) += fscache/
obj-$(CONFIG_REISERFS_FS) += reiserfs/
-obj-$(CONFIG_EXT2_FS) += ext2/
-# We place ext4 after ext2 so plain ext2 root fs's are mounted using ext2
-# unless explicitly requested by rootfstype
obj-$(CONFIG_EXT4_FS) += ext4/
+# We place ext4 before ext2 so that clean ext3 root fs's do NOT mount using the
+# ext2 driver, which doesn't know about journalling! Explicitly request ext2
+# by giving the rootfstype= parameter.
+obj-$(CONFIG_EXT2_FS) += ext2/
obj-$(CONFIG_JBD2) += jbd2/
obj-$(CONFIG_CRAMFS) += cramfs/
obj-$(CONFIG_SQUASHFS) += squashfs/
diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h
index 24575d9d882d..ea4aba56f29d 100644
--- a/fs/adfs/adfs.h
+++ b/fs/adfs/adfs.h
@@ -45,7 +45,7 @@ struct adfs_dir_ops;
struct adfs_sb_info {
union { struct {
struct adfs_discmap *s_map; /* bh list containing map */
- struct adfs_dir_ops *s_dir; /* directory operations */
+ const struct adfs_dir_ops *s_dir; /* directory operations */
};
struct rcu_head rcu; /* used only at shutdown time */
};
@@ -168,8 +168,8 @@ void __adfs_error(struct super_block *sb, const char *function,
extern const struct inode_operations adfs_dir_inode_operations;
extern const struct file_operations adfs_dir_operations;
extern const struct dentry_operations adfs_dentry_operations;
-extern struct adfs_dir_ops adfs_f_dir_ops;
-extern struct adfs_dir_ops adfs_fplus_dir_ops;
+extern const struct adfs_dir_ops adfs_f_dir_ops;
+extern const struct adfs_dir_ops adfs_fplus_dir_ops;
extern int adfs_dir_update(struct super_block *sb, struct object_info *obj,
int wait);
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index 51c279a29845..fd4cf2c48e48 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -21,7 +21,7 @@ adfs_readdir(struct file *file, struct dir_context *ctx)
{
struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
- struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir;
+ const struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir;
struct object_info obj;
struct adfs_dir dir;
int ret = 0;
@@ -69,7 +69,7 @@ adfs_dir_update(struct super_block *sb, struct object_info *obj, int wait)
{
int ret = -EINVAL;
#ifdef CONFIG_ADFS_FS_RW
- struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir;
+ const struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir;
struct adfs_dir dir;
printk(KERN_INFO "adfs_dir_update: object %06X in dir %06X\n",
@@ -129,7 +129,7 @@ static int
adfs_dir_lookup_byname(struct inode *inode, struct qstr *name, struct object_info *obj)
{
struct super_block *sb = inode->i_sb;
- struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir;
+ const struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir;
struct adfs_dir dir;
int ret;
diff --git a/fs/adfs/dir_f.c b/fs/adfs/dir_f.c
index 4bbe853ee50a..0fbfd0b04ae0 100644
--- a/fs/adfs/dir_f.c
+++ b/fs/adfs/dir_f.c
@@ -476,7 +476,7 @@ adfs_f_free(struct adfs_dir *dir)
dir->sb = NULL;
}
-struct adfs_dir_ops adfs_f_dir_ops = {
+const struct adfs_dir_ops adfs_f_dir_ops = {
.read = adfs_f_read,
.setpos = adfs_f_setpos,
.getnext = adfs_f_getnext,
diff --git a/fs/adfs/dir_fplus.c b/fs/adfs/dir_fplus.c
index 82d14cdf70f9..c92cfb638c18 100644
--- a/fs/adfs/dir_fplus.c
+++ b/fs/adfs/dir_fplus.c
@@ -256,7 +256,7 @@ adfs_fplus_free(struct adfs_dir *dir)
dir->sb = NULL;
}
-struct adfs_dir_ops adfs_fplus_dir_ops = {
+const struct adfs_dir_ops adfs_fplus_dir_ops = {
.read = adfs_fplus_read,
.setpos = adfs_fplus_setpos,
.getnext = adfs_fplus_getnext,
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 4d4a0df8344f..c9fdfb112933 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -271,7 +271,7 @@ static int __init init_inodecache(void)
adfs_inode_cachep = kmem_cache_create("adfs_inode_cache",
sizeof(struct adfs_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (adfs_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index c69a87eaf57d..cc2b2efc9211 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -138,7 +138,7 @@ extern int affs_remove_hash(struct inode *dir, struct buffer_head *rem_bh);
extern int affs_remove_header(struct dentry *dentry);
extern u32 affs_checksum_block(struct super_block *sb, struct buffer_head *bh);
extern void affs_fix_checksum(struct super_block *sb, struct buffer_head *bh);
-extern void secs_to_datestamp(time_t secs, struct affs_date *ds);
+extern void secs_to_datestamp(time64_t secs, struct affs_date *ds);
extern umode_t prot_to_mode(u32 prot);
extern void mode_to_prot(struct inode *inode);
__printf(3, 4)
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index 5fa92bc790ef..d6c7a51c93e4 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -8,6 +8,7 @@
* Please send bug reports to: hjw@zvw.de
*/
+#include <linux/math64.h>
#include "affs.h"
/*
@@ -366,22 +367,22 @@ affs_fix_checksum(struct super_block *sb, struct buffer_head *bh)
}
void
-secs_to_datestamp(time_t secs, struct affs_date *ds)
+secs_to_datestamp(time64_t secs, struct affs_date *ds)
{
u32 days;
u32 minute;
+ s32 rem;
secs -= sys_tz.tz_minuteswest * 60 + ((8 * 365 + 2) * 24 * 60 * 60);
if (secs < 0)
secs = 0;
- days = secs / 86400;
- secs -= days * 86400;
- minute = secs / 60;
- secs -= minute * 60;
+ days = div_s64_rem(secs, 86400, &rem);
+ minute = rem / 60;
+ rem -= minute * 60;
ds->days = cpu_to_be32(days);
ds->mins = cpu_to_be32(minute);
- ds->ticks = cpu_to_be32(secs * 50);
+ ds->ticks = cpu_to_be32(rem * 50);
}
umode_t
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 17349500592d..0fdb0f5b2239 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -140,6 +140,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
break;
case ST_SOFTLINK:
inode->i_mode |= S_IFLNK;
+ inode_nohighmem(inode);
inode->i_op = &affs_symlink_inode_operations;
inode->i_data.a_ops = &affs_symlink_aops;
break;
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 181e05b46e72..00d3002a6780 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -344,6 +344,7 @@ affs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
return -ENOSPC;
inode->i_op = &affs_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_data.a_ops = &affs_symlink_aops;
inode->i_mode = S_IFLNK | 0777;
mode_to_prot(inode);
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 5b50c4ca43a7..2a6713b6b9f4 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -32,7 +32,7 @@ affs_commit_super(struct super_block *sb, int wait)
struct affs_root_tail *tail = AFFS_ROOT_TAIL(sb, bh);
lock_buffer(bh);
- secs_to_datestamp(get_seconds(), &tail->disk_change);
+ secs_to_datestamp(ktime_get_real_seconds(), &tail->disk_change);
affs_fix_checksum(sb, bh);
unlock_buffer(bh);
@@ -132,7 +132,7 @@ static int __init init_inodecache(void)
affs_inode_cachep = kmem_cache_create("affs_inode_cache",
sizeof(struct affs_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (affs_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c
index ea5b69a18ba9..69b03dbb792f 100644
--- a/fs/affs/symlink.c
+++ b/fs/affs/symlink.c
@@ -14,13 +14,13 @@ static int affs_symlink_readpage(struct file *file, struct page *page)
{
struct buffer_head *bh;
struct inode *inode = page->mapping->host;
- char *link = kmap(page);
+ char *link = page_address(page);
struct slink_front *lf;
int i, j;
char c;
char lc;
- pr_debug("follow_link(ino=%lu)\n", inode->i_ino);
+ pr_debug("get_link(ino=%lu)\n", inode->i_ino);
bh = affs_bread(inode->i_sb, inode->i_ino);
if (!bh)
@@ -57,12 +57,10 @@ static int affs_symlink_readpage(struct file *file, struct page *page)
link[i] = '\0';
affs_brelse(bh);
SetPageUptodate(page);
- kunmap(page);
unlock_page(page);
return 0;
fail:
SetPageError(page);
- kunmap(page);
unlock_page(page);
return -EIO;
}
@@ -73,7 +71,6 @@ const struct address_space_operations affs_symlink_aops = {
const struct inode_operations affs_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.setattr = affs_notify_change,
};
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index e06f5a23352a..86cc7264c21c 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -56,6 +56,7 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key)
case AFS_FTYPE_SYMLINK:
inode->i_mode = S_IFLNK | vnode->status.mode;
inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
break;
default:
printk("kAFS: AFS vnode with undefined type\n");
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 24a905b076fd..2853b4095344 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -230,14 +230,9 @@ static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf,
if (size <= 1 || size >= PAGE_SIZE)
return -EINVAL;
- kbuf = kmalloc(size + 1, GFP_KERNEL);
- if (!kbuf)
- return -ENOMEM;
-
- ret = -EFAULT;
- if (copy_from_user(kbuf, buf, size) != 0)
- goto done;
- kbuf[size] = 0;
+ kbuf = memdup_user_nul(buf, size);
+ if (IS_ERR(kbuf))
+ return PTR_ERR(kbuf);
/* trim to first NL */
name = memchr(kbuf, '\n', size);
@@ -315,15 +310,9 @@ static ssize_t afs_proc_rootcell_write(struct file *file,
if (size <= 1 || size >= PAGE_SIZE)
return -EINVAL;
- ret = -ENOMEM;
- kbuf = kmalloc(size + 1, GFP_KERNEL);
- if (!kbuf)
- goto nomem;
-
- ret = -EFAULT;
- if (copy_from_user(kbuf, buf, size) != 0)
- goto infault;
- kbuf[size] = 0;
+ kbuf = memdup_user_nul(buf, size);
+ if (IS_ERR(kbuf))
+ return PTR_ERR(kbuf);
/* trim to first NL */
s = memchr(kbuf, '\n', size);
@@ -337,9 +326,7 @@ static ssize_t afs_proc_rootcell_write(struct file *file,
if (ret >= 0)
ret = size; /* consume everything, always */
-infault:
kfree(kbuf);
-nomem:
_leave(" = %d", ret);
return ret;
}
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 1fb4a5129f7d..81afefe7d8a6 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -91,7 +91,7 @@ int __init afs_fs_init(void)
afs_inode_cachep = kmem_cache_create("afs_inode_cache",
sizeof(struct afs_vnode),
0,
- SLAB_HWCACHE_ALIGN,
+ SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT,
afs_i_init_once);
if (!afs_inode_cachep) {
printk(KERN_NOTICE "kAFS: Failed to allocate inode cache\n");
diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c
index da0c33481bc0..84e037d1d129 100644
--- a/fs/autofs4/symlink.c
+++ b/fs/autofs4/symlink.c
@@ -12,10 +12,16 @@
#include "autofs_i.h"
-static const char *autofs4_follow_link(struct dentry *dentry, void **cookie)
+static const char *autofs4_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
- struct autofs_info *ino = autofs4_dentry_ino(dentry);
+ struct autofs_sb_info *sbi;
+ struct autofs_info *ino;
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+ sbi = autofs4_sbi(dentry->d_sb);
+ ino = autofs4_dentry_ino(dentry);
if (ino && !autofs4_oz_mode(sbi))
ino->last_used = jiffies;
return d_inode(dentry)->i_private;
@@ -23,5 +29,5 @@ static const char *autofs4_follow_link(struct dentry *dentry, void **cookie)
const struct inode_operations autofs4_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = autofs4_follow_link
+ .get_link = autofs4_get_link
};
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 861b1e1c4777..103f5d7c3083 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -192,7 +192,7 @@ EXPORT_SYMBOL(make_bad_inode);
* Returns true if the inode in question has been marked as bad.
*/
-int is_bad_inode(struct inode *inode)
+bool is_bad_inode(struct inode *inode)
{
return (inode->i_op == &bad_inode_ops);
}
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 46aedacfa6a8..cc0e08252913 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -42,7 +42,7 @@ static struct inode *befs_iget(struct super_block *, unsigned long);
static struct inode *befs_alloc_inode(struct super_block *sb);
static void befs_destroy_inode(struct inode *inode);
static void befs_destroy_inodecache(void);
-static const char *befs_follow_link(struct dentry *, void **);
+static int befs_symlink_readpage(struct file *, struct page *);
static int befs_utf2nls(struct super_block *sb, const char *in, int in_len,
char **out, int *out_len);
static int befs_nls2utf(struct super_block *sb, const char *in, int in_len,
@@ -79,10 +79,8 @@ static const struct address_space_operations befs_aops = {
.bmap = befs_bmap,
};
-static const struct inode_operations befs_symlink_inode_operations = {
- .readlink = generic_readlink,
- .follow_link = befs_follow_link,
- .put_link = kfree_put_link,
+static const struct address_space_operations befs_symlink_aops = {
+ .readpage = befs_symlink_readpage,
};
/*
@@ -398,7 +396,9 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
inode->i_fop = &befs_dir_operations;
} else if (S_ISLNK(inode->i_mode)) {
if (befs_ino->i_flags & BEFS_LONG_SYMLINK) {
- inode->i_op = &befs_symlink_inode_operations;
+ inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
+ inode->i_mapping->a_ops = &befs_symlink_aops;
} else {
inode->i_link = befs_ino->i_data.symlink;
inode->i_op = &simple_symlink_inode_operations;
@@ -434,7 +434,7 @@ befs_init_inodecache(void)
befs_inode_cachep = kmem_cache_create("befs_inode_cache",
sizeof (struct befs_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (befs_inode_cachep == NULL) {
pr_err("%s: Couldn't initialize inode slabcache\n", __func__);
@@ -463,31 +463,33 @@ befs_destroy_inodecache(void)
* The data stream become link name. Unless the LONG_SYMLINK
* flag is set.
*/
-static const char *
-befs_follow_link(struct dentry *dentry, void **cookie)
+static int befs_symlink_readpage(struct file *unused, struct page *page)
{
- struct super_block *sb = dentry->d_sb;
- struct befs_inode_info *befs_ino = BEFS_I(d_inode(dentry));
+ struct inode *inode = page->mapping->host;
+ struct super_block *sb = inode->i_sb;
+ struct befs_inode_info *befs_ino = BEFS_I(inode);
befs_data_stream *data = &befs_ino->i_data.ds;
befs_off_t len = data->size;
- char *link;
+ char *link = page_address(page);
- if (len == 0) {
+ if (len == 0 || len > PAGE_SIZE) {
befs_error(sb, "Long symlink with illegal length");
- return ERR_PTR(-EIO);
+ goto fail;
}
befs_debug(sb, "Follow long symlink");
- link = kmalloc(len, GFP_NOFS);
- if (!link)
- return ERR_PTR(-ENOMEM);
if (befs_read_lsymlink(sb, data, link, len) != len) {
- kfree(link);
befs_error(sb, "Failed to read entire long symlink");
- return ERR_PTR(-EIO);
+ goto fail;
}
link[len - 1] = '\0';
- return *cookie = link;
+ SetPageUptodate(page);
+ unlock_page(page);
+ return 0;
+fail:
+ SetPageError(page);
+ unlock_page(page);
+ return -EIO;
}
/*
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index fdcb4d69f430..1e5c896f6b79 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -270,7 +270,7 @@ static int __init init_inodecache(void)
bfs_inode_cachep = kmem_cache_create("bfs_inode_cache",
sizeof(struct bfs_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (bfs_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 6b659967898e..3a93755e880f 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -35,6 +35,7 @@
#include <linux/utsname.h>
#include <linux/coredump.h>
#include <linux/sched.h>
+#include <linux/dax.h>
#include <asm/uaccess.h>
#include <asm/param.h>
#include <asm/page.h>
@@ -487,7 +488,7 @@ static inline int arch_elf_pt_proc(struct elfhdr *ehdr,
}
/**
- * arch_check_elf() - check a PT_LOPROC..PT_HIPROC ELF program header
+ * arch_check_elf() - check an ELF executable
* @ehdr: The main ELF header
* @has_interp: True if the ELF has an interpreter, else false.
* @state: Architecture-specific state preserved throughout the process
@@ -759,16 +760,16 @@ static int load_elf_binary(struct linux_binprm *bprm)
*/
would_dump(bprm, interpreter);
- retval = kernel_read(interpreter, 0, bprm->buf,
- BINPRM_BUF_SIZE);
- if (retval != BINPRM_BUF_SIZE) {
+ /* Get the exec headers */
+ retval = kernel_read(interpreter, 0,
+ (void *)&loc->interp_elf_ex,
+ sizeof(loc->interp_elf_ex));
+ if (retval != sizeof(loc->interp_elf_ex)) {
if (retval >= 0)
retval = -EIO;
goto out_free_dentry;
}
- /* Get the exec headers */
- loc->interp_elf_ex = *((struct elfhdr *)bprm->buf);
break;
}
elf_ppnt++;
@@ -1236,6 +1237,15 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma,
if (vma->vm_flags & VM_DONTDUMP)
return 0;
+ /* support for DAX */
+ if (vma_is_dax(vma)) {
+ if ((vma->vm_flags & VM_SHARED) && FILTER(DAX_SHARED))
+ goto whole;
+ if (!(vma->vm_flags & VM_SHARED) && FILTER(DAX_PRIVATE))
+ goto whole;
+ return 0;
+ }
+
/* Hugetlb memory check */
if (vma->vm_flags & VM_HUGETLB) {
if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED))
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index d3634bfb7fe1..b1adb92e69de 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -35,6 +35,7 @@
#include <linux/elf-fdpic.h>
#include <linux/elfcore.h>
#include <linux/coredump.h>
+#include <linux/dax.h>
#include <asm/uaccess.h>
#include <asm/param.h>
@@ -103,19 +104,36 @@ static void __exit exit_elf_fdpic_binfmt(void)
core_initcall(init_elf_fdpic_binfmt);
module_exit(exit_elf_fdpic_binfmt);
-static int is_elf_fdpic(struct elfhdr *hdr, struct file *file)
+static int is_elf(struct elfhdr *hdr, struct file *file)
{
if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0)
return 0;
if (hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN)
return 0;
- if (!elf_check_arch(hdr) || !elf_check_fdpic(hdr))
+ if (!elf_check_arch(hdr))
return 0;
if (!file->f_op->mmap)
return 0;
return 1;
}
+#ifndef elf_check_fdpic
+#define elf_check_fdpic(x) 0
+#endif
+
+#ifndef elf_check_const_displacement
+#define elf_check_const_displacement(x) 0
+#endif
+
+static int is_constdisp(struct elfhdr *hdr)
+{
+ if (!elf_check_fdpic(hdr))
+ return 1;
+ if (elf_check_const_displacement(hdr))
+ return 1;
+ return 0;
+}
+
/*****************************************************************************/
/*
* read the program headers table into memory
@@ -191,8 +209,18 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
/* check that this is a binary we know how to deal with */
retval = -ENOEXEC;
- if (!is_elf_fdpic(&exec_params.hdr, bprm->file))
+ if (!is_elf(&exec_params.hdr, bprm->file))
+ goto error;
+ if (!elf_check_fdpic(&exec_params.hdr)) {
+#ifdef CONFIG_MMU
+ /* binfmt_elf handles non-fdpic elf except on nommu */
goto error;
+#else
+ /* nommu can only load ET_DYN (PIE) ELF */
+ if (exec_params.hdr.e_type != ET_DYN)
+ goto error;
+#endif
+ }
/* read the program header table */
retval = elf_fdpic_fetch_phdrs(&exec_params, bprm->file);
@@ -269,13 +297,13 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
}
- if (elf_check_const_displacement(&exec_params.hdr))
+ if (is_constdisp(&exec_params.hdr))
exec_params.flags |= ELF_FDPIC_FLAG_CONSTDISP;
/* perform insanity checks on the interpreter */
if (interpreter_name) {
retval = -ELIBBAD;
- if (!is_elf_fdpic(&interp_params.hdr, interpreter))
+ if (!is_elf(&interp_params.hdr, interpreter))
goto error;
interp_params.flags = ELF_FDPIC_FLAG_PRESENT;
@@ -306,9 +334,9 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
retval = -ENOEXEC;
if (stack_size == 0)
- goto error;
+ stack_size = 131072UL; /* same as exec.c's default commit */
- if (elf_check_const_displacement(&interp_params.hdr))
+ if (is_constdisp(&interp_params.hdr))
interp_params.flags |= ELF_FDPIC_FLAG_CONSTDISP;
/* flush all traces of the currently running executable */
@@ -319,7 +347,10 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
/* there's now no turning back... the old userspace image is dead,
* defunct, deceased, etc.
*/
- set_personality(PER_LINUX_FDPIC);
+ if (elf_check_fdpic(&exec_params.hdr))
+ set_personality(PER_LINUX_FDPIC);
+ else
+ set_personality(PER_LINUX);
if (elf_read_implies_exec(&exec_params.hdr, executable_stack))
current->personality |= READ_IMPLIES_EXEC;
@@ -374,10 +405,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
PAGE_ALIGN(current->mm->start_brk);
#else
- /* create a stack and brk area big enough for everyone
- * - the brk heap starts at the bottom and works up
- * - the stack starts at the top and works down
- */
+ /* create a stack area and zero-size brk area */
stack_size = (stack_size + PAGE_SIZE - 1) & PAGE_MASK;
if (stack_size < PAGE_SIZE * 2)
stack_size = PAGE_SIZE * 2;
@@ -400,8 +428,6 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
current->mm->brk = current->mm->start_brk;
current->mm->context.end_brk = current->mm->start_brk;
- current->mm->context.end_brk +=
- (stack_size > PAGE_SIZE) ? (stack_size - PAGE_SIZE) : 0;
current->mm->start_stack = current->mm->start_brk + stack_size;
#endif
@@ -1206,6 +1232,20 @@ static int maydump(struct vm_area_struct *vma, unsigned long mm_flags)
return 0;
}
+ /* support for DAX */
+ if (vma_is_dax(vma)) {
+ if (vma->vm_flags & VM_SHARED) {
+ dump_ok = test_bit(MMF_DUMP_DAX_SHARED, &mm_flags);
+ kdcore("%08lx: %08lx: %s (DAX shared)", vma->vm_start,
+ vma->vm_flags, dump_ok ? "yes" : "no");
+ } else {
+ dump_ok = test_bit(MMF_DUMP_DAX_PRIVATE, &mm_flags);
+ kdcore("%08lx: %08lx: %s (DAX private)", vma->vm_start,
+ vma->vm_flags, dump_ok ? "yes" : "no");
+ }
+ return dump_ok;
+ }
+
/* By default, dump shared memory if mapped from an anonymous file. */
if (vma->vm_flags & VM_SHARED) {
if (file_inode(vma->vm_file)->i_nlink == 0) {
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 073bb57adab1..ba762ea07f67 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -50,12 +50,21 @@ struct block_device *I_BDEV(struct inode *inode)
}
EXPORT_SYMBOL(I_BDEV);
-static void bdev_write_inode(struct inode *inode)
+static void bdev_write_inode(struct block_device *bdev)
{
+ struct inode *inode = bdev->bd_inode;
+ int ret;
+
spin_lock(&inode->i_lock);
while (inode->i_state & I_DIRTY) {
spin_unlock(&inode->i_lock);
- WARN_ON_ONCE(write_inode_now(inode, true));
+ ret = write_inode_now(inode, true);
+ if (ret) {
+ char name[BDEVNAME_SIZE];
+ pr_warn_ratelimited("VFS: Dirty inode writeback failed "
+ "for block device %s (err=%d).\n",
+ bdevname(bdev, name), ret);
+ }
spin_lock(&inode->i_lock);
}
spin_unlock(&inode->i_lock);
@@ -147,11 +156,16 @@ blkdev_get_block(struct inode *inode, sector_t iblock,
return 0;
}
+static struct inode *bdev_file_inode(struct file *file)
+{
+ return file->f_mapping->host;
+}
+
static ssize_t
blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
{
struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_mapping->host;
+ struct inode *inode = bdev_file_inode(file);
if (IS_DAX(inode))
return dax_do_io(iocb, inode, iter, offset, blkdev_get_block,
@@ -329,7 +343,7 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping,
*/
static loff_t block_llseek(struct file *file, loff_t offset, int whence)
{
- struct inode *bd_inode = file->f_mapping->host;
+ struct inode *bd_inode = bdev_file_inode(file);
loff_t retval;
mutex_lock(&bd_inode->i_mutex);
@@ -340,7 +354,7 @@ static loff_t block_llseek(struct file *file, loff_t offset, int whence)
int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
{
- struct inode *bd_inode = filp->f_mapping->host;
+ struct inode *bd_inode = bdev_file_inode(filp);
struct block_device *bdev = I_BDEV(bd_inode);
int error;
@@ -381,9 +395,17 @@ int bdev_read_page(struct block_device *bdev, sector_t sector,
struct page *page)
{
const struct block_device_operations *ops = bdev->bd_disk->fops;
+ int result = -EOPNOTSUPP;
+
if (!ops->rw_page || bdev_get_integrity(bdev))
- return -EOPNOTSUPP;
- return ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ);
+ return result;
+
+ result = blk_queue_enter(bdev->bd_queue, false);
+ if (result)
+ return result;
+ result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ);
+ blk_queue_exit(bdev->bd_queue);
+ return result;
}
EXPORT_SYMBOL_GPL(bdev_read_page);
@@ -412,14 +434,20 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
int result;
int rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE;
const struct block_device_operations *ops = bdev->bd_disk->fops;
+
if (!ops->rw_page || bdev_get_integrity(bdev))
return -EOPNOTSUPP;
+ result = blk_queue_enter(bdev->bd_queue, false);
+ if (result)
+ return result;
+
set_page_writeback(page);
result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, rw);
if (result)
end_page_writeback(page);
else
unlock_page(page);
+ blk_queue_exit(bdev->bd_queue);
return result;
}
EXPORT_SYMBOL_GPL(bdev_write_page);
@@ -427,10 +455,7 @@ EXPORT_SYMBOL_GPL(bdev_write_page);
/**
* bdev_direct_access() - Get the address for directly-accessibly memory
* @bdev: The device containing the memory
- * @sector: The offset within the device
- * @addr: Where to put the address of the memory
- * @pfn: The Page Frame Number for the memory
- * @size: The number of bytes requested
+ * @dax: control and output parameters for ->direct_access
*
* If a block device is made up of directly addressable memory, this function
* will tell the caller the PFN and the address of the memory. The address
@@ -441,10 +466,10 @@ EXPORT_SYMBOL_GPL(bdev_write_page);
* Return: negative errno if an error occurs, otherwise the number of bytes
* accessible at this address.
*/
-long bdev_direct_access(struct block_device *bdev, sector_t sector,
- void __pmem **addr, unsigned long *pfn, long size)
+long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax)
{
- long avail;
+ sector_t sector = dax->sector;
+ long avail, size = dax->size;
const struct block_device_operations *ops = bdev->bd_disk->fops;
/*
@@ -463,9 +488,11 @@ long bdev_direct_access(struct block_device *bdev, sector_t sector,
sector += get_start_sect(bdev);
if (sector % (PAGE_SIZE / 512))
return -EINVAL;
- avail = ops->direct_access(bdev, sector, addr, pfn);
+ avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn);
if (!avail)
return -ERANGE;
+ if (avail > 0 && avail & ~PAGE_MASK)
+ return -ENXIO;
return min(avail, size);
}
EXPORT_SYMBOL_GPL(bdev_direct_access);
@@ -567,7 +594,7 @@ void __init bdev_cache_init(void)
bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD|SLAB_PANIC),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT|SLAB_PANIC),
init_once);
err = register_filesystem(&bd_type);
if (err)
@@ -673,7 +700,7 @@ static struct block_device *bd_acquire(struct inode *inode)
spin_lock(&bdev_lock);
bdev = inode->i_bdev;
if (bdev) {
- ihold(bdev->bd_inode);
+ bdgrab(bdev);
spin_unlock(&bdev_lock);
return bdev;
}
@@ -689,7 +716,7 @@ static struct block_device *bd_acquire(struct inode *inode)
* So, we can access it via ->i_mapping always
* without igrab().
*/
- ihold(bdev->bd_inode);
+ bdgrab(bdev);
inode->i_bdev = bdev;
inode->i_mapping = bdev->bd_inode->i_mapping;
list_add(&inode->i_devices, &bdev->bd_inodes);
@@ -712,7 +739,7 @@ void bd_forget(struct inode *inode)
spin_unlock(&bdev_lock);
if (bdev)
- iput(bdev->bd_inode);
+ bdput(bdev);
}
/**
@@ -1019,12 +1046,9 @@ EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
static void flush_disk(struct block_device *bdev, bool kill_dirty)
{
if (__invalidate_device(bdev, kill_dirty)) {
- char name[BDEVNAME_SIZE] = "";
-
- if (bdev->bd_disk)
- disk_name(bdev->bd_disk, 0, name);
printk(KERN_WARNING "VFS: busy inodes on changed media or "
- "resized disk %s\n", name);
+ "resized disk %s\n",
+ bdev->bd_disk ? bdev->bd_disk->disk_name : "");
}
if (!bdev->bd_disk)
@@ -1048,12 +1072,9 @@ void check_disk_size_change(struct gendisk *disk, struct block_device *bdev)
disk_size = (loff_t)get_capacity(disk) << 9;
bdev_size = i_size_read(bdev->bd_inode);
if (disk_size != bdev_size) {
- char name[BDEVNAME_SIZE];
-
- disk_name(disk, 0, name);
printk(KERN_INFO
"%s: detected capacity change from %lld to %lld\n",
- name, bdev_size, disk_size);
+ disk->disk_name, bdev_size, disk_size);
i_size_write(bdev->bd_inode, disk_size);
flush_disk(bdev, false);
}
@@ -1075,7 +1096,7 @@ int revalidate_disk(struct gendisk *disk)
if (disk->fops->revalidate_disk)
ret = disk->fops->revalidate_disk(disk);
-
+ blk_integrity_revalidate(disk);
bdev = bdget_disk(disk, 0);
if (!bdev)
return ret;
@@ -1207,8 +1228,11 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
}
}
- if (!ret)
+ if (!ret) {
bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
+ if (!blkdev_dax_capable(bdev))
+ bdev->bd_inode->i_flags &= ~S_DAX;
+ }
/*
* If the device is invalidated, rescan partition
@@ -1222,6 +1246,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
else if (ret == -ENOMEDIUM)
invalidate_partitions(disk, bdev);
}
+
if (ret)
goto out_clear;
} else {
@@ -1242,12 +1267,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
goto out_clear;
}
bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
- /*
- * If the partition is not aligned on a page
- * boundary, we can't do dax I/O to it.
- */
- if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512)) ||
- (bdev->bd_part->nr_sects % (PAGE_SIZE / 512)))
+ if (!blkdev_dax_capable(bdev))
bdev->bd_inode->i_flags &= ~S_DAX;
}
} else {
@@ -1500,11 +1520,14 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
WARN_ON_ONCE(bdev->bd_holders);
sync_blockdev(bdev);
kill_bdev(bdev);
+
+ bdev_write_inode(bdev);
/*
- * ->release can cause the queue to disappear, so flush all
- * dirty data before.
+ * Detaching bdev inode from its wb in __destroy_inode()
+ * is too late: the queue which embeds its bdi (along with
+ * root wb) can be gone as soon as we put_disk() below.
*/
- bdev_write_inode(bdev->bd_inode);
+ inode_detach_wb(bdev->bd_inode);
}
if (bdev->bd_contains == bdev) {
if (disk->fops->release)
@@ -1579,14 +1602,14 @@ EXPORT_SYMBOL(blkdev_put);
static int blkdev_close(struct inode * inode, struct file * filp)
{
- struct block_device *bdev = I_BDEV(filp->f_mapping->host);
+ struct block_device *bdev = I_BDEV(bdev_file_inode(filp));
blkdev_put(bdev, filp->f_mode);
return 0;
}
static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
{
- struct block_device *bdev = I_BDEV(file->f_mapping->host);
+ struct block_device *bdev = I_BDEV(bdev_file_inode(file));
fmode_t mode = file->f_mode;
/*
@@ -1611,7 +1634,7 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
- struct inode *bd_inode = file->f_mapping->host;
+ struct inode *bd_inode = bdev_file_inode(file);
loff_t size = i_size_read(bd_inode);
struct blk_plug plug;
ssize_t ret;
@@ -1643,7 +1666,7 @@ EXPORT_SYMBOL_GPL(blkdev_write_iter);
ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct file *file = iocb->ki_filp;
- struct inode *bd_inode = file->f_mapping->host;
+ struct inode *bd_inode = bdev_file_inode(file);
loff_t size = i_size_read(bd_inode);
loff_t pos = iocb->ki_pos;
@@ -1682,13 +1705,101 @@ static const struct address_space_operations def_blk_aops = {
.is_dirty_writeback = buffer_check_dirty_writeback,
};
+#ifdef CONFIG_FS_DAX
+/*
+ * In the raw block case we do not need to contend with truncation nor
+ * unwritten file extents. Without those concerns there is no need for
+ * additional locking beyond the mmap_sem context that these routines
+ * are already executing under.
+ *
+ * Note, there is no protection if the block device is dynamically
+ * resized (partition grow/shrink) during a fault. A stable block device
+ * size is already not enforced in the blkdev_direct_IO path.
+ *
+ * For DAX, it is the responsibility of the block device driver to
+ * ensure the whole-disk device size is stable while requests are in
+ * flight.
+ *
+ * Finally, unlike the filemap_page_mkwrite() case there is no
+ * filesystem superblock to sync against freezing. We still include a
+ * pfn_mkwrite callback for dax drivers to receive write fault
+ * notifications.
+ */
+static int blkdev_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ return __dax_fault(vma, vmf, blkdev_get_block, NULL);
+}
+
+static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd, unsigned int flags)
+{
+ return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block, NULL);
+}
+
+static void blkdev_vm_open(struct vm_area_struct *vma)
+{
+ struct inode *bd_inode = bdev_file_inode(vma->vm_file);
+ struct block_device *bdev = I_BDEV(bd_inode);
+
+ mutex_lock(&bd_inode->i_mutex);
+ bdev->bd_map_count++;
+ mutex_unlock(&bd_inode->i_mutex);
+}
+
+static void blkdev_vm_close(struct vm_area_struct *vma)
+{
+ struct inode *bd_inode = bdev_file_inode(vma->vm_file);
+ struct block_device *bdev = I_BDEV(bd_inode);
+
+ mutex_lock(&bd_inode->i_mutex);
+ bdev->bd_map_count--;
+ mutex_unlock(&bd_inode->i_mutex);
+}
+
+static const struct vm_operations_struct blkdev_dax_vm_ops = {
+ .open = blkdev_vm_open,
+ .close = blkdev_vm_close,
+ .fault = blkdev_dax_fault,
+ .pmd_fault = blkdev_dax_pmd_fault,
+ .pfn_mkwrite = blkdev_dax_fault,
+};
+
+static const struct vm_operations_struct blkdev_default_vm_ops = {
+ .open = blkdev_vm_open,
+ .close = blkdev_vm_close,
+ .fault = filemap_fault,
+ .map_pages = filemap_map_pages,
+};
+
+static int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct inode *bd_inode = bdev_file_inode(file);
+ struct block_device *bdev = I_BDEV(bd_inode);
+
+ file_accessed(file);
+ mutex_lock(&bd_inode->i_mutex);
+ bdev->bd_map_count++;
+ if (IS_DAX(bd_inode)) {
+ vma->vm_ops = &blkdev_dax_vm_ops;
+ vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
+ } else {
+ vma->vm_ops = &blkdev_default_vm_ops;
+ }
+ mutex_unlock(&bd_inode->i_mutex);
+
+ return 0;
+}
+#else
+#define blkdev_mmap generic_file_mmap
+#endif
+
const struct file_operations def_blk_fops = {
.open = blkdev_open,
.release = blkdev_close,
.llseek = block_llseek,
.read_iter = blkdev_read_iter,
.write_iter = blkdev_write_iter,
- .mmap = generic_file_mmap,
+ .mmap = blkdev_mmap,
.fsync = blkdev_fsync,
.unlocked_ioctl = block_ioctl,
#ifdef CONFIG_COMPAT
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 6d1d0b93b1aa..128ce17a80b0 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -9,11 +9,12 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
export.o tree-log.o free-space-cache.o zlib.o lzo.o \
compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
- uuid-tree.o props.o hash.o
+ uuid-tree.o props.o hash.o free-space-tree.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \
tests/extent-buffer-tests.o tests/btrfs-tests.o \
- tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o
+ tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o \
+ tests/free-space-tree-tests.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 9a0124a95851..6d263bb1621c 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -37,10 +37,10 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
switch (type) {
case ACL_TYPE_ACCESS:
- name = POSIX_ACL_XATTR_ACCESS;
+ name = XATTR_NAME_POSIX_ACL_ACCESS;
break;
case ACL_TYPE_DEFAULT:
- name = POSIX_ACL_XATTR_DEFAULT;
+ name = XATTR_NAME_POSIX_ACL_DEFAULT;
break;
default:
BUG();
@@ -48,7 +48,7 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
size = __btrfs_getxattr(inode, name, "", 0);
if (size > 0) {
- value = kzalloc(size, GFP_NOFS);
+ value = kzalloc(size, GFP_KERNEL);
if (!value)
return ERR_PTR(-ENOMEM);
size = __btrfs_getxattr(inode, name, value, size);
@@ -81,7 +81,7 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
switch (type) {
case ACL_TYPE_ACCESS:
- name = POSIX_ACL_XATTR_ACCESS;
+ name = XATTR_NAME_POSIX_ACL_ACCESS;
if (acl) {
ret = posix_acl_equiv_mode(acl, &inode->i_mode);
if (ret < 0)
@@ -94,7 +94,7 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
case ACL_TYPE_DEFAULT:
if (!S_ISDIR(inode->i_mode))
return acl ? -EINVAL : 0;
- name = POSIX_ACL_XATTR_DEFAULT;
+ name = XATTR_NAME_POSIX_ACL_DEFAULT;
break;
default:
return -EINVAL;
@@ -102,7 +102,7 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
if (acl) {
size = posix_acl_xattr_size(acl->a_count);
- value = kmalloc(size, GFP_NOFS);
+ value = kmalloc(size, GFP_KERNEL);
if (!value) {
ret = -ENOMEM;
goto out;
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 3e36e4adc4a3..88d9af3d4581 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -97,7 +97,7 @@ static struct __btrfs_workqueue *
__btrfs_alloc_workqueue(const char *name, unsigned int flags, int limit_active,
int thresh)
{
- struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
+ struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL);
if (!ret)
return NULL;
@@ -148,7 +148,7 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
int limit_active,
int thresh)
{
- struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
+ struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL);
if (!ret)
return NULL;
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 9a2ec79e8cfb..08405a3da6b1 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -355,13 +355,19 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
index = srcu_read_lock(&fs_info->subvol_srcu);
- root = btrfs_read_fs_root_no_name(fs_info, &root_key);
+ root = btrfs_get_fs_root(fs_info, &root_key, false);
if (IS_ERR(root)) {
srcu_read_unlock(&fs_info->subvol_srcu, index);
ret = PTR_ERR(root);
goto out;
}
+ if (btrfs_test_is_dummy_root(root)) {
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+ ret = -ENOENT;
+ goto out;
+ }
+
if (path->search_commit_root)
root_level = btrfs_header_level(root->commit_root);
else if (time_seq == (u64)-1)
@@ -514,13 +520,10 @@ static inline int ref_for_same_block(struct __prelim_ref *ref1,
static int __add_missing_keys(struct btrfs_fs_info *fs_info,
struct list_head *head)
{
- struct list_head *pos;
+ struct __prelim_ref *ref;
struct extent_buffer *eb;
- list_for_each(pos, head) {
- struct __prelim_ref *ref;
- ref = list_entry(pos, struct __prelim_ref, list);
-
+ list_for_each_entry(ref, head, list) {
if (ref->parent)
continue;
if (ref->key_for_search.type)
@@ -557,23 +560,15 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info,
*/
static void __merge_refs(struct list_head *head, int mode)
{
- struct list_head *pos1;
+ struct __prelim_ref *ref1;
- list_for_each(pos1, head) {
- struct list_head *n2;
- struct list_head *pos2;
- struct __prelim_ref *ref1;
+ list_for_each_entry(ref1, head, list) {
+ struct __prelim_ref *ref2 = ref1, *tmp;
- ref1 = list_entry(pos1, struct __prelim_ref, list);
-
- for (pos2 = pos1->next, n2 = pos2->next; pos2 != head;
- pos2 = n2, n2 = pos2->next) {
- struct __prelim_ref *ref2;
+ list_for_each_entry_safe_continue(ref2, tmp, head, list) {
struct __prelim_ref *xchg;
struct extent_inode_elem *eie;
- ref2 = list_entry(pos2, struct __prelim_ref, list);
-
if (!ref_for_same_block(ref1, ref2))
continue;
if (mode == 1) {
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 0ef5cc13fae2..61205e3bbefa 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -192,6 +192,10 @@ struct btrfs_inode {
/* File creation time. */
struct timespec i_otime;
+ /* Hook into fs_info->delayed_iputs */
+ struct list_head delayed_iput;
+ long delayed_iput_count;
+
struct inode vfs_inode;
};
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 541fbfaed276..861d472564c1 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -531,13 +531,9 @@ static struct btrfsic_block *btrfsic_block_hashtable_lookup(
(((unsigned int)(dev_bytenr >> 16)) ^
((unsigned int)((uintptr_t)bdev))) &
(BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
- struct list_head *elem;
-
- list_for_each(elem, h->table + hashval) {
- struct btrfsic_block *const b =
- list_entry(elem, struct btrfsic_block,
- collision_resolving_node);
+ struct btrfsic_block *b;
+ list_for_each_entry(b, h->table + hashval, collision_resolving_node) {
if (b->dev_state->bdev == bdev && b->dev_bytenr == dev_bytenr)
return b;
}
@@ -588,13 +584,9 @@ static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
((unsigned int)((uintptr_t)bdev_ref_to)) ^
((unsigned int)((uintptr_t)bdev_ref_from))) &
(BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
- struct list_head *elem;
-
- list_for_each(elem, h->table + hashval) {
- struct btrfsic_block_link *const l =
- list_entry(elem, struct btrfsic_block_link,
- collision_resolving_node);
+ struct btrfsic_block_link *l;
+ list_for_each_entry(l, h->table + hashval, collision_resolving_node) {
BUG_ON(NULL == l->block_ref_to);
BUG_ON(NULL == l->block_ref_from);
if (l->block_ref_to->dev_state->bdev == bdev_ref_to &&
@@ -639,13 +631,9 @@ static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(
const unsigned int hashval =
(((unsigned int)((uintptr_t)bdev)) &
(BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1));
- struct list_head *elem;
-
- list_for_each(elem, h->table + hashval) {
- struct btrfsic_dev_state *const ds =
- list_entry(elem, struct btrfsic_dev_state,
- collision_resolving_node);
+ struct btrfsic_dev_state *ds;
+ list_for_each_entry(ds, h->table + hashval, collision_resolving_node) {
if (ds->bdev == bdev)
return ds;
}
@@ -667,7 +655,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS);
if (NULL == selected_super) {
printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
- return -1;
+ return -ENOMEM;
}
list_for_each_entry(device, dev_head, dev_list) {
@@ -845,8 +833,8 @@ static int btrfsic_process_superblock_dev_mirror(
superblock_tmp->never_written = 0;
superblock_tmp->mirror_num = 1 + superblock_mirror_num;
if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
- printk_in_rcu(KERN_INFO "New initial S-block (bdev %p, %s)"
- " @%llu (%s/%llu/%d)\n",
+ btrfs_info_in_rcu(device->dev_root->fs_info,
+ "new initial S-block (bdev %p, %s) @%llu (%s/%llu/%d)",
superblock_bdev,
rcu_str_deref(device->name), dev_bytenr,
dev_state->name, dev_bytenr,
@@ -1660,7 +1648,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
sizeof(*block_ctx->pagev)) *
num_pages, GFP_NOFS);
if (!block_ctx->mem_to_free)
- return -1;
+ return -ENOMEM;
block_ctx->datav = block_ctx->mem_to_free;
block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages);
for (i = 0; i < num_pages; i++) {
@@ -1720,29 +1708,20 @@ static int btrfsic_read_block(struct btrfsic_state *state,
static void btrfsic_dump_database(struct btrfsic_state *state)
{
- struct list_head *elem_all;
+ const struct btrfsic_block *b_all;
BUG_ON(NULL == state);
printk(KERN_INFO "all_blocks_list:\n");
- list_for_each(elem_all, &state->all_blocks_list) {
- const struct btrfsic_block *const b_all =
- list_entry(elem_all, struct btrfsic_block,
- all_blocks_node);
- struct list_head *elem_ref_to;
- struct list_head *elem_ref_from;
+ list_for_each_entry(b_all, &state->all_blocks_list, all_blocks_node) {
+ const struct btrfsic_block_link *l;
printk(KERN_INFO "%c-block @%llu (%s/%llu/%d)\n",
btrfsic_get_block_type(state, b_all),
b_all->logical_bytenr, b_all->dev_state->name,
b_all->dev_bytenr, b_all->mirror_num);
- list_for_each(elem_ref_to, &b_all->ref_to_list) {
- const struct btrfsic_block_link *const l =
- list_entry(elem_ref_to,
- struct btrfsic_block_link,
- node_ref_to);
-
+ list_for_each_entry(l, &b_all->ref_to_list, node_ref_to) {
printk(KERN_INFO " %c @%llu (%s/%llu/%d)"
" refers %u* to"
" %c @%llu (%s/%llu/%d)\n",
@@ -1757,12 +1736,7 @@ static void btrfsic_dump_database(struct btrfsic_state *state)
l->block_ref_to->mirror_num);
}
- list_for_each(elem_ref_from, &b_all->ref_from_list) {
- const struct btrfsic_block_link *const l =
- list_entry(elem_ref_from,
- struct btrfsic_block_link,
- node_ref_from);
-
+ list_for_each_entry(l, &b_all->ref_from_list, node_ref_from) {
printk(KERN_INFO " %c @%llu (%s/%llu/%d)"
" is ref %u* from"
" %c @%llu (%s/%llu/%d)\n",
@@ -1845,8 +1819,7 @@ again:
&state->block_hashtable);
if (NULL != block) {
u64 bytenr = 0;
- struct list_head *elem_ref_to;
- struct list_head *tmp_ref_to;
+ struct btrfsic_block_link *l, *tmp;
if (block->is_superblock) {
bytenr = btrfs_super_bytenr((struct btrfs_super_block *)
@@ -1967,13 +1940,8 @@ again:
* because it still carries valueable information
* like whether it was ever written and IO completed.
*/
- list_for_each_safe(elem_ref_to, tmp_ref_to,
- &block->ref_to_list) {
- struct btrfsic_block_link *const l =
- list_entry(elem_ref_to,
- struct btrfsic_block_link,
- node_ref_to);
-
+ list_for_each_entry_safe(l, tmp, &block->ref_to_list,
+ node_ref_to) {
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
btrfsic_print_rem_link(state, l);
l->ref_cnt--;
@@ -2436,7 +2404,7 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
struct btrfsic_block *const block,
int recursion_level)
{
- struct list_head *elem_ref_to;
+ const struct btrfsic_block_link *l;
int ret = 0;
if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
@@ -2464,11 +2432,7 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
* This algorithm is recursive because the amount of used stack
* space is very small and the max recursion depth is limited.
*/
- list_for_each(elem_ref_to, &block->ref_to_list) {
- const struct btrfsic_block_link *const l =
- list_entry(elem_ref_to, struct btrfsic_block_link,
- node_ref_to);
-
+ list_for_each_entry(l, &block->ref_to_list, node_ref_to) {
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
printk(KERN_INFO
"rl=%d, %c @%llu (%s/%llu/%d)"
@@ -2561,7 +2525,7 @@ static int btrfsic_is_block_ref_by_superblock(
const struct btrfsic_block *block,
int recursion_level)
{
- struct list_head *elem_ref_from;
+ const struct btrfsic_block_link *l;
if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
/* refer to comment at "abort cyclic linkage (case 1)" */
@@ -2576,11 +2540,7 @@ static int btrfsic_is_block_ref_by_superblock(
* This algorithm is recursive because the amount of used stack space
* is very small and the max recursion depth is limited.
*/
- list_for_each(elem_ref_from, &block->ref_from_list) {
- const struct btrfsic_block_link *const l =
- list_entry(elem_ref_from, struct btrfsic_block_link,
- node_ref_from);
-
+ list_for_each_entry(l, &block->ref_from_list, node_ref_from) {
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
printk(KERN_INFO
"rl=%d, %c @%llu (%s/%llu/%d)"
@@ -2669,7 +2629,7 @@ static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
const struct btrfsic_block *block,
int indent_level)
{
- struct list_head *elem_ref_to;
+ const struct btrfsic_block_link *l;
int indent_add;
static char buf[80];
int cursor_position;
@@ -2704,11 +2664,7 @@ static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
}
cursor_position = indent_level;
- list_for_each(elem_ref_to, &block->ref_to_list) {
- const struct btrfsic_block_link *const l =
- list_entry(elem_ref_to, struct btrfsic_block_link,
- node_ref_to);
-
+ list_for_each_entry(l, &block->ref_to_list, node_ref_to) {
while (cursor_position < indent_level) {
printk(" ");
cursor_position++;
@@ -3165,8 +3121,7 @@ int btrfsic_mount(struct btrfs_root *root,
void btrfsic_unmount(struct btrfs_root *root,
struct btrfs_fs_devices *fs_devices)
{
- struct list_head *elem_all;
- struct list_head *tmp_all;
+ struct btrfsic_block *b_all, *tmp_all;
struct btrfsic_state *state;
struct list_head *dev_head = &fs_devices->devices;
struct btrfs_device *device;
@@ -3206,20 +3161,12 @@ void btrfsic_unmount(struct btrfs_root *root,
* just free all memory that was allocated dynamically.
* Free the blocks and the block_links.
*/
- list_for_each_safe(elem_all, tmp_all, &state->all_blocks_list) {
- struct btrfsic_block *const b_all =
- list_entry(elem_all, struct btrfsic_block,
- all_blocks_node);
- struct list_head *elem_ref_to;
- struct list_head *tmp_ref_to;
-
- list_for_each_safe(elem_ref_to, tmp_ref_to,
- &b_all->ref_to_list) {
- struct btrfsic_block_link *const l =
- list_entry(elem_ref_to,
- struct btrfsic_block_link,
- node_ref_to);
+ list_for_each_entry_safe(b_all, tmp_all, &state->all_blocks_list,
+ all_blocks_node) {
+ struct btrfsic_block_link *l, *tmp;
+ list_for_each_entry_safe(l, tmp, &b_all->ref_to_list,
+ node_ref_to) {
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
btrfsic_print_rem_link(state, l);
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 57ee8ca29b06..c473c42d7d6c 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -482,13 +482,12 @@ static noinline int add_ra_bio_pages(struct inode *inode,
goto next;
}
- page = __page_cache_alloc(mapping_gfp_mask(mapping) &
- ~__GFP_FS);
+ page = __page_cache_alloc(mapping_gfp_constraint(mapping,
+ ~__GFP_FS));
if (!page)
break;
- if (add_to_page_cache_lru(page, mapping, pg_index,
- GFP_NOFS)) {
+ if (add_to_page_cache_lru(page, mapping, pg_index, GFP_NOFS)) {
page_cache_release(page);
goto next;
}
@@ -745,11 +744,13 @@ out:
return ret;
}
-static struct list_head comp_idle_workspace[BTRFS_COMPRESS_TYPES];
-static spinlock_t comp_workspace_lock[BTRFS_COMPRESS_TYPES];
-static int comp_num_workspace[BTRFS_COMPRESS_TYPES];
-static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES];
-static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES];
+static struct {
+ struct list_head idle_ws;
+ spinlock_t ws_lock;
+ int num_ws;
+ atomic_t alloc_ws;
+ wait_queue_head_t ws_wait;
+} btrfs_comp_ws[BTRFS_COMPRESS_TYPES];
static const struct btrfs_compress_op * const btrfs_compress_op[] = {
&btrfs_zlib_compress,
@@ -761,10 +762,10 @@ void __init btrfs_init_compress(void)
int i;
for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
- INIT_LIST_HEAD(&comp_idle_workspace[i]);
- spin_lock_init(&comp_workspace_lock[i]);
- atomic_set(&comp_alloc_workspace[i], 0);
- init_waitqueue_head(&comp_workspace_wait[i]);
+ INIT_LIST_HEAD(&btrfs_comp_ws[i].idle_ws);
+ spin_lock_init(&btrfs_comp_ws[i].ws_lock);
+ atomic_set(&btrfs_comp_ws[i].alloc_ws, 0);
+ init_waitqueue_head(&btrfs_comp_ws[i].ws_wait);
}
}
@@ -778,38 +779,38 @@ static struct list_head *find_workspace(int type)
int cpus = num_online_cpus();
int idx = type - 1;
- struct list_head *idle_workspace = &comp_idle_workspace[idx];
- spinlock_t *workspace_lock = &comp_workspace_lock[idx];
- atomic_t *alloc_workspace = &comp_alloc_workspace[idx];
- wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx];
- int *num_workspace = &comp_num_workspace[idx];
+ struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws;
+ spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock;
+ atomic_t *alloc_ws = &btrfs_comp_ws[idx].alloc_ws;
+ wait_queue_head_t *ws_wait = &btrfs_comp_ws[idx].ws_wait;
+ int *num_ws = &btrfs_comp_ws[idx].num_ws;
again:
- spin_lock(workspace_lock);
- if (!list_empty(idle_workspace)) {
- workspace = idle_workspace->next;
+ spin_lock(ws_lock);
+ if (!list_empty(idle_ws)) {
+ workspace = idle_ws->next;
list_del(workspace);
- (*num_workspace)--;
- spin_unlock(workspace_lock);
+ (*num_ws)--;
+ spin_unlock(ws_lock);
return workspace;
}
- if (atomic_read(alloc_workspace) > cpus) {
+ if (atomic_read(alloc_ws) > cpus) {
DEFINE_WAIT(wait);
- spin_unlock(workspace_lock);
- prepare_to_wait(workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
- if (atomic_read(alloc_workspace) > cpus && !*num_workspace)
+ spin_unlock(ws_lock);
+ prepare_to_wait(ws_wait, &wait, TASK_UNINTERRUPTIBLE);
+ if (atomic_read(alloc_ws) > cpus && !*num_ws)
schedule();
- finish_wait(workspace_wait, &wait);
+ finish_wait(ws_wait, &wait);
goto again;
}
- atomic_inc(alloc_workspace);
- spin_unlock(workspace_lock);
+ atomic_inc(alloc_ws);
+ spin_unlock(ws_lock);
workspace = btrfs_compress_op[idx]->alloc_workspace();
if (IS_ERR(workspace)) {
- atomic_dec(alloc_workspace);
- wake_up(workspace_wait);
+ atomic_dec(alloc_ws);
+ wake_up(ws_wait);
}
return workspace;
}
@@ -821,27 +822,30 @@ again:
static void free_workspace(int type, struct list_head *workspace)
{
int idx = type - 1;
- struct list_head *idle_workspace = &comp_idle_workspace[idx];
- spinlock_t *workspace_lock = &comp_workspace_lock[idx];
- atomic_t *alloc_workspace = &comp_alloc_workspace[idx];
- wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx];
- int *num_workspace = &comp_num_workspace[idx];
-
- spin_lock(workspace_lock);
- if (*num_workspace < num_online_cpus()) {
- list_add(workspace, idle_workspace);
- (*num_workspace)++;
- spin_unlock(workspace_lock);
+ struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws;
+ spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock;
+ atomic_t *alloc_ws = &btrfs_comp_ws[idx].alloc_ws;
+ wait_queue_head_t *ws_wait = &btrfs_comp_ws[idx].ws_wait;
+ int *num_ws = &btrfs_comp_ws[idx].num_ws;
+
+ spin_lock(ws_lock);
+ if (*num_ws < num_online_cpus()) {
+ list_add(workspace, idle_ws);
+ (*num_ws)++;
+ spin_unlock(ws_lock);
goto wake;
}
- spin_unlock(workspace_lock);
+ spin_unlock(ws_lock);
btrfs_compress_op[idx]->free_workspace(workspace);
- atomic_dec(alloc_workspace);
+ atomic_dec(alloc_ws);
wake:
+ /*
+ * Make sure counter is updated before we wake up waiters.
+ */
smp_mb();
- if (waitqueue_active(workspace_wait))
- wake_up(workspace_wait);
+ if (waitqueue_active(ws_wait))
+ wake_up(ws_wait);
}
/*
@@ -853,11 +857,11 @@ static void free_workspaces(void)
int i;
for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
- while (!list_empty(&comp_idle_workspace[i])) {
- workspace = comp_idle_workspace[i].next;
+ while (!list_empty(&btrfs_comp_ws[i].idle_ws)) {
+ workspace = btrfs_comp_ws[i].idle_ws.next;
list_del(workspace);
btrfs_compress_op[i]->free_workspace(workspace);
- atomic_dec(&comp_alloc_workspace[i]);
+ atomic_dec(&btrfs_comp_ws[i].alloc_ws);
}
}
}
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 5f745eadf77d..769e0ff1b4ce 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1011,7 +1011,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
return ret;
if (refs == 0) {
ret = -EROFS;
- btrfs_std_error(root->fs_info, ret);
+ btrfs_std_error(root->fs_info, ret, NULL);
return ret;
}
} else {
@@ -1555,7 +1555,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
return 0;
}
- search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
+ search_start = buf->start & ~((u64)SZ_1G - 1);
if (parent)
btrfs_set_lock_blocking(parent);
@@ -1927,7 +1927,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
child = read_node_slot(root, mid, 0);
if (!child) {
ret = -EROFS;
- btrfs_std_error(root->fs_info, ret);
+ btrfs_std_error(root->fs_info, ret, NULL);
goto enospc;
}
@@ -2030,7 +2030,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
*/
if (!left) {
ret = -EROFS;
- btrfs_std_error(root->fs_info, ret);
+ btrfs_std_error(root->fs_info, ret, NULL);
goto enospc;
}
wret = balance_node_right(trans, root, mid, left);
@@ -2248,7 +2248,6 @@ static void reada_for_search(struct btrfs_root *root,
u64 target;
u64 nread = 0;
u64 gen;
- int direction = path->reada;
struct extent_buffer *eb;
u32 nr;
u32 blocksize;
@@ -2276,16 +2275,16 @@ static void reada_for_search(struct btrfs_root *root,
nr = slot;
while (1) {
- if (direction < 0) {
+ if (path->reada == READA_BACK) {
if (nr == 0)
break;
nr--;
- } else if (direction > 0) {
+ } else if (path->reada == READA_FORWARD) {
nr++;
if (nr >= nritems)
break;
}
- if (path->reada < 0 && objectid) {
+ if (path->reada == READA_BACK && objectid) {
btrfs_node_key(node, &disk_key, nr);
if (btrfs_disk_key_objectid(&disk_key) != objectid)
break;
@@ -2493,7 +2492,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
btrfs_set_path_blocking(p);
free_extent_buffer(tmp);
- if (p->reada)
+ if (p->reada != READA_NONE)
reada_for_search(root, p, level, slot, key->objectid);
btrfs_release_path(p);
@@ -4940,8 +4939,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
{
struct extent_buffer *leaf;
struct btrfs_item *item;
- int last_off;
- int dsize = 0;
+ u32 last_off;
+ u32 dsize = 0;
int ret = 0;
int wret;
int i;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 938efe33be80..97ad9bbeb35d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -35,6 +35,7 @@
#include <linux/btrfs.h>
#include <linux/workqueue.h>
#include <linux/security.h>
+#include <linux/sizes.h>
#include "extent_io.h"
#include "extent_map.h"
#include "async-thread.h"
@@ -96,6 +97,9 @@ struct btrfs_ordered_sum;
/* for storing items that use the BTRFS_UUID_KEY* types */
#define BTRFS_UUID_TREE_OBJECTID 9ULL
+/* tracks free space in block groups. */
+#define BTRFS_FREE_SPACE_TREE_OBJECTID 10ULL
+
/* for storing balance parameters in the root tree */
#define BTRFS_BALANCE_OBJECTID -4ULL
@@ -174,7 +178,7 @@ struct btrfs_ordered_sum;
/* csum types */
#define BTRFS_CSUM_TYPE_CRC32 0
-static int btrfs_csum_sizes[] = { 4 };
+static const int btrfs_csum_sizes[] = { 4 };
/* four bytes for CRC32 */
#define BTRFS_EMPTY_DIR_SIZE 0
@@ -196,9 +200,9 @@ static int btrfs_csum_sizes[] = { 4 };
/* ioprio of readahead is set to idle */
#define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0))
-#define BTRFS_DIRTY_METADATA_THRESH (32 * 1024 * 1024)
+#define BTRFS_DIRTY_METADATA_THRESH SZ_32M
-#define BTRFS_MAX_EXTENT_SIZE (128 * 1024 * 1024)
+#define BTRFS_MAX_EXTENT_SIZE SZ_128M
/*
* The key defines the order in the tree, and so it also defines (optimal)
@@ -500,6 +504,8 @@ struct btrfs_super_block {
* Compat flags that we support. If any incompat flags are set other than the
* ones specified below then we will fail to mount
*/
+#define BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE (1ULL << 0)
+
#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0)
#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1)
#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2)
@@ -526,7 +532,10 @@ struct btrfs_super_block {
#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
#define BTRFS_FEATURE_COMPAT_SAFE_SET 0ULL
#define BTRFS_FEATURE_COMPAT_SAFE_CLEAR 0ULL
-#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
+
+#define BTRFS_FEATURE_COMPAT_RO_SUPP \
+ (BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE)
+
#define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL
#define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL
@@ -590,14 +599,15 @@ struct btrfs_node {
* The slots array records the index of the item or block pointer
* used while walking the tree.
*/
+enum { READA_NONE = 0, READA_BACK, READA_FORWARD };
struct btrfs_path {
struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
int slots[BTRFS_MAX_LEVEL];
/* if there is real range locking, this locks field will change */
- int locks[BTRFS_MAX_LEVEL];
- int reada;
+ u8 locks[BTRFS_MAX_LEVEL];
+ u8 reada;
/* keep some upper locks as we walk down */
- int lowest_level;
+ u8 lowest_level;
/*
* set by btrfs_split_item, tells search_slot to keep all locks
@@ -823,8 +833,18 @@ struct btrfs_disk_balance_args {
*/
__le64 profiles;
- /* usage filter */
- __le64 usage;
+ /*
+ * usage filter
+ * BTRFS_BALANCE_ARGS_USAGE with a single value means '0..N'
+ * BTRFS_BALANCE_ARGS_USAGE_RANGE - range syntax, min..max
+ */
+ union {
+ __le64 usage;
+ struct {
+ __le32 usage_min;
+ __le32 usage_max;
+ };
+ };
/* devid filter */
__le64 devid;
@@ -846,10 +866,27 @@ struct btrfs_disk_balance_args {
/* BTRFS_BALANCE_ARGS_* */
__le64 flags;
- /* BTRFS_BALANCE_ARGS_LIMIT value */
- __le64 limit;
+ /*
+ * BTRFS_BALANCE_ARGS_LIMIT with value 'limit'
+ * BTRFS_BALANCE_ARGS_LIMIT_RANGE - the extend version can use minimum
+ * and maximum
+ */
+ union {
+ __le64 limit;
+ struct {
+ __le32 limit_min;
+ __le32 limit_max;
+ };
+ };
- __le64 unused[7];
+ /*
+ * Process chunks that cross stripes_min..stripes_max devices,
+ * BTRFS_BALANCE_ARGS_STRIPES_RANGE
+ */
+ __le32 stripes_min;
+ __le32 stripes_max;
+
+ __le64 unused[6];
} __attribute__ ((__packed__));
/*
@@ -1061,6 +1098,13 @@ struct btrfs_block_group_item {
__le64 flags;
} __attribute__ ((__packed__));
+struct btrfs_free_space_info {
+ __le32 extent_count;
+ __le32 flags;
+} __attribute__ ((__packed__));
+
+#define BTRFS_FREE_SPACE_USING_BITMAPS (1ULL << 0)
+
#define BTRFS_QGROUP_LEVEL_SHIFT 48
static inline u64 btrfs_qgroup_level(u64 qgroupid)
{
@@ -1154,6 +1198,10 @@ struct btrfs_space_info {
delalloc/allocations */
u64 bytes_readonly; /* total bytes that are read only */
+ u64 max_extent_size; /* This will hold the maximum extent size of
+ the space info if we had an ENOSPC in the
+ allocator. */
+
unsigned int full:1; /* indicates that we cannot allocate any more
chunks for this space */
unsigned int chunk_alloc:1; /* set if we are allocating a chunk */
@@ -1228,6 +1276,9 @@ struct btrfs_free_cluster {
/* first extent starting offset */
u64 window_start;
+ /* We did a full search and couldn't create a cluster */
+ bool fragmented;
+
struct btrfs_block_group_cache *block_group;
/*
* when a cluster is allocated from a block group, we put the
@@ -1262,6 +1313,9 @@ struct btrfs_caching_control {
atomic_t count;
};
+/* Once caching_thread() finds this much free space, it will wake up waiters. */
+#define CACHING_CTL_WAKE_UP (1024 * 1024 * 2)
+
struct btrfs_io_ctl {
void *cur, *orig;
struct page *page;
@@ -1287,8 +1341,20 @@ struct btrfs_block_group_cache {
u64 delalloc_bytes;
u64 bytes_super;
u64 flags;
- u64 sectorsize;
u64 cache_generation;
+ u32 sectorsize;
+
+ /*
+ * If the free space extent count exceeds this number, convert the block
+ * group to bitmaps.
+ */
+ u32 bitmap_high_thresh;
+
+ /*
+ * If the free space extent count drops below this number, convert the
+ * block group back to extents.
+ */
+ u32 bitmap_low_thresh;
/*
* It is just used for the delayed data space allocation because
@@ -1344,6 +1410,15 @@ struct btrfs_block_group_cache {
struct list_head io_list;
struct btrfs_io_ctl io_ctl;
+
+ /* Lock for free space tree operations. */
+ struct mutex free_space_lock;
+
+ /*
+ * Does the block group need to be added to the free space tree?
+ * Protected by free_space_lock.
+ */
+ int needs_free_space;
};
/* delayed seq elem */
@@ -1395,6 +1470,7 @@ struct btrfs_fs_info {
struct btrfs_root *csum_root;
struct btrfs_root *quota_root;
struct btrfs_root *uuid_root;
+ struct btrfs_root *free_space_root;
/* the log root tree is a directory of all the other log roots */
struct btrfs_root *log_root_tree;
@@ -1782,6 +1858,8 @@ struct btrfs_fs_info {
* and will be latter freed. Protected by fs_info->chunk_mutex.
*/
struct list_head pinned_chunks;
+
+ int creating_free_space_tree;
};
struct btrfs_subvolume_writers {
@@ -1943,6 +2021,9 @@ struct btrfs_root {
int send_in_progress;
struct btrfs_subvolume_writers *subv_writers;
atomic_t will_be_snapshoted;
+
+ /* For qgroup metadata space reserve */
+ atomic_t qgroup_meta_rsv;
};
struct btrfs_ioctl_defrag_range_args {
@@ -2055,6 +2136,27 @@ struct btrfs_ioctl_defrag_range_args {
*/
#define BTRFS_BLOCK_GROUP_ITEM_KEY 192
+/*
+ * Every block group is represented in the free space tree by a free space info
+ * item, which stores some accounting information. It is keyed on
+ * (block_group_start, FREE_SPACE_INFO, block_group_length).
+ */
+#define BTRFS_FREE_SPACE_INFO_KEY 198
+
+/*
+ * A free space extent tracks an extent of space that is free in a block group.
+ * It is keyed on (start, FREE_SPACE_EXTENT, length).
+ */
+#define BTRFS_FREE_SPACE_EXTENT_KEY 199
+
+/*
+ * When a block group becomes very fragmented, we convert it to use bitmaps
+ * instead of extents. A free space bitmap is keyed on
+ * (start, FREE_SPACE_BITMAP, length); the corresponding item is a bitmap with
+ * (length / sectorsize) bits.
+ */
+#define BTRFS_FREE_SPACE_BITMAP_KEY 200
+
#define BTRFS_DEV_EXTENT_KEY 204
#define BTRFS_DEV_ITEM_KEY 216
#define BTRFS_CHUNK_ITEM_KEY 228
@@ -2145,6 +2247,9 @@ struct btrfs_ioctl_defrag_range_args {
#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
#define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR (1 << 22)
#define BTRFS_MOUNT_RESCAN_UUID_TREE (1 << 23)
+#define BTRFS_MOUNT_FRAGMENT_DATA (1 << 24)
+#define BTRFS_MOUNT_FRAGMENT_METADATA (1 << 25)
+#define BTRFS_MOUNT_FREE_SPACE_TREE (1 << 26)
#define BTRFS_DEFAULT_COMMIT_INTERVAL (30)
#define BTRFS_DEFAULT_MAX_INLINE (8192)
@@ -2169,6 +2274,18 @@ struct btrfs_ioctl_defrag_range_args {
btrfs_clear_opt(root->fs_info->mount_opt, opt); \
}
+#ifdef CONFIG_BTRFS_DEBUG
+static inline int
+btrfs_should_fragment_free_space(struct btrfs_root *root,
+ struct btrfs_block_group_cache *block_group)
+{
+ return (btrfs_test_opt(root, FRAGMENT_METADATA) &&
+ block_group->flags & BTRFS_BLOCK_GROUP_METADATA) ||
+ (btrfs_test_opt(root, FRAGMENT_DATA) &&
+ block_group->flags & BTRFS_BLOCK_GROUP_DATA);
+}
+#endif
+
/*
* Requests for changes that need to be done during transaction commit.
*
@@ -2455,6 +2572,11 @@ BTRFS_SETGET_FUNCS(disk_block_group_flags,
BTRFS_SETGET_STACK_FUNCS(block_group_flags,
struct btrfs_block_group_item, flags, 64);
+/* struct btrfs_free_space_info */
+BTRFS_SETGET_FUNCS(free_space_extent_count, struct btrfs_free_space_info,
+ extent_count, 32);
+BTRFS_SETGET_FUNCS(free_space_flags, struct btrfs_free_space_info, flags, 32);
+
/* struct btrfs_inode_ref */
BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
@@ -3316,7 +3438,7 @@ static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
{
- return mapping_gfp_mask(mapping) & ~__GFP_FS;
+ return mapping_gfp_constraint(mapping, ~__GFP_FS);
}
/* extent-tree.c */
@@ -3365,6 +3487,7 @@ int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
struct btrfs_block_group_cache *btrfs_lookup_block_group(
struct btrfs_fs_info *info,
u64 bytenr);
+void btrfs_get_block_group(struct btrfs_block_group_cache *cache);
void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
int get_block_group_index(struct btrfs_block_group_cache *cache);
struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
@@ -3379,7 +3502,8 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 root_objectid, u64 owner,
- u64 offset, struct btrfs_key *ins);
+ u64 offset, u64 ram_bytes,
+ struct btrfs_key *ins);
int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 root_objectid, u64 owner, u64 offset,
@@ -3398,7 +3522,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
int btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
- u64 owner, u64 offset, int no_quota);
+ u64 owner, u64 offset);
int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len,
int delalloc);
@@ -3411,7 +3535,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
- u64 root_objectid, u64 owner, u64 offset, int no_quota);
+ u64 root_objectid, u64 owner, u64 offset);
int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
@@ -3427,6 +3551,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytes_used,
u64 type, u64 chunk_objectid, u64 chunk_offset,
u64 size);
+struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
+ struct btrfs_fs_info *fs_info,
+ const u64 chunk_offset);
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 group_start,
struct extent_map *em);
@@ -3449,8 +3576,11 @@ enum btrfs_reserve_flush_enum {
BTRFS_RESERVE_FLUSH_ALL,
};
-int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes);
-void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
+int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len);
+int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes);
+void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len);
+void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
+ u64 len);
void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
@@ -3466,8 +3596,8 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root,
u64 qgroup_reserved);
int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
-int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
-void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes);
+int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len);
+void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len);
void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
unsigned short type);
@@ -3514,6 +3644,9 @@ void btrfs_end_write_no_snapshoting(struct btrfs_root *root);
void check_system_chunk(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
const u64 type);
+u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
+ struct btrfs_fs_info *info, u64 start, u64 end);
+
/* ctree.c */
int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
int level, int *slot);
@@ -3678,6 +3811,7 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info)
kfree(fs_info->csum_root);
kfree(fs_info->quota_root);
kfree(fs_info->uuid_root);
+ kfree(fs_info->free_space_root);
kfree(fs_info->super_copy);
kfree(fs_info->super_for_commit);
security_free_mnt_opts(&fs_info->security_opts);
@@ -3847,7 +3981,6 @@ void btrfs_extent_item_to_extent_map(struct inode *inode,
/* inode.c */
struct btrfs_delalloc_work {
struct inode *inode;
- int wait;
int delay_iput;
struct completion completion;
struct list_head list;
@@ -3855,7 +3988,7 @@ struct btrfs_delalloc_work {
};
struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
- int wait, int delay_iput);
+ int delay_iput);
void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work);
struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
@@ -3965,7 +4098,8 @@ void btrfs_get_block_group_info(struct list_head *groups_list,
struct btrfs_ioctl_space_info *space);
void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
struct btrfs_ioctl_balance_args *bargs);
-
+ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen,
+ struct file *dst_file, u64 dst_loff);
/* file.c */
int btrfs_auto_defrag_init(void);
@@ -3996,6 +4130,11 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
loff_t pos, size_t write_bytes,
struct extent_state **cached);
int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
+ssize_t btrfs_copy_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ size_t len, unsigned int flags);
+int btrfs_clone_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out, u64 len);
/* tree-defrag.c */
int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
@@ -4004,8 +4143,8 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
/* sysfs.c */
int btrfs_init_sysfs(void);
void btrfs_exit_sysfs(void);
-int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info);
-void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info);
+int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info);
+void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info);
/* xattr.c */
ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
@@ -4039,14 +4178,102 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
#define btrfs_info(fs_info, fmt, args...) \
btrfs_printk(fs_info, KERN_INFO fmt, ##args)
+/*
+ * Wrappers that use printk_in_rcu
+ */
+#define btrfs_emerg_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_EMERG fmt, ##args)
+#define btrfs_alert_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_ALERT fmt, ##args)
+#define btrfs_crit_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_CRIT fmt, ##args)
+#define btrfs_err_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_ERR fmt, ##args)
+#define btrfs_warn_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_WARNING fmt, ##args)
+#define btrfs_notice_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_NOTICE fmt, ##args)
+#define btrfs_info_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_INFO fmt, ##args)
+
+/*
+ * Wrappers that use a ratelimited printk_in_rcu
+ */
+#define btrfs_emerg_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_EMERG fmt, ##args)
+#define btrfs_alert_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_ALERT fmt, ##args)
+#define btrfs_crit_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_CRIT fmt, ##args)
+#define btrfs_err_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_ERR fmt, ##args)
+#define btrfs_warn_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_WARNING fmt, ##args)
+#define btrfs_notice_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_NOTICE fmt, ##args)
+#define btrfs_info_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_INFO fmt, ##args)
+
+/*
+ * Wrappers that use a ratelimited printk
+ */
+#define btrfs_emerg_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_EMERG fmt, ##args)
+#define btrfs_alert_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_ALERT fmt, ##args)
+#define btrfs_crit_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_CRIT fmt, ##args)
+#define btrfs_err_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_ERR fmt, ##args)
+#define btrfs_warn_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_WARNING fmt, ##args)
+#define btrfs_notice_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_NOTICE fmt, ##args)
+#define btrfs_info_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_INFO fmt, ##args)
#ifdef DEBUG
#define btrfs_debug(fs_info, fmt, args...) \
btrfs_printk(fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_DEBUG fmt, ##args)
#else
#define btrfs_debug(fs_info, fmt, args...) \
no_printk(KERN_DEBUG fmt, ##args)
+#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
+ no_printk(KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
+ no_printk(KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl(fs_info, fmt, args...) \
+ no_printk(KERN_DEBUG fmt, ##args)
#endif
+#define btrfs_printk_in_rcu(fs_info, fmt, args...) \
+do { \
+ rcu_read_lock(); \
+ btrfs_printk(fs_info, fmt, ##args); \
+ rcu_read_unlock(); \
+} while (0)
+
+#define btrfs_printk_ratelimited(fs_info, fmt, args...) \
+do { \
+ static DEFINE_RATELIMIT_STATE(_rs, \
+ DEFAULT_RATELIMIT_INTERVAL, \
+ DEFAULT_RATELIMIT_BURST); \
+ if (__ratelimit(&_rs)) \
+ btrfs_printk(fs_info, fmt, ##args); \
+} while (0)
+
+#define btrfs_printk_rl_in_rcu(fs_info, fmt, args...) \
+do { \
+ rcu_read_lock(); \
+ btrfs_printk_ratelimited(fs_info, fmt, ##args); \
+ rcu_read_unlock(); \
+} while (0)
+
#ifdef CONFIG_BTRFS_ASSERT
__cold
@@ -4100,16 +4327,98 @@ static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info,
}
}
+#define btrfs_clear_fs_incompat(__fs_info, opt) \
+ __btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt)
+
+static inline void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info,
+ u64 flag)
+{
+ struct btrfs_super_block *disk_super;
+ u64 features;
+
+ disk_super = fs_info->super_copy;
+ features = btrfs_super_incompat_flags(disk_super);
+ if (features & flag) {
+ spin_lock(&fs_info->super_lock);
+ features = btrfs_super_incompat_flags(disk_super);
+ if (features & flag) {
+ features &= ~flag;
+ btrfs_set_super_incompat_flags(disk_super, features);
+ btrfs_info(fs_info, "clearing %llu feature flag",
+ flag);
+ }
+ spin_unlock(&fs_info->super_lock);
+ }
+}
+
#define btrfs_fs_incompat(fs_info, opt) \
__btrfs_fs_incompat((fs_info), BTRFS_FEATURE_INCOMPAT_##opt)
-static inline int __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag)
+static inline bool __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag)
{
struct btrfs_super_block *disk_super;
disk_super = fs_info->super_copy;
return !!(btrfs_super_incompat_flags(disk_super) & flag);
}
+#define btrfs_set_fs_compat_ro(__fs_info, opt) \
+ __btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt)
+
+static inline void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info,
+ u64 flag)
+{
+ struct btrfs_super_block *disk_super;
+ u64 features;
+
+ disk_super = fs_info->super_copy;
+ features = btrfs_super_compat_ro_flags(disk_super);
+ if (!(features & flag)) {
+ spin_lock(&fs_info->super_lock);
+ features = btrfs_super_compat_ro_flags(disk_super);
+ if (!(features & flag)) {
+ features |= flag;
+ btrfs_set_super_compat_ro_flags(disk_super, features);
+ btrfs_info(fs_info, "setting %llu ro feature flag",
+ flag);
+ }
+ spin_unlock(&fs_info->super_lock);
+ }
+}
+
+#define btrfs_clear_fs_compat_ro(__fs_info, opt) \
+ __btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt)
+
+static inline void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info,
+ u64 flag)
+{
+ struct btrfs_super_block *disk_super;
+ u64 features;
+
+ disk_super = fs_info->super_copy;
+ features = btrfs_super_compat_ro_flags(disk_super);
+ if (features & flag) {
+ spin_lock(&fs_info->super_lock);
+ features = btrfs_super_compat_ro_flags(disk_super);
+ if (features & flag) {
+ features &= ~flag;
+ btrfs_set_super_compat_ro_flags(disk_super, features);
+ btrfs_info(fs_info, "clearing %llu ro feature flag",
+ flag);
+ }
+ spin_unlock(&fs_info->super_lock);
+ }
+}
+
+#define btrfs_fs_compat_ro(fs_info, opt) \
+ __btrfs_fs_compat_ro((fs_info), BTRFS_FEATURE_COMPAT_RO_##opt)
+
+static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag)
+{
+ struct btrfs_super_block *disk_super;
+ disk_super = fs_info->super_copy;
+ return !!(btrfs_super_compat_ro_flags(disk_super) & flag);
+}
+
/*
* Call btrfs_abort_transaction as early as possible when an error condition is
* detected, that way the exact line number is reported.
@@ -4127,14 +4436,7 @@ do { \
__LINE__, (errno)); \
} while (0)
-#define btrfs_std_error(fs_info, errno) \
-do { \
- if ((errno)) \
- __btrfs_std_error((fs_info), __func__, \
- __LINE__, (errno), NULL); \
-} while (0)
-
-#define btrfs_error(fs_info, errno, fmt, args...) \
+#define btrfs_std_error(fs_info, errno, fmt, args...) \
do { \
__btrfs_std_error((fs_info), __func__, __LINE__, \
(errno), fmt, ##args); \
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index a2ae42720a6a..0be47e4b8136 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -54,16 +54,11 @@ static inline void btrfs_init_delayed_node(
delayed_node->root = root;
delayed_node->inode_id = inode_id;
atomic_set(&delayed_node->refs, 0);
- delayed_node->count = 0;
- delayed_node->flags = 0;
delayed_node->ins_root = RB_ROOT;
delayed_node->del_root = RB_ROOT;
mutex_init(&delayed_node->mutex);
- delayed_node->index_cnt = 0;
INIT_LIST_HEAD(&delayed_node->n_list);
INIT_LIST_HEAD(&delayed_node->p_list);
- delayed_node->bytes_reserved = 0;
- memset(&delayed_node->inode_item, 0, sizeof(delayed_node->inode_item));
}
static inline int btrfs_is_continuous_delayed_item(
@@ -132,7 +127,7 @@ again:
if (node)
return node;
- node = kmem_cache_alloc(delayed_node_cache, GFP_NOFS);
+ node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS);
if (!node)
return ERR_PTR(-ENOMEM);
btrfs_init_delayed_node(node, root, ino);
@@ -463,6 +458,10 @@ static int __btrfs_add_delayed_deletion_item(struct btrfs_delayed_node *node,
static void finish_one_item(struct btrfs_delayed_root *delayed_root)
{
int seq = atomic_inc_return(&delayed_root->items_seq);
+
+ /*
+ * atomic_dec_return implies a barrier for waitqueue_active
+ */
if ((atomic_dec_return(&delayed_root->items) <
BTRFS_DELAYED_BACKGROUND || seq % BTRFS_DELAYED_BATCH == 0) &&
waitqueue_active(&delayed_root->wait))
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index ac3e81da6d4e..914ac13bd92f 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -197,6 +197,119 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
trans->delayed_ref_updates--;
}
+static bool merge_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_head *head,
+ struct btrfs_delayed_ref_node *ref,
+ u64 seq)
+{
+ struct btrfs_delayed_ref_node *next;
+ bool done = false;
+
+ next = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node,
+ list);
+ while (!done && &next->list != &head->ref_list) {
+ int mod;
+ struct btrfs_delayed_ref_node *next2;
+
+ next2 = list_next_entry(next, list);
+
+ if (next == ref)
+ goto next;
+
+ if (seq && next->seq >= seq)
+ goto next;
+
+ if (next->type != ref->type)
+ goto next;
+
+ if ((ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
+ ref->type == BTRFS_SHARED_BLOCK_REF_KEY) &&
+ comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref),
+ btrfs_delayed_node_to_tree_ref(next),
+ ref->type))
+ goto next;
+ if ((ref->type == BTRFS_EXTENT_DATA_REF_KEY ||
+ ref->type == BTRFS_SHARED_DATA_REF_KEY) &&
+ comp_data_refs(btrfs_delayed_node_to_data_ref(ref),
+ btrfs_delayed_node_to_data_ref(next)))
+ goto next;
+
+ if (ref->action == next->action) {
+ mod = next->ref_mod;
+ } else {
+ if (ref->ref_mod < next->ref_mod) {
+ swap(ref, next);
+ done = true;
+ }
+ mod = -next->ref_mod;
+ }
+
+ drop_delayed_ref(trans, delayed_refs, head, next);
+ ref->ref_mod += mod;
+ if (ref->ref_mod == 0) {
+ drop_delayed_ref(trans, delayed_refs, head, ref);
+ done = true;
+ } else {
+ /*
+ * Can't have multiples of the same ref on a tree block.
+ */
+ WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
+ ref->type == BTRFS_SHARED_BLOCK_REF_KEY);
+ }
+next:
+ next = next2;
+ }
+
+ return done;
+}
+
+void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_head *head)
+{
+ struct btrfs_delayed_ref_node *ref;
+ u64 seq = 0;
+
+ assert_spin_locked(&head->lock);
+
+ if (list_empty(&head->ref_list))
+ return;
+
+ /* We don't have too many refs to merge for data. */
+ if (head->is_data)
+ return;
+
+ spin_lock(&fs_info->tree_mod_seq_lock);
+ if (!list_empty(&fs_info->tree_mod_seq_list)) {
+ struct seq_list *elem;
+
+ elem = list_first_entry(&fs_info->tree_mod_seq_list,
+ struct seq_list, list);
+ seq = elem->seq;
+ }
+ spin_unlock(&fs_info->tree_mod_seq_lock);
+
+ ref = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node,
+ list);
+ while (&ref->list != &head->ref_list) {
+ if (seq && ref->seq >= seq)
+ goto next;
+
+ if (merge_ref(trans, delayed_refs, head, ref, seq)) {
+ if (list_empty(&head->ref_list))
+ break;
+ ref = list_first_entry(&head->ref_list,
+ struct btrfs_delayed_ref_node,
+ list);
+ continue;
+ }
+next:
+ ref = list_next_entry(ref, list);
+ }
+}
+
int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
u64 seq)
@@ -292,8 +405,7 @@ add_delayed_ref_tail_merge(struct btrfs_trans_handle *trans,
exist = list_entry(href->ref_list.prev, struct btrfs_delayed_ref_node,
list);
/* No need to compare bytenr nor is_head */
- if (exist->type != ref->type || exist->no_quota != ref->no_quota ||
- exist->seq != ref->seq)
+ if (exist->type != ref->type || exist->seq != ref->seq)
goto add_tail;
if ((exist->type == BTRFS_TREE_BLOCK_REF_KEY ||
@@ -381,12 +493,12 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
memcpy(&existing_ref->extent_op->key,
&ref->extent_op->key,
sizeof(ref->extent_op->key));
- existing_ref->extent_op->update_key = 1;
+ existing_ref->extent_op->update_key = true;
}
if (ref->extent_op->update_flags) {
existing_ref->extent_op->flags_to_set |=
ref->extent_op->flags_to_set;
- existing_ref->extent_op->update_flags = 1;
+ existing_ref->extent_op->update_flags = true;
}
btrfs_free_delayed_extent_op(ref->extent_op);
}
@@ -423,7 +535,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *ref,
struct btrfs_qgroup_extent_record *qrecord,
- u64 bytenr, u64 num_bytes, int action, int is_data)
+ u64 bytenr, u64 num_bytes, u64 ref_root, u64 reserved,
+ int action, int is_data)
{
struct btrfs_delayed_ref_head *existing;
struct btrfs_delayed_ref_head *head_ref = NULL;
@@ -432,6 +545,9 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
int count_mod = 1;
int must_insert_reserved = 0;
+ /* If reserved is provided, it must be a data extent. */
+ BUG_ON(!is_data && reserved);
+
/*
* the head node stores the sum of all the mods, so dropping a ref
* should drop the sum in the head node by one.
@@ -476,9 +592,16 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
INIT_LIST_HEAD(&head_ref->ref_list);
head_ref->processing = 0;
head_ref->total_ref_mod = count_mod;
+ head_ref->qgroup_reserved = 0;
+ head_ref->qgroup_ref_root = 0;
/* Record qgroup extent info if provided */
if (qrecord) {
+ if (ref_root && reserved) {
+ head_ref->qgroup_ref_root = ref_root;
+ head_ref->qgroup_reserved = reserved;
+ }
+
qrecord->bytenr = bytenr;
qrecord->num_bytes = num_bytes;
qrecord->old_roots = NULL;
@@ -497,6 +620,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
existing = htree_insert(&delayed_refs->href_root,
&head_ref->href_node);
if (existing) {
+ WARN_ON(ref_root && reserved && existing->qgroup_ref_root
+ && existing->qgroup_reserved);
update_existing_head_ref(delayed_refs, &existing->node, ref);
/*
* we've updated the existing ref, free the newly
@@ -524,7 +649,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_head *head_ref,
struct btrfs_delayed_ref_node *ref, u64 bytenr,
u64 num_bytes, u64 parent, u64 ref_root, int level,
- int action, int no_quota)
+ int action)
{
struct btrfs_delayed_tree_ref *full_ref;
struct btrfs_delayed_ref_root *delayed_refs;
@@ -546,7 +671,6 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
ref->action = action;
ref->is_head = 0;
ref->in_tree = 1;
- ref->no_quota = no_quota;
ref->seq = seq;
full_ref = btrfs_delayed_node_to_tree_ref(ref);
@@ -579,7 +703,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_head *head_ref,
struct btrfs_delayed_ref_node *ref, u64 bytenr,
u64 num_bytes, u64 parent, u64 ref_root, u64 owner,
- u64 offset, int action, int no_quota)
+ u64 offset, int action)
{
struct btrfs_delayed_data_ref *full_ref;
struct btrfs_delayed_ref_root *delayed_refs;
@@ -602,7 +726,6 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
ref->action = action;
ref->is_head = 0;
ref->in_tree = 1;
- ref->no_quota = no_quota;
ref->seq = seq;
full_ref = btrfs_delayed_node_to_data_ref(ref);
@@ -633,17 +756,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, u64 parent,
u64 ref_root, int level, int action,
- struct btrfs_delayed_extent_op *extent_op,
- int no_quota)
+ struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_delayed_tree_ref *ref;
struct btrfs_delayed_ref_head *head_ref;
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_qgroup_extent_record *record = NULL;
- if (!is_fstree(ref_root) || !fs_info->quota_enabled)
- no_quota = 0;
-
BUG_ON(extent_op && extent_op->is_data);
ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
if (!ref)
@@ -669,11 +788,10 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
* the spin lock
*/
head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record,
- bytenr, num_bytes, action, 0);
+ bytenr, num_bytes, 0, 0, action, 0);
add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr,
- num_bytes, parent, ref_root, level, action,
- no_quota);
+ num_bytes, parent, ref_root, level, action);
spin_unlock(&delayed_refs->lock);
return 0;
@@ -693,18 +811,14 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
u64 parent, u64 ref_root,
- u64 owner, u64 offset, int action,
- struct btrfs_delayed_extent_op *extent_op,
- int no_quota)
+ u64 owner, u64 offset, u64 reserved, int action,
+ struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_delayed_data_ref *ref;
struct btrfs_delayed_ref_head *head_ref;
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_qgroup_extent_record *record = NULL;
- if (!is_fstree(ref_root) || !fs_info->quota_enabled)
- no_quota = 0;
-
BUG_ON(extent_op && !extent_op->is_data);
ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS);
if (!ref)
@@ -736,16 +850,44 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
* the spin lock
*/
head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record,
- bytenr, num_bytes, action, 1);
+ bytenr, num_bytes, ref_root, reserved,
+ action, 1);
add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr,
num_bytes, parent, ref_root, owner, offset,
- action, no_quota);
+ action);
spin_unlock(&delayed_refs->lock);
return 0;
}
+int btrfs_add_delayed_qgroup_reserve(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans,
+ u64 ref_root, u64 bytenr, u64 num_bytes)
+{
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_delayed_ref_head *ref_head;
+ int ret = 0;
+
+ if (!fs_info->quota_enabled || !is_fstree(ref_root))
+ return 0;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+
+ spin_lock(&delayed_refs->lock);
+ ref_head = find_ref_head(&delayed_refs->href_root, bytenr, 0);
+ if (!ref_head) {
+ ret = -ENOENT;
+ goto out;
+ }
+ WARN_ON(ref_head->qgroup_reserved || ref_head->qgroup_ref_root);
+ ref_head->qgroup_ref_root = ref_root;
+ ref_head->qgroup_reserved = num_bytes;
+out:
+ spin_unlock(&delayed_refs->lock);
+ return ret;
+}
+
int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
@@ -764,7 +906,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
spin_lock(&delayed_refs->lock);
add_delayed_ref_head(fs_info, trans, &head_ref->node, NULL, bytenr,
- num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
+ num_bytes, 0, 0, BTRFS_UPDATE_DELAYED_HEAD,
extent_op->is_data);
spin_unlock(&delayed_refs->lock);
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 13fb5e6090fe..c24b653c7343 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -68,7 +68,6 @@ struct btrfs_delayed_ref_node {
unsigned int action:8;
unsigned int type:8;
- unsigned int no_quota:1;
/* is this node still in the rbtree? */
unsigned int is_head:1;
unsigned int in_tree:1;
@@ -76,11 +75,11 @@ struct btrfs_delayed_ref_node {
struct btrfs_delayed_extent_op {
struct btrfs_disk_key key;
+ u8 level;
+ bool update_key;
+ bool update_flags;
+ bool is_data;
u64 flags_to_set;
- int level;
- unsigned int update_key:1;
- unsigned int update_flags:1;
- unsigned int is_data:1;
};
/*
@@ -113,6 +112,17 @@ struct btrfs_delayed_ref_head {
int total_ref_mod;
/*
+ * For qgroup reserved space freeing.
+ *
+ * ref_root and reserved will be recorded after
+ * BTRFS_ADD_DELAYED_EXTENT is called.
+ * And will be used to free reserved qgroup space at
+ * run_delayed_refs() time.
+ */
+ u64 qgroup_ref_root;
+ u64 qgroup_reserved;
+
+ /*
* when a new extent is allocated, it is just reserved in memory
* The actual extent isn't inserted into the extent allocation tree
* until the delayed ref is processed. must_insert_reserved is
@@ -233,15 +243,16 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, u64 parent,
u64 ref_root, int level, int action,
- struct btrfs_delayed_extent_op *extent_op,
- int no_quota);
+ struct btrfs_delayed_extent_op *extent_op);
int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
u64 parent, u64 ref_root,
- u64 owner, u64 offset, int action,
- struct btrfs_delayed_extent_op *extent_op,
- int no_quota);
+ u64 owner, u64 offset, u64 reserved, int action,
+ struct btrfs_delayed_extent_op *extent_op);
+int btrfs_add_delayed_qgroup_reserve(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans,
+ u64 ref_root, u64 bytenr, u64 num_bytes);
int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index e54dd5905cee..1e668fb7dd4c 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -327,19 +327,6 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
args->start.tgtdev_name[0] == '\0')
return -EINVAL;
- /*
- * Here we commit the transaction to make sure commit_total_bytes
- * of all the devices are updated.
- */
- trans = btrfs_attach_transaction(root);
- if (!IS_ERR(trans)) {
- ret = btrfs_commit_transaction(trans, root);
- if (ret)
- return ret;
- } else if (PTR_ERR(trans) != -ENOENT) {
- return PTR_ERR(trans);
- }
-
/* the disk copy procedure reuses the scrub code */
mutex_lock(&fs_info->volume_mutex);
ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid,
@@ -356,6 +343,19 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
if (ret)
return ret;
+ /*
+ * Here we commit the transaction to make sure commit_total_bytes
+ * of all the devices are updated.
+ */
+ trans = btrfs_attach_transaction(root);
+ if (!IS_ERR(trans)) {
+ ret = btrfs_commit_transaction(trans, root);
+ if (ret)
+ return ret;
+ } else if (PTR_ERR(trans) != -ENOENT) {
+ return PTR_ERR(trans);
+ }
+
btrfs_dev_replace_lock(dev_replace);
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
@@ -375,12 +375,8 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
WARN_ON(!tgt_device);
dev_replace->tgtdev = tgt_device;
- ret = btrfs_kobj_add_device(tgt_device->fs_devices, tgt_device);
- if (ret)
- btrfs_err(root->fs_info, "kobj add dev failed %d\n", ret);
-
- printk_in_rcu(KERN_INFO
- "BTRFS: dev_replace from %s (devid %llu) to %s started\n",
+ btrfs_info_in_rcu(root->fs_info,
+ "dev_replace from %s (devid %llu) to %s started",
src_device->missing ? "<missing disk>" :
rcu_str_deref(src_device->name),
src_device->devid,
@@ -401,6 +397,10 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
btrfs_dev_replace_unlock(dev_replace);
+ ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device);
+ if (ret)
+ btrfs_err(root->fs_info, "kobj add dev failed %d\n", ret);
+
btrfs_wait_ordered_roots(root->fs_info, -1);
/* force writing the updated state information to disk */
@@ -454,8 +454,7 @@ static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
{
clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
- if (waitqueue_active(&fs_info->replace_wait))
- wake_up(&fs_info->replace_wait);
+ wake_up(&fs_info->replace_wait);
}
static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
@@ -523,8 +522,8 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
src_device,
tgt_device);
} else {
- printk_in_rcu(KERN_ERR
- "BTRFS: btrfs_scrub_dev(%s, %llu, %s) failed %d\n",
+ btrfs_err_in_rcu(root->fs_info,
+ "btrfs_scrub_dev(%s, %llu, %s) failed %d",
src_device->missing ? "<missing disk>" :
rcu_str_deref(src_device->name),
src_device->devid,
@@ -540,8 +539,8 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
return scrub_ret;
}
- printk_in_rcu(KERN_INFO
- "BTRFS: dev_replace from %s (devid %llu) to %s finished\n",
+ btrfs_info_in_rcu(root->fs_info,
+ "dev_replace from %s (devid %llu) to %s finished",
src_device->missing ? "<missing disk>" :
rcu_str_deref(src_device->name),
src_device->devid,
@@ -586,7 +585,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
mutex_unlock(&uuid_mutex);
/* replace the sysfs entry */
- btrfs_kobj_rm_device(fs_info->fs_devices, src_device);
+ btrfs_sysfs_rm_device_link(fs_info->fs_devices, src_device);
btrfs_rm_dev_replace_free_srcdev(fs_info, src_device);
/* write back the superblocks */
@@ -809,8 +808,8 @@ static int btrfs_dev_replace_kthread(void *data)
progress = status_args->status.progress_1000;
kfree(status_args);
progress = div_u64(progress, 10);
- printk_in_rcu(KERN_INFO
- "BTRFS: continuing dev_replace from %s (devid %llu) to %s @%u%%\n",
+ btrfs_info_in_rcu(fs_info,
+ "continuing dev_replace from %s (devid %llu) to %s @%u%%",
dev_replace->srcdev->missing ? "<missing disk>" :
rcu_str_deref(dev_replace->srcdev->name),
dev_replace->srcdev->devid,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1e60d00d4ea7..e99ccd6ffb2c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -42,6 +42,7 @@
#include "locking.h"
#include "tree-log.h"
#include "free-space-cache.h"
+#include "free-space-tree.h"
#include "inode-map.h"
#include "check-integrity.h"
#include "rcu-string.h"
@@ -319,9 +320,9 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info,
memcpy(&found, result, csum_size);
read_extent_buffer(buf, &val, 0, csum_size);
- printk_ratelimited(KERN_WARNING
- "BTRFS: %s checksum verify failed on %llu wanted %X found %X "
- "level %d\n",
+ btrfs_warn_rl(fs_info,
+ "%s checksum verify failed on %llu wanted %X found %X "
+ "level %d",
fs_info->sb->s_id, buf->start,
val, found, btrfs_header_level(buf));
if (result != (char *)&inline_result)
@@ -362,15 +363,15 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
}
lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
- 0, &cached_state);
+ &cached_state);
if (extent_buffer_uptodate(eb) &&
btrfs_header_generation(eb) == parent_transid) {
ret = 0;
goto out;
}
- printk_ratelimited(KERN_ERR
- "BTRFS (device %s): parent transid verify failed on %llu wanted %llu found %llu\n",
- eb->fs_info->sb->s_id, eb->start,
+ btrfs_err_rl(eb->fs_info,
+ "parent transid verify failed on %llu wanted %llu found %llu",
+ eb->start,
parent_transid, btrfs_header_generation(eb));
ret = 1;
@@ -629,15 +630,14 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
found_start = btrfs_header_bytenr(eb);
if (found_start != eb->start) {
- printk_ratelimited(KERN_ERR "BTRFS (device %s): bad tree block start "
- "%llu %llu\n",
- eb->fs_info->sb->s_id, found_start, eb->start);
+ btrfs_err_rl(eb->fs_info, "bad tree block start %llu %llu",
+ found_start, eb->start);
ret = -EIO;
goto err;
}
if (check_tree_block_fsid(root->fs_info, eb)) {
- printk_ratelimited(KERN_ERR "BTRFS (device %s): bad fsid on block %llu\n",
- eb->fs_info->sb->s_id, eb->start);
+ btrfs_err_rl(eb->fs_info, "bad fsid on block %llu",
+ eb->start);
ret = -EIO;
goto err;
}
@@ -802,6 +802,9 @@ static void run_one_async_done(struct btrfs_work *work)
limit = btrfs_async_submit_limit(fs_info);
limit = limit * 2 / 3;
+ /*
+ * atomic_dec_return implies a barrier for waitqueue_active
+ */
if (atomic_dec_return(&fs_info->nr_async_submits) < limit &&
waitqueue_active(&fs_info->async_submit_wait))
wake_up(&fs_info->async_submit_wait);
@@ -921,7 +924,7 @@ static int check_async_write(struct inode *inode, unsigned long bio_flags)
if (bio_flags & EXTENT_BIO_TREE_LOG)
return 0;
#ifdef CONFIG_X86
- if (cpu_has_xmm4_2)
+ if (static_cpu_has_safe(X86_FEATURE_XMM4_2))
return 0;
#endif
return 1;
@@ -1265,6 +1268,7 @@ static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize,
atomic_set(&root->orphan_inodes, 0);
atomic_set(&root->refs, 1);
atomic_set(&root->will_be_snapshoted, 0);
+ atomic_set(&root->qgroup_meta_rsv, 0);
root->log_transid = 0;
root->log_transid_committed = -1;
root->last_log_commit = 0;
@@ -1647,6 +1651,9 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
if (location->objectid == BTRFS_UUID_TREE_OBJECTID)
return fs_info->uuid_root ? fs_info->uuid_root :
ERR_PTR(-ENOENT);
+ if (location->objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
+ return fs_info->free_space_root ? fs_info->free_space_root :
+ ERR_PTR(-ENOENT);
again:
root = btrfs_lookup_fs_root(fs_info, location->objectid);
if (root) {
@@ -1759,6 +1766,7 @@ static int cleaner_kthread(void *arg)
int again;
struct btrfs_trans_handle *trans;
+ set_freezable();
do {
again = 0;
@@ -2144,6 +2152,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
free_root_extent_buffers(info->uuid_root);
if (chunk_root)
free_root_extent_buffers(info->chunk_root);
+ free_root_extent_buffers(info->free_space_root);
}
void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
@@ -2348,8 +2357,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
u64 bytenr = btrfs_super_log_root(disk_super);
if (fs_devices->rw_devices == 0) {
- printk(KERN_WARNING "BTRFS: log replay required "
- "on RO media\n");
+ btrfs_warn(fs_info, "log replay required on RO media");
return -EIO;
}
@@ -2364,12 +2372,12 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
log_tree_root->node = read_tree_block(tree_root, bytenr,
fs_info->generation + 1);
if (IS_ERR(log_tree_root->node)) {
- printk(KERN_ERR "BTRFS: failed to read log tree\n");
+ btrfs_warn(fs_info, "failed to read log tree");
ret = PTR_ERR(log_tree_root->node);
kfree(log_tree_root);
return ret;
} else if (!extent_buffer_uptodate(log_tree_root->node)) {
- printk(KERN_ERR "BTRFS: failed to read log tree\n");
+ btrfs_err(fs_info, "failed to read log tree");
free_extent_buffer(log_tree_root->node);
kfree(log_tree_root);
return -EIO;
@@ -2377,7 +2385,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
/* returns with log_tree_root freed on success */
ret = btrfs_recover_log_trees(log_tree_root);
if (ret) {
- btrfs_error(tree_root->fs_info, ret,
+ btrfs_std_error(tree_root->fs_info, ret,
"Failed to recover log tree");
free_extent_buffer(log_tree_root->node);
kfree(log_tree_root);
@@ -2445,6 +2453,15 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info,
fs_info->uuid_root = root;
}
+ if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
+ location.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID;
+ root = btrfs_read_tree_root(tree_root, &location);
+ if (IS_ERR(root))
+ return PTR_ERR(root);
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->free_space_root = root;
+ }
+
return 0;
}
@@ -2572,7 +2589,7 @@ int open_ctree(struct super_block *sb,
fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */
/* readahead state */
- INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
+ INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
spin_lock_init(&fs_info->reada_lock);
fs_info->thread_pool_size = min_t(unsigned long,
@@ -2653,8 +2670,8 @@ int open_ctree(struct super_block *sb,
* Read super block and check the signature bytes only
*/
bh = btrfs_read_dev_super(fs_devices->latest_bdev);
- if (!bh) {
- err = -EINVAL;
+ if (IS_ERR(bh)) {
+ err = PTR_ERR(bh);
goto fail_alloc;
}
@@ -2665,6 +2682,7 @@ int open_ctree(struct super_block *sb,
if (btrfs_check_super_csum(bh->b_data)) {
printk(KERN_ERR "BTRFS: superblock checksum mismatch\n");
err = -EINVAL;
+ brelse(bh);
goto fail_alloc;
}
@@ -2806,7 +2824,7 @@ int open_ctree(struct super_block *sb,
fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
- 4 * 1024 * 1024 / PAGE_CACHE_SIZE);
+ SZ_4M / PAGE_CACHE_SIZE);
tree_root->nodesize = nodesize;
tree_root->sectorsize = sectorsize;
@@ -2937,7 +2955,7 @@ retry_root_backup:
goto fail_fsdev_sysfs;
}
- ret = btrfs_sysfs_add_one(fs_info);
+ ret = btrfs_sysfs_add_mounted(fs_info);
if (ret) {
pr_err("BTRFS: failed to init sysfs interface: %d\n", ret);
goto fail_fsdev_sysfs;
@@ -3048,6 +3066,18 @@ retry_root_backup:
if (sb->s_flags & MS_RDONLY)
return 0;
+ if (btrfs_test_opt(tree_root, FREE_SPACE_TREE) &&
+ !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
+ pr_info("BTRFS: creating free space tree\n");
+ ret = btrfs_create_free_space_tree(fs_info);
+ if (ret) {
+ pr_warn("BTRFS: failed to create free space tree %d\n",
+ ret);
+ close_ctree(tree_root);
+ return ret;
+ }
+ }
+
down_read(&fs_info->cleanup_work_sem);
if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) ||
(ret = btrfs_orphan_cleanup(fs_info->tree_root))) {
@@ -3073,6 +3103,18 @@ retry_root_backup:
btrfs_qgroup_rescan_resume(fs_info);
+ if (btrfs_test_opt(tree_root, CLEAR_CACHE) &&
+ btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
+ pr_info("BTRFS: clearing free space tree\n");
+ ret = btrfs_clear_free_space_tree(fs_info);
+ if (ret) {
+ pr_warn("BTRFS: failed to clear free space tree %d\n",
+ ret);
+ close_ctree(tree_root);
+ return ret;
+ }
+ }
+
if (!fs_info->uuid_root) {
pr_info("BTRFS: creating UUID tree\n");
ret = btrfs_create_uuid_tree(fs_info);
@@ -3117,7 +3159,7 @@ fail_cleaner:
filemap_write_and_wait(fs_info->btree_inode->i_mapping);
fail_sysfs:
- btrfs_sysfs_remove_one(fs_info);
+ btrfs_sysfs_remove_mounted(fs_info);
fail_fsdev_sysfs:
btrfs_sysfs_remove_fsid(fs_info->fs_devices);
@@ -3179,8 +3221,8 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
struct btrfs_device *device = (struct btrfs_device *)
bh->b_private;
- printk_ratelimited_in_rcu(KERN_WARNING "BTRFS: lost page write due to "
- "I/O error on %s\n",
+ btrfs_warn_rl_in_rcu(device->dev_root->fs_info,
+ "lost page write due to IO error on %s",
rcu_str_deref(device->name));
/* note, we dont' set_buffer_write_io_error because we have
* our own ways of dealing with the IO errors
@@ -3192,6 +3234,37 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
put_bh(bh);
}
+int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num,
+ struct buffer_head **bh_ret)
+{
+ struct buffer_head *bh;
+ struct btrfs_super_block *super;
+ u64 bytenr;
+
+ bytenr = btrfs_sb_offset(copy_num);
+ if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode))
+ return -EINVAL;
+
+ bh = __bread(bdev, bytenr / 4096, BTRFS_SUPER_INFO_SIZE);
+ /*
+ * If we fail to read from the underlying devices, as of now
+ * the best option we have is to mark it EIO.
+ */
+ if (!bh)
+ return -EIO;
+
+ super = (struct btrfs_super_block *)bh->b_data;
+ if (btrfs_super_bytenr(super) != bytenr ||
+ btrfs_super_magic(super) != BTRFS_MAGIC) {
+ brelse(bh);
+ return -EINVAL;
+ }
+
+ *bh_ret = bh;
+ return 0;
+}
+
+
struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
{
struct buffer_head *bh;
@@ -3199,7 +3272,7 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
struct btrfs_super_block *super;
int i;
u64 transid = 0;
- u64 bytenr;
+ int ret = -EINVAL;
/* we would like to check all the supers, but that would make
* a btrfs mount succeed after a mkfs from a different FS.
@@ -3207,21 +3280,11 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
* later supers, using BTRFS_SUPER_MIRROR_MAX instead
*/
for (i = 0; i < 1; i++) {
- bytenr = btrfs_sb_offset(i);
- if (bytenr + BTRFS_SUPER_INFO_SIZE >=
- i_size_read(bdev->bd_inode))
- break;
- bh = __bread(bdev, bytenr / 4096,
- BTRFS_SUPER_INFO_SIZE);
- if (!bh)
+ ret = btrfs_read_dev_one_super(bdev, i, &bh);
+ if (ret)
continue;
super = (struct btrfs_super_block *)bh->b_data;
- if (btrfs_super_bytenr(super) != bytenr ||
- btrfs_super_magic(super) != BTRFS_MAGIC) {
- brelse(bh);
- continue;
- }
if (!latest || btrfs_super_generation(super) > transid) {
brelse(latest);
@@ -3231,6 +3294,10 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
brelse(bh);
}
}
+
+ if (!latest)
+ return ERR_PTR(ret);
+
return latest;
}
@@ -3299,8 +3366,9 @@ static int write_dev_supers(struct btrfs_device *device,
bh = __getblk(device->bdev, bytenr / 4096,
BTRFS_SUPER_INFO_SIZE);
if (!bh) {
- printk(KERN_ERR "BTRFS: couldn't get super "
- "buffer head for bytenr %Lu\n", bytenr);
+ btrfs_err(device->dev_root->fs_info,
+ "couldn't get super buffer head for bytenr %llu",
+ bytenr);
errors++;
continue;
}
@@ -3449,22 +3517,31 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
{
- if ((flags & (BTRFS_BLOCK_GROUP_DUP |
- BTRFS_BLOCK_GROUP_RAID0 |
- BTRFS_AVAIL_ALLOC_BIT_SINGLE)) ||
- ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0))
- return 0;
+ int raid_type;
+ int min_tolerated = INT_MAX;
- if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
- BTRFS_BLOCK_GROUP_RAID5 |
- BTRFS_BLOCK_GROUP_RAID10))
- return 1;
+ if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 ||
+ (flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE))
+ min_tolerated = min(min_tolerated,
+ btrfs_raid_array[BTRFS_RAID_SINGLE].
+ tolerated_failures);
- if (flags & BTRFS_BLOCK_GROUP_RAID6)
- return 2;
+ for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
+ if (raid_type == BTRFS_RAID_SINGLE)
+ continue;
+ if (!(flags & btrfs_raid_group[raid_type]))
+ continue;
+ min_tolerated = min(min_tolerated,
+ btrfs_raid_array[raid_type].
+ tolerated_failures);
+ }
- pr_warn("BTRFS: unknown raid type: %llu\n", flags);
- return 0;
+ if (min_tolerated == INT_MAX) {
+ pr_warn("BTRFS: unknown raid flag: %llu\n", flags);
+ min_tolerated = 0;
+ }
+
+ return min_tolerated;
}
int btrfs_calc_num_tolerated_disk_barrier_failures(
@@ -3548,7 +3625,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
if (ret) {
mutex_unlock(
&root->fs_info->fs_devices->device_list_mutex);
- btrfs_error(root->fs_info, ret,
+ btrfs_std_error(root->fs_info, ret,
"errors while submitting device barriers.");
return ret;
}
@@ -3588,7 +3665,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
/* FUA is masked off if unsupported and can't be the reason */
- btrfs_error(root->fs_info, -EIO,
+ btrfs_std_error(root->fs_info, -EIO,
"%d errors while writing supers", total_errors);
return -EIO;
}
@@ -3606,7 +3683,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
}
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
if (total_errors > max_errors) {
- btrfs_error(root->fs_info, -EIO,
+ btrfs_std_error(root->fs_info, -EIO,
"%d errors while writing supers", total_errors);
return -EIO;
}
@@ -3742,6 +3819,9 @@ void close_ctree(struct btrfs_root *root)
fs_info->closing = 1;
smp_mb();
+ /* wait for the qgroup rescan worker to stop */
+ btrfs_qgroup_wait_for_completion(fs_info);
+
/* wait for the uuid_scan task to finish */
down(&fs_info->uuid_tree_rescan_sem);
/* avoid complains from lockdep et al., set sem back to initial state */
@@ -3792,7 +3872,7 @@ void close_ctree(struct btrfs_root *root)
percpu_counter_sum(&fs_info->delalloc_bytes));
}
- btrfs_sysfs_remove_one(fs_info);
+ btrfs_sysfs_remove_mounted(fs_info);
btrfs_sysfs_remove_fsid(fs_info->fs_devices);
btrfs_free_fs_roots(fs_info);
@@ -3861,11 +3941,6 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
return !ret;
}
-int btrfs_set_buffer_uptodate(struct extent_buffer *buf)
-{
- return set_extent_buffer_uptodate(buf);
-}
-
void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
{
struct btrfs_root *root;
@@ -3921,7 +3996,6 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
balance_dirty_pages_ratelimited(
root->fs_info->btree_inode->i_mapping);
}
- return;
}
void btrfs_btree_balance_dirty(struct btrfs_root *root)
@@ -4290,25 +4364,6 @@ again:
return 0;
}
-static void btrfs_free_pending_ordered(struct btrfs_transaction *cur_trans,
- struct btrfs_fs_info *fs_info)
-{
- struct btrfs_ordered_extent *ordered;
-
- spin_lock(&fs_info->trans_lock);
- while (!list_empty(&cur_trans->pending_ordered)) {
- ordered = list_first_entry(&cur_trans->pending_ordered,
- struct btrfs_ordered_extent,
- trans_list);
- list_del_init(&ordered->trans_list);
- spin_unlock(&fs_info->trans_lock);
-
- btrfs_put_ordered_extent(ordered);
- spin_lock(&fs_info->trans_lock);
- }
- spin_unlock(&fs_info->trans_lock);
-}
-
void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
struct btrfs_root *root)
{
@@ -4320,7 +4375,6 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
cur_trans->state = TRANS_STATE_UNBLOCKED;
wake_up(&root->fs_info->transaction_wait);
- btrfs_free_pending_ordered(cur_trans, root->fs_info);
btrfs_destroy_delayed_inodes(root);
btrfs_assert_delayed_root_empty(root);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index bdfb479ea859..8e79d0070bcf 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -19,7 +19,7 @@
#ifndef __DISKIO__
#define __DISKIO__
-#define BTRFS_SUPER_INFO_OFFSET (64 * 1024)
+#define BTRFS_SUPER_INFO_OFFSET SZ_64K
#define BTRFS_SUPER_INFO_SIZE 4096
#define BTRFS_SUPER_MIRROR_MAX 3
@@ -35,7 +35,7 @@ enum btrfs_wq_endio_type {
static inline u64 btrfs_sb_offset(int mirror)
{
- u64 start = 16 * 1024;
+ u64 start = SZ_16K;
if (mirror)
return start << (BTRFS_SUPER_MIRROR_SHIFT * mirror);
return BTRFS_SUPER_INFO_OFFSET;
@@ -60,6 +60,8 @@ void close_ctree(struct btrfs_root *root);
int write_ctree_super(struct btrfs_trans_handle *trans,
struct btrfs_root *root, int max_mirrors);
struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
+int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num,
+ struct buffer_head **bh_ret);
int btrfs_commit_super(struct btrfs_root *root);
struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info,
u64 bytenr);
@@ -114,7 +116,6 @@ static inline void btrfs_put_fs_root(struct btrfs_root *root)
void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
int atomic);
-int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid);
u32 btrfs_csum_data(char *data, u32 seed, size_t len);
void btrfs_csum_final(u32 crc, char *result);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 601d7d45d164..60cc1399c64f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -33,6 +33,7 @@
#include "raid56.h"
#include "locking.h"
#include "free-space-cache.h"
+#include "free-space-tree.h"
#include "math.h"
#include "sysfs.h"
#include "qgroup.h"
@@ -95,8 +96,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 parent, u64 root_objectid,
u64 flags, struct btrfs_disk_key *key,
- int level, struct btrfs_key *ins,
- int no_quota);
+ int level, struct btrfs_key *ins);
static int do_chunk_alloc(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root, u64 flags,
int force);
@@ -125,7 +125,7 @@ static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
return (cache->flags & bits) == bits;
}
-static void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
+void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
{
atomic_inc(&cache->count);
}
@@ -332,13 +332,34 @@ static void put_caching_control(struct btrfs_caching_control *ctl)
kfree(ctl);
}
+#ifdef CONFIG_BTRFS_DEBUG
+static void fragment_free_space(struct btrfs_root *root,
+ struct btrfs_block_group_cache *block_group)
+{
+ u64 start = block_group->key.objectid;
+ u64 len = block_group->key.offset;
+ u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
+ root->nodesize : root->sectorsize;
+ u64 step = chunk << 1;
+
+ while (len > chunk) {
+ btrfs_remove_free_space(block_group, start, chunk);
+ start += step;
+ if (len < step)
+ len = 0;
+ else
+ len -= step;
+ }
+}
+#endif
+
/*
* this is only called by cache_block_group, since we could have freed extents
* we need to check the pinned_extents for any extents that can't be used yet
* since their free space will be released as soon as the transaction commits.
*/
-static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
- struct btrfs_fs_info *info, u64 start, u64 end)
+u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
+ struct btrfs_fs_info *info, u64 start, u64 end)
{
u64 extent_start, extent_end, size, total_added = 0;
int ret;
@@ -375,11 +396,10 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
return total_added;
}
-static noinline void caching_thread(struct btrfs_work *work)
+static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
{
struct btrfs_block_group_cache *block_group;
struct btrfs_fs_info *fs_info;
- struct btrfs_caching_control *caching_ctl;
struct btrfs_root *extent_root;
struct btrfs_path *path;
struct extent_buffer *leaf;
@@ -387,19 +407,28 @@ static noinline void caching_thread(struct btrfs_work *work)
u64 total_found = 0;
u64 last = 0;
u32 nritems;
- int ret = -ENOMEM;
+ int ret;
+ bool wakeup = true;
- caching_ctl = container_of(work, struct btrfs_caching_control, work);
block_group = caching_ctl->block_group;
fs_info = block_group->fs_info;
extent_root = fs_info->extent_root;
path = btrfs_alloc_path();
if (!path)
- goto out;
+ return -ENOMEM;
last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
+#ifdef CONFIG_BTRFS_DEBUG
+ /*
+ * If we're fragmenting we don't want to make anybody think we can
+ * allocate from this block group until we've had a chance to fragment
+ * the free space.
+ */
+ if (btrfs_should_fragment_free_space(extent_root, block_group))
+ wakeup = false;
+#endif
/*
* We don't want to deadlock with somebody trying to allocate a new
* extent for the extent root while also trying to search the extent
@@ -408,20 +437,16 @@ static noinline void caching_thread(struct btrfs_work *work)
*/
path->skip_locking = 1;
path->search_commit_root = 1;
- path->reada = 1;
+ path->reada = READA_FORWARD;
key.objectid = last;
key.offset = 0;
key.type = BTRFS_EXTENT_ITEM_KEY;
-again:
- mutex_lock(&caching_ctl->mutex);
- /* need to make sure the commit_root doesn't disappear */
- down_read(&fs_info->commit_root_sem);
next:
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
- goto err;
+ goto out;
leaf = path->nodes[0];
nritems = btrfs_header_nritems(leaf);
@@ -441,17 +466,20 @@ next:
if (need_resched() ||
rwsem_is_contended(&fs_info->commit_root_sem)) {
- caching_ctl->progress = last;
+ if (wakeup)
+ caching_ctl->progress = last;
btrfs_release_path(path);
up_read(&fs_info->commit_root_sem);
mutex_unlock(&caching_ctl->mutex);
cond_resched();
- goto again;
+ mutex_lock(&caching_ctl->mutex);
+ down_read(&fs_info->commit_root_sem);
+ goto next;
}
ret = btrfs_next_leaf(extent_root, path);
if (ret < 0)
- goto err;
+ goto out;
if (ret)
break;
leaf = path->nodes[0];
@@ -464,7 +492,8 @@ next:
key.offset = 0;
key.type = BTRFS_EXTENT_ITEM_KEY;
- caching_ctl->progress = last;
+ if (wakeup)
+ caching_ctl->progress = last;
btrfs_release_path(path);
goto next;
}
@@ -489,9 +518,10 @@ next:
else
last = key.objectid + key.offset;
- if (total_found > (1024 * 1024 * 2)) {
+ if (total_found > CACHING_CTL_WAKE_UP) {
total_found = 0;
- wake_up(&caching_ctl->wait);
+ if (wakeup)
+ wake_up(&caching_ctl->wait);
}
}
path->slots[0]++;
@@ -503,25 +533,58 @@ next:
block_group->key.offset);
caching_ctl->progress = (u64)-1;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static noinline void caching_thread(struct btrfs_work *work)
+{
+ struct btrfs_block_group_cache *block_group;
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_caching_control *caching_ctl;
+ struct btrfs_root *extent_root;
+ int ret;
+
+ caching_ctl = container_of(work, struct btrfs_caching_control, work);
+ block_group = caching_ctl->block_group;
+ fs_info = block_group->fs_info;
+ extent_root = fs_info->extent_root;
+
+ mutex_lock(&caching_ctl->mutex);
+ down_read(&fs_info->commit_root_sem);
+
+ if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+ ret = load_free_space_tree(caching_ctl);
+ else
+ ret = load_extent_tree_free(caching_ctl);
+
spin_lock(&block_group->lock);
block_group->caching_ctl = NULL;
- block_group->cached = BTRFS_CACHE_FINISHED;
+ block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
spin_unlock(&block_group->lock);
-err:
- btrfs_free_path(path);
- up_read(&fs_info->commit_root_sem);
-
- free_excluded_extents(extent_root, block_group);
+#ifdef CONFIG_BTRFS_DEBUG
+ if (btrfs_should_fragment_free_space(extent_root, block_group)) {
+ u64 bytes_used;
- mutex_unlock(&caching_ctl->mutex);
-out:
- if (ret) {
+ spin_lock(&block_group->space_info->lock);
spin_lock(&block_group->lock);
- block_group->caching_ctl = NULL;
- block_group->cached = BTRFS_CACHE_ERROR;
+ bytes_used = block_group->key.offset -
+ btrfs_block_group_used(&block_group->item);
+ block_group->space_info->bytes_used += bytes_used >> 1;
spin_unlock(&block_group->lock);
+ spin_unlock(&block_group->space_info->lock);
+ fragment_free_space(extent_root, block_group);
}
+#endif
+
+ caching_ctl->progress = (u64)-1;
+
+ up_read(&fs_info->commit_root_sem);
+ free_excluded_extents(fs_info->extent_root, block_group);
+ mutex_unlock(&caching_ctl->mutex);
+
wake_up(&caching_ctl->wait);
put_caching_control(caching_ctl);
@@ -607,6 +670,22 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
}
}
spin_unlock(&cache->lock);
+#ifdef CONFIG_BTRFS_DEBUG
+ if (ret == 1 &&
+ btrfs_should_fragment_free_space(fs_info->extent_root,
+ cache)) {
+ u64 bytes_used;
+
+ spin_lock(&cache->space_info->lock);
+ spin_lock(&cache->lock);
+ bytes_used = cache->key.offset -
+ btrfs_block_group_used(&cache->item);
+ cache->space_info->bytes_used += bytes_used >> 1;
+ spin_unlock(&cache->lock);
+ spin_unlock(&cache->space_info->lock);
+ fragment_free_space(fs_info->extent_root, cache);
+ }
+#endif
mutex_unlock(&caching_ctl->mutex);
wake_up(&caching_ctl->wait);
@@ -617,8 +696,8 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
}
} else {
/*
- * We are not going to do the fast caching, set cached to the
- * appropriate value and wakeup any waiters.
+ * We're either using the free space tree or no caching at all.
+ * Set cached to the appropriate value and wakeup any waiters.
*/
spin_lock(&cache->lock);
if (load_cache_only) {
@@ -2009,8 +2088,7 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
- u64 root_objectid, u64 owner, u64 offset,
- int no_quota)
+ u64 root_objectid, u64 owner, u64 offset)
{
int ret;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -2022,12 +2100,12 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
num_bytes,
parent, root_objectid, (int)owner,
- BTRFS_ADD_DELAYED_REF, NULL, no_quota);
+ BTRFS_ADD_DELAYED_REF, NULL);
} else {
ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
- num_bytes,
- parent, root_objectid, owner, offset,
- BTRFS_ADD_DELAYED_REF, NULL, no_quota);
+ num_bytes, parent, root_objectid,
+ owner, offset, 0,
+ BTRFS_ADD_DELAYED_REF, NULL);
}
return ret;
}
@@ -2048,16 +2126,12 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
u64 num_bytes = node->num_bytes;
u64 refs;
int ret;
- int no_quota = node->no_quota;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- if (!is_fstree(root_objectid) || !root->fs_info->quota_enabled)
- no_quota = 1;
-
- path->reada = 1;
+ path->reada = READA_FORWARD;
path->leave_spinning = 1;
/* this will setup the path even if it fails to insert the back ref */
ret = insert_inline_extent_backref(trans, fs_info->extent_root, path,
@@ -2083,7 +2157,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
- path->reada = 1;
+ path->reada = READA_FORWARD;
path->leave_spinning = 1;
/* now insert the actual backref */
ret = insert_extent_backref(trans, root->fs_info->extent_root,
@@ -2196,7 +2270,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
}
again:
- path->reada = 1;
+ path->reada = READA_FORWARD;
path->leave_spinning = 1;
ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
path, 0, 1);
@@ -2291,8 +2365,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
parent, ref_root,
extent_op->flags_to_set,
&extent_op->key,
- ref->level, &ins,
- node->no_quota);
+ ref->level, &ins);
} else if (node->action == BTRFS_ADD_DELAYED_REF) {
ret = __btrfs_inc_extent_ref(trans, root, node,
parent, ref_root,
@@ -2345,6 +2418,11 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
node->num_bytes);
}
}
+
+ /* Also free its reserved qgroup space */
+ btrfs_qgroup_free_delayed_ref(root->fs_info,
+ head->qgroup_ref_root,
+ head->qgroup_reserved);
return ret;
}
@@ -2433,7 +2511,21 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
}
}
+ /*
+ * We need to try and merge add/drops of the same ref since we
+ * can run into issues with relocate dropping the implicit ref
+ * and then it being added back again before the drop can
+ * finish. If we merged anything we need to re-loop so we can
+ * get a good ref.
+ * Or we can get node references of the same type that weren't
+ * merged when created due to bumps in the tree mod seq, and
+ * we need to merge them to prevent adding an inline extent
+ * backref before dropping it (triggering a BUG_ON at
+ * insert_inline_extent_backref()).
+ */
spin_lock(&locked_ref->lock);
+ btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
+ locked_ref);
/*
* locked_ref is the head node, so we have to go one
@@ -2834,6 +2926,9 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
if (trans->aborted)
return 0;
+ if (root->fs_info->creating_free_space_tree)
+ return 0;
+
if (root == root->fs_info->extent_root)
root = root->fs_info->tree_root;
@@ -2912,9 +3007,9 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
return -ENOMEM;
extent_op->flags_to_set = flags;
- extent_op->update_flags = 1;
- extent_op->update_key = 0;
- extent_op->is_data = is_data ? 1 : 0;
+ extent_op->update_flags = true;
+ extent_op->update_key = false;
+ extent_op->is_data = is_data ? true : false;
extent_op->level = level;
ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
@@ -3109,7 +3204,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
int level;
int ret = 0;
int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
- u64, u64, u64, u64, u64, u64, int);
+ u64, u64, u64, u64, u64, u64);
if (btrfs_test_is_dummy_root(root))
@@ -3150,15 +3245,14 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
key.offset -= btrfs_file_extent_offset(buf, fi);
ret = process_func(trans, root, bytenr, num_bytes,
parent, ref_root, key.objectid,
- key.offset, 1);
+ key.offset);
if (ret)
goto fail;
} else {
bytenr = btrfs_node_blockptr(buf, i);
num_bytes = root->nodesize;
ret = process_func(trans, root, bytenr, num_bytes,
- parent, ref_root, level - 1, 0,
- 1);
+ parent, ref_root, level - 1, 0);
if (ret)
goto fail;
}
@@ -3253,7 +3347,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group,
* If this block group is smaller than 100 megs don't bother caching the
* block group.
*/
- if (block_group->key.offset < (100 * 1024 * 1024)) {
+ if (block_group->key.offset < (100 * SZ_1M)) {
spin_lock(&block_group->lock);
block_group->disk_cache_state = BTRFS_DC_WRITTEN;
spin_unlock(&block_group->lock);
@@ -3339,28 +3433,47 @@ again:
spin_unlock(&block_group->lock);
/*
+ * We hit an ENOSPC when setting up the cache in this transaction, just
+ * skip doing the setup, we've already cleared the cache so we're safe.
+ */
+ if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
+ ret = -ENOSPC;
+ goto out_put;
+ }
+
+ /*
* Try to preallocate enough space based on how big the block group is.
* Keep in mind this has to include any pinned space which could end up
* taking up quite a bit since it's not folded into the other space
* cache.
*/
- num_pages = div_u64(block_group->key.offset, 256 * 1024 * 1024);
+ num_pages = div_u64(block_group->key.offset, SZ_256M);
if (!num_pages)
num_pages = 1;
num_pages *= 16;
num_pages *= PAGE_CACHE_SIZE;
- ret = btrfs_check_data_free_space(inode, num_pages, num_pages);
+ ret = btrfs_check_data_free_space(inode, 0, num_pages);
if (ret)
goto out_put;
ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
num_pages, num_pages,
&alloc_hint);
+ /*
+ * Our cache requires contiguous chunks so that we don't modify a bunch
+ * of metadata or split extents when writing the cache out, which means
+ * we can enospc if we are heavily fragmented in addition to just normal
+ * out of space conditions. So if we hit this just skip setting up any
+ * other block groups for this transaction, maybe we'll unpin enough
+ * space the next time around.
+ */
if (!ret)
dcs = BTRFS_DC_SETUP;
- btrfs_free_reserved_data_space(inode, num_pages);
+ else if (ret == -ENOSPC)
+ set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
+ btrfs_free_reserved_data_space(inode, 0, num_pages);
out_put:
iput(inode);
@@ -3590,11 +3703,21 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
return -ENOMEM;
/*
- * We don't need the lock here since we are protected by the transaction
- * commit. We want to do the cache_save_setup first and then run the
+ * Even though we are in the critical section of the transaction commit,
+ * we can still have concurrent tasks adding elements to this
+ * transaction's list of dirty block groups. These tasks correspond to
+ * endio free space workers started when writeback finishes for a
+ * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
+ * allocate new block groups as a result of COWing nodes of the root
+ * tree when updating the free space inode. The writeback for the space
+ * caches is triggered by an earlier call to
+ * btrfs_start_dirty_block_groups() and iterations of the following
+ * loop.
+ * Also we want to do the cache_save_setup first and then run the
* delayed refs to make sure we have the best chance at doing this all
* in one shot.
*/
+ spin_lock(&cur_trans->dirty_bgs_lock);
while (!list_empty(&cur_trans->dirty_bgs)) {
cache = list_first_entry(&cur_trans->dirty_bgs,
struct btrfs_block_group_cache,
@@ -3606,11 +3729,13 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
* finish and then do it all again
*/
if (!list_empty(&cache->io_list)) {
+ spin_unlock(&cur_trans->dirty_bgs_lock);
list_del_init(&cache->io_list);
btrfs_wait_cache_io(root, trans, cache,
&cache->io_ctl, path,
cache->key.objectid);
btrfs_put_block_group(cache);
+ spin_lock(&cur_trans->dirty_bgs_lock);
}
/*
@@ -3618,6 +3743,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
* on any pending IO
*/
list_del_init(&cache->dirty_list);
+ spin_unlock(&cur_trans->dirty_bgs_lock);
should_put = 1;
cache_save_setup(cache, trans, path);
@@ -3642,6 +3768,25 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
}
if (!ret) {
ret = write_one_cache_group(trans, root, path, cache);
+ /*
+ * One of the free space endio workers might have
+ * created a new block group while updating a free space
+ * cache's inode (at inode.c:btrfs_finish_ordered_io())
+ * and hasn't released its transaction handle yet, in
+ * which case the new block group is still attached to
+ * its transaction handle and its creation has not
+ * finished yet (no block group item in the extent tree
+ * yet, etc). If this is the case, wait for all free
+ * space endio workers to finish and retry. This is a
+ * a very rare case so no need for a more efficient and
+ * complex approach.
+ */
+ if (ret == -ENOENT) {
+ wait_event(cur_trans->writer_wait,
+ atomic_read(&cur_trans->num_writers) == 1);
+ ret = write_one_cache_group(trans, root, path,
+ cache);
+ }
if (ret)
btrfs_abort_transaction(trans, root, ret);
}
@@ -3649,7 +3794,9 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
/* if its not on the io list, we need to put the block group */
if (should_put)
btrfs_put_block_group(cache);
+ spin_lock(&cur_trans->dirty_bgs_lock);
}
+ spin_unlock(&cur_trans->dirty_bgs_lock);
while (!list_empty(io)) {
cache = list_first_entry(io, struct btrfs_block_group_cache,
@@ -3746,6 +3893,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
found->bytes_readonly = 0;
found->bytes_may_use = 0;
found->full = 0;
+ found->max_extent_size = 0;
found->force_alloc = CHUNK_ALLOC_NO_FORCE;
found->chunk_alloc = 0;
found->flush = 0;
@@ -3822,7 +3970,8 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
{
u64 num_devices = root->fs_info->fs_devices->rw_devices;
u64 target;
- u64 tmp;
+ u64 raid_type;
+ u64 allowed = 0;
/*
* see if restripe for this chunk_type is in progress, if so
@@ -3840,31 +3989,26 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
spin_unlock(&root->fs_info->balance_lock);
/* First, mask out the RAID levels which aren't possible */
- if (num_devices == 1)
- flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 |
- BTRFS_BLOCK_GROUP_RAID5);
- if (num_devices < 3)
- flags &= ~BTRFS_BLOCK_GROUP_RAID6;
- if (num_devices < 4)
- flags &= ~BTRFS_BLOCK_GROUP_RAID10;
-
- tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
- BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 |
- BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10);
- flags &= ~tmp;
-
- if (tmp & BTRFS_BLOCK_GROUP_RAID6)
- tmp = BTRFS_BLOCK_GROUP_RAID6;
- else if (tmp & BTRFS_BLOCK_GROUP_RAID5)
- tmp = BTRFS_BLOCK_GROUP_RAID5;
- else if (tmp & BTRFS_BLOCK_GROUP_RAID10)
- tmp = BTRFS_BLOCK_GROUP_RAID10;
- else if (tmp & BTRFS_BLOCK_GROUP_RAID1)
- tmp = BTRFS_BLOCK_GROUP_RAID1;
- else if (tmp & BTRFS_BLOCK_GROUP_RAID0)
- tmp = BTRFS_BLOCK_GROUP_RAID0;
-
- return extended_to_chunk(flags | tmp);
+ for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
+ if (num_devices >= btrfs_raid_array[raid_type].devs_min)
+ allowed |= btrfs_raid_group[raid_type];
+ }
+ allowed &= flags;
+
+ if (allowed & BTRFS_BLOCK_GROUP_RAID6)
+ allowed = BTRFS_BLOCK_GROUP_RAID6;
+ else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
+ allowed = BTRFS_BLOCK_GROUP_RAID5;
+ else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
+ allowed = BTRFS_BLOCK_GROUP_RAID10;
+ else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
+ allowed = BTRFS_BLOCK_GROUP_RAID1;
+ else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
+ allowed = BTRFS_BLOCK_GROUP_RAID0;
+
+ flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+ return extended_to_chunk(flags | allowed);
}
static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags)
@@ -3903,11 +4047,7 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
return ret;
}
-/*
- * This will check the space that the inode allocates from to make sure we have
- * enough space for bytes.
- */
-int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes)
+int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes)
{
struct btrfs_space_info *data_sinfo;
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -4006,7 +4146,8 @@ commit_trans:
if (IS_ERR(trans))
return PTR_ERR(trans);
if (have_pinned_space >= 0 ||
- trans->transaction->have_free_bgs ||
+ test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
+ &trans->transaction->flags) ||
need_commit > 0) {
ret = btrfs_commit_transaction(trans, root);
if (ret)
@@ -4028,38 +4169,86 @@ commit_trans:
data_sinfo->flags, bytes, 1);
return -ENOSPC;
}
- ret = btrfs_qgroup_reserve(root, write_bytes);
- if (ret)
- goto out;
data_sinfo->bytes_may_use += bytes;
trace_btrfs_space_reservation(root->fs_info, "space_info",
data_sinfo->flags, bytes, 1);
-out:
spin_unlock(&data_sinfo->lock);
return ret;
}
/*
- * Called if we need to clear a data reservation for this inode.
+ * New check_data_free_space() with ability for precious data reservation
+ * Will replace old btrfs_check_data_free_space(), but for patch split,
+ * add a new function first and then replace it.
+ */
+int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret;
+
+ /* align the range */
+ len = round_up(start + len, root->sectorsize) -
+ round_down(start, root->sectorsize);
+ start = round_down(start, root->sectorsize);
+
+ ret = btrfs_alloc_data_chunk_ondemand(inode, len);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * Use new btrfs_qgroup_reserve_data to reserve precious data space
+ *
+ * TODO: Find a good method to avoid reserve data space for NOCOW
+ * range, but don't impact performance on quota disable case.
+ */
+ ret = btrfs_qgroup_reserve_data(inode, start, len);
+ return ret;
+}
+
+/*
+ * Called if we need to clear a data reservation for this inode
+ * Normally in a error case.
+ *
+ * This one will *NOT* use accurate qgroup reserved space API, just for case
+ * which we can't sleep and is sure it won't affect qgroup reserved space.
+ * Like clear_bit_hook().
*/
-void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
+void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
+ u64 len)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_space_info *data_sinfo;
- /* make sure bytes are sectorsize aligned */
- bytes = ALIGN(bytes, root->sectorsize);
+ /* Make sure the range is aligned to sectorsize */
+ len = round_up(start + len, root->sectorsize) -
+ round_down(start, root->sectorsize);
+ start = round_down(start, root->sectorsize);
data_sinfo = root->fs_info->data_sinfo;
spin_lock(&data_sinfo->lock);
- WARN_ON(data_sinfo->bytes_may_use < bytes);
- data_sinfo->bytes_may_use -= bytes;
+ if (WARN_ON(data_sinfo->bytes_may_use < len))
+ data_sinfo->bytes_may_use = 0;
+ else
+ data_sinfo->bytes_may_use -= len;
trace_btrfs_space_reservation(root->fs_info, "space_info",
- data_sinfo->flags, bytes, 0);
+ data_sinfo->flags, len, 0);
spin_unlock(&data_sinfo->lock);
}
+/*
+ * Called if we need to clear a data reservation for this inode
+ * Normally in a error case.
+ *
+ * This one will handle the per-indoe data rsv map for accurate reserved
+ * space framework.
+ */
+void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len)
+{
+ btrfs_free_reserved_data_space_noquota(inode, start, len);
+ btrfs_qgroup_free_data(inode, start, len);
+}
+
static void force_metadata_allocation(struct btrfs_fs_info *info)
{
struct list_head *head = &info->space_info;
@@ -4103,14 +4292,13 @@ static int should_alloc_chunk(struct btrfs_root *root,
*/
if (force == CHUNK_ALLOC_LIMITED) {
thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
- thresh = max_t(u64, 64 * 1024 * 1024,
- div_factor_fine(thresh, 1));
+ thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
if (num_bytes - num_allocated < thresh)
return 1;
}
- if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8))
+ if (num_allocated + SZ_2M < div_factor(num_bytes, 8))
return 0;
return 1;
}
@@ -4310,7 +4498,7 @@ out:
* transaction.
*/
if (trans->can_flush_pending_bgs &&
- trans->chunk_bytes_reserved >= (2 * 1024 * 1024ull)) {
+ trans->chunk_bytes_reserved >= (u64)SZ_2M) {
btrfs_create_pending_block_groups(trans, trans->root);
btrfs_trans_release_chunk_metadata(trans);
}
@@ -4408,7 +4596,7 @@ static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim)
return nr;
}
-#define EXTENT_SIZE_PER_ITEM (256 * 1024)
+#define EXTENT_SIZE_PER_ITEM SZ_256K
/*
* shrink metadata reservation for delalloc
@@ -4613,8 +4801,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
u64 expected;
u64 to_reclaim;
- to_reclaim = min_t(u64, num_online_cpus() * 1024 * 1024,
- 16 * 1024 * 1024);
+ to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
spin_lock(&space_info->lock);
if (can_overcommit(root, space_info, to_reclaim,
BTRFS_RESERVE_FLUSH_ALL)) {
@@ -4625,8 +4812,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
used = space_info->bytes_used + space_info->bytes_reserved +
space_info->bytes_pinned + space_info->bytes_readonly +
space_info->bytes_may_use;
- if (can_overcommit(root, space_info, 1024 * 1024,
- BTRFS_RESERVE_FLUSH_ALL))
+ if (can_overcommit(root, space_info, SZ_1M, BTRFS_RESERVE_FLUSH_ALL))
expected = div_factor_fine(space_info->total_bytes, 95);
else
expected = div_factor_fine(space_info->total_bytes, 90);
@@ -4891,13 +5077,9 @@ static struct btrfs_block_rsv *get_block_rsv(
{
struct btrfs_block_rsv *block_rsv = NULL;
- if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
- block_rsv = trans->block_rsv;
-
- if (root == root->fs_info->csum_root && trans->adding_csums)
- block_rsv = trans->block_rsv;
-
- if (root == root->fs_info->uuid_root)
+ if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
+ (root == root->fs_info->csum_root && trans->adding_csums) ||
+ (root == root->fs_info->uuid_root))
block_rsv = trans->block_rsv;
if (!block_rsv)
@@ -5186,7 +5368,7 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
spin_lock(&sinfo->lock);
spin_lock(&block_rsv->lock);
- block_rsv->size = min_t(u64, num_bytes, 512 * 1024 * 1024);
+ block_rsv->size = min_t(u64, num_bytes, SZ_512M);
num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
sinfo->bytes_reserved + sinfo->bytes_readonly +
@@ -5340,7 +5522,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
if (root->fs_info->quota_enabled) {
/* One for parent inode, two for dir entries */
num_bytes = 3 * root->nodesize;
- ret = btrfs_qgroup_reserve(root, num_bytes);
+ ret = btrfs_qgroup_reserve_meta(root, num_bytes);
if (ret)
return ret;
} else {
@@ -5358,10 +5540,8 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
if (ret == -ENOSPC && use_global_rsv)
ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes);
- if (ret) {
- if (*qgroup_reserved)
- btrfs_qgroup_free(root, *qgroup_reserved);
- }
+ if (ret && *qgroup_reserved)
+ btrfs_qgroup_free_meta(root, *qgroup_reserved);
return ret;
}
@@ -5522,15 +5702,15 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
spin_unlock(&BTRFS_I(inode)->lock);
if (root->fs_info->quota_enabled) {
- ret = btrfs_qgroup_reserve(root, nr_extents * root->nodesize);
+ ret = btrfs_qgroup_reserve_meta(root,
+ nr_extents * root->nodesize);
if (ret)
goto out_fail;
}
ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
if (unlikely(ret)) {
- if (root->fs_info->quota_enabled)
- btrfs_qgroup_free(root, nr_extents * root->nodesize);
+ btrfs_qgroup_free_meta(root, nr_extents * root->nodesize);
goto out_fail;
}
@@ -5653,41 +5833,48 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
}
/**
- * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
+ * btrfs_delalloc_reserve_space - reserve data and metadata space for
+ * delalloc
* @inode: inode we're writing to
- * @num_bytes: the number of bytes we want to allocate
+ * @start: start range we are writing to
+ * @len: how long the range we are writing to
+ *
+ * TODO: This function will finally replace old btrfs_delalloc_reserve_space()
*
* This will do the following things
*
- * o reserve space in the data space info for num_bytes
- * o reserve space in the metadata space info based on number of outstanding
+ * o reserve space in data space info for num bytes
+ * and reserve precious corresponding qgroup space
+ * (Done in check_data_free_space)
+ *
+ * o reserve space for metadata space, based on the number of outstanding
* extents and how much csums will be needed
- * o add to the inodes ->delalloc_bytes
+ * also reserve metadata space in a per root over-reserve method.
+ * o add to the inodes->delalloc_bytes
* o add it to the fs_info's delalloc inodes list.
+ * (Above 3 all done in delalloc_reserve_metadata)
*
- * This will return 0 for success and -ENOSPC if there is no space left.
+ * Return 0 for success
+ * Return <0 for error(-ENOSPC or -EQUOT)
*/
-int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
+int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len)
{
int ret;
- ret = btrfs_check_data_free_space(inode, num_bytes, num_bytes);
- if (ret)
- return ret;
-
- ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
- if (ret) {
- btrfs_free_reserved_data_space(inode, num_bytes);
+ ret = btrfs_check_data_free_space(inode, start, len);
+ if (ret < 0)
return ret;
- }
-
- return 0;
+ ret = btrfs_delalloc_reserve_metadata(inode, len);
+ if (ret < 0)
+ btrfs_free_reserved_data_space(inode, start, len);
+ return ret;
}
/**
* btrfs_delalloc_release_space - release data and metadata space for delalloc
* @inode: inode we're releasing space for
- * @num_bytes: the number of bytes we want to free up
+ * @start: start position of the space already reserved
+ * @len: the len of the space already reserved
*
* This must be matched with a call to btrfs_delalloc_reserve_space. This is
* called in the case that we don't need the metadata AND data reservations
@@ -5696,11 +5883,12 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
* This function will release the metadata space that was not used and will
* decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
* list if there are no delalloc bytes left.
+ * Also it will handle the qgroup reserved space.
*/
-void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
+void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len)
{
- btrfs_delalloc_release_metadata(inode, num_bytes);
- btrfs_free_reserved_data_space(inode, num_bytes);
+ btrfs_delalloc_release_metadata(inode, len);
+ btrfs_free_reserved_data_space(inode, start, len);
}
static int update_block_group(struct btrfs_trans_handle *trans,
@@ -5777,19 +5965,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
set_extent_dirty(info->pinned_extents,
bytenr, bytenr + num_bytes - 1,
GFP_NOFS | __GFP_NOFAIL);
- /*
- * No longer have used bytes in this block group, queue
- * it for deletion.
- */
- if (old_val == 0) {
- spin_lock(&info->unused_bgs_lock);
- if (list_empty(&cache->bg_list)) {
- btrfs_get_block_group(cache);
- list_add_tail(&cache->bg_list,
- &info->unused_bgs);
- }
- spin_unlock(&info->unused_bgs_lock);
- }
}
spin_lock(&trans->transaction->dirty_bgs_lock);
@@ -5801,6 +5976,22 @@ static int update_block_group(struct btrfs_trans_handle *trans,
}
spin_unlock(&trans->transaction->dirty_bgs_lock);
+ /*
+ * No longer have used bytes in this block group, queue it for
+ * deletion. We do this after adding the block group to the
+ * dirty list to avoid races between cleaner kthread and space
+ * cache writeout.
+ */
+ if (!alloc && old_val == 0) {
+ spin_lock(&info->unused_bgs_lock);
+ if (list_empty(&cache->bg_list)) {
+ btrfs_get_block_group(cache);
+ list_add_tail(&cache->bg_list,
+ &info->unused_bgs);
+ }
+ spin_unlock(&info->unused_bgs_lock);
+ }
+
btrfs_put_block_group(cache);
total -= num_bytes;
bytenr += num_bytes;
@@ -6065,6 +6256,34 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
update_global_block_rsv(fs_info);
}
+/*
+ * Returns the free cluster for the given space info and sets empty_cluster to
+ * what it should be based on the mount options.
+ */
+static struct btrfs_free_cluster *
+fetch_cluster_info(struct btrfs_root *root, struct btrfs_space_info *space_info,
+ u64 *empty_cluster)
+{
+ struct btrfs_free_cluster *ret = NULL;
+ bool ssd = btrfs_test_opt(root, SSD);
+
+ *empty_cluster = 0;
+ if (btrfs_mixed_space_info(space_info))
+ return ret;
+
+ if (ssd)
+ *empty_cluster = SZ_2M;
+ if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
+ ret = &root->fs_info->meta_alloc_cluster;
+ if (!ssd)
+ *empty_cluster = SZ_64K;
+ } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && ssd) {
+ ret = &root->fs_info->data_alloc_cluster;
+ }
+
+ return ret;
+}
+
static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
const bool return_free_space)
{
@@ -6072,7 +6291,10 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
struct btrfs_block_group_cache *cache = NULL;
struct btrfs_space_info *space_info;
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+ struct btrfs_free_cluster *cluster = NULL;
u64 len;
+ u64 total_unpinned = 0;
+ u64 empty_cluster = 0;
bool readonly;
while (start <= end) {
@@ -6081,8 +6303,14 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
start >= cache->key.objectid + cache->key.offset) {
if (cache)
btrfs_put_block_group(cache);
+ total_unpinned = 0;
cache = btrfs_lookup_block_group(fs_info, start);
BUG_ON(!cache); /* Logic error */
+
+ cluster = fetch_cluster_info(root,
+ cache->space_info,
+ &empty_cluster);
+ empty_cluster <<= 1;
}
len = cache->key.objectid + cache->key.offset - start;
@@ -6095,12 +6323,27 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
}
start += len;
+ total_unpinned += len;
space_info = cache->space_info;
+ /*
+ * If this space cluster has been marked as fragmented and we've
+ * unpinned enough in this block group to potentially allow a
+ * cluster to be created inside of it go ahead and clear the
+ * fragmented check.
+ */
+ if (cluster && cluster->fragmented &&
+ total_unpinned > empty_cluster) {
+ spin_lock(&cluster->lock);
+ cluster->fragmented = 0;
+ spin_unlock(&cluster->lock);
+ }
+
spin_lock(&space_info->lock);
spin_lock(&cache->lock);
cache->pinned -= len;
space_info->bytes_pinned -= len;
+ space_info->max_extent_size = 0;
percpu_counter_add(&space_info->total_bytes_pinned, -len);
if (cache->ro) {
space_info->bytes_readonly += len;
@@ -6233,7 +6476,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
int extent_slot = 0;
int found_extent = 0;
int num_to_del = 1;
- int no_quota = node->no_quota;
u32 item_size;
u64 refs;
u64 bytenr = node->bytenr;
@@ -6242,14 +6484,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
SKINNY_METADATA);
- if (!info->quota_enabled || !is_fstree(root_objectid))
- no_quota = 1;
-
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->reada = 1;
+ path->reada = READA_FORWARD;
path->leave_spinning = 1;
is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
@@ -6472,6 +6711,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
}
+ ret = add_to_free_space_tree(trans, root->fs_info, bytenr,
+ num_bytes);
+ if (ret) {
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
+ }
+
ret = update_block_group(trans, root, bytenr, num_bytes, 0);
if (ret) {
btrfs_abort_transaction(trans, extent_root, ret);
@@ -6570,7 +6816,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
buf->start, buf->len,
parent, root->root_key.objectid,
btrfs_header_level(buf),
- BTRFS_DROP_DELAYED_REF, NULL, 0);
+ BTRFS_DROP_DELAYED_REF, NULL);
BUG_ON(ret); /* -ENOMEM */
}
@@ -6618,7 +6864,7 @@ out:
/* Can return -ENOMEM */
int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
- u64 owner, u64 offset, int no_quota)
+ u64 owner, u64 offset)
{
int ret;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -6641,13 +6887,13 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
num_bytes,
parent, root_objectid, (int)owner,
- BTRFS_DROP_DELAYED_REF, NULL, no_quota);
+ BTRFS_DROP_DELAYED_REF, NULL);
} else {
ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
num_bytes,
parent, root_objectid, owner,
- offset, BTRFS_DROP_DELAYED_REF,
- NULL, no_quota);
+ offset, 0,
+ BTRFS_DROP_DELAYED_REF, NULL);
}
return ret;
}
@@ -6833,7 +7079,7 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
struct btrfs_block_group_cache *block_group = NULL;
u64 search_start = 0;
u64 max_extent_size = 0;
- int empty_cluster = 2 * 1024 * 1024;
+ u64 empty_cluster = 0;
struct btrfs_space_info *space_info;
int loop = 0;
int index = __get_raid_index(flags);
@@ -6843,6 +7089,8 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
bool failed_alloc = false;
bool use_cluster = true;
bool have_caching_bg = false;
+ bool orig_have_caching_bg = false;
+ bool full_search = false;
WARN_ON(num_bytes < root->sectorsize);
ins->type = BTRFS_EXTENT_ITEM_KEY;
@@ -6858,36 +7106,47 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
}
/*
- * If the space info is for both data and metadata it means we have a
- * small filesystem and we can't use the clustering stuff.
+ * If our free space is heavily fragmented we may not be able to make
+ * big contiguous allocations, so instead of doing the expensive search
+ * for free space, simply return ENOSPC with our max_extent_size so we
+ * can go ahead and search for a more manageable chunk.
+ *
+ * If our max_extent_size is large enough for our allocation simply
+ * disable clustering since we will likely not be able to find enough
+ * space to create a cluster and induce latency trying.
*/
- if (btrfs_mixed_space_info(space_info))
- use_cluster = false;
-
- if (flags & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
- last_ptr = &root->fs_info->meta_alloc_cluster;
- if (!btrfs_test_opt(root, SSD))
- empty_cluster = 64 * 1024;
- }
-
- if ((flags & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
- btrfs_test_opt(root, SSD)) {
- last_ptr = &root->fs_info->data_alloc_cluster;
+ if (unlikely(space_info->max_extent_size)) {
+ spin_lock(&space_info->lock);
+ if (space_info->max_extent_size &&
+ num_bytes > space_info->max_extent_size) {
+ ins->offset = space_info->max_extent_size;
+ spin_unlock(&space_info->lock);
+ return -ENOSPC;
+ } else if (space_info->max_extent_size) {
+ use_cluster = false;
+ }
+ spin_unlock(&space_info->lock);
}
+ last_ptr = fetch_cluster_info(orig_root, space_info, &empty_cluster);
if (last_ptr) {
spin_lock(&last_ptr->lock);
if (last_ptr->block_group)
hint_byte = last_ptr->window_start;
+ if (last_ptr->fragmented) {
+ /*
+ * We still set window_start so we can keep track of the
+ * last place we found an allocation to try and save
+ * some time.
+ */
+ hint_byte = last_ptr->window_start;
+ use_cluster = false;
+ }
spin_unlock(&last_ptr->lock);
}
search_start = max(search_start, first_logical_byte(root, 0));
search_start = max(search_start, hint_byte);
-
- if (!last_ptr)
- empty_cluster = 0;
-
if (search_start == hint_byte) {
block_group = btrfs_lookup_block_group(root->fs_info,
search_start);
@@ -6922,6 +7181,8 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
}
search:
have_caching_bg = false;
+ if (index == 0 || index == __get_raid_index(flags))
+ full_search = true;
down_read(&space_info->groups_sem);
list_for_each_entry(block_group, &space_info->block_groups[index],
list) {
@@ -6955,6 +7216,7 @@ search:
have_block_group:
cached = block_group_cache_done(block_group);
if (unlikely(!cached)) {
+ have_caching_bg = true;
ret = cache_block_group(block_group, 0);
BUG_ON(ret < 0);
ret = 0;
@@ -6969,7 +7231,7 @@ have_block_group:
* Ok we want to try and use the cluster allocator, so
* lets look there
*/
- if (last_ptr) {
+ if (last_ptr && use_cluster) {
struct btrfs_block_group_cache *used_block_group;
unsigned long aligned_cluster;
/*
@@ -7095,6 +7357,16 @@ refill_cluster:
}
unclustered_alloc:
+ /*
+ * We are doing an unclustered alloc, set the fragmented flag so
+ * we don't bother trying to setup a cluster again until we get
+ * more space.
+ */
+ if (unlikely(last_ptr)) {
+ spin_lock(&last_ptr->lock);
+ last_ptr->fragmented = 1;
+ spin_unlock(&last_ptr->lock);
+ }
spin_lock(&block_group->free_space_ctl->tree_lock);
if (cached &&
block_group->free_space_ctl->free_space <
@@ -7127,8 +7399,6 @@ unclustered_alloc:
failed_alloc = true;
goto have_block_group;
} else if (!offset) {
- if (!cached)
- have_caching_bg = true;
goto loop;
}
checks:
@@ -7169,6 +7439,10 @@ loop:
}
up_read(&space_info->groups_sem);
+ if ((loop == LOOP_CACHING_NOWAIT) && have_caching_bg
+ && !orig_have_caching_bg)
+ orig_have_caching_bg = true;
+
if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
goto search;
@@ -7185,7 +7459,20 @@ loop:
*/
if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
index = 0;
- loop++;
+ if (loop == LOOP_CACHING_NOWAIT) {
+ /*
+ * We want to skip the LOOP_CACHING_WAIT step if we
+ * don't have any unached bgs and we've alrelady done a
+ * full search through.
+ */
+ if (orig_have_caching_bg || !full_search)
+ loop = LOOP_CACHING_WAIT;
+ else
+ loop = LOOP_ALLOC_CHUNK;
+ } else {
+ loop++;
+ }
+
if (loop == LOOP_ALLOC_CHUNK) {
struct btrfs_trans_handle *trans;
int exist = 0;
@@ -7203,6 +7490,15 @@ loop:
ret = do_chunk_alloc(trans, root, flags,
CHUNK_ALLOC_FORCE);
+
+ /*
+ * If we can't allocate a new chunk we've already looped
+ * through at least once, move on to the NO_EMPTY_SIZE
+ * case.
+ */
+ if (ret == -ENOSPC)
+ loop = LOOP_NO_EMPTY_SIZE;
+
/*
* Do not bail out on ENOSPC since we
* can do more things.
@@ -7219,6 +7515,15 @@ loop:
}
if (loop == LOOP_NO_EMPTY_SIZE) {
+ /*
+ * Don't loop again if we already have no empty_size and
+ * no empty_cluster.
+ */
+ if (empty_size == 0 &&
+ empty_cluster == 0) {
+ ret = -ENOSPC;
+ goto out;
+ }
empty_size = 0;
empty_cluster = 0;
}
@@ -7227,11 +7532,20 @@ loop:
} else if (!ins->objectid) {
ret = -ENOSPC;
} else if (ins->objectid) {
+ if (!use_cluster && last_ptr) {
+ spin_lock(&last_ptr->lock);
+ last_ptr->window_start = ins->objectid;
+ spin_unlock(&last_ptr->lock);
+ }
ret = 0;
}
out:
- if (ret == -ENOSPC)
+ if (ret == -ENOSPC) {
+ spin_lock(&space_info->lock);
+ space_info->max_extent_size = max_extent_size;
+ spin_unlock(&space_info->lock);
ins->offset = max_extent_size;
+ }
return ret;
}
@@ -7280,7 +7594,7 @@ int btrfs_reserve_extent(struct btrfs_root *root,
u64 empty_size, u64 hint_byte,
struct btrfs_key *ins, int is_data, int delalloc)
{
- bool final_tried = false;
+ bool final_tried = num_bytes == min_alloc_size;
u64 flags;
int ret;
@@ -7415,6 +7729,11 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(path->nodes[0]);
btrfs_free_path(path);
+ ret = remove_from_free_space_tree(trans, fs_info, ins->objectid,
+ ins->offset);
+ if (ret)
+ return ret;
+
ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
if (ret) { /* -ENOENT, logic error */
btrfs_err(fs_info, "update block group failed for %llu %llu",
@@ -7429,8 +7748,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 parent, u64 root_objectid,
u64 flags, struct btrfs_disk_key *key,
- int level, struct btrfs_key *ins,
- int no_quota)
+ int level, struct btrfs_key *ins)
{
int ret;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -7496,6 +7814,11 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_free_path(path);
+ ret = remove_from_free_space_tree(trans, fs_info, ins->objectid,
+ num_bytes);
+ if (ret)
+ return ret;
+
ret = update_block_group(trans, root, ins->objectid, root->nodesize,
1);
if (ret) { /* -ENOENT, logic error */
@@ -7511,7 +7834,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 root_objectid, u64 owner,
- u64 offset, struct btrfs_key *ins)
+ u64 offset, u64 ram_bytes,
+ struct btrfs_key *ins)
{
int ret;
@@ -7520,7 +7844,8 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
ins->offset, 0,
root_objectid, owner, offset,
- BTRFS_ADD_DELAYED_EXTENT, NULL, 0);
+ ram_bytes, BTRFS_ADD_DELAYED_EXTENT,
+ NULL);
return ret;
}
@@ -7576,7 +7901,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
btrfs_set_lock_blocking(buf);
- btrfs_set_buffer_uptodate(buf);
+ set_extent_buffer_uptodate(buf);
if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
buf->log_index = root->log_transid % 2;
@@ -7722,19 +8047,16 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
else
memset(&extent_op->key, 0, sizeof(extent_op->key));
extent_op->flags_to_set = flags;
- if (skinny_metadata)
- extent_op->update_key = 0;
- else
- extent_op->update_key = 1;
- extent_op->update_flags = 1;
- extent_op->is_data = 0;
+ extent_op->update_key = skinny_metadata ? false : true;
+ extent_op->update_flags = true;
+ extent_op->is_data = false;
extent_op->level = level;
ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
ins.objectid, ins.offset,
parent, root_objectid, level,
BTRFS_ADD_DELAYED_EXTENT,
- extent_op, 0);
+ extent_op);
if (ret)
goto out_free_delayed;
}
@@ -7850,21 +8172,47 @@ reada:
}
/*
- * TODO: Modify related function to add related node/leaf to dirty_extent_root,
- * for later qgroup accounting.
- *
- * Current, this function does nothing.
+ * These may not be seen by the usual inc/dec ref code so we have to
+ * add them here.
*/
+static int record_one_subtree_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes)
+{
+ struct btrfs_qgroup_extent_record *qrecord;
+ struct btrfs_delayed_ref_root *delayed_refs;
+
+ qrecord = kmalloc(sizeof(*qrecord), GFP_NOFS);
+ if (!qrecord)
+ return -ENOMEM;
+
+ qrecord->bytenr = bytenr;
+ qrecord->num_bytes = num_bytes;
+ qrecord->old_roots = NULL;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ spin_lock(&delayed_refs->lock);
+ if (btrfs_qgroup_insert_dirty_extent(delayed_refs, qrecord))
+ kfree(qrecord);
+ spin_unlock(&delayed_refs->lock);
+
+ return 0;
+}
+
static int account_leaf_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *eb)
{
int nr = btrfs_header_nritems(eb);
- int i, extent_type;
+ int i, extent_type, ret;
struct btrfs_key key;
struct btrfs_file_extent_item *fi;
u64 bytenr, num_bytes;
+ /* We can be called directly from walk_up_proc() */
+ if (!root->fs_info->quota_enabled)
+ return 0;
+
for (i = 0; i < nr; i++) {
btrfs_item_key_to_cpu(eb, &key, i);
@@ -7883,6 +8231,10 @@ static int account_leaf_items(struct btrfs_trans_handle *trans,
continue;
num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
+
+ ret = record_one_subtree_extent(trans, root, bytenr, num_bytes);
+ if (ret)
+ return ret;
}
return 0;
}
@@ -7951,8 +8303,6 @@ static int adjust_slots_upwards(struct btrfs_root *root,
/*
* root_eb is the subtree root and is locked before this function is called.
- * TODO: Modify this function to mark all (including complete shared node)
- * to dirty_extent_root to allow it get accounted in qgroup.
*/
static int account_shared_subtree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -8030,6 +8380,11 @@ walk_down:
btrfs_tree_read_lock(eb);
btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
+
+ ret = record_one_subtree_extent(trans, root, child_bytenr,
+ root->nodesize);
+ if (ret)
+ goto out;
}
if (level == 0) {
@@ -8275,14 +8630,15 @@ skip:
ret = account_shared_subtree(trans, root, next,
generation, level - 1);
if (ret) {
- printk_ratelimited(KERN_ERR "BTRFS: %s Error "
+ btrfs_err_rl(root->fs_info,
+ "Error "
"%d accounting shared subtree. Quota "
- "is out of sync, rescan required.\n",
- root->fs_info->sb->s_id, ret);
+ "is out of sync, rescan required.",
+ ret);
}
}
ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
- root->root_key.objectid, level - 1, 0, 0);
+ root->root_key.objectid, level - 1, 0);
BUG_ON(ret); /* -ENOMEM */
}
btrfs_tree_unlock(next);
@@ -8367,10 +8723,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
BUG_ON(ret); /* -ENOMEM */
ret = account_leaf_items(trans, root, eb);
if (ret) {
- printk_ratelimited(KERN_ERR "BTRFS: %s Error "
+ btrfs_err_rl(root->fs_info,
+ "error "
"%d accounting leaf items. Quota "
- "is out of sync, rescan required.\n",
- root->fs_info->sb->s_id, ret);
+ "is out of sync, rescan required.",
+ ret);
}
}
/* make block locked assertion in clean_tree_block happy */
@@ -8692,7 +9049,7 @@ out:
if (!for_reloc && root_dropped == false)
btrfs_add_dead_root(root);
if (err && err != -EAGAIN)
- btrfs_std_error(root->fs_info, err);
+ btrfs_std_error(root->fs_info, err, NULL);
return err;
}
@@ -8831,7 +9188,7 @@ static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
if ((sinfo->flags &
(BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
!force)
- min_allocable_bytes = 1 * 1024 * 1024;
+ min_allocable_bytes = SZ_1M;
else
min_allocable_bytes = 0;
@@ -8880,7 +9237,7 @@ again:
* back off and let this transaction commit
*/
mutex_lock(&root->fs_info->ro_block_group_mutex);
- if (trans->transaction->dirty_bg_run) {
+ if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
u64 transid = trans->transid;
mutex_unlock(&root->fs_info->ro_block_group_mutex);
@@ -9363,6 +9720,8 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
cache->full_stripe_len = btrfs_full_stripe_len(root,
&root->fs_info->mapping_tree,
start);
+ set_free_space_tree_thresholds(cache);
+
atomic_set(&cache->count, 1);
spin_lock_init(&cache->lock);
init_rwsem(&cache->data_rwsem);
@@ -9374,6 +9733,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
INIT_LIST_HEAD(&cache->io_list);
btrfs_init_free_space_ctl(cache);
atomic_set(&cache->trimming, 0);
+ mutex_init(&cache->free_space_lock);
return cache;
}
@@ -9398,7 +9758,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->reada = 1;
+ path->reada = READA_FORWARD;
cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
if (btrfs_test_opt(root, SPACE_CACHE) &&
@@ -9584,6 +9944,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
key.objectid, key.offset);
if (ret)
btrfs_abort_transaction(trans, extent_root, ret);
+ add_block_group_free_space(trans, root->fs_info, block_group);
+ /* already aborted the transaction if it failed. */
next:
list_del_init(&block_group->bg_list);
}
@@ -9614,6 +9976,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
cache->flags = type;
cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
+ cache->needs_free_space = 1;
ret = exclude_super_stripes(root, cache);
if (ret) {
/*
@@ -9630,6 +9993,14 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
free_excluded_extents(root, cache);
+#ifdef CONFIG_BTRFS_DEBUG
+ if (btrfs_should_fragment_free_space(root, cache)) {
+ u64 new_bytes_used = size - bytes_used;
+
+ bytes_used += new_bytes_used >> 1;
+ fragment_free_space(root, cache);
+ }
+#endif
/*
* Call to ensure the corresponding space_info object is created and
* assigned to our block group, but don't update its counters just yet.
@@ -9976,6 +10347,10 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
unlock_chunks(root);
+ ret = remove_block_group_free_space(trans, root->fs_info, block_group);
+ if (ret)
+ goto out;
+
btrfs_put_block_group(block_group);
btrfs_put_block_group(block_group);
@@ -9991,6 +10366,47 @@ out:
return ret;
}
+struct btrfs_trans_handle *
+btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
+ const u64 chunk_offset)
+{
+ struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
+ struct extent_map *em;
+ struct map_lookup *map;
+ unsigned int num_items;
+
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, chunk_offset, 1);
+ read_unlock(&em_tree->lock);
+ ASSERT(em && em->start == chunk_offset);
+
+ /*
+ * We need to reserve 3 + N units from the metadata space info in order
+ * to remove a block group (done at btrfs_remove_chunk() and at
+ * btrfs_remove_block_group()), which are used for:
+ *
+ * 1 unit for adding the free space inode's orphan (located in the tree
+ * of tree roots).
+ * 1 unit for deleting the block group item (located in the extent
+ * tree).
+ * 1 unit for deleting the free space item (located in tree of tree
+ * roots).
+ * N units for deleting N device extent items corresponding to each
+ * stripe (located in the device tree).
+ *
+ * In order to remove a block group we also need to reserve units in the
+ * system space info in order to update the chunk tree (update one or
+ * more device items and remove one chunk item), but this is done at
+ * btrfs_remove_chunk() through a call to check_system_chunk().
+ */
+ map = (struct map_lookup *)em->bdev;
+ num_items = 3 + map->num_stripes;
+ free_extent_map(em);
+
+ return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
+ num_items, 1);
+}
+
/*
* Process the unused_bgs list and remove any that don't have any allocated
* space inside of them.
@@ -10014,22 +10430,25 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
block_group = list_first_entry(&fs_info->unused_bgs,
struct btrfs_block_group_cache,
bg_list);
- space_info = block_group->space_info;
list_del_init(&block_group->bg_list);
+
+ space_info = block_group->space_info;
+
if (ret || btrfs_mixed_space_info(space_info)) {
btrfs_put_block_group(block_group);
continue;
}
spin_unlock(&fs_info->unused_bgs_lock);
- mutex_lock(&root->fs_info->delete_unused_bgs_mutex);
+ mutex_lock(&fs_info->delete_unused_bgs_mutex);
/* Don't want to race with allocators so take the groups_sem */
down_write(&space_info->groups_sem);
spin_lock(&block_group->lock);
if (block_group->reserved ||
btrfs_block_group_used(&block_group->item) ||
- block_group->ro) {
+ block_group->ro ||
+ list_is_singular(&block_group->list)) {
/*
* We want to bail if we made new allocations or have
* outstanding allocations in this block group. We do
@@ -10054,8 +10473,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
* Want to do this before we do anything else so we can recover
* properly if we fail to join the transaction.
*/
- /* 1 for btrfs_orphan_reserve_metadata() */
- trans = btrfs_start_transaction(root, 1);
+ trans = btrfs_start_trans_remove_block_group(fs_info,
+ block_group->key.objectid);
if (IS_ERR(trans)) {
btrfs_dec_block_group_ro(root, block_group);
ret = PTR_ERR(trans);
@@ -10135,17 +10554,21 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
* until transaction commit to do the actual discard.
*/
if (trimming) {
- WARN_ON(!list_empty(&block_group->bg_list));
- spin_lock(&trans->transaction->deleted_bgs_lock);
+ spin_lock(&fs_info->unused_bgs_lock);
+ /*
+ * A concurrent scrub might have added us to the list
+ * fs_info->unused_bgs, so use a list_move operation
+ * to add the block group to the deleted_bgs list.
+ */
list_move(&block_group->bg_list,
&trans->transaction->deleted_bgs);
- spin_unlock(&trans->transaction->deleted_bgs_lock);
+ spin_unlock(&fs_info->unused_bgs_lock);
btrfs_get_block_group(block_group);
}
end_trans:
btrfs_end_transaction(trans, root);
next:
- mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
btrfs_put_block_group(block_group);
spin_lock(&fs_info->unused_bgs_lock);
}
@@ -10370,8 +10793,7 @@ void btrfs_end_write_no_snapshoting(struct btrfs_root *root)
{
percpu_counter_dec(&root->subv_writers->counter);
/*
- * Make sure counter is updated before we wake up
- * waiters.
+ * Make sure counter is updated before we wake up waiters.
*/
smp_mb();
if (waitqueue_active(&root->subv_writers->wait))
diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
deleted file mode 100644
index e69de29bb2d1..000000000000
--- a/fs/btrfs/extent-tree.h
+++ /dev/null
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 3915c9473e94..2e7c97a3f344 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -96,8 +96,8 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller,
inode = tree->mapping->host;
isize = i_size_read(inode);
if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
- printk_ratelimited(KERN_DEBUG
- "BTRFS: %s: ino %llu isize %llu odd range [%llu,%llu]\n",
+ btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
+ "%s: ino %llu isize %llu odd range [%llu,%llu]",
caller, btrfs_ino(inode), isize, start, end);
}
}
@@ -131,6 +131,25 @@ struct extent_page_data {
unsigned int sync_io:1;
};
+static void add_extent_changeset(struct extent_state *state, unsigned bits,
+ struct extent_changeset *changeset,
+ int set)
+{
+ int ret;
+
+ if (!changeset)
+ return;
+ if (set && (state->state & bits) == bits)
+ return;
+ if (!set && (state->state & bits) == 0)
+ return;
+ changeset->bytes_changed += state->end - state->start + 1;
+ ret = ulist_add(changeset->range_changed, state->start, state->end,
+ GFP_ATOMIC);
+ /* ENOMEM */
+ BUG_ON(ret < 0);
+}
+
static noinline void flush_write_bio(void *data);
static inline struct btrfs_fs_info *
tree_fs_info(struct extent_io_tree *tree)
@@ -410,7 +429,8 @@ static void clear_state_cb(struct extent_io_tree *tree,
}
static void set_state_bits(struct extent_io_tree *tree,
- struct extent_state *state, unsigned *bits);
+ struct extent_state *state, unsigned *bits,
+ struct extent_changeset *changeset);
/*
* insert an extent_state struct into the tree. 'bits' are set on the
@@ -426,7 +446,7 @@ static int insert_state(struct extent_io_tree *tree,
struct extent_state *state, u64 start, u64 end,
struct rb_node ***p,
struct rb_node **parent,
- unsigned *bits)
+ unsigned *bits, struct extent_changeset *changeset)
{
struct rb_node *node;
@@ -436,7 +456,7 @@ static int insert_state(struct extent_io_tree *tree,
state->start = start;
state->end = end;
- set_state_bits(tree, state, bits);
+ set_state_bits(tree, state, bits, changeset);
node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
if (node) {
@@ -511,7 +531,8 @@ static struct extent_state *next_state(struct extent_state *state)
*/
static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
struct extent_state *state,
- unsigned *bits, int wake)
+ unsigned *bits, int wake,
+ struct extent_changeset *changeset)
{
struct extent_state *next;
unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS;
@@ -522,6 +543,7 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
tree->dirty_bytes -= range;
}
clear_state_cb(tree, state, bits);
+ add_extent_changeset(state, bits_to_clear, changeset, 0);
state->state &= ~bits_to_clear;
if (wake)
wake_up(&state->wq);
@@ -569,10 +591,10 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
*
* This takes the tree lock, and returns 0 on success and < 0 on error.
*/
-int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, int wake, int delete,
- struct extent_state **cached_state,
- gfp_t mask)
+static int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, int wake, int delete,
+ struct extent_state **cached_state,
+ gfp_t mask, struct extent_changeset *changeset)
{
struct extent_state *state;
struct extent_state *cached;
@@ -594,7 +616,7 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
clear = 1;
again:
- if (!prealloc && (mask & __GFP_WAIT)) {
+ if (!prealloc && gfpflags_allow_blocking(mask)) {
/*
* Don't care for allocation failure here because we might end
* up not needing the pre-allocated extent state at all, which
@@ -671,7 +693,8 @@ hit_next:
if (err)
goto out;
if (state->end <= end) {
- state = clear_state_bit(tree, state, &bits, wake);
+ state = clear_state_bit(tree, state, &bits, wake,
+ changeset);
goto next;
}
goto search_again;
@@ -692,13 +715,13 @@ hit_next:
if (wake)
wake_up(&state->wq);
- clear_state_bit(tree, prealloc, &bits, wake);
+ clear_state_bit(tree, prealloc, &bits, wake, changeset);
prealloc = NULL;
goto out;
}
- state = clear_state_bit(tree, state, &bits, wake);
+ state = clear_state_bit(tree, state, &bits, wake, changeset);
next:
if (last_end == (u64)-1)
goto out;
@@ -718,7 +741,7 @@ search_again:
if (start > end)
goto out;
spin_unlock(&tree->lock);
- if (mask & __GFP_WAIT)
+ if (gfpflags_allow_blocking(mask))
cond_resched();
goto again;
}
@@ -789,7 +812,7 @@ out:
static void set_state_bits(struct extent_io_tree *tree,
struct extent_state *state,
- unsigned *bits)
+ unsigned *bits, struct extent_changeset *changeset)
{
unsigned bits_to_set = *bits & ~EXTENT_CTLBITS;
@@ -798,6 +821,7 @@ static void set_state_bits(struct extent_io_tree *tree,
u64 range = state->end - state->start + 1;
tree->dirty_bytes += range;
}
+ add_extent_changeset(state, bits_to_set, changeset, 1);
state->state |= bits_to_set;
}
@@ -835,7 +859,7 @@ static int __must_check
__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, unsigned exclusive_bits,
u64 *failed_start, struct extent_state **cached_state,
- gfp_t mask)
+ gfp_t mask, struct extent_changeset *changeset)
{
struct extent_state *state;
struct extent_state *prealloc = NULL;
@@ -850,7 +874,7 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
bits |= EXTENT_FIRST_DELALLOC;
again:
- if (!prealloc && (mask & __GFP_WAIT)) {
+ if (!prealloc && gfpflags_allow_blocking(mask)) {
prealloc = alloc_extent_state(mask);
BUG_ON(!prealloc);
}
@@ -873,7 +897,7 @@ again:
prealloc = alloc_extent_state_atomic(prealloc);
BUG_ON(!prealloc);
err = insert_state(tree, prealloc, start, end,
- &p, &parent, &bits);
+ &p, &parent, &bits, changeset);
if (err)
extent_io_tree_panic(tree, err);
@@ -899,7 +923,7 @@ hit_next:
goto out;
}
- set_state_bits(tree, state, &bits);
+ set_state_bits(tree, state, &bits, changeset);
cache_state(state, cached_state);
merge_state(tree, state);
if (last_end == (u64)-1)
@@ -945,7 +969,7 @@ hit_next:
if (err)
goto out;
if (state->end <= end) {
- set_state_bits(tree, state, &bits);
+ set_state_bits(tree, state, &bits, changeset);
cache_state(state, cached_state);
merge_state(tree, state);
if (last_end == (u64)-1)
@@ -980,7 +1004,7 @@ hit_next:
* the later extent.
*/
err = insert_state(tree, prealloc, start, this_end,
- NULL, NULL, &bits);
+ NULL, NULL, &bits, changeset);
if (err)
extent_io_tree_panic(tree, err);
@@ -1008,7 +1032,7 @@ hit_next:
if (err)
extent_io_tree_panic(tree, err);
- set_state_bits(tree, prealloc, &bits);
+ set_state_bits(tree, prealloc, &bits, changeset);
cache_state(prealloc, cached_state);
merge_state(tree, prealloc);
prealloc = NULL;
@@ -1028,7 +1052,7 @@ search_again:
if (start > end)
goto out;
spin_unlock(&tree->lock);
- if (mask & __GFP_WAIT)
+ if (gfpflags_allow_blocking(mask))
cond_resched();
goto again;
}
@@ -1038,7 +1062,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached_state, gfp_t mask)
{
return __set_extent_bit(tree, start, end, bits, 0, failed_start,
- cached_state, mask);
+ cached_state, mask, NULL);
}
@@ -1076,7 +1100,7 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
btrfs_debug_check_extent_io_range(tree, start, end);
again:
- if (!prealloc && (mask & __GFP_WAIT)) {
+ if (!prealloc && gfpflags_allow_blocking(mask)) {
/*
* Best effort, don't worry if extent state allocation fails
* here for the first iteration. We might have a cached state
@@ -1111,7 +1135,7 @@ again:
goto out;
}
err = insert_state(tree, prealloc, start, end,
- &p, &parent, &bits);
+ &p, &parent, &bits, NULL);
if (err)
extent_io_tree_panic(tree, err);
cache_state(prealloc, cached_state);
@@ -1130,9 +1154,9 @@ hit_next:
* Just lock what we found and keep going
*/
if (state->start == start && state->end <= end) {
- set_state_bits(tree, state, &bits);
+ set_state_bits(tree, state, &bits, NULL);
cache_state(state, cached_state);
- state = clear_state_bit(tree, state, &clear_bits, 0);
+ state = clear_state_bit(tree, state, &clear_bits, 0, NULL);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
@@ -1171,9 +1195,10 @@ hit_next:
if (err)
goto out;
if (state->end <= end) {
- set_state_bits(tree, state, &bits);
+ set_state_bits(tree, state, &bits, NULL);
cache_state(state, cached_state);
- state = clear_state_bit(tree, state, &clear_bits, 0);
+ state = clear_state_bit(tree, state, &clear_bits, 0,
+ NULL);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
@@ -1208,7 +1233,7 @@ hit_next:
* the later extent.
*/
err = insert_state(tree, prealloc, start, this_end,
- NULL, NULL, &bits);
+ NULL, NULL, &bits, NULL);
if (err)
extent_io_tree_panic(tree, err);
cache_state(prealloc, cached_state);
@@ -1233,9 +1258,9 @@ hit_next:
if (err)
extent_io_tree_panic(tree, err);
- set_state_bits(tree, prealloc, &bits);
+ set_state_bits(tree, prealloc, &bits, NULL);
cache_state(prealloc, cached_state);
- clear_state_bit(tree, prealloc, &clear_bits, 0);
+ clear_state_bit(tree, prealloc, &clear_bits, 0, NULL);
prealloc = NULL;
goto out;
}
@@ -1253,81 +1278,49 @@ search_again:
if (start > end)
goto out;
spin_unlock(&tree->lock);
- if (mask & __GFP_WAIT)
+ if (gfpflags_allow_blocking(mask))
cond_resched();
first_iteration = false;
goto again;
}
/* wrappers around set/clear extent bit */
-int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
- gfp_t mask)
-{
- return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL,
- NULL, mask);
-}
-
-int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, gfp_t mask)
+int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, gfp_t mask,
+ struct extent_changeset *changeset)
{
- return set_extent_bit(tree, start, end, bits, NULL,
- NULL, mask);
-}
-
-int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, gfp_t mask)
-{
- int wake = 0;
-
- if (bits & EXTENT_LOCKED)
- wake = 1;
-
- return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask);
-}
-
-int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached_state, gfp_t mask)
-{
- return set_extent_bit(tree, start, end,
- EXTENT_DELALLOC | EXTENT_UPTODATE,
- NULL, cached_state, mask);
-}
-
-int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached_state, gfp_t mask)
-{
- return set_extent_bit(tree, start, end,
- EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG,
- NULL, cached_state, mask);
-}
+ /*
+ * We don't support EXTENT_LOCKED yet, as current changeset will
+ * record any bits changed, so for EXTENT_LOCKED case, it will
+ * either fail with -EEXIST or changeset will record the whole
+ * range.
+ */
+ BUG_ON(bits & EXTENT_LOCKED);
-int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
- gfp_t mask)
-{
- return clear_extent_bit(tree, start, end,
- EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
+ return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, mask,
+ changeset);
}
-int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
- gfp_t mask)
+int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, int wake, int delete,
+ struct extent_state **cached, gfp_t mask)
{
- return set_extent_bit(tree, start, end, EXTENT_NEW, NULL,
- NULL, mask);
+ return __clear_extent_bit(tree, start, end, bits, wake, delete,
+ cached, mask, NULL);
}
-int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached_state, gfp_t mask)
+int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, gfp_t mask,
+ struct extent_changeset *changeset)
{
- return set_extent_bit(tree, start, end, EXTENT_UPTODATE, NULL,
- cached_state, mask);
-}
+ /*
+ * Don't support EXTENT_LOCKED case, same reason as
+ * set_record_extent_bits().
+ */
+ BUG_ON(bits & EXTENT_LOCKED);
-int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached_state, gfp_t mask)
-{
- return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
- cached_state, mask);
+ return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask,
+ changeset);
}
/*
@@ -1335,15 +1328,15 @@ int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
* us if waiting is desired.
*/
int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, struct extent_state **cached_state)
+ struct extent_state **cached_state)
{
int err;
u64 failed_start;
while (1) {
- err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
+ err = __set_extent_bit(tree, start, end, EXTENT_LOCKED,
EXTENT_LOCKED, &failed_start,
- cached_state, GFP_NOFS);
+ cached_state, GFP_NOFS, NULL);
if (err == -EEXIST) {
wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
start = failed_start;
@@ -1354,18 +1347,13 @@ int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
return err;
}
-int lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
-{
- return lock_extent_bits(tree, start, end, 0, NULL);
-}
-
int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
{
int err;
u64 failed_start;
err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
- &failed_start, NULL, GFP_NOFS);
+ &failed_start, NULL, GFP_NOFS, NULL);
if (err == -EEXIST) {
if (failed_start > start)
clear_extent_bit(tree, start, failed_start - 1,
@@ -1375,20 +1363,7 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
return 1;
}
-int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached, gfp_t mask)
-{
- return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
- mask);
-}
-
-int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
-{
- return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
- GFP_NOFS);
-}
-
-int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
+void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
{
unsigned long index = start >> PAGE_CACHE_SHIFT;
unsigned long end_index = end >> PAGE_CACHE_SHIFT;
@@ -1401,10 +1376,9 @@ int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
page_cache_release(page);
index++;
}
- return 0;
}
-int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
+void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
{
unsigned long index = start >> PAGE_CACHE_SHIFT;
unsigned long end_index = end >> PAGE_CACHE_SHIFT;
@@ -1418,13 +1392,12 @@ int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
page_cache_release(page);
index++;
}
- return 0;
}
/*
* helper function to set both pages and extents in the tree writeback
*/
-static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
+static void set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
{
unsigned long index = start >> PAGE_CACHE_SHIFT;
unsigned long end_index = end >> PAGE_CACHE_SHIFT;
@@ -1437,7 +1410,6 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
page_cache_release(page);
index++;
}
- return 0;
}
/* find the first state struct with 'bits' set after 'start', and
@@ -1737,7 +1709,7 @@ again:
BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */
/* step three, lock the state bits for the whole range */
- lock_extent_bits(tree, delalloc_start, delalloc_end, 0, &cached_state);
+ lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state);
/* then test to make sure it is all still delalloc */
ret = test_range_bit(tree, delalloc_start, delalloc_end,
@@ -1757,7 +1729,7 @@ out_failed:
return found;
}
-int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
+void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
struct page *locked_page,
unsigned clear_bits,
unsigned long page_ops)
@@ -1772,7 +1744,7 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
if (page_ops == 0)
- return 0;
+ return;
if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0)
mapping_set_error(inode->i_mapping, -EIO);
@@ -1806,7 +1778,6 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
index += ret;
cond_resched();
}
- return 0;
}
/*
@@ -2078,8 +2049,8 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
return -EIO;
}
- printk_ratelimited_in_rcu(KERN_INFO
- "BTRFS: read error corrected: ino %llu off %llu (dev %s sector %llu)\n",
+ btrfs_info_rl_in_rcu(fs_info,
+ "read error corrected: ino %llu off %llu (dev %s sector %llu)",
btrfs_ino(inode), start,
rcu_str_deref(dev->name), sector);
bio_put(bio);
@@ -2453,7 +2424,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
/* lots and lots of room for performance fixes in the end_bio funcs */
-int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
+void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
{
int uptodate = (err == 0);
struct extent_io_tree *tree;
@@ -2474,7 +2445,6 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
ret = ret < 0 ? ret : -EIO;
mapping_set_error(page->mapping, ret);
}
- return 0;
}
/*
@@ -2516,9 +2486,7 @@ static void end_bio_extent_writepage(struct bio *bio)
start = page_offset(page);
end = start + bvec->bv_offset + bvec->bv_len - 1;
- if (end_extent_writepage(page, bio->bi_error, start, end))
- continue;
-
+ end_extent_writepage(page, bio->bi_error, start, end);
end_page_writeback(page);
}
@@ -3070,8 +3038,12 @@ static int __do_readpage(struct extent_io_tree *tree,
set_extent_uptodate(tree, cur, cur + iosize - 1,
&cached, GFP_NOFS);
- unlock_extent_cached(tree, cur, cur + iosize - 1,
- &cached, GFP_NOFS);
+ if (parent_locked)
+ free_extent_state(cached);
+ else
+ unlock_extent_cached(tree, cur,
+ cur + iosize - 1,
+ &cached, GFP_NOFS);
cur = cur + iosize;
pg_offset += iosize;
continue;
@@ -4259,7 +4231,7 @@ int extent_invalidatepage(struct extent_io_tree *tree,
if (start > end)
return 0;
- lock_extent_bits(tree, start, end, 0, &cached_state);
+ lock_extent_bits(tree, start, end, &cached_state);
wait_on_page_writeback(page);
clear_extent_bit(tree, start, end,
EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
@@ -4319,8 +4291,8 @@ int try_release_extent_mapping(struct extent_map_tree *map,
u64 start = page_offset(page);
u64 end = start + PAGE_CACHE_SIZE - 1;
- if ((mask & __GFP_WAIT) &&
- page->mapping->host->i_size > 16 * 1024 * 1024) {
+ if (gfpflags_allow_blocking(mask) &&
+ page->mapping->host->i_size > SZ_16M) {
u64 len;
while (start <= end) {
len = end - start + 1;
@@ -4469,7 +4441,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
last_for_get_extent = isize;
}
- lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, 0,
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1,
&cached_state);
em = get_extent_skip_holes(inode, start, last_for_get_extent,
@@ -4730,24 +4702,14 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
return new;
}
-struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start)
+struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start, unsigned long len)
{
struct extent_buffer *eb;
- unsigned long len;
unsigned long num_pages;
unsigned long i;
- if (!fs_info) {
- /*
- * Called only from tests that don't always have a fs_info
- * available, but we know that nodesize is 4096
- */
- len = 4096;
- } else {
- len = fs_info->tree_root->nodesize;
- }
- num_pages = num_extent_pages(0, len);
+ num_pages = num_extent_pages(start, len);
eb = __alloc_extent_buffer(fs_info, start, len);
if (!eb)
@@ -4770,6 +4732,24 @@ err:
return NULL;
}
+struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start)
+{
+ unsigned long len;
+
+ if (!fs_info) {
+ /*
+ * Called only from tests that don't always have a fs_info
+ * available, but we know that nodesize is 4096
+ */
+ len = 4096;
+ } else {
+ len = fs_info->tree_root->nodesize;
+ }
+
+ return __alloc_dummy_extent_buffer(fs_info, start, len);
+}
+
static void check_buffer_tree_ref(struct extent_buffer *eb)
{
int refs;
@@ -5160,7 +5140,7 @@ int set_extent_buffer_dirty(struct extent_buffer *eb)
return was_dirty;
}
-int clear_extent_buffer_uptodate(struct extent_buffer *eb)
+void clear_extent_buffer_uptodate(struct extent_buffer *eb)
{
unsigned long i;
struct page *page;
@@ -5173,10 +5153,9 @@ int clear_extent_buffer_uptodate(struct extent_buffer *eb)
if (page)
ClearPageUptodate(page);
}
- return 0;
}
-int set_extent_buffer_uptodate(struct extent_buffer *eb)
+void set_extent_buffer_uptodate(struct extent_buffer *eb)
{
unsigned long i;
struct page *page;
@@ -5188,7 +5167,6 @@ int set_extent_buffer_uptodate(struct extent_buffer *eb)
page = eb->pages[i];
SetPageUptodate(page);
}
- return 0;
}
int extent_buffer_uptodate(struct extent_buffer *eb)
@@ -5527,6 +5505,155 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
}
}
+/*
+ * The extent buffer bitmap operations are done with byte granularity because
+ * bitmap items are not guaranteed to be aligned to a word and therefore a
+ * single word in a bitmap may straddle two pages in the extent buffer.
+ */
+#define BIT_BYTE(nr) ((nr) / BITS_PER_BYTE)
+#define BYTE_MASK ((1 << BITS_PER_BYTE) - 1)
+#define BITMAP_FIRST_BYTE_MASK(start) \
+ ((BYTE_MASK << ((start) & (BITS_PER_BYTE - 1))) & BYTE_MASK)
+#define BITMAP_LAST_BYTE_MASK(nbits) \
+ (BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1)))
+
+/*
+ * eb_bitmap_offset() - calculate the page and offset of the byte containing the
+ * given bit number
+ * @eb: the extent buffer
+ * @start: offset of the bitmap item in the extent buffer
+ * @nr: bit number
+ * @page_index: return index of the page in the extent buffer that contains the
+ * given bit number
+ * @page_offset: return offset into the page given by page_index
+ *
+ * This helper hides the ugliness of finding the byte in an extent buffer which
+ * contains a given bit.
+ */
+static inline void eb_bitmap_offset(struct extent_buffer *eb,
+ unsigned long start, unsigned long nr,
+ unsigned long *page_index,
+ size_t *page_offset)
+{
+ size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+ size_t byte_offset = BIT_BYTE(nr);
+ size_t offset;
+
+ /*
+ * The byte we want is the offset of the extent buffer + the offset of
+ * the bitmap item in the extent buffer + the offset of the byte in the
+ * bitmap item.
+ */
+ offset = start_offset + start + byte_offset;
+
+ *page_index = offset >> PAGE_CACHE_SHIFT;
+ *page_offset = offset & (PAGE_CACHE_SIZE - 1);
+}
+
+/**
+ * extent_buffer_test_bit - determine whether a bit in a bitmap item is set
+ * @eb: the extent buffer
+ * @start: offset of the bitmap item in the extent buffer
+ * @nr: bit number to test
+ */
+int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start,
+ unsigned long nr)
+{
+ char *kaddr;
+ struct page *page;
+ unsigned long i;
+ size_t offset;
+
+ eb_bitmap_offset(eb, start, nr, &i, &offset);
+ page = eb->pages[i];
+ WARN_ON(!PageUptodate(page));
+ kaddr = page_address(page);
+ return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1)));
+}
+
+/**
+ * extent_buffer_bitmap_set - set an area of a bitmap
+ * @eb: the extent buffer
+ * @start: offset of the bitmap item in the extent buffer
+ * @pos: bit number of the first bit
+ * @len: number of bits to set
+ */
+void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start,
+ unsigned long pos, unsigned long len)
+{
+ char *kaddr;
+ struct page *page;
+ unsigned long i;
+ size_t offset;
+ const unsigned int size = pos + len;
+ int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
+ unsigned int mask_to_set = BITMAP_FIRST_BYTE_MASK(pos);
+
+ eb_bitmap_offset(eb, start, pos, &i, &offset);
+ page = eb->pages[i];
+ WARN_ON(!PageUptodate(page));
+ kaddr = page_address(page);
+
+ while (len >= bits_to_set) {
+ kaddr[offset] |= mask_to_set;
+ len -= bits_to_set;
+ bits_to_set = BITS_PER_BYTE;
+ mask_to_set = ~0U;
+ if (++offset >= PAGE_CACHE_SIZE && len > 0) {
+ offset = 0;
+ page = eb->pages[++i];
+ WARN_ON(!PageUptodate(page));
+ kaddr = page_address(page);
+ }
+ }
+ if (len) {
+ mask_to_set &= BITMAP_LAST_BYTE_MASK(size);
+ kaddr[offset] |= mask_to_set;
+ }
+}
+
+
+/**
+ * extent_buffer_bitmap_clear - clear an area of a bitmap
+ * @eb: the extent buffer
+ * @start: offset of the bitmap item in the extent buffer
+ * @pos: bit number of the first bit
+ * @len: number of bits to clear
+ */
+void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start,
+ unsigned long pos, unsigned long len)
+{
+ char *kaddr;
+ struct page *page;
+ unsigned long i;
+ size_t offset;
+ const unsigned int size = pos + len;
+ int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
+ unsigned int mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos);
+
+ eb_bitmap_offset(eb, start, pos, &i, &offset);
+ page = eb->pages[i];
+ WARN_ON(!PageUptodate(page));
+ kaddr = page_address(page);
+
+ while (len >= bits_to_clear) {
+ kaddr[offset] &= ~mask_to_clear;
+ len -= bits_to_clear;
+ bits_to_clear = BITS_PER_BYTE;
+ mask_to_clear = ~0U;
+ if (++offset >= PAGE_CACHE_SIZE && len > 0) {
+ offset = 0;
+ page = eb->pages[++i];
+ WARN_ON(!PageUptodate(page));
+ kaddr = page_address(page);
+ }
+ }
+ if (len) {
+ mask_to_clear &= BITMAP_LAST_BYTE_MASK(size);
+ kaddr[offset] &= ~mask_to_clear;
+ }
+}
+
static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
{
unsigned long distance = (src > dst) ? src - dst : dst - src;
@@ -5566,13 +5693,15 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
unsigned long src_i;
if (src_offset + len > dst->len) {
- printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move "
- "len %lu dst len %lu\n", src_offset, len, dst->len);
+ btrfs_err(dst->fs_info,
+ "memmove bogus src_offset %lu move "
+ "len %lu dst len %lu", src_offset, len, dst->len);
BUG_ON(1);
}
if (dst_offset + len > dst->len) {
- printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move "
- "len %lu dst len %lu\n", dst_offset, len, dst->len);
+ btrfs_err(dst->fs_info,
+ "memmove bogus dst_offset %lu move "
+ "len %lu dst len %lu", dst_offset, len, dst->len);
BUG_ON(1);
}
@@ -5612,13 +5741,13 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
unsigned long src_i;
if (src_offset + len > dst->len) {
- printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move "
- "len %lu len %lu\n", src_offset, len, dst->len);
+ btrfs_err(dst->fs_info, "memmove bogus src_offset %lu move "
+ "len %lu len %lu", src_offset, len, dst->len);
BUG_ON(1);
}
if (dst_offset + len > dst->len) {
- printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move "
- "len %lu len %lu\n", dst_offset, len, dst->len);
+ btrfs_err(dst->fs_info, "memmove bogus dst_offset %lu move "
+ "len %lu len %lu", dst_offset, len, dst->len);
BUG_ON(1);
}
if (dst_offset < src_offset) {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index c668f36898d3..0377413bd4b9 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -2,6 +2,7 @@
#define __EXTENTIO__
#include <linux/rbtree.h>
+#include "ulist.h"
/* bits for the extent state */
#define EXTENT_DIRTY (1U << 0)
@@ -18,6 +19,7 @@
#define EXTENT_NEED_WAIT (1U << 13)
#define EXTENT_DAMAGED (1U << 14)
#define EXTENT_NORESERVE (1U << 15)
+#define EXTENT_QGROUP_RESERVED (1U << 16)
#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
@@ -161,6 +163,17 @@ struct extent_buffer {
#endif
};
+/*
+ * Structure to record how many bytes and which ranges are set/cleared
+ */
+struct extent_changeset {
+ /* How many bytes are set/cleared in this operation */
+ u64 bytes_changed;
+
+ /* Changed ranges */
+ struct ulist *range_changed;
+};
+
static inline void extent_set_compress_type(unsigned long *bio_flags,
int compress_type)
{
@@ -186,12 +199,14 @@ int try_release_extent_mapping(struct extent_map_tree *map,
struct extent_io_tree *tree, struct page *page,
gfp_t mask);
int try_release_extent_buffer(struct page *page);
-int lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, struct extent_state **cached);
-int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end);
-int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached, gfp_t mask);
+ struct extent_state **cached);
+
+static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
+{
+ return lock_extent_bits(tree, start, end, NULL);
+}
+
int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
get_extent_t *get_extent, int mirror_num);
@@ -208,33 +223,105 @@ void free_extent_state(struct extent_state *state);
int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, int filled,
struct extent_state *cached_state);
-int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, gfp_t mask);
+int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, gfp_t mask,
+ struct extent_changeset *changeset);
int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, int wake, int delete,
struct extent_state **cached, gfp_t mask);
-int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, gfp_t mask);
+
+static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
+{
+ return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
+ GFP_NOFS);
+}
+
+static inline int unlock_extent_cached(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached, gfp_t mask)
+{
+ return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
+ mask);
+}
+
+static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start,
+ u64 end, unsigned bits, gfp_t mask)
+{
+ int wake = 0;
+
+ if (bits & EXTENT_LOCKED)
+ wake = 1;
+
+ return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask);
+}
+
+int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, gfp_t mask,
+ struct extent_changeset *changeset);
int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, u64 *failed_start,
struct extent_state **cached_state, gfp_t mask);
-int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached_state, gfp_t mask);
-int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached_state, gfp_t mask);
-int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
- gfp_t mask);
-int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
- gfp_t mask);
-int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
- gfp_t mask);
+
+static inline int set_extent_bits(struct extent_io_tree *tree, u64 start,
+ u64 end, unsigned bits, gfp_t mask)
+{
+ return set_extent_bit(tree, start, end, bits, NULL, NULL, mask);
+}
+
+static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached_state, gfp_t mask)
+{
+ return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
+ cached_state, mask);
+}
+
+static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start,
+ u64 end, gfp_t mask)
+{
+ return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL,
+ NULL, mask);
+}
+
+static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start,
+ u64 end, gfp_t mask)
+{
+ return clear_extent_bit(tree, start, end,
+ EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
+}
+
int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, unsigned clear_bits,
struct extent_state **cached_state, gfp_t mask);
-int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached_state, gfp_t mask);
-int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached_state, gfp_t mask);
+
+static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached_state, gfp_t mask)
+{
+ return set_extent_bit(tree, start, end,
+ EXTENT_DELALLOC | EXTENT_UPTODATE,
+ NULL, cached_state, mask);
+}
+
+static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached_state, gfp_t mask)
+{
+ return set_extent_bit(tree, start, end,
+ EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG,
+ NULL, cached_state, mask);
+}
+
+static inline int set_extent_new(struct extent_io_tree *tree, u64 start,
+ u64 end, gfp_t mask)
+{
+ return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, NULL, mask);
+}
+
+static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached_state, gfp_t mask)
+{
+ return set_extent_bit(tree, start, end, EXTENT_UPTODATE, NULL,
+ cached_state, mask);
+}
+
int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
u64 *start_ret, u64 *end_ret, unsigned bits,
struct extent_state **cached_state);
@@ -263,8 +350,10 @@ void set_page_extent_mapped(struct page *page);
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start);
+struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start, unsigned long len);
struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start);
+ u64 start);
struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src);
struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start);
@@ -309,19 +398,25 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
unsigned long src_offset, unsigned long len);
void memset_extent_buffer(struct extent_buffer *eb, char c,
unsigned long start, unsigned long len);
+int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start,
+ unsigned long pos);
+void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start,
+ unsigned long pos, unsigned long len);
+void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start,
+ unsigned long pos, unsigned long len);
void clear_extent_buffer_dirty(struct extent_buffer *eb);
int set_extent_buffer_dirty(struct extent_buffer *eb);
-int set_extent_buffer_uptodate(struct extent_buffer *eb);
-int clear_extent_buffer_uptodate(struct extent_buffer *eb);
+void set_extent_buffer_uptodate(struct extent_buffer *eb);
+void clear_extent_buffer_uptodate(struct extent_buffer *eb);
int extent_buffer_uptodate(struct extent_buffer *eb);
int extent_buffer_under_io(struct extent_buffer *eb);
int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
unsigned long min_len, char **map,
unsigned long *map_start,
unsigned long *map_len);
-int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end);
-int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
-int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
+void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end);
+void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
+void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
struct page *locked_page,
unsigned bits_to_clear,
unsigned long page_ops);
@@ -338,7 +433,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
int mirror_num);
int clean_io_failure(struct inode *inode, u64 start, struct page *page,
unsigned int pg_offset);
-int end_extent_writepage(struct page *page, int err, u64 start, u64 end);
+void end_extent_writepage(struct page *page, int err, u64 start, u64 end);
int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
int mirror_num);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 58ece6558430..a67e1c828d0f 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -202,7 +202,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
}
if (bio->bi_iter.bi_size > PAGE_CACHE_SIZE * 8)
- path->reada = 2;
+ path->reada = READA_FORWARD;
WARN_ON(bio->bi_vcnt <= 0);
@@ -328,7 +328,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
if (search_commit) {
path->skip_locking = 1;
- path->reada = 2;
+ path->reada = READA_FORWARD;
path->search_commit_root = 1;
}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 8c6f247ba81d..83d7859d7619 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -756,8 +756,16 @@ next_slot:
}
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
- if (key.objectid > ino ||
- key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
+
+ if (key.objectid > ino)
+ break;
+ if (WARN_ON_ONCE(key.objectid < ino) ||
+ key.type < BTRFS_EXTENT_DATA_KEY) {
+ ASSERT(del_nr == 0);
+ path->slots[0]++;
+ goto next_slot;
+ }
+ if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
break;
fi = btrfs_item_ptr(leaf, path->slots[0],
@@ -776,8 +784,8 @@ next_slot:
btrfs_file_extent_inline_len(leaf,
path->slots[0], fi);
} else {
- WARN_ON(1);
- extent_end = search_start;
+ /* can't happen */
+ BUG();
}
/*
@@ -847,7 +855,7 @@ next_slot:
disk_bytenr, num_bytes, 0,
root->root_key.objectid,
new_key.objectid,
- start - extent_offset, 1);
+ start - extent_offset);
BUG_ON(ret); /* -ENOMEM */
}
key.offset = start;
@@ -925,7 +933,7 @@ delete_extent_item:
disk_bytenr, num_bytes, 0,
root->root_key.objectid,
key.objectid, key.offset -
- extent_offset, 0);
+ extent_offset);
BUG_ON(ret); /* -ENOMEM */
inode_sub_bytes(inode,
extent_end - key.offset);
@@ -1204,7 +1212,7 @@ again:
ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
root->root_key.objectid,
- ino, orig_offset, 1);
+ ino, orig_offset);
BUG_ON(ret); /* -ENOMEM */
if (split == start) {
@@ -1231,7 +1239,7 @@ again:
del_nr++;
ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
0, root->root_key.objectid,
- ino, orig_offset, 0);
+ ino, orig_offset);
BUG_ON(ret); /* -ENOMEM */
}
other_start = 0;
@@ -1248,7 +1256,7 @@ again:
del_nr++;
ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
0, root->root_key.objectid,
- ino, orig_offset, 0);
+ ino, orig_offset);
BUG_ON(ret); /* -ENOMEM */
}
if (del_nr == 0) {
@@ -1283,7 +1291,8 @@ out:
* on error we return an unlocked page and the error value
* on success we return a locked page and 0
*/
-static int prepare_uptodate_page(struct page *page, u64 pos,
+static int prepare_uptodate_page(struct inode *inode,
+ struct page *page, u64 pos,
bool force_uptodate)
{
int ret = 0;
@@ -1298,6 +1307,10 @@ static int prepare_uptodate_page(struct page *page, u64 pos,
unlock_page(page);
return -EIO;
}
+ if (page->mapping != inode->i_mapping) {
+ unlock_page(page);
+ return -EAGAIN;
+ }
}
return 0;
}
@@ -1316,6 +1329,7 @@ static noinline int prepare_pages(struct inode *inode, struct page **pages,
int faili;
for (i = 0; i < num_pages; i++) {
+again:
pages[i] = find_or_create_page(inode->i_mapping, index + i,
mask | __GFP_WRITE);
if (!pages[i]) {
@@ -1325,13 +1339,17 @@ static noinline int prepare_pages(struct inode *inode, struct page **pages,
}
if (i == 0)
- err = prepare_uptodate_page(pages[i], pos,
+ err = prepare_uptodate_page(inode, pages[i], pos,
force_uptodate);
- if (i == num_pages - 1)
- err = prepare_uptodate_page(pages[i],
+ if (!err && i == num_pages - 1)
+ err = prepare_uptodate_page(inode, pages[i],
pos + write_bytes, false);
if (err) {
page_cache_release(pages[i]);
+ if (err == -EAGAIN) {
+ err = 0;
+ goto again;
+ }
faili = i - 1;
goto fail;
}
@@ -1376,7 +1394,7 @@ lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
if (start_pos < inode->i_size) {
struct btrfs_ordered_extent *ordered;
lock_extent_bits(&BTRFS_I(inode)->io_tree,
- start_pos, last_pos, 0, cached_state);
+ start_pos, last_pos, cached_state);
ordered = btrfs_lookup_ordered_range(inode, start_pos,
last_pos - start_pos + 1);
if (ordered &&
@@ -1469,7 +1487,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
u64 release_bytes = 0;
u64 lockstart;
u64 lockend;
- unsigned long first_index;
size_t num_written = 0;
int nrptrs;
int ret = 0;
@@ -1485,8 +1502,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
if (!pages)
return -ENOMEM;
- first_index = pos >> PAGE_CACHE_SHIFT;
-
while (iov_iter_count(i) > 0) {
size_t offset = pos & (PAGE_CACHE_SIZE - 1);
size_t write_bytes = min(iov_iter_count(i),
@@ -1510,12 +1525,17 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
}
reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
- ret = btrfs_check_data_free_space(inode, reserve_bytes, write_bytes);
- if (ret == -ENOSPC &&
- (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
- BTRFS_INODE_PREALLOC))) {
+
+ if (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
+ BTRFS_INODE_PREALLOC)) {
ret = check_can_nocow(inode, pos, &write_bytes);
+ if (ret < 0)
+ break;
if (ret > 0) {
+ /*
+ * For nodata cow case, no need to reserve
+ * data space.
+ */
only_release_metadata = true;
/*
* our prealloc extent may be smaller than
@@ -1524,20 +1544,19 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
num_pages = DIV_ROUND_UP(write_bytes + offset,
PAGE_CACHE_SIZE);
reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
- ret = 0;
- } else {
- ret = -ENOSPC;
+ goto reserve_metadata;
}
}
-
- if (ret)
+ ret = btrfs_check_data_free_space(inode, pos, write_bytes);
+ if (ret < 0)
break;
+reserve_metadata:
ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes);
if (ret) {
if (!only_release_metadata)
- btrfs_free_reserved_data_space(inode,
- reserve_bytes);
+ btrfs_free_reserved_data_space(inode, pos,
+ write_bytes);
else
btrfs_end_write_no_snapshoting(root);
break;
@@ -1603,12 +1622,17 @@ again:
BTRFS_I(inode)->outstanding_extents++;
spin_unlock(&BTRFS_I(inode)->lock);
}
- if (only_release_metadata)
+ if (only_release_metadata) {
btrfs_delalloc_release_metadata(inode,
release_bytes);
- else
- btrfs_delalloc_release_space(inode,
+ } else {
+ u64 __pos;
+
+ __pos = round_down(pos, root->sectorsize) +
+ (dirty_pages << PAGE_CACHE_SHIFT);
+ btrfs_delalloc_release_space(inode, __pos,
release_bytes);
+ }
}
release_bytes = dirty_pages << PAGE_CACHE_SHIFT;
@@ -1660,7 +1684,7 @@ again:
btrfs_end_write_no_snapshoting(root);
btrfs_delalloc_release_metadata(inode, release_bytes);
} else {
- btrfs_delalloc_release_space(inode, release_bytes);
+ btrfs_delalloc_release_space(inode, pos, release_bytes);
}
}
@@ -1868,8 +1892,13 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
struct btrfs_log_ctx ctx;
int ret = 0;
bool full_sync = 0;
- const u64 len = end - start + 1;
+ u64 len;
+ /*
+ * The range length can be represented by u64, we have to do the typecasts
+ * to avoid signed overflow if it's [0, LLONG_MAX] eg. from fsync()
+ */
+ len = (u64)end - (u64)start + 1;
trace_btrfs_sync_file(file, datasync);
/*
@@ -2057,8 +2086,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
}
}
if (!full_sync) {
- ret = btrfs_wait_ordered_range(inode, start,
- end - start + 1);
+ ret = btrfs_wait_ordered_range(inode, start, len);
if (ret) {
btrfs_end_transaction(trans, root);
goto out;
@@ -2266,7 +2294,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
u64 drop_end;
int ret = 0;
int err = 0;
- int rsv_count;
+ unsigned int rsv_count;
bool same_page;
bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
u64 ino_size;
@@ -2370,7 +2398,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
truncate_pagecache_range(inode, lockstart, lockend);
lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- 0, &cached_state);
+ &cached_state);
ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
/*
@@ -2488,6 +2516,19 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
trans->block_rsv = &root->fs_info->trans_block_rsv;
/*
+ * If we are using the NO_HOLES feature we might have had already an
+ * hole that overlaps a part of the region [lockstart, lockend] and
+ * ends at (or beyond) lockend. Since we have no file extent items to
+ * represent holes, drop_end can be less than lockend and so we must
+ * make sure we have an extent map representing the existing hole (the
+ * call to __btrfs_drop_extents() might have dropped the existing extent
+ * map representing the existing hole), otherwise the fast fsync path
+ * will not record the existence of the hole region
+ * [existing_hole_start, lockend].
+ */
+ if (drop_end <= lockend)
+ drop_end = lockend + 1;
+ /*
* Don't insert file hole extent item if it's for a range beyond eof
* (because it's useless) or if it represents a 0 bytes range (when
* cur_offset == drop_end).
@@ -2541,17 +2582,61 @@ out_only_mutex:
return err;
}
+/* Helper structure to record which range is already reserved */
+struct falloc_range {
+ struct list_head list;
+ u64 start;
+ u64 len;
+};
+
+/*
+ * Helper function to add falloc range
+ *
+ * Caller should have locked the larger range of extent containing
+ * [start, len)
+ */
+static int add_falloc_range(struct list_head *head, u64 start, u64 len)
+{
+ struct falloc_range *prev = NULL;
+ struct falloc_range *range = NULL;
+
+ if (list_empty(head))
+ goto insert;
+
+ /*
+ * As fallocate iterate by bytenr order, we only need to check
+ * the last range.
+ */
+ prev = list_entry(head->prev, struct falloc_range, list);
+ if (prev->start + prev->len == start) {
+ prev->len += len;
+ return 0;
+ }
+insert:
+ range = kmalloc(sizeof(*range), GFP_NOFS);
+ if (!range)
+ return -ENOMEM;
+ range->start = start;
+ range->len = len;
+ list_add_tail(&range->list, head);
+ return 0;
+}
+
static long btrfs_fallocate(struct file *file, int mode,
loff_t offset, loff_t len)
{
struct inode *inode = file_inode(file);
struct extent_state *cached_state = NULL;
+ struct falloc_range *range;
+ struct falloc_range *tmp;
+ struct list_head reserve_list;
u64 cur_offset;
u64 last_byte;
u64 alloc_start;
u64 alloc_end;
u64 alloc_hint = 0;
u64 locked_end;
+ u64 actual_end = 0;
struct extent_map *em;
int blocksize = BTRFS_I(inode)->root->sectorsize;
int ret;
@@ -2567,11 +2652,12 @@ static long btrfs_fallocate(struct file *file, int mode,
return btrfs_punch_hole(inode, offset, len);
/*
- * Make sure we have enough space before we do the
- * allocation.
+ * Only trigger disk allocation, don't trigger qgroup reserve
+ *
+ * For qgroup space, it will be checked later.
*/
- ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start, alloc_end - alloc_start);
- if (ret)
+ ret = btrfs_alloc_data_chunk_ondemand(inode, alloc_end - alloc_start);
+ if (ret < 0)
return ret;
mutex_lock(&inode->i_mutex);
@@ -2579,6 +2665,13 @@ static long btrfs_fallocate(struct file *file, int mode,
if (ret)
goto out;
+ /*
+ * TODO: Move these two operations after we have checked
+ * accurate reserved space, or fallocate can still fail but
+ * with page truncated or size expanded.
+ *
+ * But that's a minor problem and won't do much harm BTW.
+ */
if (alloc_start > inode->i_size) {
ret = btrfs_cont_expand(inode, i_size_read(inode),
alloc_start);
@@ -2612,7 +2705,7 @@ static long btrfs_fallocate(struct file *file, int mode,
* transaction
*/
lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
- locked_end, 0, &cached_state);
+ locked_end, &cached_state);
ordered = btrfs_lookup_first_ordered_extent(inode,
alloc_end - 1);
if (ordered &&
@@ -2637,10 +2730,10 @@ static long btrfs_fallocate(struct file *file, int mode,
}
}
+ /* First, check if we exceed the qgroup limit */
+ INIT_LIST_HEAD(&reserve_list);
cur_offset = alloc_start;
while (1) {
- u64 actual_end;
-
em = btrfs_get_extent(inode, NULL, 0, cur_offset,
alloc_end - cur_offset, 0);
if (IS_ERR_OR_NULL(em)) {
@@ -2653,57 +2746,82 @@ static long btrfs_fallocate(struct file *file, int mode,
last_byte = min(extent_map_end(em), alloc_end);
actual_end = min_t(u64, extent_map_end(em), offset + len);
last_byte = ALIGN(last_byte, blocksize);
-
if (em->block_start == EXTENT_MAP_HOLE ||
(cur_offset >= inode->i_size &&
!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
- ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
- last_byte - cur_offset,
- 1 << inode->i_blkbits,
- offset + len,
- &alloc_hint);
- } else if (actual_end > inode->i_size &&
- !(mode & FALLOC_FL_KEEP_SIZE)) {
- struct btrfs_trans_handle *trans;
- struct btrfs_root *root = BTRFS_I(inode)->root;
-
- /*
- * We didn't need to allocate any more space, but we
- * still extended the size of the file so we need to
- * update i_size and the inode item.
- */
- trans = btrfs_start_transaction(root, 1);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- } else {
- inode->i_ctime = CURRENT_TIME;
- i_size_write(inode, actual_end);
- btrfs_ordered_update_i_size(inode, actual_end,
- NULL);
- ret = btrfs_update_inode(trans, root, inode);
- if (ret)
- btrfs_end_transaction(trans, root);
- else
- ret = btrfs_end_transaction(trans,
- root);
+ ret = add_falloc_range(&reserve_list, cur_offset,
+ last_byte - cur_offset);
+ if (ret < 0) {
+ free_extent_map(em);
+ break;
}
+ ret = btrfs_qgroup_reserve_data(inode, cur_offset,
+ last_byte - cur_offset);
+ if (ret < 0)
+ break;
}
free_extent_map(em);
- if (ret < 0)
- break;
-
cur_offset = last_byte;
- if (cur_offset >= alloc_end) {
- ret = 0;
+ if (cur_offset >= alloc_end)
break;
+ }
+
+ /*
+ * If ret is still 0, means we're OK to fallocate.
+ * Or just cleanup the list and exit.
+ */
+ list_for_each_entry_safe(range, tmp, &reserve_list, list) {
+ if (!ret)
+ ret = btrfs_prealloc_file_range(inode, mode,
+ range->start,
+ range->len, 1 << inode->i_blkbits,
+ offset + len, &alloc_hint);
+ list_del(&range->list);
+ kfree(range);
+ }
+ if (ret < 0)
+ goto out_unlock;
+
+ if (actual_end > inode->i_size &&
+ !(mode & FALLOC_FL_KEEP_SIZE)) {
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+
+ /*
+ * We didn't need to allocate any more space, but we
+ * still extended the size of the file so we need to
+ * update i_size and the inode item.
+ */
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ } else {
+ inode->i_ctime = CURRENT_TIME;
+ i_size_write(inode, actual_end);
+ btrfs_ordered_update_i_size(inode, actual_end, NULL);
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret)
+ btrfs_end_transaction(trans, root);
+ else
+ ret = btrfs_end_transaction(trans, root);
}
}
+out_unlock:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
&cached_state, GFP_NOFS);
out:
+ /*
+ * As we waited the extent range, the data_rsv_map must be empty
+ * in the range, as written data range will be released from it.
+ * And for prealloacted extent, it will also be released when
+ * its metadata is written.
+ * So this is completely used as cleanup.
+ */
+ btrfs_qgroup_free_data(inode, alloc_start, alloc_end - alloc_start);
mutex_unlock(&inode->i_mutex);
/* Let go of our reservation. */
- btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
+ btrfs_free_reserved_data_space(inode, alloc_start,
+ alloc_end - alloc_start);
return ret;
}
@@ -2734,7 +2852,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
lockend--;
len = lockend - lockstart + 1;
- lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0,
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
&cached_state);
while (start < inode->i_size) {
@@ -2816,6 +2934,9 @@ const struct file_operations btrfs_file_operations = {
#ifdef CONFIG_COMPAT
.compat_ioctl = btrfs_ioctl,
#endif
+ .copy_file_range = btrfs_copy_file_range,
+ .clone_file_range = btrfs_clone_file_range,
+ .dedupe_file_range = btrfs_dedupe_file_range,
};
void btrfs_auto_defrag_exit(void)
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index abe3a66bd3ba..8f835bfa1bdd 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -30,7 +30,7 @@
#include "volumes.h"
#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8)
-#define MAX_CACHE_BYTES_PER_GIG (32 * 1024)
+#define MAX_CACHE_BYTES_PER_GIG SZ_32K
struct btrfs_trim_range {
u64 start;
@@ -85,8 +85,8 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
}
mapping_set_gfp_mask(inode->i_mapping,
- mapping_gfp_mask(inode->i_mapping) &
- ~(__GFP_FS | __GFP_HIGHMEM));
+ mapping_gfp_constraint(inode->i_mapping,
+ ~(__GFP_FS | __GFP_HIGHMEM)));
return inode;
}
@@ -450,9 +450,9 @@ static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
gen = io_ctl->cur;
if (le64_to_cpu(*gen) != generation) {
- printk_ratelimited(KERN_ERR "BTRFS: space cache generation "
- "(%Lu) does not match inode (%Lu)\n", *gen,
- generation);
+ btrfs_err_rl(io_ctl->root->fs_info,
+ "space cache generation (%llu) does not match inode (%llu)",
+ *gen, generation);
io_ctl_unmap_page(io_ctl);
return -EIO;
}
@@ -506,8 +506,8 @@ static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
PAGE_CACHE_SIZE - offset);
btrfs_csum_final(crc, (char *)&crc);
if (val != crc) {
- printk_ratelimited(KERN_ERR "BTRFS: csum mismatch on free "
- "space cache\n");
+ btrfs_err_rl(io_ctl->root->fs_info,
+ "csum mismatch on free space cache");
io_ctl_unmap_page(io_ctl);
return -EIO;
}
@@ -891,7 +891,7 @@ out:
spin_unlock(&block_group->lock);
ret = 0;
- btrfs_warn(fs_info, "failed to load free space cache for block group %llu, rebuild it now",
+ btrfs_warn(fs_info, "failed to load free space cache for block group %llu, rebuilding it now",
block_group->key.objectid);
}
@@ -1086,14 +1086,11 @@ write_pinned_extent_entries(struct btrfs_root *root,
static noinline_for_stack int
write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list)
{
- struct list_head *pos, *n;
+ struct btrfs_free_space *entry, *next;
int ret;
/* Write out the bitmaps */
- list_for_each_safe(pos, n, bitmap_list) {
- struct btrfs_free_space *entry =
- list_entry(pos, struct btrfs_free_space, list);
-
+ list_for_each_entry_safe(entry, next, bitmap_list, list) {
ret = io_ctl_add_bitmap(io_ctl, entry->bitmap);
if (ret)
return -ENOSPC;
@@ -1119,13 +1116,10 @@ static int flush_dirty_cache(struct inode *inode)
static void noinline_for_stack
cleanup_bitmap_list(struct list_head *bitmap_list)
{
- struct list_head *pos, *n;
+ struct btrfs_free_space *entry, *next;
- list_for_each_safe(pos, n, bitmap_list) {
- struct btrfs_free_space *entry =
- list_entry(pos, struct btrfs_free_space, list);
+ list_for_each_entry_safe(entry, next, bitmap_list, list)
list_del_init(&entry->list);
- }
}
static void noinline_for_stack
@@ -1215,7 +1209,7 @@ out:
* @offset - the offset for the key we'll insert
*
* This function writes out a free space cache struct to disk for quick recovery
- * on mount. This will return 0 if it was successfull in writing the cache out,
+ * on mount. This will return 0 if it was successful in writing the cache out,
* or an errno if it was not.
*/
static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
@@ -1261,7 +1255,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
goto out;
lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
- 0, &cached_state);
+ &cached_state);
io_ctl_set_generation(io_ctl, trans->transid);
@@ -1656,11 +1650,10 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
* at or below 32k, so we need to adjust how much memory we allow to be
* used by extent based free space tracking
*/
- if (size < 1024 * 1024 * 1024)
+ if (size < SZ_1G)
max_bytes = MAX_CACHE_BYTES_PER_GIG;
else
- max_bytes = MAX_CACHE_BYTES_PER_GIG *
- div_u64(size, 1024 * 1024 * 1024);
+ max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G);
/*
* we want to account for 1 more bitmap than what we have so we can make
@@ -1730,7 +1723,7 @@ static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
*/
static int search_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *bitmap_info, u64 *offset,
- u64 *bytes)
+ u64 *bytes, bool for_alloc)
{
unsigned long found_bits = 0;
unsigned long max_bits = 0;
@@ -1738,11 +1731,26 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
unsigned long next_zero;
unsigned long extent_bits;
+ /*
+ * Skip searching the bitmap if we don't have a contiguous section that
+ * is large enough for this allocation.
+ */
+ if (for_alloc &&
+ bitmap_info->max_extent_size &&
+ bitmap_info->max_extent_size < *bytes) {
+ *bytes = bitmap_info->max_extent_size;
+ return -1;
+ }
+
i = offset_to_bit(bitmap_info->offset, ctl->unit,
max_t(u64, *offset, bitmap_info->offset));
bits = bytes_to_bits(*bytes, ctl->unit);
for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
+ if (for_alloc && bits == 1) {
+ found_bits = 1;
+ break;
+ }
next_zero = find_next_zero_bit(bitmap_info->bitmap,
BITS_PER_BITMAP, i);
extent_bits = next_zero - i;
@@ -1762,6 +1770,7 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
}
*bytes = (u64)(max_bits) * ctl->unit;
+ bitmap_info->max_extent_size = *bytes;
return -1;
}
@@ -1813,7 +1822,7 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
if (entry->bitmap) {
u64 size = *bytes;
- ret = search_bitmap(ctl, entry, &tmp, &size);
+ ret = search_bitmap(ctl, entry, &tmp, &size, true);
if (!ret) {
*offset = tmp;
*bytes = size;
@@ -1874,7 +1883,8 @@ again:
search_start = *offset;
search_bytes = ctl->unit;
search_bytes = min(search_bytes, end - search_start + 1);
- ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes);
+ ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes,
+ false);
if (ret < 0 || search_start != *offset)
return -EINVAL;
@@ -1919,7 +1929,7 @@ again:
search_start = *offset;
search_bytes = ctl->unit;
ret = search_bitmap(ctl, bitmap_info, &search_start,
- &search_bytes);
+ &search_bytes, false);
if (ret < 0 || search_start != *offset)
return -EAGAIN;
@@ -1943,6 +1953,12 @@ static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
bitmap_set_bits(ctl, info, offset, bytes_to_set);
+ /*
+ * We set some bytes, we have no idea what the max extent size is
+ * anymore.
+ */
+ info->max_extent_size = 0;
+
return bytes_to_set;
}
@@ -1951,12 +1967,19 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info)
{
struct btrfs_block_group_cache *block_group = ctl->private;
+ bool forced = false;
+
+#ifdef CONFIG_BTRFS_DEBUG
+ if (btrfs_should_fragment_free_space(block_group->fs_info->extent_root,
+ block_group))
+ forced = true;
+#endif
/*
* If we are below the extents threshold then we can add this as an
* extent, and don't have to deal with the bitmap
*/
- if (ctl->free_extents < ctl->extents_thresh) {
+ if (!forced && ctl->free_extents < ctl->extents_thresh) {
/*
* If this block group has some small extents we don't want to
* use up all of our free slots in the cache with them, we want
@@ -1986,7 +2009,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
return true;
}
-static struct btrfs_free_space_op free_space_op = {
+static const struct btrfs_free_space_op free_space_op = {
.recalc_thresholds = recalculate_thresholds,
.use_bitmap = use_bitmap,
};
@@ -2459,8 +2482,7 @@ void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group)
* track of free space, and if we pass 1/2 of that we want to
* start converting things over to using bitmaps
*/
- ctl->extents_thresh = ((1024 * 32) / 2) /
- sizeof(struct btrfs_free_space);
+ ctl->extents_thresh = (SZ_32K / 2) / sizeof(struct btrfs_free_space);
}
/*
@@ -2661,7 +2683,7 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
search_start = min_start;
search_bytes = bytes;
- err = search_bitmap(ctl, entry, &search_start, &search_bytes);
+ err = search_bitmap(ctl, entry, &search_start, &search_bytes, true);
if (err) {
if (search_bytes > *max_extent_size)
*max_extent_size = search_bytes;
@@ -2775,6 +2797,7 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
unsigned long want_bits;
unsigned long min_bits;
unsigned long found_bits;
+ unsigned long max_bits = 0;
unsigned long start = 0;
unsigned long total_found = 0;
int ret;
@@ -2784,6 +2807,13 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
want_bits = bytes_to_bits(bytes, ctl->unit);
min_bits = bytes_to_bits(min_bytes, ctl->unit);
+ /*
+ * Don't bother looking for a cluster in this bitmap if it's heavily
+ * fragmented.
+ */
+ if (entry->max_extent_size &&
+ entry->max_extent_size < cont1_bytes)
+ return -ENOSPC;
again:
found_bits = 0;
for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) {
@@ -2791,13 +2821,19 @@ again:
BITS_PER_BITMAP, i);
if (next_zero - i >= min_bits) {
found_bits = next_zero - i;
+ if (found_bits > max_bits)
+ max_bits = found_bits;
break;
}
+ if (next_zero - i > max_bits)
+ max_bits = next_zero - i;
i = next_zero;
}
- if (!found_bits)
+ if (!found_bits) {
+ entry->max_extent_size = (u64)max_bits * ctl->unit;
return -ENOSPC;
+ }
if (!total_found) {
start = i;
@@ -2928,7 +2964,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
u64 cont1_bytes, u64 min_bytes)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
- struct btrfs_free_space *entry;
+ struct btrfs_free_space *entry = NULL;
int ret = -ENOSPC;
u64 bitmap_offset = offset_to_bitmap(ctl, offset);
@@ -2939,8 +2975,10 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
* The bitmap that covers offset won't be in the list unless offset
* is just its start offset.
*/
- entry = list_first_entry(bitmaps, struct btrfs_free_space, list);
- if (entry->offset != bitmap_offset) {
+ if (!list_empty(bitmaps))
+ entry = list_first_entry(bitmaps, struct btrfs_free_space, list);
+
+ if (!entry || entry->offset != bitmap_offset) {
entry = tree_search_offset(ctl, bitmap_offset, 1, 0);
if (entry && list_empty(&entry->list))
list_add(&entry->list, bitmaps);
@@ -3056,6 +3094,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
spin_lock_init(&cluster->refill_lock);
cluster->root = RB_ROOT;
cluster->max_size = 0;
+ cluster->fragmented = false;
INIT_LIST_HEAD(&cluster->block_group_list);
cluster->block_group = NULL;
}
@@ -3223,7 +3262,7 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
}
bytes = minlen;
- ret2 = search_bitmap(ctl, entry, &start, &bytes);
+ ret2 = search_bitmap(ctl, entry, &start, &bytes, false);
if (ret2 || start >= end) {
spin_unlock(&ctl->tree_lock);
mutex_unlock(&ctl->cache_writeout_mutex);
@@ -3376,7 +3415,7 @@ u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root)
u64 count = 1;
int ret;
- ret = search_bitmap(ctl, entry, &offset, &count);
+ ret = search_bitmap(ctl, entry, &offset, &count, true);
/* Logic error; Should be empty if it can't find anything */
ASSERT(!ret);
@@ -3532,6 +3571,7 @@ again:
spin_lock(&ctl->tree_lock);
info->offset = offset;
info->bytes = bytes;
+ info->max_extent_size = 0;
ret = link_free_space(ctl, info);
spin_unlock(&ctl->tree_lock);
if (ret)
@@ -3559,6 +3599,7 @@ again:
}
bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
+
bytes -= bytes_added;
offset += bytes_added;
spin_unlock(&ctl->tree_lock);
@@ -3602,7 +3643,7 @@ have_info:
bit_off = offset;
bit_bytes = ctl->unit;
- ret = search_bitmap(ctl, info, &bit_off, &bit_bytes);
+ ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false);
if (!ret) {
if (bit_off == offset) {
ret = 1;
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index a16a029ad3b1..33178c490ace 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -23,6 +23,7 @@ struct btrfs_free_space {
struct rb_node offset_index;
u64 offset;
u64 bytes;
+ u64 max_extent_size;
unsigned long *bitmap;
struct list_head list;
};
@@ -36,7 +37,7 @@ struct btrfs_free_space_ctl {
int total_bitmaps;
int unit;
u64 start;
- struct btrfs_free_space_op *op;
+ const struct btrfs_free_space_op *op;
void *private;
struct mutex cache_writeout_mutex;
struct list_head trimming_ranges;
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
new file mode 100644
index 000000000000..393e36bd5845
--- /dev/null
+++ b/fs/btrfs/free-space-tree.c
@@ -0,0 +1,1591 @@
+/*
+ * Copyright (C) 2015 Facebook. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/vmalloc.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "locking.h"
+#include "free-space-tree.h"
+#include "transaction.h"
+
+static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path);
+
+void set_free_space_tree_thresholds(struct btrfs_block_group_cache *cache)
+{
+ u32 bitmap_range;
+ size_t bitmap_size;
+ u64 num_bitmaps, total_bitmap_size;
+
+ /*
+ * We convert to bitmaps when the disk space required for using extents
+ * exceeds that required for using bitmaps.
+ */
+ bitmap_range = cache->sectorsize * BTRFS_FREE_SPACE_BITMAP_BITS;
+ num_bitmaps = div_u64(cache->key.offset + bitmap_range - 1,
+ bitmap_range);
+ bitmap_size = sizeof(struct btrfs_item) + BTRFS_FREE_SPACE_BITMAP_SIZE;
+ total_bitmap_size = num_bitmaps * bitmap_size;
+ cache->bitmap_high_thresh = div_u64(total_bitmap_size,
+ sizeof(struct btrfs_item));
+
+ /*
+ * We allow for a small buffer between the high threshold and low
+ * threshold to avoid thrashing back and forth between the two formats.
+ */
+ if (cache->bitmap_high_thresh > 100)
+ cache->bitmap_low_thresh = cache->bitmap_high_thresh - 100;
+ else
+ cache->bitmap_low_thresh = 0;
+}
+
+static int add_new_free_space_info(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path)
+{
+ struct btrfs_root *root = fs_info->free_space_root;
+ struct btrfs_free_space_info *info;
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ int ret;
+
+ key.objectid = block_group->key.objectid;
+ key.type = BTRFS_FREE_SPACE_INFO_KEY;
+ key.offset = block_group->key.offset;
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*info));
+ if (ret)
+ goto out;
+
+ leaf = path->nodes[0];
+ info = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_free_space_info);
+ btrfs_set_free_space_extent_count(leaf, info, 0);
+ btrfs_set_free_space_flags(leaf, info, 0);
+ btrfs_mark_buffer_dirty(leaf);
+
+ ret = 0;
+out:
+ btrfs_release_path(path);
+ return ret;
+}
+
+struct btrfs_free_space_info *
+search_free_space_info(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path, int cow)
+{
+ struct btrfs_root *root = fs_info->free_space_root;
+ struct btrfs_key key;
+ int ret;
+
+ key.objectid = block_group->key.objectid;
+ key.type = BTRFS_FREE_SPACE_INFO_KEY;
+ key.offset = block_group->key.offset;
+
+ ret = btrfs_search_slot(trans, root, &key, path, 0, cow);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ if (ret != 0) {
+ btrfs_warn(fs_info, "missing free space info for %llu\n",
+ block_group->key.objectid);
+ ASSERT(0);
+ return ERR_PTR(-ENOENT);
+ }
+
+ return btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_free_space_info);
+}
+
+/*
+ * btrfs_search_slot() but we're looking for the greatest key less than the
+ * passed key.
+ */
+static int btrfs_search_prev_slot(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_key *key, struct btrfs_path *p,
+ int ins_len, int cow)
+{
+ int ret;
+
+ ret = btrfs_search_slot(trans, root, key, p, ins_len, cow);
+ if (ret < 0)
+ return ret;
+
+ if (ret == 0) {
+ ASSERT(0);
+ return -EIO;
+ }
+
+ if (p->slots[0] == 0) {
+ ASSERT(0);
+ return -EIO;
+ }
+ p->slots[0]--;
+
+ return 0;
+}
+
+static inline u32 free_space_bitmap_size(u64 size, u32 sectorsize)
+{
+ return DIV_ROUND_UP((u32)div_u64(size, sectorsize), BITS_PER_BYTE);
+}
+
+static unsigned long *alloc_bitmap(u32 bitmap_size)
+{
+ return __vmalloc(bitmap_size, GFP_NOFS | __GFP_HIGHMEM | __GFP_ZERO,
+ PAGE_KERNEL);
+}
+
+int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path)
+{
+ struct btrfs_root *root = fs_info->free_space_root;
+ struct btrfs_free_space_info *info;
+ struct btrfs_key key, found_key;
+ struct extent_buffer *leaf;
+ unsigned long *bitmap;
+ char *bitmap_cursor;
+ u64 start, end;
+ u64 bitmap_range, i;
+ u32 bitmap_size, flags, expected_extent_count;
+ u32 extent_count = 0;
+ int done = 0, nr;
+ int ret;
+
+ bitmap_size = free_space_bitmap_size(block_group->key.offset,
+ block_group->sectorsize);
+ bitmap = alloc_bitmap(bitmap_size);
+ if (!bitmap) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ start = block_group->key.objectid;
+ end = block_group->key.objectid + block_group->key.offset;
+
+ key.objectid = end - 1;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+
+ while (!done) {
+ ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
+ if (ret)
+ goto out;
+
+ leaf = path->nodes[0];
+ nr = 0;
+ path->slots[0]++;
+ while (path->slots[0] > 0) {
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1);
+
+ if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) {
+ ASSERT(found_key.objectid == block_group->key.objectid);
+ ASSERT(found_key.offset == block_group->key.offset);
+ done = 1;
+ break;
+ } else if (found_key.type == BTRFS_FREE_SPACE_EXTENT_KEY) {
+ u64 first, last;
+
+ ASSERT(found_key.objectid >= start);
+ ASSERT(found_key.objectid < end);
+ ASSERT(found_key.objectid + found_key.offset <= end);
+
+ first = div_u64(found_key.objectid - start,
+ block_group->sectorsize);
+ last = div_u64(found_key.objectid + found_key.offset - start,
+ block_group->sectorsize);
+ bitmap_set(bitmap, first, last - first);
+
+ extent_count++;
+ nr++;
+ path->slots[0]--;
+ } else {
+ ASSERT(0);
+ }
+ }
+
+ ret = btrfs_del_items(trans, root, path, path->slots[0], nr);
+ if (ret)
+ goto out;
+ btrfs_release_path(path);
+ }
+
+ info = search_free_space_info(trans, fs_info, block_group, path, 1);
+ if (IS_ERR(info)) {
+ ret = PTR_ERR(info);
+ goto out;
+ }
+ leaf = path->nodes[0];
+ flags = btrfs_free_space_flags(leaf, info);
+ flags |= BTRFS_FREE_SPACE_USING_BITMAPS;
+ btrfs_set_free_space_flags(leaf, info, flags);
+ expected_extent_count = btrfs_free_space_extent_count(leaf, info);
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_release_path(path);
+
+ if (extent_count != expected_extent_count) {
+ btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u",
+ block_group->key.objectid, extent_count,
+ expected_extent_count);
+ ASSERT(0);
+ ret = -EIO;
+ goto out;
+ }
+
+ bitmap_cursor = (char *)bitmap;
+ bitmap_range = block_group->sectorsize * BTRFS_FREE_SPACE_BITMAP_BITS;
+ i = start;
+ while (i < end) {
+ unsigned long ptr;
+ u64 extent_size;
+ u32 data_size;
+
+ extent_size = min(end - i, bitmap_range);
+ data_size = free_space_bitmap_size(extent_size,
+ block_group->sectorsize);
+
+ key.objectid = i;
+ key.type = BTRFS_FREE_SPACE_BITMAP_KEY;
+ key.offset = extent_size;
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key,
+ data_size);
+ if (ret)
+ goto out;
+
+ leaf = path->nodes[0];
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ write_extent_buffer(leaf, bitmap_cursor, ptr,
+ data_size);
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_release_path(path);
+
+ i += extent_size;
+ bitmap_cursor += data_size;
+ }
+
+ ret = 0;
+out:
+ vfree(bitmap);
+ if (ret)
+ btrfs_abort_transaction(trans, root, ret);
+ return ret;
+}
+
+int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path)
+{
+ struct btrfs_root *root = fs_info->free_space_root;
+ struct btrfs_free_space_info *info;
+ struct btrfs_key key, found_key;
+ struct extent_buffer *leaf;
+ unsigned long *bitmap;
+ u64 start, end;
+ /* Initialize to silence GCC. */
+ u64 extent_start = 0;
+ u64 offset;
+ u32 bitmap_size, flags, expected_extent_count;
+ int prev_bit = 0, bit, bitnr;
+ u32 extent_count = 0;
+ int done = 0, nr;
+ int ret;
+
+ bitmap_size = free_space_bitmap_size(block_group->key.offset,
+ block_group->sectorsize);
+ bitmap = alloc_bitmap(bitmap_size);
+ if (!bitmap) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ start = block_group->key.objectid;
+ end = block_group->key.objectid + block_group->key.offset;
+
+ key.objectid = end - 1;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+
+ while (!done) {
+ ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
+ if (ret)
+ goto out;
+
+ leaf = path->nodes[0];
+ nr = 0;
+ path->slots[0]++;
+ while (path->slots[0] > 0) {
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1);
+
+ if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) {
+ ASSERT(found_key.objectid == block_group->key.objectid);
+ ASSERT(found_key.offset == block_group->key.offset);
+ done = 1;
+ break;
+ } else if (found_key.type == BTRFS_FREE_SPACE_BITMAP_KEY) {
+ unsigned long ptr;
+ char *bitmap_cursor;
+ u32 bitmap_pos, data_size;
+
+ ASSERT(found_key.objectid >= start);
+ ASSERT(found_key.objectid < end);
+ ASSERT(found_key.objectid + found_key.offset <= end);
+
+ bitmap_pos = div_u64(found_key.objectid - start,
+ block_group->sectorsize *
+ BITS_PER_BYTE);
+ bitmap_cursor = ((char *)bitmap) + bitmap_pos;
+ data_size = free_space_bitmap_size(found_key.offset,
+ block_group->sectorsize);
+
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0] - 1);
+ read_extent_buffer(leaf, bitmap_cursor, ptr,
+ data_size);
+
+ nr++;
+ path->slots[0]--;
+ } else {
+ ASSERT(0);
+ }
+ }
+
+ ret = btrfs_del_items(trans, root, path, path->slots[0], nr);
+ if (ret)
+ goto out;
+ btrfs_release_path(path);
+ }
+
+ info = search_free_space_info(trans, fs_info, block_group, path, 1);
+ if (IS_ERR(info)) {
+ ret = PTR_ERR(info);
+ goto out;
+ }
+ leaf = path->nodes[0];
+ flags = btrfs_free_space_flags(leaf, info);
+ flags &= ~BTRFS_FREE_SPACE_USING_BITMAPS;
+ btrfs_set_free_space_flags(leaf, info, flags);
+ expected_extent_count = btrfs_free_space_extent_count(leaf, info);
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_release_path(path);
+
+ offset = start;
+ bitnr = 0;
+ while (offset < end) {
+ bit = !!test_bit(bitnr, bitmap);
+ if (prev_bit == 0 && bit == 1) {
+ extent_start = offset;
+ } else if (prev_bit == 1 && bit == 0) {
+ key.objectid = extent_start;
+ key.type = BTRFS_FREE_SPACE_EXTENT_KEY;
+ key.offset = offset - extent_start;
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+ if (ret)
+ goto out;
+ btrfs_release_path(path);
+
+ extent_count++;
+ }
+ prev_bit = bit;
+ offset += block_group->sectorsize;
+ bitnr++;
+ }
+ if (prev_bit == 1) {
+ key.objectid = extent_start;
+ key.type = BTRFS_FREE_SPACE_EXTENT_KEY;
+ key.offset = end - extent_start;
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+ if (ret)
+ goto out;
+ btrfs_release_path(path);
+
+ extent_count++;
+ }
+
+ if (extent_count != expected_extent_count) {
+ btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u",
+ block_group->key.objectid, extent_count,
+ expected_extent_count);
+ ASSERT(0);
+ ret = -EIO;
+ goto out;
+ }
+
+ ret = 0;
+out:
+ vfree(bitmap);
+ if (ret)
+ btrfs_abort_transaction(trans, root, ret);
+ return ret;
+}
+
+static int update_free_space_extent_count(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path,
+ int new_extents)
+{
+ struct btrfs_free_space_info *info;
+ u32 flags;
+ u32 extent_count;
+ int ret = 0;
+
+ if (new_extents == 0)
+ return 0;
+
+ info = search_free_space_info(trans, fs_info, block_group, path, 1);
+ if (IS_ERR(info)) {
+ ret = PTR_ERR(info);
+ goto out;
+ }
+ flags = btrfs_free_space_flags(path->nodes[0], info);
+ extent_count = btrfs_free_space_extent_count(path->nodes[0], info);
+
+ extent_count += new_extents;
+ btrfs_set_free_space_extent_count(path->nodes[0], info, extent_count);
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+ btrfs_release_path(path);
+
+ if (!(flags & BTRFS_FREE_SPACE_USING_BITMAPS) &&
+ extent_count > block_group->bitmap_high_thresh) {
+ ret = convert_free_space_to_bitmaps(trans, fs_info, block_group,
+ path);
+ } else if ((flags & BTRFS_FREE_SPACE_USING_BITMAPS) &&
+ extent_count < block_group->bitmap_low_thresh) {
+ ret = convert_free_space_to_extents(trans, fs_info, block_group,
+ path);
+ }
+
+out:
+ return ret;
+}
+
+int free_space_test_bit(struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path, u64 offset)
+{
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ u64 found_start, found_end;
+ unsigned long ptr, i;
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ ASSERT(key.type == BTRFS_FREE_SPACE_BITMAP_KEY);
+
+ found_start = key.objectid;
+ found_end = key.objectid + key.offset;
+ ASSERT(offset >= found_start && offset < found_end);
+
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ i = div_u64(offset - found_start, block_group->sectorsize);
+ return !!extent_buffer_test_bit(leaf, ptr, i);
+}
+
+static void free_space_set_bits(struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path, u64 *start, u64 *size,
+ int bit)
+{
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ u64 end = *start + *size;
+ u64 found_start, found_end;
+ unsigned long ptr, first, last;
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ ASSERT(key.type == BTRFS_FREE_SPACE_BITMAP_KEY);
+
+ found_start = key.objectid;
+ found_end = key.objectid + key.offset;
+ ASSERT(*start >= found_start && *start < found_end);
+ ASSERT(end > found_start);
+
+ if (end > found_end)
+ end = found_end;
+
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ first = div_u64(*start - found_start, block_group->sectorsize);
+ last = div_u64(end - found_start, block_group->sectorsize);
+ if (bit)
+ extent_buffer_bitmap_set(leaf, ptr, first, last - first);
+ else
+ extent_buffer_bitmap_clear(leaf, ptr, first, last - first);
+ btrfs_mark_buffer_dirty(leaf);
+
+ *size -= end - *start;
+ *start = end;
+}
+
+/*
+ * We can't use btrfs_next_item() in modify_free_space_bitmap() because
+ * btrfs_next_leaf() doesn't get the path for writing. We can forgo the fancy
+ * tree walking in btrfs_next_leaf() anyways because we know exactly what we're
+ * looking for.
+ */
+static int free_space_next_bitmap(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *p)
+{
+ struct btrfs_key key;
+
+ if (p->slots[0] + 1 < btrfs_header_nritems(p->nodes[0])) {
+ p->slots[0]++;
+ return 0;
+ }
+
+ btrfs_item_key_to_cpu(p->nodes[0], &key, p->slots[0]);
+ btrfs_release_path(p);
+
+ key.objectid += key.offset;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+
+ return btrfs_search_prev_slot(trans, root, &key, p, 0, 1);
+}
+
+/*
+ * If remove is 1, then we are removing free space, thus clearing bits in the
+ * bitmap. If remove is 0, then we are adding free space, thus setting bits in
+ * the bitmap.
+ */
+static int modify_free_space_bitmap(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path,
+ u64 start, u64 size, int remove)
+{
+ struct btrfs_root *root = fs_info->free_space_root;
+ struct btrfs_key key;
+ u64 end = start + size;
+ u64 cur_start, cur_size;
+ int prev_bit, next_bit;
+ int new_extents;
+ int ret;
+
+ /*
+ * Read the bit for the block immediately before the extent of space if
+ * that block is within the block group.
+ */
+ if (start > block_group->key.objectid) {
+ u64 prev_block = start - block_group->sectorsize;
+
+ key.objectid = prev_block;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_prev_slot(trans, root, &key, path, 0, 1);
+ if (ret)
+ goto out;
+
+ prev_bit = free_space_test_bit(block_group, path, prev_block);
+
+ /* The previous block may have been in the previous bitmap. */
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (start >= key.objectid + key.offset) {
+ ret = free_space_next_bitmap(trans, root, path);
+ if (ret)
+ goto out;
+ }
+ } else {
+ key.objectid = start;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_prev_slot(trans, root, &key, path, 0, 1);
+ if (ret)
+ goto out;
+
+ prev_bit = -1;
+ }
+
+ /*
+ * Iterate over all of the bitmaps overlapped by the extent of space,
+ * clearing/setting bits as required.
+ */
+ cur_start = start;
+ cur_size = size;
+ while (1) {
+ free_space_set_bits(block_group, path, &cur_start, &cur_size,
+ !remove);
+ if (cur_size == 0)
+ break;
+ ret = free_space_next_bitmap(trans, root, path);
+ if (ret)
+ goto out;
+ }
+
+ /*
+ * Read the bit for the block immediately after the extent of space if
+ * that block is within the block group.
+ */
+ if (end < block_group->key.objectid + block_group->key.offset) {
+ /* The next block may be in the next bitmap. */
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (end >= key.objectid + key.offset) {
+ ret = free_space_next_bitmap(trans, root, path);
+ if (ret)
+ goto out;
+ }
+
+ next_bit = free_space_test_bit(block_group, path, end);
+ } else {
+ next_bit = -1;
+ }
+
+ if (remove) {
+ new_extents = -1;
+ if (prev_bit == 1) {
+ /* Leftover on the left. */
+ new_extents++;
+ }
+ if (next_bit == 1) {
+ /* Leftover on the right. */
+ new_extents++;
+ }
+ } else {
+ new_extents = 1;
+ if (prev_bit == 1) {
+ /* Merging with neighbor on the left. */
+ new_extents--;
+ }
+ if (next_bit == 1) {
+ /* Merging with neighbor on the right. */
+ new_extents--;
+ }
+ }
+
+ btrfs_release_path(path);
+ ret = update_free_space_extent_count(trans, fs_info, block_group, path,
+ new_extents);
+
+out:
+ return ret;
+}
+
+static int remove_free_space_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path,
+ u64 start, u64 size)
+{
+ struct btrfs_root *root = fs_info->free_space_root;
+ struct btrfs_key key;
+ u64 found_start, found_end;
+ u64 end = start + size;
+ int new_extents = -1;
+ int ret;
+
+ key.objectid = start;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
+ if (ret)
+ goto out;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+ ASSERT(key.type == BTRFS_FREE_SPACE_EXTENT_KEY);
+
+ found_start = key.objectid;
+ found_end = key.objectid + key.offset;
+ ASSERT(start >= found_start && end <= found_end);
+
+ /*
+ * Okay, now that we've found the free space extent which contains the
+ * free space that we are removing, there are four cases:
+ *
+ * 1. We're using the whole extent: delete the key we found and
+ * decrement the free space extent count.
+ * 2. We are using part of the extent starting at the beginning: delete
+ * the key we found and insert a new key representing the leftover at
+ * the end. There is no net change in the number of extents.
+ * 3. We are using part of the extent ending at the end: delete the key
+ * we found and insert a new key representing the leftover at the
+ * beginning. There is no net change in the number of extents.
+ * 4. We are using part of the extent in the middle: delete the key we
+ * found and insert two new keys representing the leftovers on each
+ * side. Where we used to have one extent, we now have two, so increment
+ * the extent count. We may need to convert the block group to bitmaps
+ * as a result.
+ */
+
+ /* Delete the existing key (cases 1-4). */
+ ret = btrfs_del_item(trans, root, path);
+ if (ret)
+ goto out;
+
+ /* Add a key for leftovers at the beginning (cases 3 and 4). */
+ if (start > found_start) {
+ key.objectid = found_start;
+ key.type = BTRFS_FREE_SPACE_EXTENT_KEY;
+ key.offset = start - found_start;
+
+ btrfs_release_path(path);
+ ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+ if (ret)
+ goto out;
+ new_extents++;
+ }
+
+ /* Add a key for leftovers at the end (cases 2 and 4). */
+ if (end < found_end) {
+ key.objectid = end;
+ key.type = BTRFS_FREE_SPACE_EXTENT_KEY;
+ key.offset = found_end - end;
+
+ btrfs_release_path(path);
+ ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+ if (ret)
+ goto out;
+ new_extents++;
+ }
+
+ btrfs_release_path(path);
+ ret = update_free_space_extent_count(trans, fs_info, block_group, path,
+ new_extents);
+
+out:
+ return ret;
+}
+
+int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path, u64 start, u64 size)
+{
+ struct btrfs_free_space_info *info;
+ u32 flags;
+ int ret;
+
+ if (block_group->needs_free_space) {
+ ret = __add_block_group_free_space(trans, fs_info, block_group,
+ path);
+ if (ret)
+ return ret;
+ }
+
+ info = search_free_space_info(NULL, fs_info, block_group, path, 0);
+ if (IS_ERR(info))
+ return PTR_ERR(info);
+ flags = btrfs_free_space_flags(path->nodes[0], info);
+ btrfs_release_path(path);
+
+ if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) {
+ return modify_free_space_bitmap(trans, fs_info, block_group,
+ path, start, size, 1);
+ } else {
+ return remove_free_space_extent(trans, fs_info, block_group,
+ path, start, size);
+ }
+}
+
+int remove_from_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ u64 start, u64 size)
+{
+ struct btrfs_block_group_cache *block_group;
+ struct btrfs_path *path;
+ int ret;
+
+ if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+ return 0;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ block_group = btrfs_lookup_block_group(fs_info, start);
+ if (!block_group) {
+ ASSERT(0);
+ ret = -ENOENT;
+ goto out;
+ }
+
+ mutex_lock(&block_group->free_space_lock);
+ ret = __remove_from_free_space_tree(trans, fs_info, block_group, path,
+ start, size);
+ mutex_unlock(&block_group->free_space_lock);
+
+ btrfs_put_block_group(block_group);
+out:
+ btrfs_free_path(path);
+ if (ret)
+ btrfs_abort_transaction(trans, fs_info->free_space_root, ret);
+ return ret;
+}
+
+static int add_free_space_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path,
+ u64 start, u64 size)
+{
+ struct btrfs_root *root = fs_info->free_space_root;
+ struct btrfs_key key, new_key;
+ u64 found_start, found_end;
+ u64 end = start + size;
+ int new_extents = 1;
+ int ret;
+
+ /*
+ * We are adding a new extent of free space, but we need to merge
+ * extents. There are four cases here:
+ *
+ * 1. The new extent does not have any immediate neighbors to merge
+ * with: add the new key and increment the free space extent count. We
+ * may need to convert the block group to bitmaps as a result.
+ * 2. The new extent has an immediate neighbor before it: remove the
+ * previous key and insert a new key combining both of them. There is no
+ * net change in the number of extents.
+ * 3. The new extent has an immediate neighbor after it: remove the next
+ * key and insert a new key combining both of them. There is no net
+ * change in the number of extents.
+ * 4. The new extent has immediate neighbors on both sides: remove both
+ * of the keys and insert a new key combining all of them. Where we used
+ * to have two extents, we now have one, so decrement the extent count.
+ */
+
+ new_key.objectid = start;
+ new_key.type = BTRFS_FREE_SPACE_EXTENT_KEY;
+ new_key.offset = size;
+
+ /* Search for a neighbor on the left. */
+ if (start == block_group->key.objectid)
+ goto right;
+ key.objectid = start - 1;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
+ if (ret)
+ goto out;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+ if (key.type != BTRFS_FREE_SPACE_EXTENT_KEY) {
+ ASSERT(key.type == BTRFS_FREE_SPACE_INFO_KEY);
+ btrfs_release_path(path);
+ goto right;
+ }
+
+ found_start = key.objectid;
+ found_end = key.objectid + key.offset;
+ ASSERT(found_start >= block_group->key.objectid &&
+ found_end > block_group->key.objectid);
+ ASSERT(found_start < start && found_end <= start);
+
+ /*
+ * Delete the neighbor on the left and absorb it into the new key (cases
+ * 2 and 4).
+ */
+ if (found_end == start) {
+ ret = btrfs_del_item(trans, root, path);
+ if (ret)
+ goto out;
+ new_key.objectid = found_start;
+ new_key.offset += key.offset;
+ new_extents--;
+ }
+ btrfs_release_path(path);
+
+right:
+ /* Search for a neighbor on the right. */
+ if (end == block_group->key.objectid + block_group->key.offset)
+ goto insert;
+ key.objectid = end;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
+ if (ret)
+ goto out;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+ if (key.type != BTRFS_FREE_SPACE_EXTENT_KEY) {
+ ASSERT(key.type == BTRFS_FREE_SPACE_INFO_KEY);
+ btrfs_release_path(path);
+ goto insert;
+ }
+
+ found_start = key.objectid;
+ found_end = key.objectid + key.offset;
+ ASSERT(found_start >= block_group->key.objectid &&
+ found_end > block_group->key.objectid);
+ ASSERT((found_start < start && found_end <= start) ||
+ (found_start >= end && found_end > end));
+
+ /*
+ * Delete the neighbor on the right and absorb it into the new key
+ * (cases 3 and 4).
+ */
+ if (found_start == end) {
+ ret = btrfs_del_item(trans, root, path);
+ if (ret)
+ goto out;
+ new_key.offset += key.offset;
+ new_extents--;
+ }
+ btrfs_release_path(path);
+
+insert:
+ /* Insert the new key (cases 1-4). */
+ ret = btrfs_insert_empty_item(trans, root, path, &new_key, 0);
+ if (ret)
+ goto out;
+
+ btrfs_release_path(path);
+ ret = update_free_space_extent_count(trans, fs_info, block_group, path,
+ new_extents);
+
+out:
+ return ret;
+}
+
+int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path, u64 start, u64 size)
+{
+ struct btrfs_free_space_info *info;
+ u32 flags;
+ int ret;
+
+ if (block_group->needs_free_space) {
+ ret = __add_block_group_free_space(trans, fs_info, block_group,
+ path);
+ if (ret)
+ return ret;
+ }
+
+ info = search_free_space_info(NULL, fs_info, block_group, path, 0);
+ if (IS_ERR(info))
+ return PTR_ERR(info);
+ flags = btrfs_free_space_flags(path->nodes[0], info);
+ btrfs_release_path(path);
+
+ if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) {
+ return modify_free_space_bitmap(trans, fs_info, block_group,
+ path, start, size, 0);
+ } else {
+ return add_free_space_extent(trans, fs_info, block_group, path,
+ start, size);
+ }
+}
+
+int add_to_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ u64 start, u64 size)
+{
+ struct btrfs_block_group_cache *block_group;
+ struct btrfs_path *path;
+ int ret;
+
+ if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+ return 0;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ block_group = btrfs_lookup_block_group(fs_info, start);
+ if (!block_group) {
+ ASSERT(0);
+ ret = -ENOENT;
+ goto out;
+ }
+
+ mutex_lock(&block_group->free_space_lock);
+ ret = __add_to_free_space_tree(trans, fs_info, block_group, path, start,
+ size);
+ mutex_unlock(&block_group->free_space_lock);
+
+ btrfs_put_block_group(block_group);
+out:
+ btrfs_free_path(path);
+ if (ret)
+ btrfs_abort_transaction(trans, fs_info->free_space_root, ret);
+ return ret;
+}
+
+/*
+ * Populate the free space tree by walking the extent tree. Operations on the
+ * extent tree that happen as a result of writes to the free space tree will go
+ * through the normal add/remove hooks.
+ */
+static int populate_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group)
+{
+ struct btrfs_root *extent_root = fs_info->extent_root;
+ struct btrfs_path *path, *path2;
+ struct btrfs_key key;
+ u64 start, end;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->reada = 1;
+
+ path2 = btrfs_alloc_path();
+ if (!path2) {
+ btrfs_free_path(path);
+ return -ENOMEM;
+ }
+
+ ret = add_new_free_space_info(trans, fs_info, block_group, path2);
+ if (ret)
+ goto out;
+
+ mutex_lock(&block_group->free_space_lock);
+
+ /*
+ * Iterate through all of the extent and metadata items in this block
+ * group, adding the free space between them and the free space at the
+ * end. Note that EXTENT_ITEM and METADATA_ITEM are less than
+ * BLOCK_GROUP_ITEM, so an extent may precede the block group that it's
+ * contained in.
+ */
+ key.objectid = block_group->key.objectid;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot_for_read(extent_root, &key, path, 1, 0);
+ if (ret < 0)
+ goto out_locked;
+ ASSERT(ret == 0);
+
+ start = block_group->key.objectid;
+ end = block_group->key.objectid + block_group->key.offset;
+ while (1) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+ if (key.type == BTRFS_EXTENT_ITEM_KEY ||
+ key.type == BTRFS_METADATA_ITEM_KEY) {
+ if (key.objectid >= end)
+ break;
+
+ if (start < key.objectid) {
+ ret = __add_to_free_space_tree(trans, fs_info,
+ block_group,
+ path2, start,
+ key.objectid -
+ start);
+ if (ret)
+ goto out_locked;
+ }
+ start = key.objectid;
+ if (key.type == BTRFS_METADATA_ITEM_KEY)
+ start += fs_info->tree_root->nodesize;
+ else
+ start += key.offset;
+ } else if (key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
+ if (key.objectid != block_group->key.objectid)
+ break;
+ }
+
+ ret = btrfs_next_item(extent_root, path);
+ if (ret < 0)
+ goto out_locked;
+ if (ret)
+ break;
+ }
+ if (start < end) {
+ ret = __add_to_free_space_tree(trans, fs_info, block_group,
+ path2, start, end - start);
+ if (ret)
+ goto out_locked;
+ }
+
+ ret = 0;
+out_locked:
+ mutex_unlock(&block_group->free_space_lock);
+out:
+ btrfs_free_path(path2);
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *tree_root = fs_info->tree_root;
+ struct btrfs_root *free_space_root;
+ struct btrfs_block_group_cache *block_group;
+ struct rb_node *node;
+ int ret;
+
+ trans = btrfs_start_transaction(tree_root, 0);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ fs_info->creating_free_space_tree = 1;
+ free_space_root = btrfs_create_tree(trans, fs_info,
+ BTRFS_FREE_SPACE_TREE_OBJECTID);
+ if (IS_ERR(free_space_root)) {
+ ret = PTR_ERR(free_space_root);
+ goto abort;
+ }
+ fs_info->free_space_root = free_space_root;
+
+ node = rb_first(&fs_info->block_group_cache_tree);
+ while (node) {
+ block_group = rb_entry(node, struct btrfs_block_group_cache,
+ cache_node);
+ ret = populate_free_space_tree(trans, fs_info, block_group);
+ if (ret)
+ goto abort;
+ node = rb_next(node);
+ }
+
+ btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE);
+ fs_info->creating_free_space_tree = 0;
+
+ ret = btrfs_commit_transaction(trans, tree_root);
+ if (ret)
+ return ret;
+
+ return 0;
+
+abort:
+ fs_info->creating_free_space_tree = 0;
+ btrfs_abort_transaction(trans, tree_root, ret);
+ btrfs_end_transaction(trans, tree_root);
+ return ret;
+}
+
+static int clear_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int nr;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ path->leave_spinning = 1;
+
+ key.objectid = 0;
+ key.type = 0;
+ key.offset = 0;
+
+ while (1) {
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret < 0)
+ goto out;
+
+ nr = btrfs_header_nritems(path->nodes[0]);
+ if (!nr)
+ break;
+
+ path->slots[0] = 0;
+ ret = btrfs_del_items(trans, root, path, 0, nr);
+ if (ret)
+ goto out;
+
+ btrfs_release_path(path);
+ }
+
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *tree_root = fs_info->tree_root;
+ struct btrfs_root *free_space_root = fs_info->free_space_root;
+ int ret;
+
+ trans = btrfs_start_transaction(tree_root, 0);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE);
+ fs_info->free_space_root = NULL;
+
+ ret = clear_free_space_tree(trans, free_space_root);
+ if (ret)
+ goto abort;
+
+ ret = btrfs_del_root(trans, tree_root, &free_space_root->root_key);
+ if (ret)
+ goto abort;
+
+ list_del(&free_space_root->dirty_list);
+
+ btrfs_tree_lock(free_space_root->node);
+ clean_tree_block(trans, tree_root->fs_info, free_space_root->node);
+ btrfs_tree_unlock(free_space_root->node);
+ btrfs_free_tree_block(trans, free_space_root, free_space_root->node,
+ 0, 1);
+
+ free_extent_buffer(free_space_root->node);
+ free_extent_buffer(free_space_root->commit_root);
+ kfree(free_space_root);
+
+ ret = btrfs_commit_transaction(trans, tree_root);
+ if (ret)
+ return ret;
+
+ return 0;
+
+abort:
+ btrfs_abort_transaction(trans, tree_root, ret);
+ btrfs_end_transaction(trans, tree_root);
+ return ret;
+}
+
+static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path)
+{
+ u64 start, end;
+ int ret;
+
+ start = block_group->key.objectid;
+ end = block_group->key.objectid + block_group->key.offset;
+
+ block_group->needs_free_space = 0;
+
+ ret = add_new_free_space_info(trans, fs_info, block_group, path);
+ if (ret)
+ return ret;
+
+ return __add_to_free_space_tree(trans, fs_info, block_group, path,
+ block_group->key.objectid,
+ block_group->key.offset);
+}
+
+int add_block_group_free_space(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group)
+{
+ struct btrfs_path *path = NULL;
+ int ret = 0;
+
+ if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+ return 0;
+
+ mutex_lock(&block_group->free_space_lock);
+ if (!block_group->needs_free_space)
+ goto out;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = __add_block_group_free_space(trans, fs_info, block_group, path);
+
+out:
+ btrfs_free_path(path);
+ mutex_unlock(&block_group->free_space_lock);
+ if (ret)
+ btrfs_abort_transaction(trans, fs_info->free_space_root, ret);
+ return ret;
+}
+
+int remove_block_group_free_space(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group)
+{
+ struct btrfs_root *root = fs_info->free_space_root;
+ struct btrfs_path *path;
+ struct btrfs_key key, found_key;
+ struct extent_buffer *leaf;
+ u64 start, end;
+ int done = 0, nr;
+ int ret;
+
+ if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+ return 0;
+
+ if (block_group->needs_free_space) {
+ /* We never added this block group to the free space tree. */
+ return 0;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ start = block_group->key.objectid;
+ end = block_group->key.objectid + block_group->key.offset;
+
+ key.objectid = end - 1;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+
+ while (!done) {
+ ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
+ if (ret)
+ goto out;
+
+ leaf = path->nodes[0];
+ nr = 0;
+ path->slots[0]++;
+ while (path->slots[0] > 0) {
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1);
+
+ if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) {
+ ASSERT(found_key.objectid == block_group->key.objectid);
+ ASSERT(found_key.offset == block_group->key.offset);
+ done = 1;
+ nr++;
+ path->slots[0]--;
+ break;
+ } else if (found_key.type == BTRFS_FREE_SPACE_EXTENT_KEY ||
+ found_key.type == BTRFS_FREE_SPACE_BITMAP_KEY) {
+ ASSERT(found_key.objectid >= start);
+ ASSERT(found_key.objectid < end);
+ ASSERT(found_key.objectid + found_key.offset <= end);
+ nr++;
+ path->slots[0]--;
+ } else {
+ ASSERT(0);
+ }
+ }
+
+ ret = btrfs_del_items(trans, root, path, path->slots[0], nr);
+ if (ret)
+ goto out;
+ btrfs_release_path(path);
+ }
+
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ if (ret)
+ btrfs_abort_transaction(trans, root, ret);
+ return ret;
+}
+
+static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
+ struct btrfs_path *path,
+ u32 expected_extent_count)
+{
+ struct btrfs_block_group_cache *block_group;
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_root *root;
+ struct btrfs_key key;
+ int prev_bit = 0, bit;
+ /* Initialize to silence GCC. */
+ u64 extent_start = 0;
+ u64 end, offset;
+ u64 total_found = 0;
+ u32 extent_count = 0;
+ int ret;
+
+ block_group = caching_ctl->block_group;
+ fs_info = block_group->fs_info;
+ root = fs_info->free_space_root;
+
+ end = block_group->key.objectid + block_group->key.offset;
+
+ while (1) {
+ ret = btrfs_next_item(root, path);
+ if (ret < 0)
+ goto out;
+ if (ret)
+ break;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+ if (key.type == BTRFS_FREE_SPACE_INFO_KEY)
+ break;
+
+ ASSERT(key.type == BTRFS_FREE_SPACE_BITMAP_KEY);
+ ASSERT(key.objectid < end && key.objectid + key.offset <= end);
+
+ caching_ctl->progress = key.objectid;
+
+ offset = key.objectid;
+ while (offset < key.objectid + key.offset) {
+ bit = free_space_test_bit(block_group, path, offset);
+ if (prev_bit == 0 && bit == 1) {
+ extent_start = offset;
+ } else if (prev_bit == 1 && bit == 0) {
+ total_found += add_new_free_space(block_group,
+ fs_info,
+ extent_start,
+ offset);
+ if (total_found > CACHING_CTL_WAKE_UP) {
+ total_found = 0;
+ wake_up(&caching_ctl->wait);
+ }
+ extent_count++;
+ }
+ prev_bit = bit;
+ offset += block_group->sectorsize;
+ }
+ }
+ if (prev_bit == 1) {
+ total_found += add_new_free_space(block_group, fs_info,
+ extent_start, end);
+ extent_count++;
+ }
+
+ if (extent_count != expected_extent_count) {
+ btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u",
+ block_group->key.objectid, extent_count,
+ expected_extent_count);
+ ASSERT(0);
+ ret = -EIO;
+ goto out;
+ }
+
+ caching_ctl->progress = (u64)-1;
+
+ ret = 0;
+out:
+ return ret;
+}
+
+static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
+ struct btrfs_path *path,
+ u32 expected_extent_count)
+{
+ struct btrfs_block_group_cache *block_group;
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_root *root;
+ struct btrfs_key key;
+ u64 end;
+ u64 total_found = 0;
+ u32 extent_count = 0;
+ int ret;
+
+ block_group = caching_ctl->block_group;
+ fs_info = block_group->fs_info;
+ root = fs_info->free_space_root;
+
+ end = block_group->key.objectid + block_group->key.offset;
+
+ while (1) {
+ ret = btrfs_next_item(root, path);
+ if (ret < 0)
+ goto out;
+ if (ret)
+ break;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+ if (key.type == BTRFS_FREE_SPACE_INFO_KEY)
+ break;
+
+ ASSERT(key.type == BTRFS_FREE_SPACE_EXTENT_KEY);
+ ASSERT(key.objectid < end && key.objectid + key.offset <= end);
+
+ caching_ctl->progress = key.objectid;
+
+ total_found += add_new_free_space(block_group, fs_info,
+ key.objectid,
+ key.objectid + key.offset);
+ if (total_found > CACHING_CTL_WAKE_UP) {
+ total_found = 0;
+ wake_up(&caching_ctl->wait);
+ }
+ extent_count++;
+ }
+
+ if (extent_count != expected_extent_count) {
+ btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u",
+ block_group->key.objectid, extent_count,
+ expected_extent_count);
+ ASSERT(0);
+ ret = -EIO;
+ goto out;
+ }
+
+ caching_ctl->progress = (u64)-1;
+
+ ret = 0;
+out:
+ return ret;
+}
+
+int load_free_space_tree(struct btrfs_caching_control *caching_ctl)
+{
+ struct btrfs_block_group_cache *block_group;
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_free_space_info *info;
+ struct btrfs_path *path;
+ u32 extent_count, flags;
+ int ret;
+
+ block_group = caching_ctl->block_group;
+ fs_info = block_group->fs_info;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ /*
+ * Just like caching_thread() doesn't want to deadlock on the extent
+ * tree, we don't want to deadlock on the free space tree.
+ */
+ path->skip_locking = 1;
+ path->search_commit_root = 1;
+ path->reada = 1;
+
+ info = search_free_space_info(NULL, fs_info, block_group, path, 0);
+ if (IS_ERR(info)) {
+ ret = PTR_ERR(info);
+ goto out;
+ }
+ extent_count = btrfs_free_space_extent_count(path->nodes[0], info);
+ flags = btrfs_free_space_flags(path->nodes[0], info);
+
+ /*
+ * We left path pointing to the free space info item, so now
+ * load_free_space_foo can just iterate through the free space tree from
+ * there.
+ */
+ if (flags & BTRFS_FREE_SPACE_USING_BITMAPS)
+ ret = load_free_space_bitmaps(caching_ctl, path, extent_count);
+ else
+ ret = load_free_space_extents(caching_ctl, path, extent_count);
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h
new file mode 100644
index 000000000000..54ffced3bce8
--- /dev/null
+++ b/fs/btrfs/free-space-tree.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (C) 2015 Facebook. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_FREE_SPACE_TREE
+#define __BTRFS_FREE_SPACE_TREE
+
+/*
+ * The default size for new free space bitmap items. The last bitmap in a block
+ * group may be truncated, and none of the free space tree code assumes that
+ * existing bitmaps are this size.
+ */
+#define BTRFS_FREE_SPACE_BITMAP_SIZE 256
+#define BTRFS_FREE_SPACE_BITMAP_BITS (BTRFS_FREE_SPACE_BITMAP_SIZE * BITS_PER_BYTE)
+
+void set_free_space_tree_thresholds(struct btrfs_block_group_cache *block_group);
+int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info);
+int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info);
+int load_free_space_tree(struct btrfs_caching_control *caching_ctl);
+int add_block_group_free_space(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group);
+int remove_block_group_free_space(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group);
+int add_to_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ u64 start, u64 size);
+int remove_from_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ u64 start, u64 size);
+
+/* Exposed for testing. */
+struct btrfs_free_space_info *
+search_free_space_info(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path, int cow);
+int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path, u64 start, u64 size);
+int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path, u64 start, u64 size);
+int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path);
+int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path);
+int free_space_test_bit(struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path, u64 offset);
+
+#endif
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 265e03c73f4d..be4d22a5022f 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -157,7 +157,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
*/
if (!btrfs_find_name_in_ext_backref(path, ref_objectid,
name, name_len, &extref)) {
- btrfs_std_error(root->fs_info, -ENOENT);
+ btrfs_std_error(root->fs_info, -ENOENT, NULL);
ret = -EROFS;
goto out;
}
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index d4a582ac3f73..8b57c17b3fb3 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -48,7 +48,7 @@ static int caching_kthread(void *data)
/* Since the commit root is read-only, we can safely skip locking. */
path->skip_locking = 1;
path->search_commit_root = 1;
- path->reada = 2;
+ path->reada = READA_FORWARD;
key.objectid = BTRFS_FIRST_FREE_OBJECTID;
key.offset = 0;
@@ -282,7 +282,7 @@ void btrfs_unpin_free_ino(struct btrfs_root *root)
}
}
-#define INIT_THRESHOLD (((1024 * 32) / 2) / sizeof(struct btrfs_free_space))
+#define INIT_THRESHOLD ((SZ_32K / 2) / sizeof(struct btrfs_free_space))
#define INODES_PER_BITMAP (PAGE_CACHE_SIZE * 8)
/*
@@ -334,7 +334,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
return true;
}
-static struct btrfs_free_space_op free_ino_op = {
+static const struct btrfs_free_space_op free_ino_op = {
.recalc_thresholds = recalculate_thresholds,
.use_bitmap = use_bitmap,
};
@@ -356,7 +356,7 @@ static bool pinned_use_bitmap(struct btrfs_free_space_ctl *ctl,
return false;
}
-static struct btrfs_free_space_op pinned_free_ino_op = {
+static const struct btrfs_free_space_op pinned_free_ino_op = {
.recalc_thresholds = pinned_recalc_thresholds,
.use_bitmap = pinned_use_bitmap,
};
@@ -488,17 +488,17 @@ again:
/* Just to make sure we have enough space */
prealloc += 8 * PAGE_CACHE_SIZE;
- ret = btrfs_delalloc_reserve_space(inode, prealloc);
+ ret = btrfs_delalloc_reserve_space(inode, 0, prealloc);
if (ret)
goto out_put;
ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
prealloc, prealloc, &alloc_hint);
if (ret) {
- btrfs_delalloc_release_space(inode, prealloc);
+ btrfs_delalloc_release_space(inode, 0, prealloc);
goto out_put;
}
- btrfs_free_reserved_data_space(inode, prealloc);
+ btrfs_free_reserved_data_space(inode, 0, prealloc);
ret = btrfs_write_out_ino_cache(root, trans, path, inode);
out_put:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 611b66d73e80..247830107686 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -66,6 +66,13 @@ struct btrfs_iget_args {
struct btrfs_root *root;
};
+struct btrfs_dio_data {
+ u64 outstanding_extents;
+ u64 reserve;
+ u64 unsubmitted_oe_range_start;
+ u64 unsubmitted_oe_range_end;
+};
+
static const struct inode_operations btrfs_dir_inode_operations;
static const struct inode_operations btrfs_symlink_inode_operations;
static const struct inode_operations btrfs_dir_ro_inode_operations;
@@ -74,17 +81,16 @@ static const struct inode_operations btrfs_file_inode_operations;
static const struct address_space_operations btrfs_aops;
static const struct address_space_operations btrfs_symlink_aops;
static const struct file_operations btrfs_dir_file_operations;
-static struct extent_io_ops btrfs_extent_io_ops;
+static const struct extent_io_ops btrfs_extent_io_ops;
static struct kmem_cache *btrfs_inode_cachep;
-static struct kmem_cache *btrfs_delalloc_work_cachep;
struct kmem_cache *btrfs_trans_handle_cachep;
struct kmem_cache *btrfs_transaction_cachep;
struct kmem_cache *btrfs_path_cachep;
struct kmem_cache *btrfs_free_space_cachep;
#define S_SHIFT 12
-static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
+static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
[S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE,
[S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR,
[S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV,
@@ -310,6 +316,13 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
btrfs_delalloc_release_metadata(inode, end + 1 - start);
btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
out:
+ /*
+ * Don't forget to free the reserved space, as for inlined extent
+ * it won't count as data extent, free them directly here.
+ * And at reserve time, it's always aligned to page size, so
+ * just free one page here.
+ */
+ btrfs_qgroup_free_data(inode, 0, PAGE_CACHE_SIZE);
btrfs_free_path(path);
btrfs_end_transaction(trans, root);
return ret;
@@ -407,15 +420,15 @@ static noinline void compress_file_range(struct inode *inode,
unsigned long nr_pages_ret = 0;
unsigned long total_compressed = 0;
unsigned long total_in = 0;
- unsigned long max_compressed = 128 * 1024;
- unsigned long max_uncompressed = 128 * 1024;
+ unsigned long max_compressed = SZ_128K;
+ unsigned long max_uncompressed = SZ_128K;
int i;
int will_compress;
int compress_type = root->fs_info->compress_type;
int redirty = 0;
/* if this is a small write inside eof, kick off a defrag */
- if ((end - start + 1) < 16 * 1024 &&
+ if ((end - start + 1) < SZ_16K &&
(start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
btrfs_add_inode_defrag(NULL, inode);
@@ -423,7 +436,7 @@ static noinline void compress_file_range(struct inode *inode,
again:
will_compress = 0;
nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
- nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
+ nr_pages = min_t(unsigned long, nr_pages, SZ_128K / PAGE_CACHE_SIZE);
/*
* we don't want to send crud past the end of i_size through
@@ -937,7 +950,7 @@ static noinline int cow_file_range(struct inode *inode,
disk_num_bytes = num_bytes;
/* if this is a small write inside eof, kick off defrag */
- if (num_bytes < 64 * 1024 &&
+ if (num_bytes < SZ_64K &&
(start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
btrfs_add_inode_defrag(NULL, inode);
@@ -1096,8 +1109,11 @@ static noinline void async_cow_submit(struct btrfs_work *work)
nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
PAGE_CACHE_SHIFT;
+ /*
+ * atomic_sub_return implies a barrier for waitqueue_active
+ */
if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) <
- 5 * 1024 * 1024 &&
+ 5 * SZ_1M &&
waitqueue_active(&root->fs_info->async_submit_wait))
wake_up(&root->fs_info->async_submit_wait);
@@ -1122,7 +1138,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
struct btrfs_root *root = BTRFS_I(inode)->root;
unsigned long nr_pages;
u64 cur_end;
- int limit = 10 * 1024 * 1024;
+ int limit = 10 * SZ_1M;
clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
1, 0, NULL, GFP_NOFS);
@@ -1138,7 +1154,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
!btrfs_test_opt(root, FORCE_COMPRESS))
cur_end = end;
else
- cur_end = min(end, start + 512 * 1024 - 1);
+ cur_end = min(end, start + SZ_512K - 1);
async_cow->end = cur_end;
INIT_LIST_HEAD(&async_cow->extents);
@@ -1294,8 +1310,14 @@ next_slot:
num_bytes = 0;
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
- if (found_key.objectid > ino ||
- found_key.type > BTRFS_EXTENT_DATA_KEY ||
+ if (found_key.objectid > ino)
+ break;
+ if (WARN_ON_ONCE(found_key.objectid < ino) ||
+ found_key.type < BTRFS_EXTENT_DATA_KEY) {
+ path->slots[0]++;
+ goto next_slot;
+ }
+ if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
found_key.offset > end)
break;
@@ -1766,7 +1788,8 @@ static void btrfs_clear_bit_hook(struct inode *inode,
if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
&& do_list && !(state->state & EXTENT_NORESERVE))
- btrfs_free_reserved_data_space(inode, len);
+ btrfs_free_reserved_data_space_noquota(inode,
+ state->start, len);
__percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
root->fs_info->delalloc_batch);
@@ -1861,15 +1884,15 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
u64 bio_offset)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
+ enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
int ret = 0;
int skip_sum;
- int metadata = 0;
int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
if (btrfs_is_free_space_inode(inode))
- metadata = 2;
+ metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
if (!(rw & REQ_WRITE)) {
ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
@@ -1972,7 +1995,7 @@ again:
page_start = page_offset(page);
page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
- lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0,
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
&cached_state);
/* already ordered? We're done */
@@ -1989,7 +2012,8 @@ again:
goto again;
}
- ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+ ret = btrfs_delalloc_reserve_space(inode, page_start,
+ PAGE_CACHE_SIZE);
if (ret) {
mapping_set_error(page->mapping, ret);
end_extent_writepage(page, ret, page_start, page_end);
@@ -2115,7 +2139,13 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
ins.type = BTRFS_EXTENT_ITEM_KEY;
ret = btrfs_alloc_reserved_file_extent(trans, root,
root->root_key.objectid,
- btrfs_ino(inode), file_pos, &ins);
+ btrfs_ino(inode), file_pos,
+ ram_bytes, &ins);
+ /*
+ * Release the reserved range from inode dirty range map, as it is
+ * already moved into delayed_ref_head
+ */
+ btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
out:
btrfs_free_path(path);
@@ -2458,7 +2488,7 @@ static noinline int relink_extent_backref(struct btrfs_path *path,
lock_start = backref->file_pos;
lock_end = backref->file_pos + backref->num_bytes - 1;
lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
- 0, &cached);
+ &cached);
ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
if (ordered) {
@@ -2573,7 +2603,7 @@ again:
ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
new->disk_len, 0,
backref->root_id, backref->inum,
- new->file_pos, 0); /* start - extent_offset */
+ new->file_pos); /* start - extent_offset */
if (ret) {
btrfs_abort_transaction(trans, root, ret);
goto out_free_path;
@@ -2599,7 +2629,6 @@ static void free_sa_defrag_extent(struct new_sa_defrag_extent *new)
return;
list_for_each_entry_safe(old, tmp, &new->head, list) {
- list_del(&old->list);
kfree(old);
}
kfree(new);
@@ -2824,6 +2853,14 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
+
+ /*
+ * For mwrite(mmap + memset to write) case, we still reserve
+ * space for NOCOW range.
+ * As NOCOW won't cause a new delayed ref, just free the space
+ */
+ btrfs_qgroup_free_data(inode, ordered_extent->file_offset,
+ ordered_extent->len);
btrfs_ordered_update_i_size(inode, 0, ordered_extent);
if (nolock)
trans = btrfs_join_transaction_nolock(root);
@@ -2843,7 +2880,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
lock_extent_bits(io_tree, ordered_extent->file_offset,
ordered_extent->file_offset + ordered_extent->len - 1,
- 0, &cached_state);
+ &cached_state);
ret = test_range_bit(io_tree, ordered_extent->file_offset,
ordered_extent->file_offset + ordered_extent->len - 1,
@@ -3018,8 +3055,6 @@ static int __readpage_endio_check(struct inode *inode,
char *kaddr;
u32 csum_expected;
u32 csum = ~(u32)0;
- static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
- DEFAULT_RATELIMIT_BURST);
csum_expected = *(((u32 *)io_bio->csum) + icsum);
@@ -3032,9 +3067,8 @@ static int __readpage_endio_check(struct inode *inode,
kunmap_atomic(kaddr);
return 0;
zeroit:
- if (__ratelimit(&_rs))
- btrfs_warn(BTRFS_I(inode)->root->fs_info,
- "csum failed ino %llu off %llu csum %u expected csum %u",
+ btrfs_warn_rl(BTRFS_I(inode)->root->fs_info,
+ "csum failed ino %llu off %llu csum %u expected csum %u",
btrfs_ino(inode), start, csum, csum_expected);
memset(kaddr + pgoff, 1, len);
flush_dcache_page(page);
@@ -3078,55 +3112,47 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
start, (size_t)(end - start + 1));
}
-struct delayed_iput {
- struct list_head list;
- struct inode *inode;
-};
-
-/* JDM: If this is fs-wide, why can't we add a pointer to
- * btrfs_inode instead and avoid the allocation? */
void btrfs_add_delayed_iput(struct inode *inode)
{
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
- struct delayed_iput *delayed;
+ struct btrfs_inode *binode = BTRFS_I(inode);
if (atomic_add_unless(&inode->i_count, -1, 1))
return;
- delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL);
- delayed->inode = inode;
-
spin_lock(&fs_info->delayed_iput_lock);
- list_add_tail(&delayed->list, &fs_info->delayed_iputs);
+ if (binode->delayed_iput_count == 0) {
+ ASSERT(list_empty(&binode->delayed_iput));
+ list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
+ } else {
+ binode->delayed_iput_count++;
+ }
spin_unlock(&fs_info->delayed_iput_lock);
}
void btrfs_run_delayed_iputs(struct btrfs_root *root)
{
- LIST_HEAD(list);
struct btrfs_fs_info *fs_info = root->fs_info;
- struct delayed_iput *delayed;
- int empty;
-
- spin_lock(&fs_info->delayed_iput_lock);
- empty = list_empty(&fs_info->delayed_iputs);
- spin_unlock(&fs_info->delayed_iput_lock);
- if (empty)
- return;
down_read(&fs_info->delayed_iput_sem);
-
spin_lock(&fs_info->delayed_iput_lock);
- list_splice_init(&fs_info->delayed_iputs, &list);
- spin_unlock(&fs_info->delayed_iput_lock);
-
- while (!list_empty(&list)) {
- delayed = list_entry(list.next, struct delayed_iput, list);
- list_del(&delayed->list);
- iput(delayed->inode);
- kfree(delayed);
+ while (!list_empty(&fs_info->delayed_iputs)) {
+ struct btrfs_inode *inode;
+
+ inode = list_first_entry(&fs_info->delayed_iputs,
+ struct btrfs_inode, delayed_iput);
+ if (inode->delayed_iput_count) {
+ inode->delayed_iput_count--;
+ list_move_tail(&inode->delayed_iput,
+ &fs_info->delayed_iputs);
+ } else {
+ list_del_init(&inode->delayed_iput);
+ }
+ spin_unlock(&fs_info->delayed_iput_lock);
+ iput(&inode->vfs_inode);
+ spin_lock(&fs_info->delayed_iput_lock);
}
-
+ spin_unlock(&fs_info->delayed_iput_lock);
up_read(&root->fs_info->delayed_iput_sem);
}
@@ -3323,7 +3349,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
ret = -ENOMEM;
goto out;
}
- path->reada = -1;
+ path->reada = READA_BACK;
key.objectid = BTRFS_ORPHAN_OBJECTID;
key.type = BTRFS_ORPHAN_ITEM_KEY;
@@ -3522,10 +3548,10 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
int scanned = 0;
if (!xattr_access) {
- xattr_access = btrfs_name_hash(POSIX_ACL_XATTR_ACCESS,
- strlen(POSIX_ACL_XATTR_ACCESS));
- xattr_default = btrfs_name_hash(POSIX_ACL_XATTR_DEFAULT,
- strlen(POSIX_ACL_XATTR_DEFAULT));
+ xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS,
+ strlen(XATTR_NAME_POSIX_ACL_ACCESS));
+ xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT,
+ strlen(XATTR_NAME_POSIX_ACL_DEFAULT));
}
slot++;
@@ -3746,6 +3772,7 @@ cache_acl:
break;
case S_IFLNK:
inode->i_op = &btrfs_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &btrfs_symlink_aops;
break;
default:
@@ -4018,9 +4045,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
*/
static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
{
- struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(dir)->root;
- int ret;
/*
* 1 for the possible orphan item
@@ -4029,27 +4054,7 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
* 1 for the inode ref
* 1 for the inode
*/
- trans = btrfs_start_transaction(root, 5);
- if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
- return trans;
-
- if (PTR_ERR(trans) == -ENOSPC) {
- u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
-
- trans = btrfs_start_transaction(root, 0);
- if (IS_ERR(trans))
- return trans;
- ret = btrfs_cond_migrate_bytes(root->fs_info,
- &root->fs_info->trans_block_rsv,
- num_bytes, 5);
- if (ret) {
- btrfs_end_transaction(trans, root);
- return ERR_PTR(ret);
- }
- trans->block_rsv = &root->fs_info->trans_block_rsv;
- trans->bytes_reserved = num_bytes;
- }
- return trans;
+ return btrfs_start_transaction_fallback_global_rsv(root, 5, 5);
}
static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
@@ -4217,6 +4222,47 @@ static int truncate_space_check(struct btrfs_trans_handle *trans,
}
+static int truncate_inline_extent(struct inode *inode,
+ struct btrfs_path *path,
+ struct btrfs_key *found_key,
+ const u64 item_end,
+ const u64 new_size)
+{
+ struct extent_buffer *leaf = path->nodes[0];
+ int slot = path->slots[0];
+ struct btrfs_file_extent_item *fi;
+ u32 size = (u32)(new_size - found_key->offset);
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+
+ fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+
+ if (btrfs_file_extent_compression(leaf, fi) != BTRFS_COMPRESS_NONE) {
+ loff_t offset = new_size;
+ loff_t page_end = ALIGN(offset, PAGE_CACHE_SIZE);
+
+ /*
+ * Zero out the remaining of the last page of our inline extent,
+ * instead of directly truncating our inline extent here - that
+ * would be much more complex (decompressing all the data, then
+ * compressing the truncated data, which might be bigger than
+ * the size of the inline extent, resize the extent, etc).
+ * We release the path because to get the page we might need to
+ * read the extent item from disk (data not in the page cache).
+ */
+ btrfs_release_path(path);
+ return btrfs_truncate_page(inode, offset, page_end - offset, 0);
+ }
+
+ btrfs_set_file_extent_ram_bytes(leaf, fi, size);
+ size = btrfs_file_extent_calc_inline_size(size);
+ btrfs_truncate_item(root, path, size, 1);
+
+ if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
+ inode_sub_bytes(inode, item_end + 1 - new_size);
+
+ return 0;
+}
+
/*
* this can truncate away extent items, csum items and directory items.
* It starts at a high offset and removes keys until it can't find
@@ -4270,7 +4316,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->reada = -1;
+ path->reada = READA_BACK;
/*
* We want to drop from the next block forward in case this new size is
@@ -4301,7 +4347,7 @@ search_again:
* up a huge file in a single leaf. Most of the time that
* bytes_deleted is > 0, it will be huge by the time we get here
*/
- if (be_nice && bytes_deleted > 32 * 1024 * 1024) {
+ if (be_nice && bytes_deleted > SZ_32M) {
if (btrfs_should_end_transaction(trans, root)) {
err = -EAGAIN;
goto error;
@@ -4411,27 +4457,40 @@ search_again:
* special encodings
*/
if (!del_item &&
- btrfs_file_extent_compression(leaf, fi) == 0 &&
btrfs_file_extent_encryption(leaf, fi) == 0 &&
btrfs_file_extent_other_encoding(leaf, fi) == 0) {
- u32 size = new_size - found_key.offset;
-
- if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
- inode_sub_bytes(inode, item_end + 1 -
- new_size);
/*
- * update the ram bytes to properly reflect
- * the new size of our item
+ * Need to release path in order to truncate a
+ * compressed extent. So delete any accumulated
+ * extent items so far.
*/
- btrfs_set_file_extent_ram_bytes(leaf, fi, size);
- size =
- btrfs_file_extent_calc_inline_size(size);
- btrfs_truncate_item(root, path, size, 1);
+ if (btrfs_file_extent_compression(leaf, fi) !=
+ BTRFS_COMPRESS_NONE && pending_del_nr) {
+ err = btrfs_del_items(trans, root, path,
+ pending_del_slot,
+ pending_del_nr);
+ if (err) {
+ btrfs_abort_transaction(trans,
+ root,
+ err);
+ goto error;
+ }
+ pending_del_nr = 0;
+ }
+
+ err = truncate_inline_extent(inode, path,
+ &found_key,
+ item_end,
+ new_size);
+ if (err) {
+ btrfs_abort_transaction(trans,
+ root, err);
+ goto error;
+ }
} else if (test_bit(BTRFS_ROOT_REF_COWS,
&root->state)) {
- inode_sub_bytes(inode, item_end + 1 -
- found_key.offset);
+ inode_sub_bytes(inode, item_end + 1 - new_size);
}
}
delete:
@@ -4461,7 +4520,7 @@ delete:
ret = btrfs_free_extent(trans, root, extent_start,
extent_num_bytes, 0,
btrfs_header_owner(leaf),
- ino, extent_offset, 0);
+ ino, extent_offset);
BUG_ON(ret);
if (btrfs_should_throttle_delayed_refs(trans, root))
btrfs_async_run_delayed_refs(root,
@@ -4531,7 +4590,7 @@ error:
btrfs_free_path(path);
- if (be_nice && bytes_deleted > 32 * 1024 * 1024) {
+ if (be_nice && bytes_deleted > SZ_32M) {
unsigned long updates = trans->delayed_ref_updates;
if (updates) {
trans->delayed_ref_updates = 0;
@@ -4575,14 +4634,17 @@ int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
if ((offset & (blocksize - 1)) == 0 &&
(!len || ((len & (blocksize - 1)) == 0)))
goto out;
- ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+ ret = btrfs_delalloc_reserve_space(inode,
+ round_down(from, PAGE_CACHE_SIZE), PAGE_CACHE_SIZE);
if (ret)
goto out;
again:
page = find_or_create_page(mapping, index, mask);
if (!page) {
- btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+ btrfs_delalloc_release_space(inode,
+ round_down(from, PAGE_CACHE_SIZE),
+ PAGE_CACHE_SIZE);
ret = -ENOMEM;
goto out;
}
@@ -4605,7 +4667,7 @@ again:
}
wait_on_page_writeback(page);
- lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state);
+ lock_extent_bits(io_tree, page_start, page_end, &cached_state);
set_page_extent_mapped(page);
ordered = btrfs_lookup_ordered_extent(inode, page_start);
@@ -4650,7 +4712,8 @@ again:
out_unlock:
if (ret)
- btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+ btrfs_delalloc_release_space(inode, page_start,
+ PAGE_CACHE_SIZE);
unlock_page(page);
page_cache_release(page);
out:
@@ -4735,7 +4798,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
while (1) {
struct btrfs_ordered_extent *ordered;
- lock_extent_bits(io_tree, hole_start, block_end - 1, 0,
+ lock_extent_bits(io_tree, hole_start, block_end - 1,
&cached_state);
ordered = btrfs_lookup_ordered_range(inode, hole_start,
block_end - hole_start);
@@ -5047,7 +5110,19 @@ static void evict_inode_truncate_pages(struct inode *inode)
end = state->end;
spin_unlock(&io_tree->lock);
- lock_extent_bits(io_tree, start, end, 0, &cached_state);
+ lock_extent_bits(io_tree, start, end, &cached_state);
+
+ /*
+ * If still has DELALLOC flag, the extent didn't reach disk,
+ * and its reserved space won't be freed by delayed_ref.
+ * So we need to free its reserved space here.
+ * (Refer to comment in btrfs_invalidatepage, case 2)
+ *
+ * Note, end is the bytenr of last byte, so we need + 1 here.
+ */
+ if (state->state & EXTENT_DELALLOC)
+ btrfs_qgroup_free_data(inode, start, end - start + 1);
+
clear_extent_bit(io_tree, start, end,
EXTENT_LOCKED | EXTENT_DIRTY |
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
@@ -5228,7 +5303,6 @@ void btrfs_evict_inode(struct inode *inode)
no_delete:
btrfs_remove_delayed_node(inode);
clear_inode(inode);
- return;
}
/*
@@ -5677,7 +5751,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
if (!path)
return -ENOMEM;
- path->reada = 1;
+ path->reada = READA_FORWARD;
if (key_type == BTRFS_DIR_INDEX_KEY) {
INIT_LIST_HEAD(&ins_list);
@@ -6268,9 +6342,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
u64 objectid;
u64 index = 0;
- if (!new_valid_dev(rdev))
- return -EINVAL;
-
/*
* 2 for inode item and ref
* 2 for dir items
@@ -6408,7 +6479,7 @@ out_unlock_inode:
static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
struct dentry *dentry)
{
- struct btrfs_trans_handle *trans;
+ struct btrfs_trans_handle *trans = NULL;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct inode *inode = d_inode(old_dentry);
u64 index;
@@ -6434,6 +6505,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
trans = btrfs_start_transaction(root, 5);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
+ trans = NULL;
goto fail;
}
@@ -6467,9 +6539,10 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
btrfs_log_new_name(trans, inode, NULL, parent);
}
- btrfs_end_transaction(trans, root);
btrfs_balance_delayed_items(root);
fail:
+ if (trans)
+ btrfs_end_transaction(trans, root);
if (drop_inode) {
inode_dec_link_count(inode);
iput(inode);
@@ -6614,7 +6687,7 @@ static int merge_extent_mapping(struct extent_map_tree *em_tree,
}
static noinline int uncompress_inline(struct btrfs_path *path,
- struct inode *inode, struct page *page,
+ struct page *page,
size_t pg_offset, u64 extent_offset,
struct btrfs_file_extent_item *item)
{
@@ -6711,7 +6784,7 @@ again:
* Chances are we'll be called again, so go ahead and do
* readahead
*/
- path->reada = 1;
+ path->reada = READA_FORWARD;
}
ret = btrfs_lookup_file_extent(trans, root, path,
@@ -6810,8 +6883,7 @@ next:
if (create == 0 && !PageUptodate(page)) {
if (btrfs_file_extent_compression(leaf, item) !=
BTRFS_COMPRESS_NONE) {
- ret = uncompress_inline(path, inode, page,
- pg_offset,
+ ret = uncompress_inline(path, page, pg_offset,
extent_offset, item);
if (ret) {
err = ret;
@@ -7307,7 +7379,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
while (1) {
lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- 0, cached_state);
+ cached_state);
/*
* We're concerned with the entire range that we're going to be
* doing DIO to, so we need to make sure theres no ordered
@@ -7335,25 +7407,21 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
btrfs_start_ordered_extent(inode, ordered, 1);
btrfs_put_ordered_extent(ordered);
} else {
- /* Screw you mmap */
- ret = btrfs_fdatawrite_range(inode, lockstart, lockend);
- if (ret)
- break;
- ret = filemap_fdatawait_range(inode->i_mapping,
- lockstart,
- lockend);
- if (ret)
- break;
-
/*
- * If we found a page that couldn't be invalidated just
- * fall back to buffered.
+ * We could trigger writeback for this range (and wait
+ * for it to complete) and then invalidate the pages for
+ * this range (through invalidate_inode_pages2_range()),
+ * but that can lead us to a deadlock with a concurrent
+ * call to readpages() (a buffered read or a defrag call
+ * triggered a readahead) on a page lock due to an
+ * ordered dio extent we created before but did not have
+ * yet a corresponding bio submitted (whence it can not
+ * complete), which makes readpages() wait for that
+ * ordered extent to complete while holding a lock on
+ * that page.
*/
- ret = invalidate_inode_pages2_range(inode->i_mapping,
- lockstart >> PAGE_CACHE_SHIFT,
- lockend >> PAGE_CACHE_SHIFT);
- if (ret)
- break;
+ ret = -ENOTBLK;
+ break;
}
cond_resched();
@@ -7409,10 +7477,27 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
return em;
}
-struct btrfs_dio_data {
- u64 outstanding_extents;
- u64 reserve;
-};
+static void adjust_dio_outstanding_extents(struct inode *inode,
+ struct btrfs_dio_data *dio_data,
+ const u64 len)
+{
+ unsigned num_extents;
+
+ num_extents = (unsigned) div64_u64(len + BTRFS_MAX_EXTENT_SIZE - 1,
+ BTRFS_MAX_EXTENT_SIZE);
+ /*
+ * If we have an outstanding_extents count still set then we're
+ * within our reservation, otherwise we need to adjust our inode
+ * counter appropriately.
+ */
+ if (dio_data->outstanding_extents) {
+ dio_data->outstanding_extents -= num_extents;
+ } else {
+ spin_lock(&BTRFS_I(inode)->lock);
+ BTRFS_I(inode)->outstanding_extents += num_extents;
+ spin_unlock(&BTRFS_I(inode)->lock);
+ }
+}
static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
@@ -7449,8 +7534,11 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
* If this errors out it's because we couldn't invalidate pagecache for
* this range and we need to fallback to buffered.
*/
- if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create))
- return -ENOTBLK;
+ if (lock_extent_direct(inode, lockstart, lockend, &cached_state,
+ create)) {
+ ret = -ENOTBLK;
+ goto err;
+ }
em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
if (IS_ERR(em)) {
@@ -7568,22 +7656,11 @@ unlock:
if (start + len > i_size_read(inode))
i_size_write(inode, start + len);
- /*
- * If we have an outstanding_extents count still set then we're
- * within our reservation, otherwise we need to adjust our inode
- * counter appropriately.
- */
- if (dio_data->outstanding_extents) {
- (dio_data->outstanding_extents)--;
- } else {
- spin_lock(&BTRFS_I(inode)->lock);
- BTRFS_I(inode)->outstanding_extents++;
- spin_unlock(&BTRFS_I(inode)->lock);
- }
-
- btrfs_free_reserved_data_space(inode, len);
+ adjust_dio_outstanding_extents(inode, dio_data, len);
+ btrfs_free_reserved_data_space(inode, start, len);
WARN_ON(dio_data->reserve < len);
dio_data->reserve -= len;
+ dio_data->unsubmitted_oe_range_end = start + len;
current->journal_info = dio_data;
}
@@ -7607,8 +7684,17 @@ unlock:
unlock_err:
clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
unlock_bits, 1, 0, &cached_state, GFP_NOFS);
+err:
if (dio_data)
current->journal_info = dio_data;
+ /*
+ * Compensate the delalloc release we do in btrfs_direct_IO() when we
+ * write less data then expected, so that we don't underflow our inode's
+ * outstanding extents counter.
+ */
+ if (create && dio_data)
+ adjust_dio_outstanding_extents(inode, dio_data, len);
+
return ret;
}
@@ -7897,22 +7983,22 @@ static void btrfs_endio_direct_read(struct bio *bio)
bio_put(bio);
}
-static void btrfs_endio_direct_write(struct bio *bio)
+static void btrfs_endio_direct_write_update_ordered(struct inode *inode,
+ const u64 offset,
+ const u64 bytes,
+ const int uptodate)
{
- struct btrfs_dio_private *dip = bio->bi_private;
- struct inode *inode = dip->inode;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_ordered_extent *ordered = NULL;
- u64 ordered_offset = dip->logical_offset;
- u64 ordered_bytes = dip->bytes;
- struct bio *dio_bio;
+ u64 ordered_offset = offset;
+ u64 ordered_bytes = bytes;
int ret;
again:
ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
&ordered_offset,
ordered_bytes,
- !bio->bi_error);
+ uptodate);
if (!ret)
goto out_test;
@@ -7925,13 +8011,22 @@ out_test:
* our bio might span multiple ordered extents. If we haven't
* completed the accounting for the whole dio, go back and try again
*/
- if (ordered_offset < dip->logical_offset + dip->bytes) {
- ordered_bytes = dip->logical_offset + dip->bytes -
- ordered_offset;
+ if (ordered_offset < offset + bytes) {
+ ordered_bytes = offset + bytes - ordered_offset;
ordered = NULL;
goto again;
}
- dio_bio = dip->dio_bio;
+}
+
+static void btrfs_endio_direct_write(struct bio *bio)
+{
+ struct btrfs_dio_private *dip = bio->bi_private;
+ struct bio *dio_bio = dip->dio_bio;
+
+ btrfs_endio_direct_write_update_ordered(dip->inode,
+ dip->logical_offset,
+ dip->bytes,
+ !bio->bi_error);
kfree(dip);
@@ -8239,6 +8334,21 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
dip->subio_endio = btrfs_subio_endio_read;
}
+ /*
+ * Reset the range for unsubmitted ordered extents (to a 0 length range)
+ * even if we fail to submit a bio, because in such case we do the
+ * corresponding error handling below and it must not be done a second
+ * time by btrfs_direct_IO().
+ */
+ if (write) {
+ struct btrfs_dio_data *dio_data = current->journal_info;
+
+ dio_data->unsubmitted_oe_range_end = dip->logical_offset +
+ dip->bytes;
+ dio_data->unsubmitted_oe_range_start =
+ dio_data->unsubmitted_oe_range_end;
+ }
+
ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
if (!ret)
return;
@@ -8267,24 +8377,15 @@ free_ordered:
dip = NULL;
io_bio = NULL;
} else {
- if (write) {
- struct btrfs_ordered_extent *ordered;
-
- ordered = btrfs_lookup_ordered_extent(inode,
- file_offset);
- set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
- /*
- * Decrements our ref on the ordered extent and removes
- * the ordered extent from the inode's ordered tree,
- * doing all the proper resource cleanup such as for the
- * reserved space and waking up any waiters for this
- * ordered extent (through btrfs_remove_ordered_extent).
- */
- btrfs_finish_ordered_io(ordered);
- } else {
+ if (write)
+ btrfs_endio_direct_write_update_ordered(inode,
+ file_offset,
+ dio_bio->bi_iter.bi_size,
+ 0);
+ else
unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
file_offset + dio_bio->bi_iter.bi_size - 1);
- }
+
dio_bio->bi_error = -EIO;
/*
* Releases and cleans up our dio_bio, no need to bio_put()
@@ -8371,7 +8472,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
mutex_unlock(&inode->i_mutex);
relock = true;
}
- ret = btrfs_delalloc_reserve_space(inode, count);
+ ret = btrfs_delalloc_reserve_space(inode, offset, count);
if (ret)
goto out;
dio_data.outstanding_extents = div64_u64(count +
@@ -8384,6 +8485,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
* originally calculated. Abuse current->journal_info for this.
*/
dio_data.reserve = round_up(count, root->sectorsize);
+ dio_data.unsubmitted_oe_range_start = (u64)offset;
+ dio_data.unsubmitted_oe_range_end = (u64)offset;
current->journal_info = &dio_data;
} else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
&BTRFS_I(inode)->runtime_flags)) {
@@ -8400,10 +8503,23 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
current->journal_info = NULL;
if (ret < 0 && ret != -EIOCBQUEUED) {
if (dio_data.reserve)
- btrfs_delalloc_release_space(inode,
- dio_data.reserve);
+ btrfs_delalloc_release_space(inode, offset,
+ dio_data.reserve);
+ /*
+ * On error we might have left some ordered extents
+ * without submitting corresponding bios for them, so
+ * cleanup them up to avoid other tasks getting them
+ * and waiting for them to complete forever.
+ */
+ if (dio_data.unsubmitted_oe_range_start <
+ dio_data.unsubmitted_oe_range_end)
+ btrfs_endio_direct_write_update_ordered(inode,
+ dio_data.unsubmitted_oe_range_start,
+ dio_data.unsubmitted_oe_range_end -
+ dio_data.unsubmitted_oe_range_start,
+ 0);
} else if (ret >= 0 && (size_t)ret < count)
- btrfs_delalloc_release_space(inode,
+ btrfs_delalloc_release_space(inode, offset,
count - (size_t)ret);
}
out:
@@ -8439,15 +8555,28 @@ int btrfs_readpage(struct file *file, struct page *page)
static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
{
struct extent_io_tree *tree;
-
+ struct inode *inode = page->mapping->host;
+ int ret;
if (current->flags & PF_MEMALLOC) {
redirty_page_for_writepage(wbc, page);
unlock_page(page);
return 0;
}
+
+ /*
+ * If we are under memory pressure we will call this directly from the
+ * VM, we need to make sure we have the inode referenced for the ordered
+ * extent. If not just return like we didn't do anything.
+ */
+ if (!igrab(inode)) {
+ redirty_page_for_writepage(wbc, page);
+ return AOP_WRITEPAGE_ACTIVATE;
+ }
tree = &BTRFS_I(page->mapping->host)->io_tree;
- return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
+ ret = extent_write_full_page(tree, page, btrfs_get_extent, wbc);
+ btrfs_add_delayed_iput(inode);
+ return ret;
}
static int btrfs_writepages(struct address_space *mapping,
@@ -8519,7 +8648,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
}
if (!inode_evicting)
- lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
+ lock_extent_bits(tree, page_start, page_end, &cached_state);
ordered = btrfs_lookup_ordered_extent(inode, page_start);
if (ordered) {
/*
@@ -8557,11 +8686,23 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
btrfs_put_ordered_extent(ordered);
if (!inode_evicting) {
cached_state = NULL;
- lock_extent_bits(tree, page_start, page_end, 0,
+ lock_extent_bits(tree, page_start, page_end,
&cached_state);
}
}
+ /*
+ * Qgroup reserved space handler
+ * Page here will be either
+ * 1) Already written to disk
+ * In this case, its reserved space is released from data rsv map
+ * and will be freed by delayed_ref handler finally.
+ * So even we call qgroup_free_data(), it won't decrease reserved
+ * space.
+ * 2) Not written to disk
+ * This means the reserved space should be freed here.
+ */
+ btrfs_qgroup_free_data(inode, page_start, PAGE_CACHE_SIZE);
if (!inode_evicting) {
clear_extent_bit(tree, page_start, page_end,
EXTENT_LOCKED | EXTENT_DIRTY |
@@ -8612,7 +8753,11 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
u64 page_end;
sb_start_pagefault(inode->i_sb);
- ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+ page_start = page_offset(page);
+ page_end = page_start + PAGE_CACHE_SIZE - 1;
+
+ ret = btrfs_delalloc_reserve_space(inode, page_start,
+ PAGE_CACHE_SIZE);
if (!ret) {
ret = file_update_time(vma->vm_file);
reserved = 1;
@@ -8631,8 +8776,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
again:
lock_page(page);
size = i_size_read(inode);
- page_start = page_offset(page);
- page_end = page_start + PAGE_CACHE_SIZE - 1;
if ((page->mapping != inode->i_mapping) ||
(page_start >= size)) {
@@ -8641,7 +8784,7 @@ again:
}
wait_on_page_writeback(page);
- lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state);
+ lock_extent_bits(io_tree, page_start, page_end, &cached_state);
set_page_extent_mapped(page);
/*
@@ -8709,7 +8852,7 @@ out_unlock:
}
unlock_page(page);
out:
- btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+ btrfs_delalloc_release_space(inode, page_start, PAGE_CACHE_SIZE);
out_noreserve:
sb_end_pagefault(inode->i_sb);
return ret;
@@ -8915,6 +9058,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->dir_index = 0;
ei->last_unlink_trans = 0;
ei->last_log_commit = 0;
+ ei->delayed_iput_count = 0;
spin_lock_init(&ei->lock);
ei->outstanding_extents = 0;
@@ -8939,6 +9083,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
mutex_init(&ei->delalloc_mutex);
btrfs_ordered_inode_tree_init(&ei->ordered_tree);
INIT_LIST_HEAD(&ei->delalloc_inodes);
+ INIT_LIST_HEAD(&ei->delayed_iput);
RB_CLEAR_NODE(&ei->rb_node);
return inode;
@@ -8998,6 +9143,7 @@ void btrfs_destroy_inode(struct inode *inode)
btrfs_put_ordered_extent(ordered);
}
}
+ btrfs_qgroup_check_reserved_leak(inode);
inode_tree_del(inode);
btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
free:
@@ -9042,15 +9188,14 @@ void btrfs_destroy_cachep(void)
kmem_cache_destroy(btrfs_path_cachep);
if (btrfs_free_space_cachep)
kmem_cache_destroy(btrfs_free_space_cachep);
- if (btrfs_delalloc_work_cachep)
- kmem_cache_destroy(btrfs_delalloc_work_cachep);
}
int btrfs_init_cachep(void)
{
btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
sizeof(struct btrfs_inode), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once);
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT,
+ init_once);
if (!btrfs_inode_cachep)
goto fail;
@@ -9078,13 +9223,6 @@ int btrfs_init_cachep(void)
if (!btrfs_free_space_cachep)
goto fail;
- btrfs_delalloc_work_cachep = kmem_cache_create("btrfs_delalloc_work",
- sizeof(struct btrfs_delalloc_work), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
- NULL);
- if (!btrfs_delalloc_work_cachep)
- goto fail;
-
return 0;
fail:
btrfs_destroy_cachep();
@@ -9308,14 +9446,10 @@ static void btrfs_run_delalloc_work(struct btrfs_work *work)
delalloc_work = container_of(work, struct btrfs_delalloc_work,
work);
inode = delalloc_work->inode;
- if (delalloc_work->wait) {
- btrfs_wait_ordered_range(inode, 0, (u64)-1);
- } else {
+ filemap_flush(inode->i_mapping);
+ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags))
filemap_flush(inode->i_mapping);
- if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags))
- filemap_flush(inode->i_mapping);
- }
if (delalloc_work->delay_iput)
btrfs_add_delayed_iput(inode);
@@ -9325,18 +9459,17 @@ static void btrfs_run_delalloc_work(struct btrfs_work *work)
}
struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
- int wait, int delay_iput)
+ int delay_iput)
{
struct btrfs_delalloc_work *work;
- work = kmem_cache_zalloc(btrfs_delalloc_work_cachep, GFP_NOFS);
+ work = kmalloc(sizeof(*work), GFP_NOFS);
if (!work)
return NULL;
init_completion(&work->completion);
INIT_LIST_HEAD(&work->list);
work->inode = inode;
- work->wait = wait;
work->delay_iput = delay_iput;
WARN_ON_ONCE(!inode);
btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
@@ -9348,7 +9481,7 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
{
wait_for_completion(&work->completion);
- kmem_cache_free(btrfs_delalloc_work_cachep, work);
+ kfree(work);
}
/*
@@ -9384,7 +9517,7 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
}
spin_unlock(&root->delalloc_lock);
- work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
+ work = btrfs_alloc_delalloc_work(inode, delay_iput);
if (!work) {
if (delay_iput)
btrfs_add_delayed_iput(inode);
@@ -9526,9 +9659,11 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
/*
* 2 items for inode item and ref
* 2 items for dir items
+ * 1 item for updating parent inode item
+ * 1 item for the inline extent item
* 1 item for xattr if selinux is on
*/
- trans = btrfs_start_transaction(root, 5);
+ trans = btrfs_start_transaction(root, 7);
if (IS_ERR(trans))
return PTR_ERR(trans);
@@ -9559,10 +9694,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
if (err)
goto out_unlock_inode;
- err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
- if (err)
- goto out_unlock_inode;
-
path = btrfs_alloc_path();
if (!path) {
err = -ENOMEM;
@@ -9595,10 +9726,18 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
btrfs_free_path(path);
inode->i_op = &btrfs_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &btrfs_symlink_aops;
inode_set_bytes(inode, name_len);
btrfs_i_size_write(inode, name_len);
err = btrfs_update_inode(trans, root, inode);
+ /*
+ * Last step, add directory indexes for our symlink inode. This is the
+ * last step to avoid extra cleanup of these indexes if an error happens
+ * elsewhere above.
+ */
+ if (!err)
+ err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
if (err) {
drop_inode = 1;
goto out_unlock_inode;
@@ -9634,6 +9773,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
u64 cur_offset = start;
u64 i_size;
u64 cur_bytes;
+ u64 last_alloc = (u64)-1;
int ret = 0;
bool own_trans = true;
@@ -9648,8 +9788,15 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
}
}
- cur_bytes = min(num_bytes, 256ULL * 1024 * 1024);
+ cur_bytes = min_t(u64, num_bytes, SZ_256M);
cur_bytes = max(cur_bytes, min_size);
+ /*
+ * If we are severely fragmented we could end up with really
+ * small allocations, so if the allocator is returning small
+ * chunks lets make its job easier by only searching for those
+ * sized chunks.
+ */
+ cur_bytes = min(cur_bytes, last_alloc);
ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0,
*alloc_hint, &ins, 1, 0);
if (ret) {
@@ -9658,6 +9805,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
break;
}
+ last_alloc = ins.offset;
ret = insert_reserved_file_extent(trans, inode,
cur_offset, ins.objectid,
ins.offset, ins.offset,
@@ -9875,7 +10023,7 @@ static const struct inode_operations btrfs_dir_inode_operations = {
.setattr = btrfs_setattr,
.mknod = btrfs_mknod,
.setxattr = btrfs_setxattr,
- .getxattr = btrfs_getxattr,
+ .getxattr = generic_getxattr,
.listxattr = btrfs_listxattr,
.removexattr = btrfs_removexattr,
.permission = btrfs_permission,
@@ -9904,7 +10052,7 @@ static const struct file_operations btrfs_dir_file_operations = {
.fsync = btrfs_sync_file,
};
-static struct extent_io_ops btrfs_extent_io_ops = {
+static const struct extent_io_ops btrfs_extent_io_ops = {
.fill_delalloc = run_delalloc_range,
.submit_bio_hook = btrfs_submit_bio_hook,
.merge_bio_hook = btrfs_merge_bio_hook,
@@ -9952,7 +10100,7 @@ static const struct inode_operations btrfs_file_inode_operations = {
.getattr = btrfs_getattr,
.setattr = btrfs_setattr,
.setxattr = btrfs_setxattr,
- .getxattr = btrfs_getxattr,
+ .getxattr = generic_getxattr,
.listxattr = btrfs_listxattr,
.removexattr = btrfs_removexattr,
.permission = btrfs_permission,
@@ -9966,7 +10114,7 @@ static const struct inode_operations btrfs_special_inode_operations = {
.setattr = btrfs_setattr,
.permission = btrfs_permission,
.setxattr = btrfs_setxattr,
- .getxattr = btrfs_getxattr,
+ .getxattr = generic_getxattr,
.listxattr = btrfs_listxattr,
.removexattr = btrfs_removexattr,
.get_acl = btrfs_get_acl,
@@ -9975,13 +10123,12 @@ static const struct inode_operations btrfs_special_inode_operations = {
};
static const struct inode_operations btrfs_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.getattr = btrfs_getattr,
.setattr = btrfs_setattr,
.permission = btrfs_permission,
.setxattr = btrfs_setxattr,
- .getxattr = btrfs_getxattr,
+ .getxattr = generic_getxattr,
.listxattr = btrfs_listxattr,
.removexattr = btrfs_removexattr,
.update_time = btrfs_update_time,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8d20f3b1cab0..2a47a3148ec8 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -655,22 +655,28 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
return -EINVAL;
+ pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
+ if (!pending_snapshot)
+ return -ENOMEM;
+
+ pending_snapshot->root_item = kzalloc(sizeof(struct btrfs_root_item),
+ GFP_NOFS);
+ pending_snapshot->path = btrfs_alloc_path();
+ if (!pending_snapshot->root_item || !pending_snapshot->path) {
+ ret = -ENOMEM;
+ goto free_pending;
+ }
+
atomic_inc(&root->will_be_snapshoted);
smp_mb__after_atomic();
btrfs_wait_for_no_snapshoting_writes(root);
ret = btrfs_start_delalloc_inodes(root, 0);
if (ret)
- goto out;
+ goto dec_and_free;
btrfs_wait_ordered_extents(root, -1);
- pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
- if (!pending_snapshot) {
- ret = -ENOMEM;
- goto out;
- }
-
btrfs_init_block_rsv(&pending_snapshot->block_rsv,
BTRFS_BLOCK_RSV_TEMP);
/*
@@ -686,7 +692,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
&pending_snapshot->qgroup_reserved,
false);
if (ret)
- goto free;
+ goto dec_and_free;
pending_snapshot->dentry = dentry;
pending_snapshot->root = root;
@@ -737,11 +743,14 @@ fail:
btrfs_subvolume_release_metadata(BTRFS_I(dir)->root,
&pending_snapshot->block_rsv,
pending_snapshot->qgroup_reserved);
-free:
- kfree(pending_snapshot);
-out:
+dec_and_free:
if (atomic_dec_and_test(&root->will_be_snapshoted))
wake_up_atomic_t(&root->will_be_snapshoted);
+free_pending:
+ kfree(pending_snapshot->root_item);
+ btrfs_free_path(pending_snapshot->path);
+ kfree(pending_snapshot);
+
return ret;
}
@@ -992,7 +1001,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start)
u64 end = start + len - 1;
/* get the big lock and read metadata off disk */
- lock_extent_bits(io_tree, start, end, 0, &cached);
+ lock_extent_bits(io_tree, start, end, &cached);
em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
unlock_extent_cached(io_tree, start, end, &cached, GFP_NOFS);
@@ -1016,7 +1025,7 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
ret = false;
else if ((em->block_start + em->block_len == next->block_start) &&
- (em->block_len > 128 * 1024 && next->block_len > 128 * 1024))
+ (em->block_len > SZ_128K && next->block_len > SZ_128K))
ret = false;
free_extent_map(next);
@@ -1120,7 +1129,8 @@ static int cluster_pages_for_defrag(struct inode *inode,
page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1);
ret = btrfs_delalloc_reserve_space(inode,
- page_cnt << PAGE_CACHE_SHIFT);
+ start_index << PAGE_CACHE_SHIFT,
+ page_cnt << PAGE_CACHE_SHIFT);
if (ret)
return ret;
i_done = 0;
@@ -1139,7 +1149,7 @@ again:
page_end = page_start + PAGE_CACHE_SIZE - 1;
while (1) {
lock_extent_bits(tree, page_start, page_end,
- 0, &cached_state);
+ &cached_state);
ordered = btrfs_lookup_ordered_extent(inode,
page_start);
unlock_extent_cached(tree, page_start, page_end,
@@ -1199,7 +1209,7 @@ again:
page_end = page_offset(pages[i_done - 1]) + PAGE_CACHE_SIZE;
lock_extent_bits(&BTRFS_I(inode)->io_tree,
- page_start, page_end - 1, 0, &cached_state);
+ page_start, page_end - 1, &cached_state);
clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
@@ -1210,7 +1220,8 @@ again:
BTRFS_I(inode)->outstanding_extents++;
spin_unlock(&BTRFS_I(inode)->lock);
btrfs_delalloc_release_space(inode,
- (page_cnt - i_done) << PAGE_CACHE_SHIFT);
+ start_index << PAGE_CACHE_SHIFT,
+ (page_cnt - i_done) << PAGE_CACHE_SHIFT);
}
@@ -1235,7 +1246,9 @@ out:
unlock_page(pages[i]);
page_cache_release(pages[i]);
}
- btrfs_delalloc_release_space(inode, page_cnt << PAGE_CACHE_SHIFT);
+ btrfs_delalloc_release_space(inode,
+ start_index << PAGE_CACHE_SHIFT,
+ page_cnt << PAGE_CACHE_SHIFT);
return ret;
}
@@ -1258,9 +1271,9 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
int defrag_count = 0;
int compress_type = BTRFS_COMPRESS_ZLIB;
u32 extent_thresh = range->extent_thresh;
- unsigned long max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
+ unsigned long max_cluster = SZ_256K >> PAGE_CACHE_SHIFT;
unsigned long cluster = max_cluster;
- u64 new_align = ~((u64)128 * 1024 - 1);
+ u64 new_align = ~((u64)SZ_128K - 1);
struct page **pages = NULL;
if (isize == 0)
@@ -1277,7 +1290,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
}
if (extent_thresh == 0)
- extent_thresh = 256 * 1024;
+ extent_thresh = SZ_256K;
/*
* if we were not given a file, allocate a readahead
@@ -1309,7 +1322,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
if (newer_than) {
ret = find_new_extents(root, inode, newer_than,
- &newer_off, 64 * 1024);
+ &newer_off, SZ_64K);
if (!ret) {
range->start = newer_off;
/*
@@ -1342,7 +1355,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
break;
if (btrfs_defrag_cancelled(root->fs_info)) {
- printk(KERN_DEBUG "BTRFS: defrag_file cancelled\n");
+ btrfs_debug(root->fs_info, "defrag_file cancelled");
ret = -EAGAIN;
break;
}
@@ -1399,9 +1412,8 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
newer_off = max(newer_off + 1,
(u64)i << PAGE_CACHE_SHIFT);
- ret = find_new_extents(root, inode,
- newer_than, &newer_off,
- 64 * 1024);
+ ret = find_new_extents(root, inode, newer_than,
+ &newer_off, SZ_64K);
if (!ret) {
range->start = newer_off;
i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
@@ -1567,7 +1579,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
new_size = old_size + new_size;
}
- if (new_size < 256 * 1024 * 1024) {
+ if (new_size < SZ_256M) {
ret = -EINVAL;
goto out_free;
}
@@ -1579,7 +1591,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
new_size = div_u64(new_size, root->sectorsize);
new_size *= root->sectorsize;
- printk_in_rcu(KERN_INFO "BTRFS: new size for %s is %llu\n",
+ btrfs_info_in_rcu(root->fs_info, "new size for %s is %llu",
rcu_str_deref(device->name), new_size);
if (new_size > old_size) {
@@ -2081,7 +2093,7 @@ static noinline int search_ioctl(struct inode *inode,
key.offset = (u64)-1;
root = btrfs_read_fs_root_no_name(info, &key);
if (IS_ERR(root)) {
- printk(KERN_ERR "BTRFS: could not find root %llu\n",
+ btrfs_err(info, "could not find root %llu",
sk->tree_id);
btrfs_free_path(path);
return -ENOENT;
@@ -2156,7 +2168,7 @@ static noinline int btrfs_ioctl_tree_search_v2(struct file *file,
struct inode *inode;
int ret;
size_t buf_size;
- const size_t buf_limit = 16 * 1024 * 1024;
+ const size_t buf_limit = SZ_16M;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -2221,7 +2233,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
key.offset = (u64)-1;
root = btrfs_read_fs_root_no_name(info, &key);
if (IS_ERR(root)) {
- printk(KERN_ERR "BTRFS: could not find root %llu\n", tree_id);
+ btrfs_err(info, "could not find root %llu", tree_id);
ret = -ENOENT;
goto out;
}
@@ -2699,7 +2711,6 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)
{
struct btrfs_ioctl_fs_info_args *fi_args;
struct btrfs_device *device;
- struct btrfs_device *next;
struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
int ret = 0;
@@ -2711,7 +2722,7 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)
fi_args->num_devices = fs_devices->num_devices;
memcpy(&fi_args->fsid, root->fs_info->fsid, sizeof(fi_args->fsid));
- list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
if (device->devid > fi_args->max_id)
fi_args->max_id = device->devid;
}
@@ -2959,7 +2970,7 @@ static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst,
flush_dcache_page(dst_page);
if (memcmp(addr, dst_addr, cmp_len))
- ret = BTRFS_SAME_DATA_DIFFERS;
+ ret = -EBADE;
kunmap_atomic(addr);
kunmap_atomic(dst_addr);
@@ -3093,55 +3104,18 @@ out_unlock:
return ret;
}
-#define BTRFS_MAX_DEDUPE_LEN (16 * 1024 * 1024)
+#define BTRFS_MAX_DEDUPE_LEN SZ_16M
-static long btrfs_ioctl_file_extent_same(struct file *file,
- struct btrfs_ioctl_same_args __user *argp)
+ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen,
+ struct file *dst_file, u64 dst_loff)
{
- struct btrfs_ioctl_same_args *same = NULL;
- struct btrfs_ioctl_same_extent_info *info;
- struct inode *src = file_inode(file);
- u64 off;
- u64 len;
- int i;
- int ret;
- unsigned long size;
+ struct inode *src = file_inode(src_file);
+ struct inode *dst = file_inode(dst_file);
u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
- bool is_admin = capable(CAP_SYS_ADMIN);
- u16 count;
-
- if (!(file->f_mode & FMODE_READ))
- return -EINVAL;
-
- ret = mnt_want_write_file(file);
- if (ret)
- return ret;
-
- if (get_user(count, &argp->dest_count)) {
- ret = -EFAULT;
- goto out;
- }
-
- size = offsetof(struct btrfs_ioctl_same_args __user, info[count]);
+ ssize_t res;
- same = memdup_user(argp, size);
-
- if (IS_ERR(same)) {
- ret = PTR_ERR(same);
- same = NULL;
- goto out;
- }
-
- off = same->logical_offset;
- len = same->length;
-
- /*
- * Limit the total length we will dedupe for each operation.
- * This is intended to bound the total time spent in this
- * ioctl to something sane.
- */
- if (len > BTRFS_MAX_DEDUPE_LEN)
- len = BTRFS_MAX_DEDUPE_LEN;
+ if (olen > BTRFS_MAX_DEDUPE_LEN)
+ olen = BTRFS_MAX_DEDUPE_LEN;
if (WARN_ON_ONCE(bs < PAGE_CACHE_SIZE)) {
/*
@@ -3149,93 +3123,13 @@ static long btrfs_ioctl_file_extent_same(struct file *file,
* result, btrfs_cmp_data() won't correctly handle
* this situation without an update.
*/
- ret = -EINVAL;
- goto out;
- }
-
- ret = -EISDIR;
- if (S_ISDIR(src->i_mode))
- goto out;
-
- ret = -EACCES;
- if (!S_ISREG(src->i_mode))
- goto out;
-
- /* pre-format output fields to sane values */
- for (i = 0; i < count; i++) {
- same->info[i].bytes_deduped = 0ULL;
- same->info[i].status = 0;
- }
-
- for (i = 0, info = same->info; i < count; i++, info++) {
- struct inode *dst;
- struct fd dst_file = fdget(info->fd);
- if (!dst_file.file) {
- info->status = -EBADF;
- continue;
- }
- dst = file_inode(dst_file.file);
-
- if (!(is_admin || (dst_file.file->f_mode & FMODE_WRITE))) {
- info->status = -EINVAL;
- } else if (file->f_path.mnt != dst_file.file->f_path.mnt) {
- info->status = -EXDEV;
- } else if (S_ISDIR(dst->i_mode)) {
- info->status = -EISDIR;
- } else if (!S_ISREG(dst->i_mode)) {
- info->status = -EACCES;
- } else {
- info->status = btrfs_extent_same(src, off, len, dst,
- info->logical_offset);
- if (info->status == 0)
- info->bytes_deduped += len;
- }
- fdput(dst_file);
+ return -EINVAL;
}
- ret = copy_to_user(argp, same, size);
- if (ret)
- ret = -EFAULT;
-
-out:
- mnt_drop_write_file(file);
- kfree(same);
- return ret;
-}
-
-/* Helper to check and see if this root currently has a ref on the given disk
- * bytenr. If it does then we need to update the quota for this root. This
- * doesn't do anything if quotas aren't enabled.
- */
-static int check_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- u64 disko)
-{
- struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
- struct ulist *roots;
- struct ulist_iterator uiter;
- struct ulist_node *root_node = NULL;
- int ret;
-
- if (!root->fs_info->quota_enabled)
- return 1;
-
- btrfs_get_tree_mod_seq(root->fs_info, &tree_mod_seq_elem);
- ret = btrfs_find_all_roots(trans, root->fs_info, disko,
- tree_mod_seq_elem.seq, &roots);
- if (ret < 0)
- goto out;
- ret = 0;
- ULIST_ITER_INIT(&uiter);
- while ((root_node = ulist_next(roots, &uiter))) {
- if (root_node->val == root->objectid) {
- ret = 1;
- break;
- }
- }
- ulist_free(roots);
-out:
- btrfs_put_tree_mod_seq(root->fs_info, &tree_mod_seq_elem);
- return ret;
+ res = btrfs_extent_same(src, loff, olen, dst, dst_loff);
+ if (res)
+ return res;
+ return olen;
}
static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
@@ -3328,6 +3222,150 @@ static void clone_update_extent_map(struct inode *inode,
&BTRFS_I(inode)->runtime_flags);
}
+/*
+ * Make sure we do not end up inserting an inline extent into a file that has
+ * already other (non-inline) extents. If a file has an inline extent it can
+ * not have any other extents and the (single) inline extent must start at the
+ * file offset 0. Failing to respect these rules will lead to file corruption,
+ * resulting in EIO errors on read/write operations, hitting BUG_ON's in mm, etc
+ *
+ * We can have extents that have been already written to disk or we can have
+ * dirty ranges still in delalloc, in which case the extent maps and items are
+ * created only when we run delalloc, and the delalloc ranges might fall outside
+ * the range we are currently locking in the inode's io tree. So we check the
+ * inode's i_size because of that (i_size updates are done while holding the
+ * i_mutex, which we are holding here).
+ * We also check to see if the inode has a size not greater than "datal" but has
+ * extents beyond it, due to an fallocate with FALLOC_FL_KEEP_SIZE (and we are
+ * protected against such concurrent fallocate calls by the i_mutex).
+ *
+ * If the file has no extents but a size greater than datal, do not allow the
+ * copy because we would need turn the inline extent into a non-inline one (even
+ * with NO_HOLES enabled). If we find our destination inode only has one inline
+ * extent, just overwrite it with the source inline extent if its size is less
+ * than the source extent's size, or we could copy the source inline extent's
+ * data into the destination inode's inline extent if the later is greater then
+ * the former.
+ */
+static int clone_copy_inline_extent(struct inode *src,
+ struct inode *dst,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ struct btrfs_key *new_key,
+ const u64 drop_start,
+ const u64 datal,
+ const u64 skip,
+ const u64 size,
+ char *inline_data)
+{
+ struct btrfs_root *root = BTRFS_I(dst)->root;
+ const u64 aligned_end = ALIGN(new_key->offset + datal,
+ root->sectorsize);
+ int ret;
+ struct btrfs_key key;
+
+ if (new_key->offset > 0)
+ return -EOPNOTSUPP;
+
+ key.objectid = btrfs_ino(dst);
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = 0;
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0) {
+ return ret;
+ } else if (ret > 0) {
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ return ret;
+ else if (ret > 0)
+ goto copy_inline_extent;
+ }
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.objectid == btrfs_ino(dst) &&
+ key.type == BTRFS_EXTENT_DATA_KEY) {
+ ASSERT(key.offset > 0);
+ return -EOPNOTSUPP;
+ }
+ } else if (i_size_read(dst) <= datal) {
+ struct btrfs_file_extent_item *ei;
+ u64 ext_len;
+
+ /*
+ * If the file size is <= datal, make sure there are no other
+ * extents following (can happen do to an fallocate call with
+ * the flag FALLOC_FL_KEEP_SIZE).
+ */
+ ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_file_extent_item);
+ /*
+ * If it's an inline extent, it can not have other extents
+ * following it.
+ */
+ if (btrfs_file_extent_type(path->nodes[0], ei) ==
+ BTRFS_FILE_EXTENT_INLINE)
+ goto copy_inline_extent;
+
+ ext_len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
+ if (ext_len > aligned_end)
+ return -EOPNOTSUPP;
+
+ ret = btrfs_next_item(root, path);
+ if (ret < 0) {
+ return ret;
+ } else if (ret == 0) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key,
+ path->slots[0]);
+ if (key.objectid == btrfs_ino(dst) &&
+ key.type == BTRFS_EXTENT_DATA_KEY)
+ return -EOPNOTSUPP;
+ }
+ }
+
+copy_inline_extent:
+ /*
+ * We have no extent items, or we have an extent at offset 0 which may
+ * or may not be inlined. All these cases are dealt the same way.
+ */
+ if (i_size_read(dst) > datal) {
+ /*
+ * If the destination inode has an inline extent...
+ * This would require copying the data from the source inline
+ * extent into the beginning of the destination's inline extent.
+ * But this is really complex, both extents can be compressed
+ * or just one of them, which would require decompressing and
+ * re-compressing data (which could increase the new compressed
+ * size, not allowing the compressed data to fit anymore in an
+ * inline extent).
+ * So just don't support this case for now (it should be rare,
+ * we are not really saving space when cloning inline extents).
+ */
+ return -EOPNOTSUPP;
+ }
+
+ btrfs_release_path(path);
+ ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1);
+ if (ret)
+ return ret;
+ ret = btrfs_insert_empty_item(trans, root, path, new_key, size);
+ if (ret)
+ return ret;
+
+ if (skip) {
+ const u32 start = btrfs_file_extent_calc_inline_size(0);
+
+ memmove(inline_data + start, inline_data + start + skip, datal);
+ }
+
+ write_extent_buffer(path->nodes[0], inline_data,
+ btrfs_item_ptr_offset(path->nodes[0],
+ path->slots[0]),
+ size);
+ inode_add_bytes(dst, datal);
+
+ return 0;
+}
+
/**
* btrfs_clone() - clone a range from inode file to another
*
@@ -3352,9 +3390,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
u32 nritems;
int slot;
int ret;
- int no_quota;
const u64 len = olen_aligned;
- u64 last_disko = 0;
u64 last_dest_end = destoff;
ret = -ENOMEM;
@@ -3368,7 +3404,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
return ret;
}
- path->reada = 2;
+ path->reada = READA_FORWARD;
/* clone data */
key.objectid = btrfs_ino(src);
key.type = BTRFS_EXTENT_DATA_KEY;
@@ -3400,7 +3436,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
nritems = btrfs_header_nritems(path->nodes[0]);
process_slot:
- no_quota = 1;
if (path->slots[0] >= nritems) {
ret = btrfs_next_leaf(BTRFS_I(src)->root, path);
if (ret < 0)
@@ -3552,35 +3587,13 @@ process_slot:
btrfs_set_file_extent_num_bytes(leaf, extent,
datal);
- /*
- * We need to look up the roots that point at
- * this bytenr and see if the new root does. If
- * it does not we need to make sure we update
- * quotas appropriately.
- */
- if (disko && root != BTRFS_I(src)->root &&
- disko != last_disko) {
- no_quota = check_ref(trans, root,
- disko);
- if (no_quota < 0) {
- btrfs_abort_transaction(trans,
- root,
- ret);
- btrfs_end_transaction(trans,
- root);
- ret = no_quota;
- goto out;
- }
- }
-
if (disko) {
inode_add_bytes(inode, datal);
ret = btrfs_inc_extent_ref(trans, root,
disko, diskl, 0,
root->root_key.objectid,
btrfs_ino(inode),
- new_key.offset - datao,
- no_quota);
+ new_key.offset - datao);
if (ret) {
btrfs_abort_transaction(trans,
root,
@@ -3594,21 +3607,6 @@ process_slot:
} else if (type == BTRFS_FILE_EXTENT_INLINE) {
u64 skip = 0;
u64 trim = 0;
- u64 aligned_end = 0;
-
- /*
- * Don't copy an inline extent into an offset
- * greater than zero. Having an inline extent
- * at such an offset results in chaos as btrfs
- * isn't prepared for such cases. Just skip
- * this case for the same reasons as commented
- * at btrfs_ioctl_clone().
- */
- if (last_dest_end > 0) {
- ret = -EOPNOTSUPP;
- btrfs_end_transaction(trans, root);
- goto out;
- }
if (off > key.offset) {
skip = off - key.offset;
@@ -3626,42 +3624,22 @@ process_slot:
size -= skip + trim;
datal -= skip + trim;
- aligned_end = ALIGN(new_key.offset + datal,
- root->sectorsize);
- ret = btrfs_drop_extents(trans, root, inode,
- drop_start,
- aligned_end,
- 1);
+ ret = clone_copy_inline_extent(src, inode,
+ trans, path,
+ &new_key,
+ drop_start,
+ datal,
+ skip, size, buf);
if (ret) {
if (ret != -EOPNOTSUPP)
btrfs_abort_transaction(trans,
- root, ret);
- btrfs_end_transaction(trans, root);
- goto out;
- }
-
- ret = btrfs_insert_empty_item(trans, root, path,
- &new_key, size);
- if (ret) {
- btrfs_abort_transaction(trans, root,
- ret);
+ root,
+ ret);
btrfs_end_transaction(trans, root);
goto out;
}
-
- if (skip) {
- u32 start =
- btrfs_file_extent_calc_inline_size(0);
- memmove(buf+start, buf+start+skip,
- datal);
- }
-
leaf = path->nodes[0];
slot = path->slots[0];
- write_extent_buffer(leaf, buf,
- btrfs_item_ptr_offset(leaf, slot),
- size);
- inode_add_bytes(inode, datal);
}
/* If we have an implicit hole (NO_HOLES feature). */
@@ -3727,17 +3705,16 @@ out:
return ret;
}
-static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
- u64 off, u64 olen, u64 destoff)
+static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
+ u64 off, u64 olen, u64 destoff)
{
struct inode *inode = file_inode(file);
+ struct inode *src = file_inode(file_src);
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct fd src_file;
- struct inode *src;
int ret;
u64 len = olen;
u64 bs = root->fs_info->sb->s_blocksize;
- int same_inode = 0;
+ int same_inode = src == inode;
/*
* TODO:
@@ -3750,49 +3727,20 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
* be either compressed or non-compressed.
*/
- /* the destination must be opened for writing */
- if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND))
- return -EINVAL;
-
if (btrfs_root_readonly(root))
return -EROFS;
- ret = mnt_want_write_file(file);
- if (ret)
- return ret;
-
- src_file = fdget(srcfd);
- if (!src_file.file) {
- ret = -EBADF;
- goto out_drop_write;
- }
-
- ret = -EXDEV;
- if (src_file.file->f_path.mnt != file->f_path.mnt)
- goto out_fput;
-
- src = file_inode(src_file.file);
-
- ret = -EINVAL;
- if (src == inode)
- same_inode = 1;
-
- /* the src must be open for reading */
- if (!(src_file.file->f_mode & FMODE_READ))
- goto out_fput;
+ if (file_src->f_path.mnt != file->f_path.mnt ||
+ src->i_sb != inode->i_sb)
+ return -EXDEV;
/* don't make the dst file partly checksummed */
if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM))
- goto out_fput;
+ return -EINVAL;
- ret = -EISDIR;
if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
- goto out_fput;
-
- ret = -EXDEV;
- if (src->i_sb != inode->i_sb)
- goto out_fput;
+ return -EISDIR;
if (!same_inode) {
btrfs_double_inode_lock(src, inode);
@@ -3869,21 +3817,25 @@ out_unlock:
btrfs_double_inode_unlock(src, inode);
else
mutex_unlock(&src->i_mutex);
-out_fput:
- fdput(src_file);
-out_drop_write:
- mnt_drop_write_file(file);
return ret;
}
-static long btrfs_ioctl_clone_range(struct file *file, void __user *argp)
+ssize_t btrfs_copy_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ size_t len, unsigned int flags)
{
- struct btrfs_ioctl_clone_range_args args;
+ ssize_t ret;
- if (copy_from_user(&args, argp, sizeof(args)))
- return -EFAULT;
- return btrfs_ioctl_clone(file, args.src_fd, args.src_offset,
- args.src_length, args.dest_offset);
+ ret = btrfs_clone_files(file_out, file_in, pos_in, len, pos_out);
+ if (ret == 0)
+ ret = len;
+ return ret;
+}
+
+int btrfs_clone_file_range(struct file *src_file, loff_t off,
+ struct file *dst_file, loff_t destoff, u64 len)
+{
+ return btrfs_clone_files(dst_file, src_file, off, len, destoff);
}
/*
@@ -4095,7 +4047,7 @@ static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
return -ENOMEM;
space_args.total_spaces = 0;
- dest = kmalloc(alloc_size, GFP_NOFS);
+ dest = kmalloc(alloc_size, GFP_KERNEL);
if (!dest)
return -ENOMEM;
dest_orig = dest;
@@ -4472,7 +4424,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
goto out;
}
- size = min_t(u32, loi->size, 64 * 1024);
+ size = min_t(u32, loi->size, SZ_64K);
inodes = init_data_container(size);
if (IS_ERR(inodes)) {
ret = PTR_ERR(inodes);
@@ -4621,7 +4573,7 @@ locked:
goto out_bargs;
}
- bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
+ bctl = kzalloc(sizeof(*bctl), GFP_KERNEL);
if (!bctl) {
ret = -ENOMEM;
goto out_bargs;
@@ -4707,7 +4659,7 @@ static long btrfs_ioctl_balance_progress(struct btrfs_root *root,
goto out;
}
- bargs = kzalloc(sizeof(*bargs), GFP_NOFS);
+ bargs = kzalloc(sizeof(*bargs), GFP_KERNEL);
if (!bargs) {
ret = -ENOMEM;
goto out;
@@ -4814,7 +4766,7 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
/* update qgroup status and info */
err = btrfs_run_qgroups(trans, root->fs_info);
if (err < 0)
- btrfs_error(root->fs_info, ret,
+ btrfs_std_error(root->fs_info, ret,
"failed to update qgroup status and info\n");
err = btrfs_end_transaction(trans, root);
if (err && !ret)
@@ -4967,7 +4919,7 @@ static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- qsa = kzalloc(sizeof(*qsa), GFP_NOFS);
+ qsa = kzalloc(sizeof(*qsa), GFP_KERNEL);
if (!qsa)
return -ENOMEM;
@@ -5097,7 +5049,7 @@ static long btrfs_ioctl_set_received_subvol_32(struct file *file,
goto out;
}
- args64 = kmalloc(sizeof(*args64), GFP_NOFS);
+ args64 = kmalloc(sizeof(*args64), GFP_KERNEL);
if (!args64) {
ret = -ENOMEM;
goto out;
@@ -5234,7 +5186,7 @@ out_unlock:
static int btrfs_ioctl_get_supported_features(struct file *file,
void __user *arg)
{
- static struct btrfs_ioctl_feature_flags features[3] = {
+ static const struct btrfs_ioctl_feature_flags features[3] = {
INIT_FEATURE_FLAGS(SUPP),
INIT_FEATURE_FLAGS(SAFE_SET),
INIT_FEATURE_FLAGS(SAFE_CLEAR)
@@ -5433,10 +5385,6 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_dev_info(root, argp);
case BTRFS_IOC_BALANCE:
return btrfs_ioctl_balance(file, NULL);
- case BTRFS_IOC_CLONE:
- return btrfs_ioctl_clone(file, arg, 0, 0, 0);
- case BTRFS_IOC_CLONE_RANGE:
- return btrfs_ioctl_clone_range(file, argp);
case BTRFS_IOC_TRANS_START:
return btrfs_ioctl_trans_start(file);
case BTRFS_IOC_TRANS_END:
@@ -5514,8 +5462,6 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_get_fslabel(file, argp);
case BTRFS_IOC_SET_FSLABEL:
return btrfs_ioctl_set_fslabel(file, argp);
- case BTRFS_IOC_FILE_EXTENT_SAME:
- return btrfs_ioctl_file_extent_same(file, argp);
case BTRFS_IOC_GET_SUPPORTED_FEATURES:
return btrfs_ioctl_get_supported_features(file, argp);
case BTRFS_IOC_GET_FEATURES:
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index d7e6baf1b205..d13128c70ddd 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -56,7 +56,6 @@ void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
atomic_dec(&eb->spinning_readers);
read_unlock(&eb->lock);
}
- return;
}
/*
@@ -79,6 +78,9 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
write_lock(&eb->lock);
WARN_ON(atomic_read(&eb->spinning_writers));
atomic_inc(&eb->spinning_writers);
+ /*
+ * atomic_dec_and_test implies a barrier for waitqueue_active
+ */
if (atomic_dec_and_test(&eb->blocking_writers) &&
waitqueue_active(&eb->write_lock_wq))
wake_up(&eb->write_lock_wq);
@@ -86,11 +88,13 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
BUG_ON(atomic_read(&eb->blocking_readers) == 0);
read_lock(&eb->lock);
atomic_inc(&eb->spinning_readers);
+ /*
+ * atomic_dec_and_test implies a barrier for waitqueue_active
+ */
if (atomic_dec_and_test(&eb->blocking_readers) &&
waitqueue_active(&eb->read_lock_wq))
wake_up(&eb->read_lock_wq);
}
- return;
}
/*
@@ -229,6 +233,9 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
}
btrfs_assert_tree_read_locked(eb);
WARN_ON(atomic_read(&eb->blocking_readers) == 0);
+ /*
+ * atomic_dec_and_test implies a barrier for waitqueue_active
+ */
if (atomic_dec_and_test(&eb->blocking_readers) &&
waitqueue_active(&eb->read_lock_wq))
wake_up(&eb->read_lock_wq);
@@ -280,6 +287,9 @@ void btrfs_tree_unlock(struct extent_buffer *eb)
if (blockers) {
WARN_ON(atomic_read(&eb->spinning_writers));
atomic_dec(&eb->blocking_writers);
+ /*
+ * Make sure counter is updated before we wake up waiters.
+ */
smp_mb();
if (waitqueue_active(&eb->write_lock_wq))
wake_up(&eb->write_lock_wq);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 52170cf1757e..8c27292ea9ea 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -345,6 +345,9 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode,
if (entry->bytes_left == 0) {
ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
+ /*
+ * Implicit memory barrier after test_and_set_bit
+ */
if (waitqueue_active(&entry->wait))
wake_up(&entry->wait);
} else {
@@ -409,6 +412,9 @@ have_entry:
if (entry->bytes_left == 0) {
ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
+ /*
+ * Implicit memory barrier after test_and_set_bit
+ */
if (waitqueue_active(&entry->wait))
wake_up(&entry->wait);
} else {
@@ -484,15 +490,16 @@ void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
spin_lock_irq(&log->log_extents_lock[index]);
while (!list_empty(&log->logged_list[index])) {
+ struct inode *inode;
ordered = list_first_entry(&log->logged_list[index],
struct btrfs_ordered_extent,
log_list);
list_del_init(&ordered->log_list);
+ inode = ordered->inode;
spin_unlock_irq(&log->log_extents_lock[index]);
if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) &&
!test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) {
- struct inode *inode = ordered->inode;
u64 start = ordered->file_offset;
u64 end = ordered->file_offset + ordered->len - 1;
@@ -503,20 +510,25 @@ void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
&ordered->flags));
/*
- * If our ordered extent completed it means it updated the
- * fs/subvol and csum trees already, so no need to make the
- * current transaction's commit wait for it, as we end up
- * holding memory unnecessarily and delaying the inode's iput
- * until the transaction commit (we schedule an iput for the
- * inode when the ordered extent's refcount drops to 0), which
- * prevents it from being evictable until the transaction
- * commits.
+ * In order to keep us from losing our ordered extent
+ * information when committing the transaction we have to make
+ * sure that any logged extents are completed when we go to
+ * commit the transaction. To do this we simply increase the
+ * current transactions pending_ordered counter and decrement it
+ * when the ordered extent completes.
*/
- if (test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags))
- btrfs_put_ordered_extent(ordered);
- else
- list_add_tail(&ordered->trans_list, &trans->ordered);
-
+ if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
+ struct btrfs_ordered_inode_tree *tree;
+
+ tree = &BTRFS_I(inode)->ordered_tree;
+ spin_lock_irq(&tree->lock);
+ if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
+ set_bit(BTRFS_ORDERED_PENDING, &ordered->flags);
+ atomic_inc(&trans->transaction->pending_ordered);
+ }
+ spin_unlock_irq(&tree->lock);
+ }
+ btrfs_put_ordered_extent(ordered);
spin_lock_irq(&log->log_extents_lock[index]);
}
spin_unlock_irq(&log->log_extents_lock[index]);
@@ -578,6 +590,7 @@ void btrfs_remove_ordered_extent(struct inode *inode,
struct btrfs_ordered_inode_tree *tree;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct rb_node *node;
+ bool dec_pending_ordered = false;
tree = &BTRFS_I(inode)->ordered_tree;
spin_lock_irq(&tree->lock);
@@ -587,8 +600,37 @@ void btrfs_remove_ordered_extent(struct inode *inode,
if (tree->last == node)
tree->last = NULL;
set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
+ if (test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags))
+ dec_pending_ordered = true;
spin_unlock_irq(&tree->lock);
+ /*
+ * The current running transaction is waiting on us, we need to let it
+ * know that we're complete and wake it up.
+ */
+ if (dec_pending_ordered) {
+ struct btrfs_transaction *trans;
+
+ /*
+ * The checks for trans are just a formality, it should be set,
+ * but if it isn't we don't want to deref/assert under the spin
+ * lock, so be nice and check if trans is set, but ASSERT() so
+ * if it isn't set a developer will notice.
+ */
+ spin_lock(&root->fs_info->trans_lock);
+ trans = root->fs_info->running_transaction;
+ if (trans)
+ atomic_inc(&trans->use_count);
+ spin_unlock(&root->fs_info->trans_lock);
+
+ ASSERT(trans);
+ if (trans) {
+ if (atomic_dec_and_test(&trans->pending_ordered))
+ wake_up(&trans->pending_wait);
+ btrfs_put_transaction(trans);
+ }
+ }
+
spin_lock(&root->ordered_extent_lock);
list_del_init(&entry->root_extent_list);
root->nr_ordered_extents--;
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 7176cc0fe43f..23c96059cef2 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -73,6 +73,8 @@ struct btrfs_ordered_sum {
#define BTRFS_ORDERED_LOGGED 10 /* Set when we've waited on this ordered extent
* in the logging code. */
+#define BTRFS_ORDERED_PENDING 11 /* We are waiting for this ordered extent to
+ * complete in the current transaction. */
struct btrfs_ordered_extent {
/* logical offset in the file */
u64 file_offset;
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index dca137b04095..f9e60231f685 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -49,18 +49,16 @@ static struct prop_handler prop_handlers[] = {
.extract = prop_compression_extract,
.inheritable = 1
},
- {
- .xattr_name = NULL
- }
};
void __init btrfs_props_init(void)
{
- struct prop_handler *p;
+ int i;
hash_init(prop_handlers_ht);
- for (p = &prop_handlers[0]; p->xattr_name; p++) {
+ for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) {
+ struct prop_handler *p = &prop_handlers[i];
u64 h = btrfs_name_hash(p->xattr_name, strlen(p->xattr_name));
hash_add(prop_handlers_ht, &p->node, h);
@@ -301,15 +299,16 @@ static int inherit_props(struct btrfs_trans_handle *trans,
struct inode *inode,
struct inode *parent)
{
- const struct prop_handler *h;
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret;
+ int i;
if (!test_bit(BTRFS_INODE_HAS_PROPS,
&BTRFS_I(parent)->runtime_flags))
return 0;
- for (h = &prop_handlers[0]; h->xattr_name; h++) {
+ for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) {
+ const struct prop_handler *h = &prop_handlers[i];
const char *value;
u64 num_bytes;
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index d904ee1c5349..5279fdae7142 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -993,9 +993,10 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans,
mutex_lock(&fs_info->qgroup_ioctl_lock);
if (!fs_info->quota_root)
goto out;
- spin_lock(&fs_info->qgroup_lock);
fs_info->quota_enabled = 0;
fs_info->pending_quota_state = 0;
+ btrfs_qgroup_wait_for_completion(fs_info);
+ spin_lock(&fs_info->qgroup_lock);
quota_root = fs_info->quota_root;
fs_info->quota_root = NULL;
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
@@ -1461,6 +1462,8 @@ struct btrfs_qgroup_extent_record
struct btrfs_qgroup_extent_record *entry;
u64 bytenr = record->bytenr;
+ assert_spin_locked(&delayed_refs->lock);
+
while (*p) {
parent_node = *p;
entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record,
@@ -1652,10 +1655,6 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
}
}
- /* For exclusive extent, free its reserved bytes too */
- if (nr_old_roots == 0 && nr_new_roots == 1 &&
- cur_new_count == nr_new_roots)
- qg->reserved -= num_bytes;
if (dirty)
qgroup_dirty(fs_info, qg);
}
@@ -2035,7 +2034,7 @@ out:
return ret;
}
-int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
+static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
{
struct btrfs_root *quota_root;
struct btrfs_qgroup *qgroup;
@@ -2116,14 +2115,13 @@ out:
return ret;
}
-void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
+void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
+ u64 ref_root, u64 num_bytes)
{
struct btrfs_root *quota_root;
struct btrfs_qgroup *qgroup;
- struct btrfs_fs_info *fs_info = root->fs_info;
struct ulist_node *unode;
struct ulist_iterator uiter;
- u64 ref_root = root->root_key.objectid;
int ret = 0;
if (!is_fstree(ref_root))
@@ -2169,6 +2167,11 @@ out:
spin_unlock(&fs_info->qgroup_lock);
}
+static inline void qgroup_free(struct btrfs_root *root, u64 num_bytes)
+{
+ return btrfs_qgroup_free_refroot(root->fs_info, root->objectid,
+ num_bytes);
+}
void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
{
if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq)
@@ -2188,17 +2191,16 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
*/
static int
qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
- struct btrfs_trans_handle *trans,
- struct extent_buffer *scratch_leaf)
+ struct btrfs_trans_handle *trans)
{
struct btrfs_key found;
+ struct extent_buffer *scratch_leaf = NULL;
struct ulist *roots = NULL;
struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
u64 num_bytes;
int slot;
int ret;
- path->leave_spinning = 1;
mutex_lock(&fs_info->qgroup_rescan_lock);
ret = btrfs_search_slot_for_read(fs_info->extent_root,
&fs_info->qgroup_rescan_progress,
@@ -2229,7 +2231,15 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
fs_info->qgroup_rescan_progress.objectid = found.objectid + 1;
btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
- memcpy(scratch_leaf, path->nodes[0], sizeof(*scratch_leaf));
+ scratch_leaf = btrfs_clone_extent_buffer(path->nodes[0]);
+ if (!scratch_leaf) {
+ ret = -ENOMEM;
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+ goto out;
+ }
+ extent_buffer_get(scratch_leaf);
+ btrfs_tree_read_lock(scratch_leaf);
+ btrfs_set_lock_blocking_rw(scratch_leaf, BTRFS_READ_LOCK);
slot = path->slots[0];
btrfs_release_path(path);
mutex_unlock(&fs_info->qgroup_rescan_lock);
@@ -2255,6 +2265,10 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
goto out;
}
out:
+ if (scratch_leaf) {
+ btrfs_tree_read_unlock_blocking(scratch_leaf);
+ free_extent_buffer(scratch_leaf);
+ }
btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
return ret;
@@ -2266,19 +2280,15 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
qgroup_rescan_work);
struct btrfs_path *path;
struct btrfs_trans_handle *trans = NULL;
- struct extent_buffer *scratch_leaf = NULL;
int err = -ENOMEM;
int ret = 0;
path = btrfs_alloc_path();
if (!path)
goto out;
- scratch_leaf = kmalloc(sizeof(*scratch_leaf), GFP_NOFS);
- if (!scratch_leaf)
- goto out;
err = 0;
- while (!err) {
+ while (!err && !btrfs_fs_closing(fs_info)) {
trans = btrfs_start_transaction(fs_info->fs_root, 0);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
@@ -2287,8 +2297,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
if (!fs_info->quota_enabled) {
err = -EINTR;
} else {
- err = qgroup_rescan_leaf(fs_info, path, trans,
- scratch_leaf);
+ err = qgroup_rescan_leaf(fs_info, path, trans);
}
if (err > 0)
btrfs_commit_transaction(trans, fs_info->fs_root);
@@ -2297,11 +2306,11 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
}
out:
- kfree(scratch_leaf);
btrfs_free_path(path);
mutex_lock(&fs_info->qgroup_rescan_lock);
- fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+ if (!btrfs_fs_closing(fs_info))
+ fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
if (err > 0 &&
fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
@@ -2330,7 +2339,9 @@ out:
}
btrfs_end_transaction(trans, fs_info->quota_root);
- if (err >= 0) {
+ if (btrfs_fs_closing(fs_info)) {
+ btrfs_info(fs_info, "qgroup scan paused");
+ } else if (err >= 0) {
btrfs_info(fs_info, "qgroup scan completed%s",
err > 0 ? " (inconsistency flag cleared)" : "");
} else {
@@ -2378,12 +2389,11 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
memset(&fs_info->qgroup_rescan_progress, 0,
sizeof(fs_info->qgroup_rescan_progress));
fs_info->qgroup_rescan_progress.objectid = progress_objectid;
+ init_completion(&fs_info->qgroup_rescan_completion);
spin_unlock(&fs_info->qgroup_lock);
mutex_unlock(&fs_info->qgroup_rescan_lock);
- init_completion(&fs_info->qgroup_rescan_completion);
-
memset(&fs_info->qgroup_rescan_work, 0,
sizeof(fs_info->qgroup_rescan_work));
btrfs_init_work(&fs_info->qgroup_rescan_work,
@@ -2486,3 +2496,190 @@ btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
btrfs_queue_work(fs_info->qgroup_rescan_workers,
&fs_info->qgroup_rescan_work);
}
+
+/*
+ * Reserve qgroup space for range [start, start + len).
+ *
+ * This function will either reserve space from related qgroups or doing
+ * nothing if the range is already reserved.
+ *
+ * Return 0 for successful reserve
+ * Return <0 for error (including -EQUOT)
+ *
+ * NOTE: this function may sleep for memory allocation.
+ */
+int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_changeset changeset;
+ struct ulist_node *unode;
+ struct ulist_iterator uiter;
+ int ret;
+
+ if (!root->fs_info->quota_enabled || !is_fstree(root->objectid) ||
+ len == 0)
+ return 0;
+
+ changeset.bytes_changed = 0;
+ changeset.range_changed = ulist_alloc(GFP_NOFS);
+ ret = set_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
+ start + len -1, EXTENT_QGROUP_RESERVED, GFP_NOFS,
+ &changeset);
+ trace_btrfs_qgroup_reserve_data(inode, start, len,
+ changeset.bytes_changed,
+ QGROUP_RESERVE);
+ if (ret < 0)
+ goto cleanup;
+ ret = qgroup_reserve(root, changeset.bytes_changed);
+ if (ret < 0)
+ goto cleanup;
+
+ ulist_free(changeset.range_changed);
+ return ret;
+
+cleanup:
+ /* cleanup already reserved ranges */
+ ULIST_ITER_INIT(&uiter);
+ while ((unode = ulist_next(changeset.range_changed, &uiter)))
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val,
+ unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL,
+ GFP_NOFS);
+ ulist_free(changeset.range_changed);
+ return ret;
+}
+
+static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len,
+ int free)
+{
+ struct extent_changeset changeset;
+ int trace_op = QGROUP_RELEASE;
+ int ret;
+
+ changeset.bytes_changed = 0;
+ changeset.range_changed = ulist_alloc(GFP_NOFS);
+ if (!changeset.range_changed)
+ return -ENOMEM;
+
+ ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
+ start + len -1, EXTENT_QGROUP_RESERVED, GFP_NOFS,
+ &changeset);
+ if (ret < 0)
+ goto out;
+
+ if (free) {
+ qgroup_free(BTRFS_I(inode)->root, changeset.bytes_changed);
+ trace_op = QGROUP_FREE;
+ }
+ trace_btrfs_qgroup_release_data(inode, start, len,
+ changeset.bytes_changed, trace_op);
+out:
+ ulist_free(changeset.range_changed);
+ return ret;
+}
+
+/*
+ * Free a reserved space range from io_tree and related qgroups
+ *
+ * Should be called when a range of pages get invalidated before reaching disk.
+ * Or for error cleanup case.
+ *
+ * For data written to disk, use btrfs_qgroup_release_data().
+ *
+ * NOTE: This function may sleep for memory allocation.
+ */
+int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len)
+{
+ return __btrfs_qgroup_release_data(inode, start, len, 1);
+}
+
+/*
+ * Release a reserved space range from io_tree only.
+ *
+ * Should be called when a range of pages get written to disk and corresponding
+ * FILE_EXTENT is inserted into corresponding root.
+ *
+ * Since new qgroup accounting framework will only update qgroup numbers at
+ * commit_transaction() time, its reserved space shouldn't be freed from
+ * related qgroups.
+ *
+ * But we should release the range from io_tree, to allow further write to be
+ * COWed.
+ *
+ * NOTE: This function may sleep for memory allocation.
+ */
+int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len)
+{
+ return __btrfs_qgroup_release_data(inode, start, len, 0);
+}
+
+int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes)
+{
+ int ret;
+
+ if (!root->fs_info->quota_enabled || !is_fstree(root->objectid) ||
+ num_bytes == 0)
+ return 0;
+
+ BUG_ON(num_bytes != round_down(num_bytes, root->nodesize));
+ ret = qgroup_reserve(root, num_bytes);
+ if (ret < 0)
+ return ret;
+ atomic_add(num_bytes, &root->qgroup_meta_rsv);
+ return ret;
+}
+
+void btrfs_qgroup_free_meta_all(struct btrfs_root *root)
+{
+ int reserved;
+
+ if (!root->fs_info->quota_enabled || !is_fstree(root->objectid))
+ return;
+
+ reserved = atomic_xchg(&root->qgroup_meta_rsv, 0);
+ if (reserved == 0)
+ return;
+ qgroup_free(root, reserved);
+}
+
+void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes)
+{
+ if (!root->fs_info->quota_enabled || !is_fstree(root->objectid))
+ return;
+
+ BUG_ON(num_bytes != round_down(num_bytes, root->nodesize));
+ WARN_ON(atomic_read(&root->qgroup_meta_rsv) < num_bytes);
+ atomic_sub(num_bytes, &root->qgroup_meta_rsv);
+ qgroup_free(root, num_bytes);
+}
+
+/*
+ * Check qgroup reserved space leaking, normally at destory inode
+ * time
+ */
+void btrfs_qgroup_check_reserved_leak(struct inode *inode)
+{
+ struct extent_changeset changeset;
+ struct ulist_node *unode;
+ struct ulist_iterator iter;
+ int ret;
+
+ changeset.bytes_changed = 0;
+ changeset.range_changed = ulist_alloc(GFP_NOFS);
+ if (WARN_ON(!changeset.range_changed))
+ return;
+
+ ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
+ EXTENT_QGROUP_RESERVED, GFP_NOFS, &changeset);
+
+ WARN_ON(ret < 0);
+ if (WARN_ON(changeset.bytes_changed)) {
+ ULIST_ITER_INIT(&iter);
+ while ((unode = ulist_next(changeset.range_changed, &iter))) {
+ btrfs_warn(BTRFS_I(inode)->root->fs_info,
+ "leaking qgroup reserved space, ino: %lu, start: %llu, end: %llu",
+ inode->i_ino, unode->val, unode->aux);
+ }
+ qgroup_free(BTRFS_I(inode)->root, changeset.bytes_changed);
+ }
+ ulist_free(changeset.range_changed);
+}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 6387dcfa354c..ecb2c143ef75 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -33,6 +33,13 @@ struct btrfs_qgroup_extent_record {
struct ulist *old_roots;
};
+/*
+ * For qgroup event trace points only
+ */
+#define QGROUP_RESERVE (1<<0)
+#define QGROUP_RELEASE (1<<1)
+#define QGROUP_FREE (1<<2)
+
int btrfs_quota_enable(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
int btrfs_quota_disable(struct btrfs_trans_handle *trans,
@@ -71,9 +78,18 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
struct btrfs_qgroup_inherit *inherit);
-int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes);
-void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes);
-
+void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
+ u64 ref_root, u64 num_bytes);
+/*
+ * TODO: Add proper trace point for it, as btrfs_qgroup_free() is
+ * called by everywhere, can't provide good trace for delayed ref case.
+ */
+static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info,
+ u64 ref_root, u64 num_bytes)
+{
+ btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes);
+ trace_btrfs_qgroup_free_delayed_ref(ref_root, num_bytes);
+}
void assert_qgroups_uptodate(struct btrfs_trans_handle *trans);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
@@ -81,4 +97,13 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
u64 rfer, u64 excl);
#endif
+/* New io_tree based accurate qgroup reserve API */
+int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len);
+int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len);
+int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len);
+
+int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes);
+void btrfs_qgroup_free_meta_all(struct btrfs_root *root);
+void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes);
+void btrfs_qgroup_check_reserved_leak(struct inode *inode);
#endif /* __BTRFS_QGROUP__ */
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index fcf7265ca46f..6d707545f775 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -503,7 +503,6 @@ static void cache_rbio(struct btrfs_raid_bio *rbio)
}
spin_unlock_irqrestore(&table->cache_lock, flags);
- return;
}
/*
@@ -810,7 +809,11 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
}
goto done_nolock;
- } else if (waitqueue_active(&h->wait)) {
+ /*
+ * The barrier for this waitqueue_active is not needed,
+ * we're protected by h->lock and can't miss a wakeup.
+ */
+ } else if (waitqueue_active(&h->wait)) {
spin_unlock(&rbio->bio_list_lock);
spin_unlock_irqrestore(&h->lock, flags);
wake_up(&h->wait);
@@ -902,7 +905,6 @@ static void raid_write_end_io(struct bio *bio)
err = -EIO;
rbio_orig_end_io(rbio, err);
- return;
}
/*
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 4645cd16d5ba..619f92963e27 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -569,7 +569,7 @@ static int reada_add_block(struct reada_control *rc, u64 logical,
rec = kzalloc(sizeof(*rec), GFP_NOFS);
if (!rec) {
reada_extent_put(root->fs_info, re);
- return -1;
+ return -ENOMEM;
}
rec->rc = rc;
@@ -918,6 +918,7 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
u64 start;
u64 generation;
int level;
+ int ret;
struct extent_buffer *node;
static struct btrfs_key max_key = {
.objectid = (u64)-1,
@@ -943,9 +944,10 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
generation = btrfs_header_generation(node);
free_extent_buffer(node);
- if (reada_add_block(rc, start, &max_key, level, generation)) {
+ ret = reada_add_block(rc, start, &max_key, level, generation);
+ if (ret) {
kfree(rc);
- return ERR_PTR(-ENOMEM);
+ return ERR_PTR(ret);
}
reada_start_machine(root->fs_info);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 303babeef505..ef6d8fc85853 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -708,8 +708,8 @@ struct backref_node *build_backref_tree(struct reloc_control *rc,
err = -ENOMEM;
goto out;
}
- path1->reada = 1;
- path2->reada = 2;
+ path1->reada = READA_FORWARD;
+ path2->reada = READA_FORWARD;
node = alloc_backref_node(cache);
if (!node) {
@@ -1716,7 +1716,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
num_bytes, parent,
btrfs_header_owner(leaf),
- key.objectid, key.offset, 1);
+ key.objectid, key.offset);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
break;
@@ -1724,7 +1724,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
parent, btrfs_header_owner(leaf),
- key.objectid, key.offset, 1);
+ key.objectid, key.offset);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
break;
@@ -1900,23 +1900,21 @@ again:
ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize,
path->nodes[level]->start,
- src->root_key.objectid, level - 1, 0,
- 1);
+ src->root_key.objectid, level - 1, 0);
BUG_ON(ret);
ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize,
0, dest->root_key.objectid, level - 1,
- 0, 1);
+ 0);
BUG_ON(ret);
ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
path->nodes[level]->start,
- src->root_key.objectid, level - 1, 0,
- 1);
+ src->root_key.objectid, level - 1, 0);
BUG_ON(ret);
ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
0, dest->root_key.objectid, level - 1,
- 0, 1);
+ 0);
BUG_ON(ret);
btrfs_unlock_up_safe(path, 0);
@@ -2132,7 +2130,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->reada = 1;
+ path->reada = READA_FORWARD;
reloc_root = root->reloc_root;
root_item = &reloc_root->root_item;
@@ -2418,7 +2416,7 @@ again:
}
out:
if (ret) {
- btrfs_std_error(root->fs_info, ret);
+ btrfs_std_error(root->fs_info, ret, NULL);
if (!list_empty(&reloc_roots))
free_reloc_roots(&reloc_roots);
@@ -2745,7 +2743,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
node->eb->start, blocksize,
upper->eb->start,
btrfs_header_owner(upper->eb),
- node->level, 0, 1);
+ node->level, 0);
BUG_ON(ret);
ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
@@ -3034,8 +3032,8 @@ int prealloc_file_extent_cluster(struct inode *inode,
BUG_ON(cluster->start != cluster->boundary[0]);
mutex_lock(&inode->i_mutex);
- ret = btrfs_check_data_free_space(inode, cluster->end +
- 1 - cluster->start, 0);
+ ret = btrfs_check_data_free_space(inode, cluster->start,
+ cluster->end + 1 - cluster->start);
if (ret)
goto out;
@@ -3056,8 +3054,8 @@ int prealloc_file_extent_cluster(struct inode *inode,
break;
nr++;
}
- btrfs_free_reserved_data_space(inode, cluster->end +
- 1 - cluster->start);
+ btrfs_free_reserved_data_space(inode, cluster->start,
+ cluster->end + 1 - cluster->start);
out:
mutex_unlock(&inode->i_mutex);
return ret;
@@ -3529,7 +3527,7 @@ static int find_data_references(struct reloc_control *rc,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->reada = 1;
+ path->reada = READA_FORWARD;
root = read_fs_root(rc->extent_root->fs_info, ref_root);
if (IS_ERR(root)) {
@@ -3919,7 +3917,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->reada = 1;
+ path->reada = READA_FORWARD;
ret = prepare_to_relocate(rc);
if (ret) {
@@ -4345,7 +4343,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->reada = -1;
+ path->reada = READA_BACK;
key.objectid = BTRFS_TREE_RELOC_OBJECTID;
key.type = BTRFS_ROOT_ITEM_KEY;
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 360a728a639f..7cf8509deda7 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -45,12 +45,13 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
if (!need_reset && btrfs_root_generation(item)
!= btrfs_root_generation_v2(item)) {
if (btrfs_root_generation_v2(item) != 0) {
- printk(KERN_WARNING "BTRFS: mismatching "
+ btrfs_warn(eb->fs_info,
+ "mismatching "
"generation and generation_v2 "
"found in root item. This root "
"was probably mounted with an "
"older kernel. Resetting all "
- "new fields.\n");
+ "new fields.");
}
need_reset = 1;
}
@@ -141,7 +142,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
int ret;
int slot;
unsigned long ptr;
- int old_len;
+ u32 old_len;
path = btrfs_alloc_path();
if (!path)
@@ -283,7 +284,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
trans = btrfs_join_transaction(tree_root);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
- btrfs_error(tree_root->fs_info, err,
+ btrfs_std_error(tree_root->fs_info, err,
"Failed to start trans to delete "
"orphan item");
break;
@@ -292,7 +293,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
root_key.objectid);
btrfs_end_transaction(trans, tree_root);
if (err) {
- btrfs_error(tree_root->fs_info, err,
+ btrfs_std_error(tree_root->fs_info, err,
"Failed to delete root orphan "
"item");
break;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index a39f5d1144e8..0c981ebe2acb 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -248,14 +248,9 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
struct scrub_block *sblocks_for_recheck);
static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
- struct scrub_block *sblock, int is_metadata,
- int have_csum, u8 *csum, u64 generation,
- u16 csum_size, int retry_failed_mirror);
-static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
- struct scrub_block *sblock,
- int is_metadata, int have_csum,
- const u8 *csum, u64 generation,
- u16 csum_size);
+ struct scrub_block *sblock,
+ int retry_failed_mirror);
+static void scrub_recheck_block_checksum(struct scrub_block *sblock);
static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
struct scrub_block *sblock_good);
static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
@@ -580,9 +575,9 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
* hold all of the paths here
*/
for (i = 0; i < ipath->fspath->elem_cnt; ++i)
- printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
+ btrfs_warn_in_rcu(fs_info, "%s at logical %llu on dev "
"%s, sector %llu, root %llu, inode %llu, offset %llu, "
- "length %llu, links %u (path: %s)\n", swarn->errstr,
+ "length %llu, links %u (path: %s)", swarn->errstr,
swarn->logical, rcu_str_deref(swarn->dev->name),
(unsigned long long)swarn->sector, root, inum, offset,
min(isize - offset, (u64)PAGE_SIZE), nlink,
@@ -592,9 +587,9 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
return 0;
err:
- printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
+ btrfs_warn_in_rcu(fs_info, "%s at logical %llu on dev "
"%s, sector %llu, root %llu, inode %llu, offset %llu: path "
- "resolving failed with ret=%d\n", swarn->errstr,
+ "resolving failed with ret=%d", swarn->errstr,
swarn->logical, rcu_str_deref(swarn->dev->name),
(unsigned long long)swarn->sector, root, inum, offset, ret);
@@ -649,10 +644,10 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
item_size, &ref_root,
&ref_level);
- printk_in_rcu(KERN_WARNING
- "BTRFS: %s at logical %llu on dev %s, "
+ btrfs_warn_in_rcu(fs_info,
+ "%s at logical %llu on dev %s, "
"sector %llu: metadata %s (level %d) in tree "
- "%llu\n", errstr, swarn.logical,
+ "%llu", errstr, swarn.logical,
rcu_str_deref(dev->name),
(unsigned long long)swarn.sector,
ref_level ? "node" : "leaf",
@@ -850,8 +845,8 @@ out:
btrfs_dev_replace_stats_inc(
&sctx->dev_root->fs_info->dev_replace.
num_uncorrectable_read_errors);
- printk_ratelimited_in_rcu(KERN_ERR "BTRFS: "
- "unable to fixup (nodatasum) error at logical %llu on dev %s\n",
+ btrfs_err_rl_in_rcu(sctx->dev_root->fs_info,
+ "unable to fixup (nodatasum) error at logical %llu on dev %s",
fixup->logical, rcu_str_deref(fixup->dev->name));
}
@@ -889,11 +884,9 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
struct btrfs_fs_info *fs_info;
u64 length;
u64 logical;
- u64 generation;
unsigned int failed_mirror_index;
unsigned int is_metadata;
unsigned int have_csum;
- u8 *csum;
struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
struct scrub_block *sblock_bad;
int ret;
@@ -918,13 +911,11 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
}
length = sblock_to_check->page_count * PAGE_SIZE;
logical = sblock_to_check->pagev[0]->logical;
- generation = sblock_to_check->pagev[0]->generation;
BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
is_metadata = !(sblock_to_check->pagev[0]->flags &
BTRFS_EXTENT_FLAG_DATA);
have_csum = sblock_to_check->pagev[0]->have_csum;
- csum = sblock_to_check->pagev[0]->csum;
dev = sblock_to_check->pagev[0]->dev;
if (sctx->is_dev_replace && !is_metadata && !have_csum) {
@@ -987,8 +978,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
sblock_bad = sblocks_for_recheck + failed_mirror_index;
/* build and submit the bios for the failed mirror, check checksums */
- scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
- csum, generation, sctx->csum_size, 1);
+ scrub_recheck_block(fs_info, sblock_bad, 1);
if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
sblock_bad->no_io_error_seen) {
@@ -1101,9 +1091,7 @@ nodatasum_case:
sblock_other = sblocks_for_recheck + mirror_index;
/* build and submit the bios, check checksums */
- scrub_recheck_block(fs_info, sblock_other, is_metadata,
- have_csum, csum, generation,
- sctx->csum_size, 0);
+ scrub_recheck_block(fs_info, sblock_other, 0);
if (!sblock_other->header_error &&
!sblock_other->checksum_error &&
@@ -1215,9 +1203,7 @@ nodatasum_case:
* is verified, but most likely the data comes out
* of the page cache.
*/
- scrub_recheck_block(fs_info, sblock_bad,
- is_metadata, have_csum, csum,
- generation, sctx->csum_size, 1);
+ scrub_recheck_block(fs_info, sblock_bad, 1);
if (!sblock_bad->header_error &&
!sblock_bad->checksum_error &&
sblock_bad->no_io_error_seen)
@@ -1230,8 +1216,8 @@ corrected_error:
sctx->stat.corrected_errors++;
sblock_to_check->data_corrected = 1;
spin_unlock(&sctx->stat_lock);
- printk_ratelimited_in_rcu(KERN_ERR
- "BTRFS: fixed up error at logical %llu on dev %s\n",
+ btrfs_err_rl_in_rcu(fs_info,
+ "fixed up error at logical %llu on dev %s",
logical, rcu_str_deref(dev->name));
}
} else {
@@ -1239,8 +1225,8 @@ did_not_correct_error:
spin_lock(&sctx->stat_lock);
sctx->stat.uncorrectable_errors++;
spin_unlock(&sctx->stat_lock);
- printk_ratelimited_in_rcu(KERN_ERR
- "BTRFS: unable to fixup (regular) error at logical %llu on dev %s\n",
+ btrfs_err_rl_in_rcu(fs_info,
+ "unable to fixup (regular) error at logical %llu on dev %s",
logical, rcu_str_deref(dev->name));
}
@@ -1318,6 +1304,9 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
u64 length = original_sblock->page_count * PAGE_SIZE;
u64 logical = original_sblock->pagev[0]->logical;
+ u64 generation = original_sblock->pagev[0]->generation;
+ u64 flags = original_sblock->pagev[0]->flags;
+ u64 have_csum = original_sblock->pagev[0]->have_csum;
struct scrub_recover *recover;
struct btrfs_bio *bbio;
u64 sublen;
@@ -1372,6 +1361,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
sblock = sblocks_for_recheck + mirror_index;
sblock->sctx = sctx;
+
page = kzalloc(sizeof(*page), GFP_NOFS);
if (!page) {
leave_nomem:
@@ -1383,7 +1373,15 @@ leave_nomem:
}
scrub_page_get(page);
sblock->pagev[page_index] = page;
+ page->sblock = sblock;
+ page->flags = flags;
+ page->generation = generation;
page->logical = logical;
+ page->have_csum = have_csum;
+ if (have_csum)
+ memcpy(page->csum,
+ original_sblock->pagev[0]->csum,
+ sctx->csum_size);
scrub_stripe_index_and_offset(logical,
bbio->map_type,
@@ -1474,15 +1472,12 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
* the pages that are errored in the just handled mirror can be repaired.
*/
static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
- struct scrub_block *sblock, int is_metadata,
- int have_csum, u8 *csum, u64 generation,
- u16 csum_size, int retry_failed_mirror)
+ struct scrub_block *sblock,
+ int retry_failed_mirror)
{
int page_num;
sblock->no_io_error_seen = 1;
- sblock->header_error = 0;
- sblock->checksum_error = 0;
for (page_num = 0; page_num < sblock->page_count; page_num++) {
struct bio *bio;
@@ -1518,11 +1513,7 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
}
if (sblock->no_io_error_seen)
- scrub_recheck_block_checksum(fs_info, sblock, is_metadata,
- have_csum, csum, generation,
- csum_size);
-
- return;
+ scrub_recheck_block_checksum(sblock);
}
static inline int scrub_check_fsid(u8 fsid[],
@@ -1535,61 +1526,16 @@ static inline int scrub_check_fsid(u8 fsid[],
return !ret;
}
-static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
- struct scrub_block *sblock,
- int is_metadata, int have_csum,
- const u8 *csum, u64 generation,
- u16 csum_size)
+static void scrub_recheck_block_checksum(struct scrub_block *sblock)
{
- int page_num;
- u8 calculated_csum[BTRFS_CSUM_SIZE];
- u32 crc = ~(u32)0;
- void *mapped_buffer;
-
- WARN_ON(!sblock->pagev[0]->page);
- if (is_metadata) {
- struct btrfs_header *h;
-
- mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
- h = (struct btrfs_header *)mapped_buffer;
-
- if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h) ||
- !scrub_check_fsid(h->fsid, sblock->pagev[0]) ||
- memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
- BTRFS_UUID_SIZE)) {
- sblock->header_error = 1;
- } else if (generation != btrfs_stack_header_generation(h)) {
- sblock->header_error = 1;
- sblock->generation_error = 1;
- }
- csum = h->csum;
- } else {
- if (!have_csum)
- return;
-
- mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
- }
-
- for (page_num = 0;;) {
- if (page_num == 0 && is_metadata)
- crc = btrfs_csum_data(
- ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE,
- crc, PAGE_SIZE - BTRFS_CSUM_SIZE);
- else
- crc = btrfs_csum_data(mapped_buffer, crc, PAGE_SIZE);
-
- kunmap_atomic(mapped_buffer);
- page_num++;
- if (page_num >= sblock->page_count)
- break;
- WARN_ON(!sblock->pagev[page_num]->page);
-
- mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page);
- }
+ sblock->header_error = 0;
+ sblock->checksum_error = 0;
+ sblock->generation_error = 0;
- btrfs_csum_final(crc, calculated_csum);
- if (memcmp(calculated_csum, csum, csum_size))
- sblock->checksum_error = 1;
+ if (sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA)
+ scrub_checksum_data(sblock);
+ else
+ scrub_checksum_tree_block(sblock);
}
static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
@@ -1626,9 +1572,9 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
int ret;
if (!page_bad->dev->bdev) {
- printk_ratelimited(KERN_WARNING "BTRFS: "
+ btrfs_warn_rl(sblock_bad->sctx->dev_root->fs_info,
"scrub_repair_page_from_good_copy(bdev == NULL) "
- "is unexpected!\n");
+ "is unexpected");
return -EIO;
}
@@ -1833,6 +1779,18 @@ static int scrub_checksum(struct scrub_block *sblock)
u64 flags;
int ret;
+ /*
+ * No need to initialize these stats currently,
+ * because this function only use return value
+ * instead of these stats value.
+ *
+ * Todo:
+ * always use stats
+ */
+ sblock->header_error = 0;
+ sblock->generation_error = 0;
+ sblock->checksum_error = 0;
+
WARN_ON(sblock->page_count < 1);
flags = sblock->pagev[0]->flags;
ret = 0;
@@ -1858,7 +1816,6 @@ static int scrub_checksum_data(struct scrub_block *sblock)
struct page *page;
void *buffer;
u32 crc = ~(u32)0;
- int fail = 0;
u64 len;
int index;
@@ -1889,9 +1846,9 @@ static int scrub_checksum_data(struct scrub_block *sblock)
btrfs_csum_final(crc, csum);
if (memcmp(csum, on_disk_csum, sctx->csum_size))
- fail = 1;
+ sblock->checksum_error = 1;
- return fail;
+ return sblock->checksum_error;
}
static int scrub_checksum_tree_block(struct scrub_block *sblock)
@@ -1907,8 +1864,6 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
u64 mapped_size;
void *p;
u32 crc = ~(u32)0;
- int fail = 0;
- int crc_fail = 0;
u64 len;
int index;
@@ -1923,19 +1878,20 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
* a) don't have an extent buffer and
* b) the page is already kmapped
*/
-
if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h))
- ++fail;
+ sblock->header_error = 1;
- if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h))
- ++fail;
+ if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h)) {
+ sblock->header_error = 1;
+ sblock->generation_error = 1;
+ }
if (!scrub_check_fsid(h->fsid, sblock->pagev[0]))
- ++fail;
+ sblock->header_error = 1;
if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
BTRFS_UUID_SIZE))
- ++fail;
+ sblock->header_error = 1;
len = sctx->nodesize - BTRFS_CSUM_SIZE;
mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
@@ -1960,9 +1916,9 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
btrfs_csum_final(crc, calculated_csum);
if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
- ++crc_fail;
+ sblock->checksum_error = 1;
- return fail || crc_fail;
+ return sblock->header_error || sblock->checksum_error;
}
static int scrub_checksum_super(struct scrub_block *sblock)
@@ -2176,40 +2132,28 @@ static void scrub_missing_raid56_worker(struct btrfs_work *work)
{
struct scrub_block *sblock = container_of(work, struct scrub_block, work);
struct scrub_ctx *sctx = sblock->sctx;
- struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
- unsigned int is_metadata;
- unsigned int have_csum;
- u8 *csum;
- u64 generation;
u64 logical;
struct btrfs_device *dev;
- is_metadata = !(sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA);
- have_csum = sblock->pagev[0]->have_csum;
- csum = sblock->pagev[0]->csum;
- generation = sblock->pagev[0]->generation;
logical = sblock->pagev[0]->logical;
dev = sblock->pagev[0]->dev;
- if (sblock->no_io_error_seen) {
- scrub_recheck_block_checksum(fs_info, sblock, is_metadata,
- have_csum, csum, generation,
- sctx->csum_size);
- }
+ if (sblock->no_io_error_seen)
+ scrub_recheck_block_checksum(sblock);
if (!sblock->no_io_error_seen) {
spin_lock(&sctx->stat_lock);
sctx->stat.read_errors++;
spin_unlock(&sctx->stat_lock);
- printk_ratelimited_in_rcu(KERN_ERR
- "BTRFS: I/O error rebulding logical %llu for dev %s\n",
+ btrfs_err_rl_in_rcu(sctx->dev_root->fs_info,
+ "IO error rebuilding logical %llu for dev %s",
logical, rcu_str_deref(dev->name));
} else if (sblock->header_error || sblock->checksum_error) {
spin_lock(&sctx->stat_lock);
sctx->stat.uncorrectable_errors++;
spin_unlock(&sctx->stat_lock);
- printk_ratelimited_in_rcu(KERN_ERR
- "BTRFS: failed to rebuild valid logical %llu for dev %s\n",
+ btrfs_err_rl_in_rcu(sctx->dev_root->fs_info,
+ "failed to rebuild valid logical %llu for dev %s",
logical, rcu_str_deref(dev->name));
} else {
scrub_write_block_to_dev_replace(sblock);
@@ -2500,8 +2444,7 @@ static void scrub_block_complete(struct scrub_block *sblock)
}
}
-static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
- u8 *csum)
+static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
{
struct btrfs_ordered_sum *sum = NULL;
unsigned long index;
@@ -2565,7 +2508,7 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
if (flags & BTRFS_EXTENT_FLAG_DATA) {
/* push csums to sbio */
- have_csum = scrub_find_csum(sctx, logical, l, csum);
+ have_csum = scrub_find_csum(sctx, logical, csum);
if (have_csum == 0)
++sctx->stat.no_csum;
if (sctx->is_dev_replace && !have_csum) {
@@ -2703,7 +2646,7 @@ static int scrub_extent_for_parity(struct scrub_parity *sparity,
if (flags & BTRFS_EXTENT_FLAG_DATA) {
/* push csums to sbio */
- have_csum = scrub_find_csum(sctx, logical, l, csum);
+ have_csum = scrub_find_csum(sctx, logical, csum);
if (have_csum == 0)
goto skip;
}
@@ -3012,6 +2955,9 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
logic_start + map->stripe_len)) {
btrfs_err(fs_info, "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
key.objectid, logic_start);
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.uncorrectable_errors++;
+ spin_unlock(&sctx->stat_lock);
goto next;
}
again:
@@ -3361,6 +3307,9 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
"scrub: tree block %llu spanning "
"stripes, ignored. logical=%llu",
key.objectid, logical);
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.uncorrectable_errors++;
+ spin_unlock(&sctx->stat_lock);
goto next;
}
@@ -3481,7 +3430,9 @@ out:
static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
struct btrfs_device *scrub_dev,
u64 chunk_offset, u64 length,
- u64 dev_offset, int is_dev_replace)
+ u64 dev_offset,
+ struct btrfs_block_group_cache *cache,
+ int is_dev_replace)
{
struct btrfs_mapping_tree *map_tree =
&sctx->dev_root->fs_info->mapping_tree;
@@ -3494,8 +3445,18 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
read_unlock(&map_tree->map_tree.lock);
- if (!em)
- return -EINVAL;
+ if (!em) {
+ /*
+ * Might have been an unused block group deleted by the cleaner
+ * kthread or relocation.
+ */
+ spin_lock(&cache->lock);
+ if (!cache->removed)
+ ret = -EINVAL;
+ spin_unlock(&cache->lock);
+
+ return ret;
+ }
map = (struct map_lookup *)em->bdev;
if (em->start != chunk_offset)
@@ -3532,6 +3493,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
u64 length;
u64 chunk_offset;
int ret = 0;
+ int ro_set;
int slot;
struct extent_buffer *l;
struct btrfs_key key;
@@ -3543,7 +3505,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
if (!path)
return -ENOMEM;
- path->reada = 2;
+ path->reada = READA_FORWARD;
path->search_commit_root = 1;
path->skip_locking = 1;
@@ -3617,7 +3579,21 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
scrub_pause_on(fs_info);
ret = btrfs_inc_block_group_ro(root, cache);
scrub_pause_off(fs_info);
- if (ret) {
+
+ if (ret == 0) {
+ ro_set = 1;
+ } else if (ret == -ENOSPC) {
+ /*
+ * btrfs_inc_block_group_ro return -ENOSPC when it
+ * failed in creating new chunk for metadata.
+ * It is not a problem for scrub/replace, because
+ * metadata are always cowed, and our scrub paused
+ * commit_transactions.
+ */
+ ro_set = 0;
+ } else {
+ btrfs_warn(fs_info, "failed setting block group ro, ret=%d\n",
+ ret);
btrfs_put_block_group(cache);
break;
}
@@ -3626,7 +3602,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
dev_replace->cursor_left = found_key.offset;
dev_replace->item_needs_writeback = 1;
ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length,
- found_key.offset, is_dev_replace);
+ found_key.offset, cache, is_dev_replace);
/*
* flush, submit all pending read and write bios, afterwards
@@ -3660,7 +3636,30 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
scrub_pause_off(fs_info);
- btrfs_dec_block_group_ro(root, cache);
+ if (ro_set)
+ btrfs_dec_block_group_ro(root, cache);
+
+ /*
+ * We might have prevented the cleaner kthread from deleting
+ * this block group if it was already unused because we raced
+ * and set it to RO mode first. So add it back to the unused
+ * list, otherwise it might not ever be deleted unless a manual
+ * balance is triggered or it becomes used and unused again.
+ */
+ spin_lock(&cache->lock);
+ if (!cache->removed && !cache->ro && cache->reserved == 0 &&
+ btrfs_block_group_used(&cache->item) == 0) {
+ spin_unlock(&cache->lock);
+ spin_lock(&fs_info->unused_bgs_lock);
+ if (list_empty(&cache->bg_list)) {
+ btrfs_get_block_group(cache);
+ list_add_tail(&cache->bg_list,
+ &fs_info->unused_bgs);
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
+ } else {
+ spin_unlock(&cache->lock);
+ }
btrfs_put_block_group(cache);
if (ret)
@@ -3734,27 +3733,27 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
if (fs_info->scrub_workers_refcnt == 0) {
if (is_dev_replace)
fs_info->scrub_workers =
- btrfs_alloc_workqueue("btrfs-scrub", flags,
+ btrfs_alloc_workqueue("scrub", flags,
1, 4);
else
fs_info->scrub_workers =
- btrfs_alloc_workqueue("btrfs-scrub", flags,
+ btrfs_alloc_workqueue("scrub", flags,
max_active, 4);
if (!fs_info->scrub_workers)
goto fail_scrub_workers;
fs_info->scrub_wr_completion_workers =
- btrfs_alloc_workqueue("btrfs-scrubwrc", flags,
+ btrfs_alloc_workqueue("scrubwrc", flags,
max_active, 2);
if (!fs_info->scrub_wr_completion_workers)
goto fail_scrub_wr_completion_workers;
fs_info->scrub_nocow_workers =
- btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0);
+ btrfs_alloc_workqueue("scrubnc", flags, 1, 0);
if (!fs_info->scrub_nocow_workers)
goto fail_scrub_nocow_workers;
fs_info->scrub_parity_workers =
- btrfs_alloc_workqueue("btrfs-scrubparity", flags,
+ btrfs_alloc_workqueue("scrubparity", flags,
max_active, 2);
if (!fs_info->scrub_parity_workers)
goto fail_scrub_parity_workers;
@@ -4210,7 +4209,7 @@ static int check_extent_to_block(struct inode *inode, u64 start, u64 len,
io_tree = &BTRFS_I(inode)->io_tree;
- lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);
+ lock_extent_bits(io_tree, lockstart, lockend, &cached_state);
ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
if (ordered) {
btrfs_put_ordered_extent(ordered);
@@ -4375,8 +4374,8 @@ static int write_page_nocow(struct scrub_ctx *sctx,
if (!dev)
return -EIO;
if (!dev->bdev) {
- printk_ratelimited(KERN_WARNING
- "BTRFS: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
+ btrfs_warn_rl(dev->dev_root->fs_info,
+ "scrub write_page_nocow(bdev == NULL) is unexpected");
return -EIO;
}
bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index a739b825bdd3..63a6152be04b 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1434,16 +1434,6 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "
}
if (cur_clone_root) {
- if (compressed != BTRFS_COMPRESS_NONE) {
- /*
- * Offsets given by iterate_extent_inodes() are relative
- * to the start of the extent, we need to add logical
- * offset from the file extent item.
- * (See why at backref.c:check_extent_in_eb())
- */
- cur_clone_root->offset += btrfs_file_extent_offset(eb,
- fi);
- }
*found = cur_clone_root;
ret = 0;
} else {
@@ -1479,7 +1469,21 @@ static int read_symlink(struct btrfs_root *root,
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
- BUG_ON(ret);
+ if (ret) {
+ /*
+ * An empty symlink inode. Can happen in rare error paths when
+ * creating a symlink (transaction committed before the inode
+ * eviction handler removed the symlink inode items and a crash
+ * happened in between or the subvol was snapshoted in between).
+ * Print an informative message to dmesg/syslog so that the user
+ * can delete the symlink.
+ */
+ btrfs_err(root->fs_info,
+ "Found empty symlink inode %llu at root %llu",
+ ino, root->root_key.objectid);
+ ret = -EIO;
+ goto out;
+ }
ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_file_extent_item);
@@ -2353,8 +2357,14 @@ static int send_subvol_begin(struct send_ctx *sctx)
}
TLV_PUT_STRING(sctx, BTRFS_SEND_A_PATH, name, namelen);
- TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
- sctx->send_root->root_item.uuid);
+
+ if (!btrfs_is_empty_uuid(sctx->send_root->root_item.received_uuid))
+ TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
+ sctx->send_root->root_item.received_uuid);
+ else
+ TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
+ sctx->send_root->root_item.uuid);
+
TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID,
le64_to_cpu(sctx->send_root->root_item.ctransid));
if (parent_root) {
@@ -2564,7 +2574,7 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino);
} else if (S_ISSOCK(mode)) {
cmd = BTRFS_SEND_C_MKSOCK;
} else {
- printk(KERN_WARNING "btrfs: unexpected inode type %o",
+ btrfs_warn(sctx->send_root->fs_info, "unexpected inode type %o",
(int)(mode & S_IFMT));
ret = -ENOTSUPP;
goto out;
@@ -4687,6 +4697,171 @@ tlv_put_failure:
return ret;
}
+static int send_extent_data(struct send_ctx *sctx,
+ const u64 offset,
+ const u64 len)
+{
+ u64 sent = 0;
+
+ if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
+ return send_update_extent(sctx, offset, len);
+
+ while (sent < len) {
+ u64 size = len - sent;
+ int ret;
+
+ if (size > BTRFS_SEND_READ_SIZE)
+ size = BTRFS_SEND_READ_SIZE;
+ ret = send_write(sctx, offset + sent, size);
+ if (ret < 0)
+ return ret;
+ if (!ret)
+ break;
+ sent += ret;
+ }
+ return 0;
+}
+
+static int clone_range(struct send_ctx *sctx,
+ struct clone_root *clone_root,
+ const u64 disk_byte,
+ u64 data_offset,
+ u64 offset,
+ u64 len)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int ret;
+
+ path = alloc_path_for_send();
+ if (!path)
+ return -ENOMEM;
+
+ /*
+ * We can't send a clone operation for the entire range if we find
+ * extent items in the respective range in the source file that
+ * refer to different extents or if we find holes.
+ * So check for that and do a mix of clone and regular write/copy
+ * operations if needed.
+ *
+ * Example:
+ *
+ * mkfs.btrfs -f /dev/sda
+ * mount /dev/sda /mnt
+ * xfs_io -f -c "pwrite -S 0xaa 0K 100K" /mnt/foo
+ * cp --reflink=always /mnt/foo /mnt/bar
+ * xfs_io -c "pwrite -S 0xbb 50K 50K" /mnt/foo
+ * btrfs subvolume snapshot -r /mnt /mnt/snap
+ *
+ * If when we send the snapshot and we are processing file bar (which
+ * has a higher inode number than foo) we blindly send a clone operation
+ * for the [0, 100K[ range from foo to bar, the receiver ends up getting
+ * a file bar that matches the content of file foo - iow, doesn't match
+ * the content from bar in the original filesystem.
+ */
+ key.objectid = clone_root->ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = clone_root->offset;
+ ret = btrfs_search_slot(NULL, clone_root->root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ if (ret > 0 && path->slots[0] > 0) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
+ if (key.objectid == clone_root->ino &&
+ key.type == BTRFS_EXTENT_DATA_KEY)
+ path->slots[0]--;
+ }
+
+ while (true) {
+ struct extent_buffer *leaf = path->nodes[0];
+ int slot = path->slots[0];
+ struct btrfs_file_extent_item *ei;
+ u8 type;
+ u64 ext_len;
+ u64 clone_len;
+
+ if (slot >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(clone_root->root, path);
+ if (ret < 0)
+ goto out;
+ else if (ret > 0)
+ break;
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+
+ /*
+ * We might have an implicit trailing hole (NO_HOLES feature
+ * enabled). We deal with it after leaving this loop.
+ */
+ if (key.objectid != clone_root->ino ||
+ key.type != BTRFS_EXTENT_DATA_KEY)
+ break;
+
+ ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+ type = btrfs_file_extent_type(leaf, ei);
+ if (type == BTRFS_FILE_EXTENT_INLINE) {
+ ext_len = btrfs_file_extent_inline_len(leaf, slot, ei);
+ ext_len = PAGE_CACHE_ALIGN(ext_len);
+ } else {
+ ext_len = btrfs_file_extent_num_bytes(leaf, ei);
+ }
+
+ if (key.offset + ext_len <= clone_root->offset)
+ goto next;
+
+ if (key.offset > clone_root->offset) {
+ /* Implicit hole, NO_HOLES feature enabled. */
+ u64 hole_len = key.offset - clone_root->offset;
+
+ if (hole_len > len)
+ hole_len = len;
+ ret = send_extent_data(sctx, offset, hole_len);
+ if (ret < 0)
+ goto out;
+
+ len -= hole_len;
+ if (len == 0)
+ break;
+ offset += hole_len;
+ clone_root->offset += hole_len;
+ data_offset += hole_len;
+ }
+
+ if (key.offset >= clone_root->offset + len)
+ break;
+
+ clone_len = min_t(u64, ext_len, len);
+
+ if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte &&
+ btrfs_file_extent_offset(leaf, ei) == data_offset)
+ ret = send_clone(sctx, offset, clone_len, clone_root);
+ else
+ ret = send_extent_data(sctx, offset, clone_len);
+
+ if (ret < 0)
+ goto out;
+
+ len -= clone_len;
+ if (len == 0)
+ break;
+ offset += clone_len;
+ clone_root->offset += clone_len;
+ data_offset += clone_len;
+next:
+ path->slots[0]++;
+ }
+
+ if (len > 0)
+ ret = send_extent_data(sctx, offset, len);
+ else
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
static int send_write_or_clone(struct send_ctx *sctx,
struct btrfs_path *path,
struct btrfs_key *key,
@@ -4695,9 +4870,7 @@ static int send_write_or_clone(struct send_ctx *sctx,
int ret = 0;
struct btrfs_file_extent_item *ei;
u64 offset = key->offset;
- u64 pos = 0;
u64 len;
- u32 l;
u8 type;
u64 bs = sctx->send_root->fs_info->sb->s_blocksize;
@@ -4725,22 +4898,15 @@ static int send_write_or_clone(struct send_ctx *sctx,
}
if (clone_root && IS_ALIGNED(offset + len, bs)) {
- ret = send_clone(sctx, offset, len, clone_root);
- } else if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) {
- ret = send_update_extent(sctx, offset, len);
+ u64 disk_byte;
+ u64 data_offset;
+
+ disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei);
+ data_offset = btrfs_file_extent_offset(path->nodes[0], ei);
+ ret = clone_range(sctx, clone_root, disk_byte, data_offset,
+ offset, len);
} else {
- while (pos < len) {
- l = len - pos;
- if (l > BTRFS_SEND_READ_SIZE)
- l = BTRFS_SEND_READ_SIZE;
- ret = send_write(sctx, pos + offset, l);
- if (ret < 0)
- goto out;
- if (!ret)
- break;
- pos += ret;
- }
- ret = 0;
+ ret = send_extent_data(sctx, offset, len);
}
out:
return ret;
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index 48d425aef05b..02e00166c4da 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -22,8 +22,8 @@
#define BTRFS_SEND_STREAM_MAGIC "btrfs-stream"
#define BTRFS_SEND_STREAM_VERSION 1
-#define BTRFS_SEND_BUF_SIZE (1024 * 64)
-#define BTRFS_SEND_READ_SIZE (1024 * 48)
+#define BTRFS_SEND_BUF_SIZE SZ_64K
+#define BTRFS_SEND_READ_SIZE (48 * SZ_1K)
enum btrfs_tlv_type {
BTRFS_TLV_U8,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 11d1eab9234d..9b9eab6d048e 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -130,7 +130,6 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
}
}
-#ifdef CONFIG_PRINTK
/*
* __btrfs_std_error decodes expected errors from the caller and
* invokes the approciate error response.
@@ -140,7 +139,9 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int errno, const char *fmt, ...)
{
struct super_block *sb = fs_info->sb;
+#ifdef CONFIG_PRINTK
const char *errstr;
+#endif
/*
* Special case: if the error is EROFS, and we're already
@@ -149,6 +150,7 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
return;
+#ifdef CONFIG_PRINTK
errstr = btrfs_decode_error(errno);
if (fmt) {
struct va_format vaf;
@@ -166,6 +168,7 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
printk(KERN_CRIT "BTRFS: error (device %s) in %s:%d: errno=%d %s\n",
sb->s_id, function, line, errno, errstr);
}
+#endif
/* Don't go through full error handling during mount */
save_error_info(fs_info);
@@ -173,6 +176,7 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
btrfs_handle_error(fs_info);
}
+#ifdef CONFIG_PRINTK
static const char * const logtypes[] = {
"emergency",
"alert",
@@ -212,27 +216,6 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
va_end(args);
}
-
-#else
-
-void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
- unsigned int line, int errno, const char *fmt, ...)
-{
- struct super_block *sb = fs_info->sb;
-
- /*
- * Special case: if the error is EROFS, and we're already
- * under MS_RDONLY, then it is safe here.
- */
- if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
- return;
-
- /* Don't go through full error handling during mount */
- if (sb->s_flags & MS_BORN) {
- save_error_info(fs_info);
- btrfs_handle_error(fs_info);
- }
-}
#endif
/*
@@ -312,18 +295,22 @@ enum {
Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
- Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
- Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache,
- Opt_no_space_cache, Opt_recovery, Opt_skip_balance,
- Opt_check_integrity, Opt_check_integrity_including_extent_data,
+ Opt_space_cache, Opt_space_cache_version, Opt_clear_cache,
+ Opt_user_subvol_rm_allowed, Opt_enospc_debug, Opt_subvolrootid,
+ Opt_defrag, Opt_inode_cache, Opt_no_space_cache, Opt_recovery,
+ Opt_skip_balance, Opt_check_integrity,
+ Opt_check_integrity_including_extent_data,
Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_rescan_uuid_tree,
Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard,
Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow,
Opt_datasum, Opt_treelog, Opt_noinode_cache,
+#ifdef CONFIG_BTRFS_DEBUG
+ Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all,
+#endif
Opt_err,
};
-static match_table_t tokens = {
+static const match_table_t tokens = {
{Opt_degraded, "degraded"},
{Opt_subvol, "subvol=%s"},
{Opt_subvolid, "subvolid=%s"},
@@ -354,6 +341,7 @@ static match_table_t tokens = {
{Opt_discard, "discard"},
{Opt_nodiscard, "nodiscard"},
{Opt_space_cache, "space_cache"},
+ {Opt_space_cache_version, "space_cache=%s"},
{Opt_clear_cache, "clear_cache"},
{Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
{Opt_enospc_debug, "enospc_debug"},
@@ -372,6 +360,11 @@ static match_table_t tokens = {
{Opt_rescan_uuid_tree, "rescan_uuid_tree"},
{Opt_fatal_errors, "fatal_errors=%s"},
{Opt_commit_interval, "commit=%d"},
+#ifdef CONFIG_BTRFS_DEBUG
+ {Opt_fragment_data, "fragment=data"},
+ {Opt_fragment_metadata, "fragment=metadata"},
+ {Opt_fragment_all, "fragment=all"},
+#endif
{Opt_err, NULL},
};
@@ -392,7 +385,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
bool compress_force = false;
cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
- if (cache_gen)
+ if (btrfs_fs_compat_ro(root->fs_info, FREE_SPACE_TREE))
+ btrfs_set_opt(info->mount_opt, FREE_SPACE_TREE);
+ else if (cache_gen)
btrfs_set_opt(info->mount_opt, SPACE_CACHE);
if (!options)
@@ -626,15 +621,35 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
"turning off discard");
break;
case Opt_space_cache:
- btrfs_set_and_info(root, SPACE_CACHE,
- "enabling disk space caching");
+ case Opt_space_cache_version:
+ if (token == Opt_space_cache ||
+ strcmp(args[0].from, "v1") == 0) {
+ btrfs_clear_opt(root->fs_info->mount_opt,
+ FREE_SPACE_TREE);
+ btrfs_set_and_info(root, SPACE_CACHE,
+ "enabling disk space caching");
+ } else if (strcmp(args[0].from, "v2") == 0) {
+ btrfs_clear_opt(root->fs_info->mount_opt,
+ SPACE_CACHE);
+ btrfs_set_and_info(root, FREE_SPACE_TREE,
+ "enabling free space tree");
+ } else {
+ ret = -EINVAL;
+ goto out;
+ }
break;
case Opt_rescan_uuid_tree:
btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE);
break;
case Opt_no_space_cache:
- btrfs_clear_and_info(root, SPACE_CACHE,
- "disabling disk space caching");
+ if (btrfs_test_opt(root, SPACE_CACHE)) {
+ btrfs_clear_and_info(root, SPACE_CACHE,
+ "disabling disk space caching");
+ }
+ if (btrfs_test_opt(root, FREE_SPACE_TREE)) {
+ btrfs_clear_and_info(root, FREE_SPACE_TREE,
+ "disabling free space tree");
+ }
break;
case Opt_inode_cache:
btrfs_set_pending_and_info(info, INODE_MAP_CACHE,
@@ -738,6 +753,22 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
}
break;
+#ifdef CONFIG_BTRFS_DEBUG
+ case Opt_fragment_all:
+ btrfs_info(root->fs_info, "fragmenting all space");
+ btrfs_set_opt(info->mount_opt, FRAGMENT_DATA);
+ btrfs_set_opt(info->mount_opt, FRAGMENT_METADATA);
+ break;
+ case Opt_fragment_metadata:
+ btrfs_info(root->fs_info, "fragmenting metadata");
+ btrfs_set_opt(info->mount_opt,
+ FRAGMENT_METADATA);
+ break;
+ case Opt_fragment_data:
+ btrfs_info(root->fs_info, "fragmenting data");
+ btrfs_set_opt(info->mount_opt, FRAGMENT_DATA);
+ break;
+#endif
case Opt_err:
btrfs_info(root->fs_info, "unrecognized mount option '%s'", p);
ret = -EINVAL;
@@ -747,8 +778,17 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
}
}
out:
+ if (btrfs_fs_compat_ro(root->fs_info, FREE_SPACE_TREE) &&
+ !btrfs_test_opt(root, FREE_SPACE_TREE) &&
+ !btrfs_test_opt(root, CLEAR_CACHE)) {
+ btrfs_err(root->fs_info, "cannot disable free space tree");
+ ret = -EINVAL;
+
+ }
if (!ret && btrfs_test_opt(root, SPACE_CACHE))
btrfs_info(root->fs_info, "disk space caching is enabled");
+ if (!ret && btrfs_test_opt(root, FREE_SPACE_TREE))
+ btrfs_info(root->fs_info, "using free space tree");
kfree(orig);
return ret;
}
@@ -1155,6 +1195,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
seq_puts(seq, ",noacl");
if (btrfs_test_opt(root, SPACE_CACHE))
seq_puts(seq, ",space_cache");
+ else if (btrfs_test_opt(root, FREE_SPACE_TREE))
+ seq_puts(seq, ",space_cache=v2");
else
seq_puts(seq, ",nospace_cache");
if (btrfs_test_opt(root, RESCAN_UUID_TREE))
@@ -1189,6 +1231,12 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
seq_puts(seq, ",fatal_errors=panic");
if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL)
seq_printf(seq, ",commit=%d", info->commit_interval);
+#ifdef CONFIG_BTRFS_DEBUG
+ if (btrfs_test_opt(root, FRAGMENT_DATA))
+ seq_puts(seq, ",fragment=data");
+ if (btrfs_test_opt(root, FRAGMENT_METADATA))
+ seq_puts(seq, ",fragment=metadata");
+#endif
seq_printf(seq, ",subvolid=%llu",
BTRFS_I(d_inode(dentry))->root->root_key.objectid);
seq_puts(seq, ",subvol=");
@@ -1501,9 +1549,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
if ((flags ^ s->s_flags) & MS_RDONLY)
error = -EBUSY;
} else {
- char b[BDEVNAME_SIZE];
-
- strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
+ snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
btrfs_sb(s)->bdev_holder = fs_type;
error = btrfs_fill_super(s, fs_devices, data,
flags & MS_SILENT ? 1 : 0);
@@ -1852,7 +1898,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
* btrfs starts at an offset of at least 1MB when doing chunk
* allocation.
*/
- skip_space = 1024 * 1024;
+ skip_space = SZ_1M;
/* user can set the offset in fs_info->alloc_start. */
if (fs_info->alloc_start &&
@@ -1943,6 +1989,8 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
* there are other factors that may change the result (like a new metadata
* chunk).
*
+ * If metadata is exhausted, f_bavail will be 0.
+ *
* FIXME: not accurate for mixed block groups, total and free/used are ok,
* available appears slightly larger.
*/
@@ -1954,11 +2002,13 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
struct btrfs_space_info *found;
u64 total_used = 0;
u64 total_free_data = 0;
+ u64 total_free_meta = 0;
int bits = dentry->d_sb->s_blocksize_bits;
__be32 *fsid = (__be32 *)fs_info->fsid;
unsigned factor = 1;
struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
int ret;
+ u64 thresh = 0;
/*
* holding chunk_muext to avoid allocating new chunks, holding
@@ -1984,6 +2034,8 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
}
}
}
+ if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
+ total_free_meta += found->disk_total - found->disk_used;
total_used += found->disk_used;
}
@@ -2006,6 +2058,24 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bavail += div_u64(total_free_data, factor);
buf->f_bavail = buf->f_bavail >> bits;
+ /*
+ * We calculate the remaining metadata space minus global reserve. If
+ * this is (supposedly) smaller than zero, there's no space. But this
+ * does not hold in practice, the exhausted state happens where's still
+ * some positive delta. So we apply some guesswork and compare the
+ * delta to a 4M threshold. (Practically observed delta was ~2M.)
+ *
+ * We probably cannot calculate the exact threshold value because this
+ * depends on the internal reservations requested by various
+ * operations, so some operations that consume a few metadata will
+ * succeed even if the Avail is zero. But this is better than the other
+ * way around.
+ */
+ thresh = 4 * 1024 * 1024;
+
+ if (total_free_meta - thresh < block_rsv->size)
+ buf->f_bavail = 0;
+
buf->f_type = BTRFS_SUPER_MAGIC;
buf->f_bsize = dentry->d_sb->s_blocksize;
buf->f_namelen = BTRFS_NAME_LEN;
@@ -2212,6 +2282,9 @@ static int btrfs_run_sanity_tests(void)
if (ret)
goto out;
ret = btrfs_test_qgroups();
+ if (ret)
+ goto out;
+ ret = btrfs_test_free_space_tree();
out:
btrfs_destroy_test_fs();
return ret;
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 603b0cc2b9bb..e0ac85949067 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -437,24 +437,24 @@ static const struct attribute *btrfs_attrs[] = {
NULL,
};
-static void btrfs_release_super_kobj(struct kobject *kobj)
+static void btrfs_release_fsid_kobj(struct kobject *kobj)
{
struct btrfs_fs_devices *fs_devs = to_fs_devs(kobj);
- memset(&fs_devs->super_kobj, 0, sizeof(struct kobject));
+ memset(&fs_devs->fsid_kobj, 0, sizeof(struct kobject));
complete(&fs_devs->kobj_unregister);
}
static struct kobj_type btrfs_ktype = {
.sysfs_ops = &kobj_sysfs_ops,
- .release = btrfs_release_super_kobj,
+ .release = btrfs_release_fsid_kobj,
};
static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj)
{
if (kobj->ktype != &btrfs_ktype)
return NULL;
- return container_of(kobj, struct btrfs_fs_devices, super_kobj);
+ return container_of(kobj, struct btrfs_fs_devices, fsid_kobj);
}
static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj)
@@ -502,12 +502,12 @@ static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add)
attrs[0] = &fa->kobj_attr.attr;
if (add) {
int ret;
- ret = sysfs_merge_group(&fs_info->fs_devices->super_kobj,
+ ret = sysfs_merge_group(&fs_info->fs_devices->fsid_kobj,
&agroup);
if (ret)
return ret;
} else
- sysfs_unmerge_group(&fs_info->fs_devices->super_kobj,
+ sysfs_unmerge_group(&fs_info->fs_devices->fsid_kobj,
&agroup);
}
@@ -523,9 +523,9 @@ static void __btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
fs_devs->device_dir_kobj = NULL;
}
- if (fs_devs->super_kobj.state_initialized) {
- kobject_del(&fs_devs->super_kobj);
- kobject_put(&fs_devs->super_kobj);
+ if (fs_devs->fsid_kobj.state_initialized) {
+ kobject_del(&fs_devs->fsid_kobj);
+ kobject_put(&fs_devs->fsid_kobj);
wait_for_completion(&fs_devs->kobj_unregister);
}
}
@@ -545,7 +545,7 @@ void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
}
}
-void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
+void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info)
{
btrfs_reset_fs_info_ptr(fs_info);
@@ -555,9 +555,9 @@ void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
kobject_put(fs_info->space_info_kobj);
}
addrm_unknown_feature_attrs(fs_info, false);
- sysfs_remove_group(&fs_info->fs_devices->super_kobj, &btrfs_feature_attr_group);
- sysfs_remove_files(&fs_info->fs_devices->super_kobj, btrfs_attrs);
- btrfs_kobj_rm_device(fs_info->fs_devices, NULL);
+ sysfs_remove_group(&fs_info->fs_devices->fsid_kobj, &btrfs_feature_attr_group);
+ sysfs_remove_files(&fs_info->fs_devices->fsid_kobj, btrfs_attrs);
+ btrfs_sysfs_rm_device_link(fs_info->fs_devices, NULL);
}
const char * const btrfs_feature_set_names[3] = {
@@ -637,7 +637,7 @@ static void init_feature_attrs(void)
/* when one_device is NULL, it removes all device links */
-int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices,
+int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *one_device)
{
struct hd_struct *disk;
@@ -675,7 +675,7 @@ int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs)
{
if (!fs_devs->device_dir_kobj)
fs_devs->device_dir_kobj = kobject_create_and_add("devices",
- &fs_devs->super_kobj);
+ &fs_devs->fsid_kobj);
if (!fs_devs->device_dir_kobj)
return -ENOMEM;
@@ -683,7 +683,7 @@ int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs)
return 0;
}
-int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices,
+int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *one_device)
{
int error = 0;
@@ -730,31 +730,31 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs,
int error;
init_completion(&fs_devs->kobj_unregister);
- fs_devs->super_kobj.kset = btrfs_kset;
- error = kobject_init_and_add(&fs_devs->super_kobj,
+ fs_devs->fsid_kobj.kset = btrfs_kset;
+ error = kobject_init_and_add(&fs_devs->fsid_kobj,
&btrfs_ktype, parent, "%pU", fs_devs->fsid);
return error;
}
-int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
+int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info)
{
int error;
struct btrfs_fs_devices *fs_devs = fs_info->fs_devices;
- struct kobject *super_kobj = &fs_devs->super_kobj;
+ struct kobject *fsid_kobj = &fs_devs->fsid_kobj;
btrfs_set_fs_info_ptr(fs_info);
- error = btrfs_kobj_add_device(fs_devs, NULL);
+ error = btrfs_sysfs_add_device_link(fs_devs, NULL);
if (error)
return error;
- error = sysfs_create_files(super_kobj, btrfs_attrs);
+ error = sysfs_create_files(fsid_kobj, btrfs_attrs);
if (error) {
- btrfs_kobj_rm_device(fs_devs, NULL);
+ btrfs_sysfs_rm_device_link(fs_devs, NULL);
return error;
}
- error = sysfs_create_group(super_kobj,
+ error = sysfs_create_group(fsid_kobj,
&btrfs_feature_attr_group);
if (error)
goto failure;
@@ -764,7 +764,7 @@ int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
goto failure;
fs_info->space_info_kobj = kobject_create_and_add("allocation",
- super_kobj);
+ fsid_kobj);
if (!fs_info->space_info_kobj) {
error = -ENOMEM;
goto failure;
@@ -776,7 +776,7 @@ int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
return 0;
failure:
- btrfs_sysfs_remove_one(fs_info);
+ btrfs_sysfs_remove_mounted(fs_info);
return error;
}
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index 6392527bcc15..9c09522125a6 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -82,9 +82,9 @@ char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags);
extern const char * const btrfs_feature_set_names[3];
extern struct kobj_type space_info_ktype;
extern struct kobj_type btrfs_raid_ktype;
-int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices,
+int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *one_device);
-int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices,
+int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *one_device);
int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs,
struct kobject *parent);
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 9626252ee6b4..b1d920b30070 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -21,6 +21,9 @@
#include <linux/magic.h>
#include "btrfs-tests.h"
#include "../ctree.h"
+#include "../free-space-cache.h"
+#include "../free-space-tree.h"
+#include "../transaction.h"
#include "../volumes.h"
#include "../disk-io.h"
#include "../qgroup.h"
@@ -122,6 +125,9 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void)
INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
+ extent_io_tree_init(&fs_info->freed_extents[0], NULL);
+ extent_io_tree_init(&fs_info->freed_extents[1], NULL);
+ fs_info->pinned_extents = &fs_info->freed_extents[0];
return fs_info;
}
@@ -169,3 +175,55 @@ void btrfs_free_dummy_root(struct btrfs_root *root)
kfree(root);
}
+struct btrfs_block_group_cache *
+btrfs_alloc_dummy_block_group(unsigned long length)
+{
+ struct btrfs_block_group_cache *cache;
+
+ cache = kzalloc(sizeof(*cache), GFP_NOFS);
+ if (!cache)
+ return NULL;
+ cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
+ GFP_NOFS);
+ if (!cache->free_space_ctl) {
+ kfree(cache);
+ return NULL;
+ }
+ cache->fs_info = btrfs_alloc_dummy_fs_info();
+ if (!cache->fs_info) {
+ kfree(cache->free_space_ctl);
+ kfree(cache);
+ return NULL;
+ }
+
+ cache->key.objectid = 0;
+ cache->key.offset = length;
+ cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+ cache->sectorsize = 4096;
+ cache->full_stripe_len = 4096;
+
+ INIT_LIST_HEAD(&cache->list);
+ INIT_LIST_HEAD(&cache->cluster_list);
+ INIT_LIST_HEAD(&cache->bg_list);
+ btrfs_init_free_space_ctl(cache);
+ mutex_init(&cache->free_space_lock);
+
+ return cache;
+}
+
+void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache)
+{
+ if (!cache)
+ return;
+ __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ kfree(cache->free_space_ctl);
+ kfree(cache);
+}
+
+void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans)
+{
+ memset(trans, 0, sizeof(*trans));
+ trans->transid = 1;
+ INIT_LIST_HEAD(&trans->qgroup_ref_list);
+ trans->type = __TRANS_DUMMY;
+}
diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h
index fd3954224480..054b8c73c951 100644
--- a/fs/btrfs/tests/btrfs-tests.h
+++ b/fs/btrfs/tests/btrfs-tests.h
@@ -24,17 +24,23 @@
#define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt, ##__VA_ARGS__)
struct btrfs_root;
+struct btrfs_trans_handle;
int btrfs_test_free_space_cache(void);
int btrfs_test_extent_buffer_operations(void);
int btrfs_test_extent_io(void);
int btrfs_test_inodes(void);
int btrfs_test_qgroups(void);
+int btrfs_test_free_space_tree(void);
int btrfs_init_test_fs(void);
void btrfs_destroy_test_fs(void);
struct inode *btrfs_new_test_inode(void);
struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void);
void btrfs_free_dummy_root(struct btrfs_root *root);
+struct btrfs_block_group_cache *
+btrfs_alloc_dummy_block_group(unsigned long length);
+void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache);
+void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans);
#else
static inline int btrfs_test_free_space_cache(void)
{
@@ -63,6 +69,10 @@ static inline int btrfs_test_qgroups(void)
{
return 0;
}
+static inline int btrfs_test_free_space_tree(void)
+{
+ return 0;
+}
#endif
#endif
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index 9e9f2368177d..e29fa297e053 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -18,6 +18,8 @@
#include <linux/pagemap.h>
#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/sizes.h>
#include "btrfs-tests.h"
#include "../extent_io.h"
@@ -70,12 +72,14 @@ static int test_find_delalloc(void)
struct page *page;
struct page *locked_page = NULL;
unsigned long index = 0;
- u64 total_dirty = 256 * 1024 * 1024;
- u64 max_bytes = 128 * 1024 * 1024;
+ u64 total_dirty = SZ_256M;
+ u64 max_bytes = SZ_128M;
u64 start, end, test_start;
u64 found;
int ret = -EINVAL;
+ test_msg("Running find delalloc tests\n");
+
inode = btrfs_new_test_inode();
if (!inode) {
test_msg("Failed to allocate test inode\n");
@@ -133,7 +137,7 @@ static int test_find_delalloc(void)
* |--- delalloc ---|
* |--- search ---|
*/
- test_start = 64 * 1024 * 1024;
+ test_start = SZ_64M;
locked_page = find_lock_page(inode->i_mapping,
test_start >> PAGE_CACHE_SHIFT);
if (!locked_page) {
@@ -220,8 +224,8 @@ static int test_find_delalloc(void)
* Now to test where we run into a page that is no longer dirty in the
* range we want to find.
*/
- page = find_get_page(inode->i_mapping, (max_bytes + (1 * 1024 * 1024))
- >> PAGE_CACHE_SHIFT);
+ page = find_get_page(inode->i_mapping,
+ (max_bytes + SZ_1M) >> PAGE_CACHE_SHIFT);
if (!page) {
test_msg("Couldn't find our page\n");
goto out_bits;
@@ -268,8 +272,139 @@ out:
return ret;
}
+static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb,
+ unsigned long len)
+{
+ unsigned long i, x;
+
+ memset(bitmap, 0, len);
+ memset_extent_buffer(eb, 0, 0, len);
+ if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) {
+ test_msg("Bitmap was not zeroed\n");
+ return -EINVAL;
+ }
+
+ bitmap_set(bitmap, 0, len * BITS_PER_BYTE);
+ extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE);
+ if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) {
+ test_msg("Setting all bits failed\n");
+ return -EINVAL;
+ }
+
+ bitmap_clear(bitmap, 0, len * BITS_PER_BYTE);
+ extent_buffer_bitmap_clear(eb, 0, 0, len * BITS_PER_BYTE);
+ if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) {
+ test_msg("Clearing all bits failed\n");
+ return -EINVAL;
+ }
+
+ bitmap_set(bitmap, (PAGE_CACHE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE,
+ sizeof(long) * BITS_PER_BYTE);
+ extent_buffer_bitmap_set(eb, PAGE_CACHE_SIZE - sizeof(long) / 2, 0,
+ sizeof(long) * BITS_PER_BYTE);
+ if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) {
+ test_msg("Setting straddling pages failed\n");
+ return -EINVAL;
+ }
+
+ bitmap_set(bitmap, 0, len * BITS_PER_BYTE);
+ bitmap_clear(bitmap,
+ (PAGE_CACHE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE,
+ sizeof(long) * BITS_PER_BYTE);
+ extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE);
+ extent_buffer_bitmap_clear(eb, PAGE_CACHE_SIZE - sizeof(long) / 2, 0,
+ sizeof(long) * BITS_PER_BYTE);
+ if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) {
+ test_msg("Clearing straddling pages failed\n");
+ return -EINVAL;
+ }
+
+ /*
+ * Generate a wonky pseudo-random bit pattern for the sake of not using
+ * something repetitive that could miss some hypothetical off-by-n bug.
+ */
+ x = 0;
+ for (i = 0; i < len / sizeof(long); i++) {
+ x = (0x19660dULL * (u64)x + 0x3c6ef35fULL) & 0xffffffffUL;
+ bitmap[i] = x;
+ }
+ write_extent_buffer(eb, bitmap, 0, len);
+
+ for (i = 0; i < len * BITS_PER_BYTE; i++) {
+ int bit, bit1;
+
+ bit = !!test_bit(i, bitmap);
+ bit1 = !!extent_buffer_test_bit(eb, 0, i);
+ if (bit1 != bit) {
+ test_msg("Testing bit pattern failed\n");
+ return -EINVAL;
+ }
+
+ bit1 = !!extent_buffer_test_bit(eb, i / BITS_PER_BYTE,
+ i % BITS_PER_BYTE);
+ if (bit1 != bit) {
+ test_msg("Testing bit pattern with offset failed\n");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int test_eb_bitmaps(void)
+{
+ unsigned long len = PAGE_CACHE_SIZE * 4;
+ unsigned long *bitmap;
+ struct extent_buffer *eb;
+ int ret;
+
+ test_msg("Running extent buffer bitmap tests\n");
+
+ bitmap = kmalloc(len, GFP_NOFS);
+ if (!bitmap) {
+ test_msg("Couldn't allocate test bitmap\n");
+ return -ENOMEM;
+ }
+
+ eb = __alloc_dummy_extent_buffer(NULL, 0, len);
+ if (!eb) {
+ test_msg("Couldn't allocate test extent buffer\n");
+ kfree(bitmap);
+ return -ENOMEM;
+ }
+
+ ret = __test_eb_bitmaps(bitmap, eb, len);
+ if (ret)
+ goto out;
+
+ /* Do it over again with an extent buffer which isn't page-aligned. */
+ free_extent_buffer(eb);
+ eb = __alloc_dummy_extent_buffer(NULL, PAGE_CACHE_SIZE / 2, len);
+ if (!eb) {
+ test_msg("Couldn't allocate test extent buffer\n");
+ kfree(bitmap);
+ return -ENOMEM;
+ }
+
+ ret = __test_eb_bitmaps(bitmap, eb, len);
+out:
+ free_extent_buffer(eb);
+ kfree(bitmap);
+ return ret;
+}
+
int btrfs_test_extent_io(void)
{
- test_msg("Running find delalloc tests\n");
- return test_find_delalloc();
+ int ret;
+
+ test_msg("Running extent I/O tests\n");
+
+ ret = test_find_delalloc();
+ if (ret)
+ goto out;
+
+ ret = test_eb_bitmaps();
+out:
+ test_msg("Extent I/O tests finished\n");
+ return ret;
}
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
index 2299bfde39ee..c9ad97b1e690 100644
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -19,38 +19,10 @@
#include <linux/slab.h>
#include "btrfs-tests.h"
#include "../ctree.h"
+#include "../disk-io.h"
#include "../free-space-cache.h"
#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8)
-static struct btrfs_block_group_cache *init_test_block_group(void)
-{
- struct btrfs_block_group_cache *cache;
-
- cache = kzalloc(sizeof(*cache), GFP_NOFS);
- if (!cache)
- return NULL;
- cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
- GFP_NOFS);
- if (!cache->free_space_ctl) {
- kfree(cache);
- return NULL;
- }
-
- cache->key.objectid = 0;
- cache->key.offset = 1024 * 1024 * 1024;
- cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
- cache->sectorsize = 4096;
- cache->full_stripe_len = 4096;
-
- spin_lock_init(&cache->lock);
- INIT_LIST_HEAD(&cache->list);
- INIT_LIST_HEAD(&cache->cluster_list);
- INIT_LIST_HEAD(&cache->bg_list);
-
- btrfs_init_free_space_ctl(cache);
-
- return cache;
-}
/*
* This test just does basic sanity checking, making sure we can add an exten
@@ -64,59 +36,59 @@ static int test_extents(struct btrfs_block_group_cache *cache)
test_msg("Running extent only tests\n");
/* First just make sure we can remove an entire entry */
- ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024);
+ ret = btrfs_add_free_space(cache, 0, SZ_4M);
if (ret) {
test_msg("Error adding initial extents %d\n", ret);
return ret;
}
- ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024);
+ ret = btrfs_remove_free_space(cache, 0, SZ_4M);
if (ret) {
test_msg("Error removing extent %d\n", ret);
return ret;
}
- if (test_check_exists(cache, 0, 4 * 1024 * 1024)) {
+ if (test_check_exists(cache, 0, SZ_4M)) {
test_msg("Full remove left some lingering space\n");
return -1;
}
/* Ok edge and middle cases now */
- ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024);
+ ret = btrfs_add_free_space(cache, 0, SZ_4M);
if (ret) {
test_msg("Error adding half extent %d\n", ret);
return ret;
}
- ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 1 * 1024 * 1024);
+ ret = btrfs_remove_free_space(cache, 3 * SZ_1M, SZ_1M);
if (ret) {
test_msg("Error removing tail end %d\n", ret);
return ret;
}
- ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024);
+ ret = btrfs_remove_free_space(cache, 0, SZ_1M);
if (ret) {
test_msg("Error removing front end %d\n", ret);
return ret;
}
- ret = btrfs_remove_free_space(cache, 2 * 1024 * 1024, 4096);
+ ret = btrfs_remove_free_space(cache, SZ_2M, 4096);
if (ret) {
test_msg("Error removing middle piece %d\n", ret);
return ret;
}
- if (test_check_exists(cache, 0, 1 * 1024 * 1024)) {
+ if (test_check_exists(cache, 0, SZ_1M)) {
test_msg("Still have space at the front\n");
return -1;
}
- if (test_check_exists(cache, 2 * 1024 * 1024, 4096)) {
+ if (test_check_exists(cache, SZ_2M, 4096)) {
test_msg("Still have space in the middle\n");
return -1;
}
- if (test_check_exists(cache, 3 * 1024 * 1024, 1 * 1024 * 1024)) {
+ if (test_check_exists(cache, 3 * SZ_1M, SZ_1M)) {
test_msg("Still have space at the end\n");
return -1;
}
@@ -134,30 +106,30 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache)
test_msg("Running bitmap only tests\n");
- ret = test_add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1);
+ ret = test_add_free_space_entry(cache, 0, SZ_4M, 1);
if (ret) {
test_msg("Couldn't create a bitmap entry %d\n", ret);
return ret;
}
- ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024);
+ ret = btrfs_remove_free_space(cache, 0, SZ_4M);
if (ret) {
test_msg("Error removing bitmap full range %d\n", ret);
return ret;
}
- if (test_check_exists(cache, 0, 4 * 1024 * 1024)) {
+ if (test_check_exists(cache, 0, SZ_4M)) {
test_msg("Left some space in bitmap\n");
return -1;
}
- ret = test_add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1);
+ ret = test_add_free_space_entry(cache, 0, SZ_4M, 1);
if (ret) {
test_msg("Couldn't add to our bitmap entry %d\n", ret);
return ret;
}
- ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 2 * 1024 * 1024);
+ ret = btrfs_remove_free_space(cache, SZ_1M, SZ_2M);
if (ret) {
test_msg("Couldn't remove middle chunk %d\n", ret);
return ret;
@@ -170,23 +142,21 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache)
next_bitmap_offset = (u64)(BITS_PER_BITMAP * 4096);
/* Test a bit straddling two bitmaps */
- ret = test_add_free_space_entry(cache, next_bitmap_offset -
- (2 * 1024 * 1024), 4 * 1024 * 1024, 1);
+ ret = test_add_free_space_entry(cache, next_bitmap_offset - SZ_2M,
+ SZ_4M, 1);
if (ret) {
test_msg("Couldn't add space that straddles two bitmaps %d\n",
ret);
return ret;
}
- ret = btrfs_remove_free_space(cache, next_bitmap_offset -
- (1 * 1024 * 1024), 2 * 1024 * 1024);
+ ret = btrfs_remove_free_space(cache, next_bitmap_offset - SZ_1M, SZ_2M);
if (ret) {
test_msg("Couldn't remove overlapping space %d\n", ret);
return ret;
}
- if (test_check_exists(cache, next_bitmap_offset - (1 * 1024 * 1024),
- 2 * 1024 * 1024)) {
+ if (test_check_exists(cache, next_bitmap_offset - SZ_1M, SZ_2M)) {
test_msg("Left some space when removing overlapping\n");
return -1;
}
@@ -209,43 +179,43 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
* bitmap, but the free space completely in the extent and then
* completely in the bitmap.
*/
- ret = test_add_free_space_entry(cache, 4 * 1024 * 1024, 1 * 1024 * 1024, 1);
+ ret = test_add_free_space_entry(cache, SZ_4M, SZ_1M, 1);
if (ret) {
test_msg("Couldn't create bitmap entry %d\n", ret);
return ret;
}
- ret = test_add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0);
+ ret = test_add_free_space_entry(cache, 0, SZ_1M, 0);
if (ret) {
test_msg("Couldn't add extent entry %d\n", ret);
return ret;
}
- ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024);
+ ret = btrfs_remove_free_space(cache, 0, SZ_1M);
if (ret) {
test_msg("Couldn't remove extent entry %d\n", ret);
return ret;
}
- if (test_check_exists(cache, 0, 1 * 1024 * 1024)) {
+ if (test_check_exists(cache, 0, SZ_1M)) {
test_msg("Left remnants after our remove\n");
return -1;
}
/* Now to add back the extent entry and remove from the bitmap */
- ret = test_add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0);
+ ret = test_add_free_space_entry(cache, 0, SZ_1M, 0);
if (ret) {
test_msg("Couldn't re-add extent entry %d\n", ret);
return ret;
}
- ret = btrfs_remove_free_space(cache, 4 * 1024 * 1024, 1 * 1024 * 1024);
+ ret = btrfs_remove_free_space(cache, SZ_4M, SZ_1M);
if (ret) {
test_msg("Couldn't remove from bitmap %d\n", ret);
return ret;
}
- if (test_check_exists(cache, 4 * 1024 * 1024, 1 * 1024 * 1024)) {
+ if (test_check_exists(cache, SZ_4M, SZ_1M)) {
test_msg("Left remnants in the bitmap\n");
return -1;
}
@@ -254,19 +224,19 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
* Ok so a little more evil, extent entry and bitmap at the same offset,
* removing an overlapping chunk.
*/
- ret = test_add_free_space_entry(cache, 1 * 1024 * 1024, 4 * 1024 * 1024, 1);
+ ret = test_add_free_space_entry(cache, SZ_1M, SZ_4M, 1);
if (ret) {
test_msg("Couldn't add to a bitmap %d\n", ret);
return ret;
}
- ret = btrfs_remove_free_space(cache, 512 * 1024, 3 * 1024 * 1024);
+ ret = btrfs_remove_free_space(cache, SZ_512K, 3 * SZ_1M);
if (ret) {
test_msg("Couldn't remove overlapping space %d\n", ret);
return ret;
}
- if (test_check_exists(cache, 512 * 1024, 3 * 1024 * 1024)) {
+ if (test_check_exists(cache, SZ_512K, 3 * SZ_1M)) {
test_msg("Left over pieces after removing overlapping\n");
return -1;
}
@@ -274,25 +244,25 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
__btrfs_remove_free_space_cache(cache->free_space_ctl);
/* Now with the extent entry offset into the bitmap */
- ret = test_add_free_space_entry(cache, 4 * 1024 * 1024, 4 * 1024 * 1024, 1);
+ ret = test_add_free_space_entry(cache, SZ_4M, SZ_4M, 1);
if (ret) {
test_msg("Couldn't add space to the bitmap %d\n", ret);
return ret;
}
- ret = test_add_free_space_entry(cache, 2 * 1024 * 1024, 2 * 1024 * 1024, 0);
+ ret = test_add_free_space_entry(cache, SZ_2M, SZ_2M, 0);
if (ret) {
test_msg("Couldn't add extent to the cache %d\n", ret);
return ret;
}
- ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 4 * 1024 * 1024);
+ ret = btrfs_remove_free_space(cache, 3 * SZ_1M, SZ_4M);
if (ret) {
test_msg("Problem removing overlapping space %d\n", ret);
return ret;
}
- if (test_check_exists(cache, 3 * 1024 * 1024, 4 * 1024 * 1024)) {
+ if (test_check_exists(cache, 3 * SZ_1M, SZ_4M)) {
test_msg("Left something behind when removing space");
return -1;
}
@@ -308,29 +278,26 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
* [ del ]
*/
__btrfs_remove_free_space_cache(cache->free_space_ctl);
- ret = test_add_free_space_entry(cache, bitmap_offset + 4 * 1024 * 1024,
- 4 * 1024 * 1024, 1);
+ ret = test_add_free_space_entry(cache, bitmap_offset + SZ_4M, SZ_4M, 1);
if (ret) {
test_msg("Couldn't add bitmap %d\n", ret);
return ret;
}
- ret = test_add_free_space_entry(cache, bitmap_offset - 1 * 1024 * 1024,
- 5 * 1024 * 1024, 0);
+ ret = test_add_free_space_entry(cache, bitmap_offset - SZ_1M,
+ 5 * SZ_1M, 0);
if (ret) {
test_msg("Couldn't add extent entry %d\n", ret);
return ret;
}
- ret = btrfs_remove_free_space(cache, bitmap_offset + 1 * 1024 * 1024,
- 5 * 1024 * 1024);
+ ret = btrfs_remove_free_space(cache, bitmap_offset + SZ_1M, 5 * SZ_1M);
if (ret) {
test_msg("Failed to free our space %d\n", ret);
return ret;
}
- if (test_check_exists(cache, bitmap_offset + 1 * 1024 * 1024,
- 5 * 1024 * 1024)) {
+ if (test_check_exists(cache, bitmap_offset + SZ_1M, 5 * SZ_1M)) {
test_msg("Left stuff over\n");
return -1;
}
@@ -343,19 +310,19 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
* to return -EAGAIN back from btrfs_remove_extent, make sure this
* doesn't happen.
*/
- ret = test_add_free_space_entry(cache, 1 * 1024 * 1024, 2 * 1024 * 1024, 1);
+ ret = test_add_free_space_entry(cache, SZ_1M, SZ_2M, 1);
if (ret) {
test_msg("Couldn't add bitmap entry %d\n", ret);
return ret;
}
- ret = test_add_free_space_entry(cache, 3 * 1024 * 1024, 1 * 1024 * 1024, 0);
+ ret = test_add_free_space_entry(cache, 3 * SZ_1M, SZ_1M, 0);
if (ret) {
test_msg("Couldn't add extent entry %d\n", ret);
return ret;
}
- ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 3 * 1024 * 1024);
+ ret = btrfs_remove_free_space(cache, SZ_1M, 3 * SZ_1M);
if (ret) {
test_msg("Error removing bitmap and extent overlapping %d\n", ret);
return ret;
@@ -438,9 +405,11 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
int ret;
u64 offset;
u64 max_extent_size;
-
- bool (*use_bitmap_op)(struct btrfs_free_space_ctl *,
- struct btrfs_free_space *);
+ const struct btrfs_free_space_op test_free_space_ops = {
+ .recalc_thresholds = cache->free_space_ctl->op->recalc_thresholds,
+ .use_bitmap = test_use_bitmap,
+ };
+ const struct btrfs_free_space_op *orig_free_space_ops;
test_msg("Running space stealing from bitmap to extent\n");
@@ -462,22 +431,21 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* that forces use of bitmaps as soon as we have at least 1
* extent entry.
*/
- use_bitmap_op = cache->free_space_ctl->op->use_bitmap;
- cache->free_space_ctl->op->use_bitmap = test_use_bitmap;
+ orig_free_space_ops = cache->free_space_ctl->op;
+ cache->free_space_ctl->op = &test_free_space_ops;
/*
* Extent entry covering free space range [128Mb - 256Kb, 128Mb - 128Kb[
*/
- ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 - 256 * 1024,
- 128 * 1024, 0);
+ ret = test_add_free_space_entry(cache, SZ_128M - SZ_256K, SZ_128K, 0);
if (ret) {
test_msg("Couldn't add extent entry %d\n", ret);
return ret;
}
/* Bitmap entry covering free space range [128Mb + 512Kb, 256Mb[ */
- ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 + 512 * 1024,
- 128 * 1024 * 1024 - 512 * 1024, 1);
+ ret = test_add_free_space_entry(cache, SZ_128M + SZ_512K,
+ SZ_128M - SZ_512K, 1);
if (ret) {
test_msg("Couldn't add bitmap entry %d\n", ret);
return ret;
@@ -495,21 +463,19 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* [128Mb + 512Kb, 128Mb + 768Kb[
*/
ret = btrfs_remove_free_space(cache,
- 128 * 1024 * 1024 + 768 * 1024,
- 128 * 1024 * 1024 - 768 * 1024);
+ SZ_128M + 768 * SZ_1K,
+ SZ_128M - 768 * SZ_1K);
if (ret) {
test_msg("Failed to free part of bitmap space %d\n", ret);
return ret;
}
/* Confirm that only those 2 ranges are marked as free. */
- if (!test_check_exists(cache, 128 * 1024 * 1024 - 256 * 1024,
- 128 * 1024)) {
+ if (!test_check_exists(cache, SZ_128M - SZ_256K, SZ_128K)) {
test_msg("Free space range missing\n");
return -ENOENT;
}
- if (!test_check_exists(cache, 128 * 1024 * 1024 + 512 * 1024,
- 256 * 1024)) {
+ if (!test_check_exists(cache, SZ_128M + SZ_512K, SZ_256K)) {
test_msg("Free space range missing\n");
return -ENOENT;
}
@@ -518,8 +484,8 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* Confirm that the bitmap range [128Mb + 768Kb, 256Mb[ isn't marked
* as free anymore.
*/
- if (test_check_exists(cache, 128 * 1024 * 1024 + 768 * 1024,
- 128 * 1024 * 1024 - 768 * 1024)) {
+ if (test_check_exists(cache, SZ_128M + 768 * SZ_1K,
+ SZ_128M - 768 * SZ_1K)) {
test_msg("Bitmap region not removed from space cache\n");
return -EINVAL;
}
@@ -528,8 +494,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* Confirm that the region [128Mb + 256Kb, 128Mb + 512Kb[, which is
* covered by the bitmap, isn't marked as free.
*/
- if (test_check_exists(cache, 128 * 1024 * 1024 + 256 * 1024,
- 256 * 1024)) {
+ if (test_check_exists(cache, SZ_128M + SZ_256K, SZ_256K)) {
test_msg("Invalid bitmap region marked as free\n");
return -EINVAL;
}
@@ -538,8 +503,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* Confirm that the region [128Mb, 128Mb + 256Kb[, which is covered
* by the bitmap too, isn't marked as free either.
*/
- if (test_check_exists(cache, 128 * 1024 * 1024,
- 256 * 1024)) {
+ if (test_check_exists(cache, SZ_128M, SZ_256K)) {
test_msg("Invalid bitmap region marked as free\n");
return -EINVAL;
}
@@ -549,13 +513,13 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* lets make sure the free space cache marks it as free in the bitmap,
* and doesn't insert a new extent entry to represent this region.
*/
- ret = btrfs_add_free_space(cache, 128 * 1024 * 1024, 512 * 1024);
+ ret = btrfs_add_free_space(cache, SZ_128M, SZ_512K);
if (ret) {
test_msg("Error adding free space: %d\n", ret);
return ret;
}
/* Confirm the region is marked as free. */
- if (!test_check_exists(cache, 128 * 1024 * 1024, 512 * 1024)) {
+ if (!test_check_exists(cache, SZ_128M, SZ_512K)) {
test_msg("Bitmap region not marked as free\n");
return -ENOENT;
}
@@ -574,8 +538,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* The goal is to test that the bitmap entry space stealing doesn't
* steal this space region.
*/
- ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 + 16 * 1024 * 1024,
- 4096);
+ ret = btrfs_add_free_space(cache, SZ_128M + SZ_16M, 4096);
if (ret) {
test_msg("Error adding free space: %d\n", ret);
return ret;
@@ -594,15 +557,13 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* expand the range covered by the existing extent entry that represents
* the free space [128Mb - 256Kb, 128Mb - 128Kb[.
*/
- ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 - 128 * 1024,
- 128 * 1024);
+ ret = btrfs_add_free_space(cache, SZ_128M - SZ_128K, SZ_128K);
if (ret) {
test_msg("Error adding free space: %d\n", ret);
return ret;
}
/* Confirm the region is marked as free. */
- if (!test_check_exists(cache, 128 * 1024 * 1024 - 128 * 1024,
- 128 * 1024)) {
+ if (!test_check_exists(cache, SZ_128M - SZ_128K, SZ_128K)) {
test_msg("Extent region not marked as free\n");
return -ENOENT;
}
@@ -630,21 +591,20 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* that represents the 1Mb free space, and therefore we're able to
* allocate the whole free space at once.
*/
- if (!test_check_exists(cache, 128 * 1024 * 1024 - 256 * 1024,
- 1 * 1024 * 1024)) {
+ if (!test_check_exists(cache, SZ_128M - SZ_256K, SZ_1M)) {
test_msg("Expected region not marked as free\n");
return -ENOENT;
}
- if (cache->free_space_ctl->free_space != (1 * 1024 * 1024 + 4096)) {
+ if (cache->free_space_ctl->free_space != (SZ_1M + 4096)) {
test_msg("Cache free space is not 1Mb + 4Kb\n");
return -EINVAL;
}
offset = btrfs_find_space_for_alloc(cache,
- 0, 1 * 1024 * 1024, 0,
+ 0, SZ_1M, 0,
&max_extent_size);
- if (offset != (128 * 1024 * 1024 - 256 * 1024)) {
+ if (offset != (SZ_128M - SZ_256K)) {
test_msg("Failed to allocate 1Mb from space cache, returned offset is: %llu\n",
offset);
return -EINVAL;
@@ -663,7 +623,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
offset = btrfs_find_space_for_alloc(cache,
0, 4096, 0,
&max_extent_size);
- if (offset != (128 * 1024 * 1024 + 16 * 1024 * 1024)) {
+ if (offset != (SZ_128M + SZ_16M)) {
test_msg("Failed to allocate 4Kb from space cache, returned offset is: %llu\n",
offset);
return -EINVAL;
@@ -684,16 +644,14 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
/*
* Extent entry covering free space range [128Mb + 128Kb, 128Mb + 256Kb[
*/
- ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 + 128 * 1024,
- 128 * 1024, 0);
+ ret = test_add_free_space_entry(cache, SZ_128M + SZ_128K, SZ_128K, 0);
if (ret) {
test_msg("Couldn't add extent entry %d\n", ret);
return ret;
}
/* Bitmap entry covering free space range [0, 128Mb - 512Kb[ */
- ret = test_add_free_space_entry(cache, 0,
- 128 * 1024 * 1024 - 512 * 1024, 1);
+ ret = test_add_free_space_entry(cache, 0, SZ_128M - SZ_512K, 1);
if (ret) {
test_msg("Couldn't add bitmap entry %d\n", ret);
return ret;
@@ -710,22 +668,18 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* [128Mb + 128b, 128Mb + 256Kb[
* [128Mb - 768Kb, 128Mb - 512Kb[
*/
- ret = btrfs_remove_free_space(cache,
- 0,
- 128 * 1024 * 1024 - 768 * 1024);
+ ret = btrfs_remove_free_space(cache, 0, SZ_128M - 768 * SZ_1K);
if (ret) {
test_msg("Failed to free part of bitmap space %d\n", ret);
return ret;
}
/* Confirm that only those 2 ranges are marked as free. */
- if (!test_check_exists(cache, 128 * 1024 * 1024 + 128 * 1024,
- 128 * 1024)) {
+ if (!test_check_exists(cache, SZ_128M + SZ_128K, SZ_128K)) {
test_msg("Free space range missing\n");
return -ENOENT;
}
- if (!test_check_exists(cache, 128 * 1024 * 1024 - 768 * 1024,
- 256 * 1024)) {
+ if (!test_check_exists(cache, SZ_128M - 768 * SZ_1K, SZ_256K)) {
test_msg("Free space range missing\n");
return -ENOENT;
}
@@ -734,8 +688,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* Confirm that the bitmap range [0, 128Mb - 768Kb[ isn't marked
* as free anymore.
*/
- if (test_check_exists(cache, 0,
- 128 * 1024 * 1024 - 768 * 1024)) {
+ if (test_check_exists(cache, 0, SZ_128M - 768 * SZ_1K)) {
test_msg("Bitmap region not removed from space cache\n");
return -EINVAL;
}
@@ -744,8 +697,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* Confirm that the region [128Mb - 512Kb, 128Mb[, which is
* covered by the bitmap, isn't marked as free.
*/
- if (test_check_exists(cache, 128 * 1024 * 1024 - 512 * 1024,
- 512 * 1024)) {
+ if (test_check_exists(cache, SZ_128M - SZ_512K, SZ_512K)) {
test_msg("Invalid bitmap region marked as free\n");
return -EINVAL;
}
@@ -755,15 +707,13 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* lets make sure the free space cache marks it as free in the bitmap,
* and doesn't insert a new extent entry to represent this region.
*/
- ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 - 512 * 1024,
- 512 * 1024);
+ ret = btrfs_add_free_space(cache, SZ_128M - SZ_512K, SZ_512K);
if (ret) {
test_msg("Error adding free space: %d\n", ret);
return ret;
}
/* Confirm the region is marked as free. */
- if (!test_check_exists(cache, 128 * 1024 * 1024 - 512 * 1024,
- 512 * 1024)) {
+ if (!test_check_exists(cache, SZ_128M - SZ_512K, SZ_512K)) {
test_msg("Bitmap region not marked as free\n");
return -ENOENT;
}
@@ -782,7 +732,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* The goal is to test that the bitmap entry space stealing doesn't
* steal this space region.
*/
- ret = btrfs_add_free_space(cache, 32 * 1024 * 1024, 8192);
+ ret = btrfs_add_free_space(cache, SZ_32M, 8192);
if (ret) {
test_msg("Error adding free space: %d\n", ret);
return ret;
@@ -793,13 +743,13 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* expand the range covered by the existing extent entry that represents
* the free space [128Mb + 128Kb, 128Mb + 256Kb[.
*/
- ret = btrfs_add_free_space(cache, 128 * 1024 * 1024, 128 * 1024);
+ ret = btrfs_add_free_space(cache, SZ_128M, SZ_128K);
if (ret) {
test_msg("Error adding free space: %d\n", ret);
return ret;
}
/* Confirm the region is marked as free. */
- if (!test_check_exists(cache, 128 * 1024 * 1024, 128 * 1024)) {
+ if (!test_check_exists(cache, SZ_128M, SZ_128K)) {
test_msg("Extent region not marked as free\n");
return -ENOENT;
}
@@ -827,21 +777,19 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* that represents the 1Mb free space, and therefore we're able to
* allocate the whole free space at once.
*/
- if (!test_check_exists(cache, 128 * 1024 * 1024 - 768 * 1024,
- 1 * 1024 * 1024)) {
+ if (!test_check_exists(cache, SZ_128M - 768 * SZ_1K, SZ_1M)) {
test_msg("Expected region not marked as free\n");
return -ENOENT;
}
- if (cache->free_space_ctl->free_space != (1 * 1024 * 1024 + 8192)) {
+ if (cache->free_space_ctl->free_space != (SZ_1M + 8192)) {
test_msg("Cache free space is not 1Mb + 8Kb\n");
return -EINVAL;
}
- offset = btrfs_find_space_for_alloc(cache,
- 0, 1 * 1024 * 1024, 0,
+ offset = btrfs_find_space_for_alloc(cache, 0, SZ_1M, 0,
&max_extent_size);
- if (offset != (128 * 1024 * 1024 - 768 * 1024)) {
+ if (offset != (SZ_128M - 768 * SZ_1K)) {
test_msg("Failed to allocate 1Mb from space cache, returned offset is: %llu\n",
offset);
return -EINVAL;
@@ -860,7 +808,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
offset = btrfs_find_space_for_alloc(cache,
0, 8192, 0,
&max_extent_size);
- if (offset != (32 * 1024 * 1024)) {
+ if (offset != SZ_32M) {
test_msg("Failed to allocate 8Kb from space cache, returned offset is: %llu\n",
offset);
return -EINVAL;
@@ -870,7 +818,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
if (ret)
return ret;
- cache->free_space_ctl->op->use_bitmap = use_bitmap_op;
+ cache->free_space_ctl->op = orig_free_space_ops;
__btrfs_remove_free_space_cache(cache->free_space_ctl);
return 0;
@@ -879,16 +827,30 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
int btrfs_test_free_space_cache(void)
{
struct btrfs_block_group_cache *cache;
- int ret;
+ struct btrfs_root *root = NULL;
+ int ret = -ENOMEM;
test_msg("Running btrfs free space cache tests\n");
- cache = init_test_block_group();
+ cache = btrfs_alloc_dummy_block_group(1024 * 1024 * 1024);
if (!cache) {
test_msg("Couldn't run the tests\n");
return 0;
}
+ root = btrfs_alloc_dummy_root();
+ if (IS_ERR(root)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
+
+ root->fs_info = btrfs_alloc_dummy_fs_info();
+ if (!root->fs_info)
+ goto out;
+
+ root->fs_info->extent_root = root;
+ cache->fs_info = root->fs_info;
+
ret = test_extents(cache);
if (ret)
goto out;
@@ -901,9 +863,8 @@ int btrfs_test_free_space_cache(void)
ret = test_steal_space_from_bitmap_to_extent(cache);
out:
- __btrfs_remove_free_space_cache(cache->free_space_ctl);
- kfree(cache->free_space_ctl);
- kfree(cache);
+ btrfs_free_dummy_block_group(cache);
+ btrfs_free_dummy_root(root);
test_msg("Free space cache tests finished\n");
return ret;
}
diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c
new file mode 100644
index 000000000000..d05fe1ab4808
--- /dev/null
+++ b/fs/btrfs/tests/free-space-tree-tests.c
@@ -0,0 +1,571 @@
+/*
+ * Copyright (C) 2015 Facebook. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "btrfs-tests.h"
+#include "../ctree.h"
+#include "../disk-io.h"
+#include "../free-space-tree.h"
+#include "../transaction.h"
+
+struct free_space_extent {
+ u64 start, length;
+};
+
+/*
+ * The test cases align their operations to this in order to hit some of the
+ * edge cases in the bitmap code.
+ */
+#define BITMAP_RANGE (BTRFS_FREE_SPACE_BITMAP_BITS * 4096)
+
+static int __check_free_space_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *cache,
+ struct btrfs_path *path,
+ struct free_space_extent *extents,
+ unsigned int num_extents)
+{
+ struct btrfs_free_space_info *info;
+ struct btrfs_key key;
+ int prev_bit = 0, bit;
+ u64 extent_start = 0, offset, end;
+ u32 flags, extent_count;
+ unsigned int i;
+ int ret;
+
+ info = search_free_space_info(trans, fs_info, cache, path, 0);
+ if (IS_ERR(info)) {
+ test_msg("Could not find free space info\n");
+ ret = PTR_ERR(info);
+ goto out;
+ }
+ flags = btrfs_free_space_flags(path->nodes[0], info);
+ extent_count = btrfs_free_space_extent_count(path->nodes[0], info);
+
+ if (extent_count != num_extents) {
+ test_msg("Extent count is wrong\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) {
+ if (path->slots[0] != 0)
+ goto invalid;
+ end = cache->key.objectid + cache->key.offset;
+ i = 0;
+ while (++path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.type != BTRFS_FREE_SPACE_BITMAP_KEY)
+ goto invalid;
+ offset = key.objectid;
+ while (offset < key.objectid + key.offset) {
+ bit = free_space_test_bit(cache, path, offset);
+ if (prev_bit == 0 && bit == 1) {
+ extent_start = offset;
+ } else if (prev_bit == 1 && bit == 0) {
+ if (i >= num_extents)
+ goto invalid;
+ if (i >= num_extents ||
+ extent_start != extents[i].start ||
+ offset - extent_start != extents[i].length)
+ goto invalid;
+ i++;
+ }
+ prev_bit = bit;
+ offset += cache->sectorsize;
+ }
+ }
+ if (prev_bit == 1) {
+ if (i >= num_extents ||
+ extent_start != extents[i].start ||
+ end - extent_start != extents[i].length)
+ goto invalid;
+ i++;
+ }
+ if (i != num_extents)
+ goto invalid;
+ } else {
+ if (btrfs_header_nritems(path->nodes[0]) != num_extents + 1 ||
+ path->slots[0] != 0)
+ goto invalid;
+ for (i = 0; i < num_extents; i++) {
+ path->slots[0]++;
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.type != BTRFS_FREE_SPACE_EXTENT_KEY ||
+ key.objectid != extents[i].start ||
+ key.offset != extents[i].length)
+ goto invalid;
+ }
+ }
+
+ ret = 0;
+out:
+ btrfs_release_path(path);
+ return ret;
+invalid:
+ test_msg("Free space tree is invalid\n");
+ ret = -EINVAL;
+ goto out;
+}
+
+static int check_free_space_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *cache,
+ struct btrfs_path *path,
+ struct free_space_extent *extents,
+ unsigned int num_extents)
+{
+ struct btrfs_free_space_info *info;
+ u32 flags;
+ int ret;
+
+ info = search_free_space_info(trans, fs_info, cache, path, 0);
+ if (IS_ERR(info)) {
+ test_msg("Could not find free space info\n");
+ btrfs_release_path(path);
+ return PTR_ERR(info);
+ }
+ flags = btrfs_free_space_flags(path->nodes[0], info);
+ btrfs_release_path(path);
+
+ ret = __check_free_space_extents(trans, fs_info, cache, path, extents,
+ num_extents);
+ if (ret)
+ return ret;
+
+ /* Flip it to the other format and check that for good measure. */
+ if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) {
+ ret = convert_free_space_to_extents(trans, fs_info, cache, path);
+ if (ret) {
+ test_msg("Could not convert to extents\n");
+ return ret;
+ }
+ } else {
+ ret = convert_free_space_to_bitmaps(trans, fs_info, cache, path);
+ if (ret) {
+ test_msg("Could not convert to bitmaps\n");
+ return ret;
+ }
+ }
+ return __check_free_space_extents(trans, fs_info, cache, path, extents,
+ num_extents);
+}
+
+static int test_empty_block_group(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *cache,
+ struct btrfs_path *path)
+{
+ struct free_space_extent extents[] = {
+ {cache->key.objectid, cache->key.offset},
+ };
+
+ return check_free_space_extents(trans, fs_info, cache, path,
+ extents, ARRAY_SIZE(extents));
+}
+
+static int test_remove_all(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *cache,
+ struct btrfs_path *path)
+{
+ struct free_space_extent extents[] = {};
+ int ret;
+
+ ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid,
+ cache->key.offset);
+ if (ret) {
+ test_msg("Could not remove free space\n");
+ return ret;
+ }
+
+ return check_free_space_extents(trans, fs_info, cache, path,
+ extents, ARRAY_SIZE(extents));
+}
+
+static int test_remove_beginning(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *cache,
+ struct btrfs_path *path)
+{
+ struct free_space_extent extents[] = {
+ {cache->key.objectid + BITMAP_RANGE,
+ cache->key.offset - BITMAP_RANGE},
+ };
+ int ret;
+
+ ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid, BITMAP_RANGE);
+ if (ret) {
+ test_msg("Could not remove free space\n");
+ return ret;
+ }
+
+ return check_free_space_extents(trans, fs_info, cache, path,
+ extents, ARRAY_SIZE(extents));
+
+}
+
+static int test_remove_end(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *cache,
+ struct btrfs_path *path)
+{
+ struct free_space_extent extents[] = {
+ {cache->key.objectid, cache->key.offset - BITMAP_RANGE},
+ };
+ int ret;
+
+ ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid +
+ cache->key.offset - BITMAP_RANGE,
+ BITMAP_RANGE);
+ if (ret) {
+ test_msg("Could not remove free space\n");
+ return ret;
+ }
+
+ return check_free_space_extents(trans, fs_info, cache, path,
+ extents, ARRAY_SIZE(extents));
+}
+
+static int test_remove_middle(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *cache,
+ struct btrfs_path *path)
+{
+ struct free_space_extent extents[] = {
+ {cache->key.objectid, BITMAP_RANGE},
+ {cache->key.objectid + 2 * BITMAP_RANGE,
+ cache->key.offset - 2 * BITMAP_RANGE},
+ };
+ int ret;
+
+ ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid + BITMAP_RANGE,
+ BITMAP_RANGE);
+ if (ret) {
+ test_msg("Could not remove free space\n");
+ return ret;
+ }
+
+ return check_free_space_extents(trans, fs_info, cache, path,
+ extents, ARRAY_SIZE(extents));
+}
+
+static int test_merge_left(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *cache,
+ struct btrfs_path *path)
+{
+ struct free_space_extent extents[] = {
+ {cache->key.objectid, 2 * BITMAP_RANGE},
+ };
+ int ret;
+
+ ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid,
+ cache->key.offset);
+ if (ret) {
+ test_msg("Could not remove free space\n");
+ return ret;
+ }
+
+ ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid, BITMAP_RANGE);
+ if (ret) {
+ test_msg("Could not add free space\n");
+ return ret;
+ }
+
+ ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid + BITMAP_RANGE,
+ BITMAP_RANGE);
+ if (ret) {
+ test_msg("Could not add free space\n");
+ return ret;
+ }
+
+ return check_free_space_extents(trans, fs_info, cache, path,
+ extents, ARRAY_SIZE(extents));
+}
+
+static int test_merge_right(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *cache,
+ struct btrfs_path *path)
+{
+ struct free_space_extent extents[] = {
+ {cache->key.objectid + BITMAP_RANGE, 2 * BITMAP_RANGE},
+ };
+ int ret;
+
+ ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid,
+ cache->key.offset);
+ if (ret) {
+ test_msg("Could not remove free space\n");
+ return ret;
+ }
+
+ ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid + 2 * BITMAP_RANGE,
+ BITMAP_RANGE);
+ if (ret) {
+ test_msg("Could not add free space\n");
+ return ret;
+ }
+
+ ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid + BITMAP_RANGE,
+ BITMAP_RANGE);
+ if (ret) {
+ test_msg("Could not add free space\n");
+ return ret;
+ }
+
+ return check_free_space_extents(trans, fs_info, cache, path,
+ extents, ARRAY_SIZE(extents));
+}
+
+static int test_merge_both(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *cache,
+ struct btrfs_path *path)
+{
+ struct free_space_extent extents[] = {
+ {cache->key.objectid, 3 * BITMAP_RANGE},
+ };
+ int ret;
+
+ ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid,
+ cache->key.offset);
+ if (ret) {
+ test_msg("Could not remove free space\n");
+ return ret;
+ }
+
+ ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid, BITMAP_RANGE);
+ if (ret) {
+ test_msg("Could not add free space\n");
+ return ret;
+ }
+
+ ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid + 2 * BITMAP_RANGE,
+ BITMAP_RANGE);
+ if (ret) {
+ test_msg("Could not add free space\n");
+ return ret;
+ }
+
+ ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid + BITMAP_RANGE,
+ BITMAP_RANGE);
+ if (ret) {
+ test_msg("Could not add free space\n");
+ return ret;
+ }
+
+ return check_free_space_extents(trans, fs_info, cache, path,
+ extents, ARRAY_SIZE(extents));
+}
+
+static int test_merge_none(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *cache,
+ struct btrfs_path *path)
+{
+ struct free_space_extent extents[] = {
+ {cache->key.objectid, BITMAP_RANGE},
+ {cache->key.objectid + 2 * BITMAP_RANGE, BITMAP_RANGE},
+ {cache->key.objectid + 4 * BITMAP_RANGE, BITMAP_RANGE},
+ };
+ int ret;
+
+ ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid,
+ cache->key.offset);
+ if (ret) {
+ test_msg("Could not remove free space\n");
+ return ret;
+ }
+
+ ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid, BITMAP_RANGE);
+ if (ret) {
+ test_msg("Could not add free space\n");
+ return ret;
+ }
+
+ ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid + 4 * BITMAP_RANGE,
+ BITMAP_RANGE);
+ if (ret) {
+ test_msg("Could not add free space\n");
+ return ret;
+ }
+
+ ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid + 2 * BITMAP_RANGE,
+ BITMAP_RANGE);
+ if (ret) {
+ test_msg("Could not add free space\n");
+ return ret;
+ }
+
+ return check_free_space_extents(trans, fs_info, cache, path,
+ extents, ARRAY_SIZE(extents));
+}
+
+typedef int (*test_func_t)(struct btrfs_trans_handle *,
+ struct btrfs_fs_info *,
+ struct btrfs_block_group_cache *,
+ struct btrfs_path *);
+
+static int run_test(test_func_t test_func, int bitmaps)
+{
+ struct btrfs_root *root = NULL;
+ struct btrfs_block_group_cache *cache = NULL;
+ struct btrfs_trans_handle trans;
+ struct btrfs_path *path = NULL;
+ int ret;
+
+ root = btrfs_alloc_dummy_root();
+ if (IS_ERR(root)) {
+ test_msg("Couldn't allocate dummy root\n");
+ ret = PTR_ERR(root);
+ goto out;
+ }
+
+ root->fs_info = btrfs_alloc_dummy_fs_info();
+ if (!root->fs_info) {
+ test_msg("Couldn't allocate dummy fs info\n");
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ btrfs_set_super_compat_ro_flags(root->fs_info->super_copy,
+ BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE);
+ root->fs_info->free_space_root = root;
+ root->fs_info->tree_root = root;
+
+ root->node = alloc_test_extent_buffer(root->fs_info, 4096);
+ if (!root->node) {
+ test_msg("Couldn't allocate dummy buffer\n");
+ ret = -ENOMEM;
+ goto out;
+ }
+ btrfs_set_header_level(root->node, 0);
+ btrfs_set_header_nritems(root->node, 0);
+ root->alloc_bytenr += 8192;
+
+ cache = btrfs_alloc_dummy_block_group(8 * BITMAP_RANGE);
+ if (!cache) {
+ test_msg("Couldn't allocate dummy block group cache\n");
+ ret = -ENOMEM;
+ goto out;
+ }
+ cache->bitmap_low_thresh = 0;
+ cache->bitmap_high_thresh = (u32)-1;
+ cache->needs_free_space = 1;
+
+ btrfs_init_dummy_trans(&trans);
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ test_msg("Couldn't allocate path\n");
+ return -ENOMEM;
+ }
+
+ ret = add_block_group_free_space(&trans, root->fs_info, cache);
+ if (ret) {
+ test_msg("Could not add block group free space\n");
+ goto out;
+ }
+
+ if (bitmaps) {
+ ret = convert_free_space_to_bitmaps(&trans, root->fs_info,
+ cache, path);
+ if (ret) {
+ test_msg("Could not convert block group to bitmaps\n");
+ goto out;
+ }
+ }
+
+ ret = test_func(&trans, root->fs_info, cache, path);
+ if (ret)
+ goto out;
+
+ ret = remove_block_group_free_space(&trans, root->fs_info, cache);
+ if (ret) {
+ test_msg("Could not remove block group free space\n");
+ goto out;
+ }
+
+ if (btrfs_header_nritems(root->node) != 0) {
+ test_msg("Free space tree has leftover items\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ btrfs_free_dummy_block_group(cache);
+ btrfs_free_dummy_root(root);
+ return ret;
+}
+
+static int run_test_both_formats(test_func_t test_func)
+{
+ int ret;
+
+ ret = run_test(test_func, 0);
+ if (ret)
+ return ret;
+ return run_test(test_func, 1);
+}
+
+int btrfs_test_free_space_tree(void)
+{
+ test_func_t tests[] = {
+ test_empty_block_group,
+ test_remove_all,
+ test_remove_beginning,
+ test_remove_end,
+ test_remove_middle,
+ test_merge_left,
+ test_merge_right,
+ test_merge_both,
+ test_merge_none,
+ };
+ int i;
+
+ test_msg("Running free space tree tests\n");
+ for (i = 0; i < ARRAY_SIZE(tests); i++) {
+ int ret = run_test_both_formats(tests[i]);
+ if (ret) {
+ test_msg("%pf failed\n", tests[i]);
+ return ret;
+ }
+ }
+
+ return 0;
+}
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 054fc0d97131..5de55fdd28bc 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -100,7 +100,7 @@ static void insert_inode_item_key(struct btrfs_root *root)
static void setup_file_extents(struct btrfs_root *root)
{
int slot = 0;
- u64 disk_bytenr = 1 * 1024 * 1024;
+ u64 disk_bytenr = SZ_1M;
u64 offset = 0;
/* First we want a hole */
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index 846d277b1901..8ea5d34bc5a2 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -23,14 +23,6 @@
#include "../qgroup.h"
#include "../backref.h"
-static void init_dummy_trans(struct btrfs_trans_handle *trans)
-{
- memset(trans, 0, sizeof(*trans));
- trans->transid = 1;
- INIT_LIST_HEAD(&trans->qgroup_ref_list);
- trans->type = __TRANS_DUMMY;
-}
-
static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr,
u64 num_bytes, u64 parent, u64 root_objectid)
{
@@ -44,7 +36,7 @@ static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr,
u32 size = sizeof(*item) + sizeof(*iref) + sizeof(*block_info);
int ret;
- init_dummy_trans(&trans);
+ btrfs_init_dummy_trans(&trans);
ins.objectid = bytenr;
ins.type = BTRFS_EXTENT_ITEM_KEY;
@@ -94,7 +86,7 @@ static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
u64 refs;
int ret;
- init_dummy_trans(&trans);
+ btrfs_init_dummy_trans(&trans);
key.objectid = bytenr;
key.type = BTRFS_EXTENT_ITEM_KEY;
@@ -144,7 +136,7 @@ static int remove_extent_item(struct btrfs_root *root, u64 bytenr,
struct btrfs_path *path;
int ret;
- init_dummy_trans(&trans);
+ btrfs_init_dummy_trans(&trans);
key.objectid = bytenr;
key.type = BTRFS_EXTENT_ITEM_KEY;
@@ -178,7 +170,7 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr,
u64 refs;
int ret;
- init_dummy_trans(&trans);
+ btrfs_init_dummy_trans(&trans);
key.objectid = bytenr;
key.type = BTRFS_EXTENT_ITEM_KEY;
@@ -232,7 +224,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root)
struct ulist *new_roots = NULL;
int ret;
- init_dummy_trans(&trans);
+ btrfs_init_dummy_trans(&trans);
test_msg("Qgroup basic add\n");
ret = btrfs_create_qgroup(NULL, fs_info, 5);
@@ -326,7 +318,7 @@ static int test_multiple_refs(struct btrfs_root *root)
struct ulist *new_roots = NULL;
int ret;
- init_dummy_trans(&trans);
+ btrfs_init_dummy_trans(&trans);
test_msg("Qgroup multiple refs test\n");
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index a5b06442f0bf..b6031ce474f7 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -75,6 +75,23 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
list_del_init(&em->list);
free_extent_map(em);
}
+ /*
+ * If any block groups are found in ->deleted_bgs then it's
+ * because the transaction was aborted and a commit did not
+ * happen (things failed before writing the new superblock
+ * and calling btrfs_finish_extent_commit()), so we can not
+ * discard the physical locations of the block groups.
+ */
+ while (!list_empty(&transaction->deleted_bgs)) {
+ struct btrfs_block_group_cache *cache;
+
+ cache = list_first_entry(&transaction->deleted_bgs,
+ struct btrfs_block_group_cache,
+ bg_list);
+ list_del_init(&cache->bg_list);
+ btrfs_put_block_group_trimming(cache);
+ btrfs_put_block_group(cache);
+ }
kmem_cache_free(btrfs_transaction_cachep, transaction);
}
}
@@ -82,6 +99,12 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
static void clear_btree_io_tree(struct extent_io_tree *tree)
{
spin_lock(&tree->lock);
+ /*
+ * Do a single barrier for the waitqueue_active check here, the state
+ * of the waitqueue should not change once clear_btree_io_tree is
+ * called.
+ */
+ smp_mb();
while (!RB_EMPTY_ROOT(&tree->state)) {
struct rb_node *node;
struct extent_state *state;
@@ -226,25 +249,22 @@ loop:
extwriter_counter_init(cur_trans, type);
init_waitqueue_head(&cur_trans->writer_wait);
init_waitqueue_head(&cur_trans->commit_wait);
+ init_waitqueue_head(&cur_trans->pending_wait);
cur_trans->state = TRANS_STATE_RUNNING;
/*
* One for this trans handle, one so it will live on until we
* commit the transaction.
*/
atomic_set(&cur_trans->use_count, 2);
- cur_trans->have_free_bgs = 0;
+ atomic_set(&cur_trans->pending_ordered, 0);
+ cur_trans->flags = 0;
cur_trans->start_time = get_seconds();
- cur_trans->dirty_bg_run = 0;
+
+ memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs));
cur_trans->delayed_refs.href_root = RB_ROOT;
cur_trans->delayed_refs.dirty_extent_root = RB_ROOT;
atomic_set(&cur_trans->delayed_refs.num_entries, 0);
- cur_trans->delayed_refs.num_heads_ready = 0;
- cur_trans->delayed_refs.pending_csums = 0;
- cur_trans->delayed_refs.num_heads = 0;
- cur_trans->delayed_refs.flushing = 0;
- cur_trans->delayed_refs.run_delayed_start = 0;
- cur_trans->delayed_refs.qgroup_to_skip = 0;
/*
* although the tree mod log is per file system and not per transaction,
@@ -264,7 +284,6 @@ loop:
INIT_LIST_HEAD(&cur_trans->pending_snapshots);
INIT_LIST_HEAD(&cur_trans->pending_chunks);
INIT_LIST_HEAD(&cur_trans->switch_commits);
- INIT_LIST_HEAD(&cur_trans->pending_ordered);
INIT_LIST_HEAD(&cur_trans->dirty_bgs);
INIT_LIST_HEAD(&cur_trans->io_bgs);
INIT_LIST_HEAD(&cur_trans->dropped_roots);
@@ -272,7 +291,6 @@ loop:
cur_trans->num_dirty_bgs = 0;
spin_lock_init(&cur_trans->dirty_bgs_lock);
INIT_LIST_HEAD(&cur_trans->deleted_bgs);
- spin_lock_init(&cur_trans->deleted_bgs_lock);
spin_lock_init(&cur_trans->dropped_roots_lock);
list_add_tail(&cur_trans->list, &fs_info->trans_list);
extent_io_tree_init(&cur_trans->dirty_pages,
@@ -447,8 +465,8 @@ static inline bool need_reserve_reloc_root(struct btrfs_root *root)
}
static struct btrfs_trans_handle *
-start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
- enum btrfs_reserve_flush_enum flush)
+start_transaction(struct btrfs_root *root, unsigned int num_items,
+ unsigned int type, enum btrfs_reserve_flush_enum flush)
{
struct btrfs_trans_handle *h;
struct btrfs_transaction *cur_trans;
@@ -478,13 +496,10 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
* the appropriate flushing if need be.
*/
if (num_items > 0 && root != root->fs_info->chunk_root) {
- if (root->fs_info->quota_enabled &&
- is_fstree(root->root_key.objectid)) {
- qgroup_reserved = num_items * root->nodesize;
- ret = btrfs_qgroup_reserve(root, qgroup_reserved);
- if (ret)
- return ERR_PTR(ret);
- }
+ qgroup_reserved = num_items * root->nodesize;
+ ret = btrfs_qgroup_reserve_meta(root, qgroup_reserved);
+ if (ret)
+ return ERR_PTR(ret);
num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
/*
@@ -502,7 +517,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
goto reserve_fail;
}
again:
- h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
+ h = kmem_cache_zalloc(btrfs_trans_handle_cachep, GFP_NOFS);
if (!h) {
ret = -ENOMEM;
goto alloc_fail;
@@ -543,26 +558,13 @@ again:
h->transid = cur_trans->transid;
h->transaction = cur_trans;
- h->blocks_used = 0;
- h->bytes_reserved = 0;
- h->chunk_bytes_reserved = 0;
h->root = root;
- h->delayed_ref_updates = 0;
h->use_count = 1;
- h->adding_csums = 0;
- h->block_rsv = NULL;
- h->orig_rsv = NULL;
- h->aborted = 0;
- h->qgroup_reserved = 0;
- h->delayed_ref_elem.seq = 0;
+
h->type = type;
- h->allocating_chunk = false;
h->can_flush_pending_bgs = true;
- h->reloc_reserved = false;
- h->sync = false;
INIT_LIST_HEAD(&h->qgroup_ref_list);
INIT_LIST_HEAD(&h->new_bgs);
- INIT_LIST_HEAD(&h->ordered);
smp_mb();
if (cur_trans->state >= TRANS_STATE_BLOCKED &&
@@ -579,7 +581,6 @@ again:
h->bytes_reserved = num_bytes;
h->reloc_reserved = reloc_reserved;
}
- h->qgroup_reserved = qgroup_reserved;
got_it:
btrfs_record_root_in_trans(h, root);
@@ -597,20 +598,52 @@ alloc_fail:
btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv,
num_bytes);
reserve_fail:
- if (qgroup_reserved)
- btrfs_qgroup_free(root, qgroup_reserved);
+ btrfs_qgroup_free_meta(root, qgroup_reserved);
return ERR_PTR(ret);
}
struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
- int num_items)
+ unsigned int num_items)
{
return start_transaction(root, num_items, TRANS_START,
BTRFS_RESERVE_FLUSH_ALL);
}
+struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv(
+ struct btrfs_root *root,
+ unsigned int num_items,
+ int min_factor)
+{
+ struct btrfs_trans_handle *trans;
+ u64 num_bytes;
+ int ret;
+
+ trans = btrfs_start_transaction(root, num_items);
+ if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
+ return trans;
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans))
+ return trans;
+
+ num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
+ ret = btrfs_cond_migrate_bytes(root->fs_info,
+ &root->fs_info->trans_block_rsv,
+ num_bytes,
+ min_factor);
+ if (ret) {
+ btrfs_end_transaction(trans, root);
+ return ERR_PTR(ret);
+ }
+
+ trans->block_rsv = &root->fs_info->trans_block_rsv;
+ trans->bytes_reserved = num_bytes;
+
+ return trans;
+}
struct btrfs_trans_handle *btrfs_start_transaction_lflush(
- struct btrfs_root *root, int num_items)
+ struct btrfs_root *root,
+ unsigned int num_items)
{
return start_transaction(root, num_items, TRANS_START,
BTRFS_RESERVE_FLUSH_LIMIT);
@@ -618,17 +651,20 @@ struct btrfs_trans_handle *btrfs_start_transaction_lflush(
struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
{
- return start_transaction(root, 0, TRANS_JOIN, 0);
+ return start_transaction(root, 0, TRANS_JOIN,
+ BTRFS_RESERVE_NO_FLUSH);
}
struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root)
{
- return start_transaction(root, 0, TRANS_JOIN_NOLOCK, 0);
+ return start_transaction(root, 0, TRANS_JOIN_NOLOCK,
+ BTRFS_RESERVE_NO_FLUSH);
}
struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root)
{
- return start_transaction(root, 0, TRANS_USERSPACE, 0);
+ return start_transaction(root, 0, TRANS_USERSPACE,
+ BTRFS_RESERVE_NO_FLUSH);
}
/*
@@ -646,7 +682,8 @@ struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root
*/
struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
{
- return start_transaction(root, 0, TRANS_ATTACH, 0);
+ return start_transaction(root, 0, TRANS_ATTACH,
+ BTRFS_RESERVE_NO_FLUSH);
}
/*
@@ -661,7 +698,8 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root)
{
struct btrfs_trans_handle *trans;
- trans = start_transaction(root, 0, TRANS_ATTACH, 0);
+ trans = start_transaction(root, 0, TRANS_ATTACH,
+ BTRFS_RESERVE_NO_FLUSH);
if (IS_ERR(trans) && PTR_ERR(trans) == -ENOENT)
btrfs_wait_for_commit(root, 0);
@@ -794,12 +832,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
if (!list_empty(&trans->new_bgs))
btrfs_create_pending_block_groups(trans, root);
- if (!list_empty(&trans->ordered)) {
- spin_lock(&info->trans_lock);
- list_splice_init(&trans->ordered, &cur_trans->pending_ordered);
- spin_unlock(&info->trans_lock);
- }
-
trans->delayed_ref_updates = 0;
if (!trans->sync) {
must_run_delayed_refs =
@@ -815,15 +847,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
must_run_delayed_refs = 2;
}
- if (trans->qgroup_reserved) {
- /*
- * the same root has to be passed here between start_transaction
- * and end_transaction. Subvolume quota depends on this.
- */
- btrfs_qgroup_free(trans->root, trans->qgroup_reserved);
- trans->qgroup_reserved = 0;
- }
-
btrfs_trans_release_metadata(trans, root);
trans->block_rsv = NULL;
@@ -856,6 +879,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
atomic_dec(&cur_trans->num_writers);
extwriter_counter_dec(cur_trans, trans->type);
+ /*
+ * Make sure counter is updated before we wake up waiters.
+ */
smp_mb();
if (waitqueue_active(&cur_trans->writer_wait))
wake_up(&cur_trans->writer_wait);
@@ -1238,6 +1264,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
spin_lock(&fs_info->fs_roots_radix_lock);
if (err)
break;
+ btrfs_qgroup_free_meta_all(root);
}
}
spin_unlock(&fs_info->fs_roots_radix_lock);
@@ -1314,17 +1341,11 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
u64 root_flags;
uuid_le new_uuid;
- path = btrfs_alloc_path();
- if (!path) {
- pending->error = -ENOMEM;
- return 0;
- }
+ ASSERT(pending->path);
+ path = pending->path;
- new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
- if (!new_root_item) {
- pending->error = -ENOMEM;
- goto root_item_alloc_fail;
- }
+ ASSERT(pending->root_item);
+ new_root_item = pending->root_item;
pending->error = btrfs_find_free_objectid(tree_root, &objectid);
if (pending->error)
@@ -1557,8 +1578,10 @@ clear_skip_qgroup:
btrfs_clear_skip_qgroup(trans);
no_free_objectid:
kfree(new_root_item);
-root_item_alloc_fail:
+ pending->root_item = NULL;
btrfs_free_path(path);
+ pending->path = NULL;
+
return ret;
}
@@ -1795,25 +1818,10 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
}
static inline void
-btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans,
- struct btrfs_fs_info *fs_info)
+btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans)
{
- struct btrfs_ordered_extent *ordered;
-
- spin_lock(&fs_info->trans_lock);
- while (!list_empty(&cur_trans->pending_ordered)) {
- ordered = list_first_entry(&cur_trans->pending_ordered,
- struct btrfs_ordered_extent,
- trans_list);
- list_del_init(&ordered->trans_list);
- spin_unlock(&fs_info->trans_lock);
-
- wait_event(ordered->wait, test_bit(BTRFS_ORDERED_COMPLETE,
- &ordered->flags));
- btrfs_put_ordered_extent(ordered);
- spin_lock(&fs_info->trans_lock);
- }
- spin_unlock(&fs_info->trans_lock);
+ wait_event(cur_trans->pending_wait,
+ atomic_read(&cur_trans->pending_ordered) == 0);
}
int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
@@ -1842,10 +1850,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
btrfs_trans_release_metadata(trans, root);
trans->block_rsv = NULL;
- if (trans->qgroup_reserved) {
- btrfs_qgroup_free(root, trans->qgroup_reserved);
- trans->qgroup_reserved = 0;
- }
cur_trans = trans->transaction;
@@ -1865,7 +1869,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
return ret;
}
- if (!cur_trans->dirty_bg_run) {
+ if (!test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &cur_trans->flags)) {
int run_it = 0;
/* this mutex is also taken before trying to set
@@ -1874,18 +1878,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
* after a extents from that block group have been
* allocated for cache files. btrfs_set_block_group_ro
* will wait for the transaction to commit if it
- * finds dirty_bg_run = 1
+ * finds BTRFS_TRANS_DIRTY_BG_RUN set.
*
- * The dirty_bg_run flag is also used to make sure only
- * one process starts all the block group IO. It wouldn't
+ * The BTRFS_TRANS_DIRTY_BG_RUN flag is also used to make sure
+ * only one process starts all the block group IO. It wouldn't
* hurt to have more than one go through, but there's no
* real advantage to it either.
*/
mutex_lock(&root->fs_info->ro_block_group_mutex);
- if (!cur_trans->dirty_bg_run) {
+ if (!test_and_set_bit(BTRFS_TRANS_DIRTY_BG_RUN,
+ &cur_trans->flags))
run_it = 1;
- cur_trans->dirty_bg_run = 1;
- }
mutex_unlock(&root->fs_info->ro_block_group_mutex);
if (run_it)
@@ -1897,7 +1900,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
}
spin_lock(&root->fs_info->trans_lock);
- list_splice_init(&trans->ordered, &cur_trans->pending_ordered);
if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
spin_unlock(&root->fs_info->trans_lock);
atomic_inc(&cur_trans->use_count);
@@ -1956,7 +1958,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
btrfs_wait_delalloc_flush(root->fs_info);
- btrfs_wait_pending_ordered(cur_trans, root->fs_info);
+ btrfs_wait_pending_ordered(cur_trans);
btrfs_scrub_pause(root);
/*
@@ -2136,7 +2138,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
ret = btrfs_write_and_wait_transaction(trans, root);
if (ret) {
- btrfs_error(root->fs_info, ret,
+ btrfs_std_error(root->fs_info, ret,
"Error while writing out transaction");
mutex_unlock(&root->fs_info->tree_log_mutex);
goto scrub_continue;
@@ -2156,7 +2158,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
btrfs_finish_extent_commit(trans, root);
- if (cur_trans->have_free_bgs)
+ if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags))
btrfs_clear_space_info_full(root->fs_info);
root->fs_info->last_trans_committed = cur_trans->transid;
@@ -2198,10 +2200,6 @@ cleanup_transaction:
btrfs_trans_release_metadata(trans, root);
btrfs_trans_release_chunk_metadata(trans);
trans->block_rsv = NULL;
- if (trans->qgroup_reserved) {
- btrfs_qgroup_free(root, trans->qgroup_reserved);
- trans->qgroup_reserved = 0;
- }
btrfs_warn(root->fs_info, "Skipping commit of aborted transaction.");
if (current->journal_info == trans)
current->journal_info = NULL;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index a994bb097ee5..72be51f7ca2f 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -32,6 +32,10 @@ enum btrfs_trans_state {
TRANS_STATE_MAX = 6,
};
+#define BTRFS_TRANS_HAVE_FREE_BGS 0
+#define BTRFS_TRANS_DIRTY_BG_RUN 1
+#define BTRFS_TRANS_CACHE_ENOSPC 2
+
struct btrfs_transaction {
u64 transid;
/*
@@ -46,11 +50,9 @@ struct btrfs_transaction {
*/
atomic_t num_writers;
atomic_t use_count;
+ atomic_t pending_ordered;
- /*
- * true if there is free bgs operations in this transaction
- */
- int have_free_bgs;
+ unsigned long flags;
/* Be protected by fs_info->trans_lock when we want to change it. */
enum btrfs_trans_state state;
@@ -59,9 +61,9 @@ struct btrfs_transaction {
unsigned long start_time;
wait_queue_head_t writer_wait;
wait_queue_head_t commit_wait;
+ wait_queue_head_t pending_wait;
struct list_head pending_snapshots;
struct list_head pending_chunks;
- struct list_head pending_ordered;
struct list_head switch_commits;
struct list_head dirty_bgs;
struct list_head io_bgs;
@@ -75,12 +77,11 @@ struct btrfs_transaction {
*/
struct mutex cache_write_mutex;
spinlock_t dirty_bgs_lock;
+ /* Protected by spin lock fs_info->unused_bgs_lock. */
struct list_head deleted_bgs;
- spinlock_t deleted_bgs_lock;
spinlock_t dropped_roots_lock;
struct btrfs_delayed_ref_root delayed_refs;
int aborted;
- int dirty_bg_run;
};
#define __TRANS_FREEZABLE (1U << 0)
@@ -107,7 +108,6 @@ struct btrfs_trans_handle {
u64 transid;
u64 bytes_reserved;
u64 chunk_bytes_reserved;
- u64 qgroup_reserved;
unsigned long use_count;
unsigned long blocks_reserved;
unsigned long blocks_used;
@@ -129,7 +129,6 @@ struct btrfs_trans_handle {
*/
struct btrfs_root *root;
struct seq_list delayed_ref_elem;
- struct list_head ordered;
struct list_head qgroup_ref_list;
struct list_head new_bgs;
};
@@ -138,8 +137,10 @@ struct btrfs_pending_snapshot {
struct dentry *dentry;
struct inode *dir;
struct btrfs_root *root;
+ struct btrfs_root_item *root_item;
struct btrfs_root *snap;
struct btrfs_qgroup_inherit *inherit;
+ struct btrfs_path *path;
/* block reservation for the operation */
struct btrfs_block_rsv block_rsv;
u64 qgroup_reserved;
@@ -185,9 +186,14 @@ static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans)
int btrfs_end_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
- int num_items);
+ unsigned int num_items);
+struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv(
+ struct btrfs_root *root,
+ unsigned int num_items,
+ int min_factor);
struct btrfs_trans_handle *btrfs_start_transaction_lflush(
- struct btrfs_root *root, int num_items);
+ struct btrfs_root *root,
+ unsigned int num_items);
struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root);
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index f31db4325339..cb65089127cc 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -89,6 +89,12 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
goto out;
}
btrfs_release_path(path);
+ /*
+ * We don't need a lock on a leaf. btrfs_realloc_node() will lock all
+ * leafs from path->nodes[1], so set lowest_level to 1 to avoid later
+ * a deadlock (attempting to write lock an already write locked leaf).
+ */
+ path->lowest_level = 1;
wret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (wret < 0) {
@@ -99,9 +105,12 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
ret = 0;
goto out;
}
- path->slots[1] = btrfs_header_nritems(path->nodes[1]);
- next_key_ret = btrfs_find_next_key(root, path, &key, 1,
- min_trans);
+ /*
+ * The node at level 1 must always be locked when our path has
+ * keep_locks set and lowest_level is 1, regardless of the value of
+ * path->slots[1].
+ */
+ BUG_ON(path->locks[1] == 0);
ret = btrfs_realloc_node(trans, root,
path->nodes[1], 0,
&last_ret,
@@ -110,6 +119,18 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
WARN_ON(ret == -EAGAIN);
goto out;
}
+ /*
+ * Now that we reallocated the node we can find the next key. Note that
+ * btrfs_find_next_key() can release our path and do another search
+ * without COWing, this is because even with path->keep_locks = 1,
+ * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a
+ * node when path->slots[node_level - 1] does not point to the last
+ * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore
+ * we search for the next key after reallocating our node.
+ */
+ path->slots[1] = btrfs_header_nritems(path->nodes[1]);
+ next_key_ret = btrfs_find_next_key(root, path, &key, 1,
+ min_trans);
if (next_key_ret == 0) {
memcpy(&root->defrag_progress, &key, sizeof(key));
ret = -EAGAIN;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 1bbaace73383..323e12cc9d2f 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -229,7 +229,9 @@ int btrfs_pin_log_trans(struct btrfs_root *root)
void btrfs_end_log_trans(struct btrfs_root *root)
{
if (atomic_dec_and_test(&root->log_writers)) {
- smp_mb();
+ /*
+ * Implicit memory barrier after atomic_dec_and_test
+ */
if (waitqueue_active(&root->log_writer_wait))
wake_up(&root->log_writer_wait);
}
@@ -691,7 +693,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
ret = btrfs_inc_extent_ref(trans, root,
ins.objectid, ins.offset,
0, root->root_key.objectid,
- key->objectid, offset, 0);
+ key->objectid, offset);
if (ret)
goto out;
} else {
@@ -2820,7 +2822,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
mutex_lock(&log_root_tree->log_mutex);
if (atomic_dec_and_test(&log_root_tree->log_writers)) {
- smp_mb();
+ /*
+ * Implicit memory barrier after atomic_dec_and_test
+ */
if (waitqueue_active(&log_root_tree->log_writer_wait))
wake_up(&log_root_tree->log_writer_wait);
}
@@ -2950,6 +2954,9 @@ out_wake_log_root:
atomic_set(&log_root_tree->log_commit[index2], 0);
mutex_unlock(&log_root_tree->log_mutex);
+ /*
+ * The barrier before waitqueue_active is implied by mutex_unlock
+ */
if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
wake_up(&log_root_tree->log_commit_wait[index2]);
out:
@@ -2961,6 +2968,9 @@ out:
atomic_set(&root->log_commit[index1], 0);
mutex_unlock(&root->log_mutex);
+ /*
+ * The barrier before waitqueue_active is implied by mutex_unlock
+ */
if (waitqueue_active(&root->log_commit_wait[index1]))
wake_up(&root->log_commit_wait[index1]);
return ret;
@@ -5314,7 +5324,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
ret = walk_log_tree(trans, log_root_tree, &wc);
if (ret) {
- btrfs_error(fs_info, ret, "Failed to pin buffers while "
+ btrfs_std_error(fs_info, ret, "Failed to pin buffers while "
"recovering log root tree.");
goto error;
}
@@ -5328,7 +5338,7 @@ again:
ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
if (ret < 0) {
- btrfs_error(fs_info, ret,
+ btrfs_std_error(fs_info, ret,
"Couldn't find tree log root.");
goto error;
}
@@ -5346,7 +5356,7 @@ again:
log = btrfs_read_fs_root(log_root_tree, &found_key);
if (IS_ERR(log)) {
ret = PTR_ERR(log);
- btrfs_error(fs_info, ret,
+ btrfs_std_error(fs_info, ret,
"Couldn't read tree log root.");
goto error;
}
@@ -5361,7 +5371,7 @@ again:
free_extent_buffer(log->node);
free_extent_buffer(log->commit_root);
kfree(log);
- btrfs_error(fs_info, ret, "Couldn't read target root "
+ btrfs_std_error(fs_info, ret, "Couldn't read target root "
"for tree log recovery.");
goto error;
}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 6fc735869c18..c32abbca9d77 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -42,6 +42,82 @@
#include "dev-replace.h"
#include "sysfs.h"
+const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
+ [BTRFS_RAID_RAID10] = {
+ .sub_stripes = 2,
+ .dev_stripes = 1,
+ .devs_max = 0, /* 0 == as many as possible */
+ .devs_min = 4,
+ .tolerated_failures = 1,
+ .devs_increment = 2,
+ .ncopies = 2,
+ },
+ [BTRFS_RAID_RAID1] = {
+ .sub_stripes = 1,
+ .dev_stripes = 1,
+ .devs_max = 2,
+ .devs_min = 2,
+ .tolerated_failures = 1,
+ .devs_increment = 2,
+ .ncopies = 2,
+ },
+ [BTRFS_RAID_DUP] = {
+ .sub_stripes = 1,
+ .dev_stripes = 2,
+ .devs_max = 1,
+ .devs_min = 1,
+ .tolerated_failures = 0,
+ .devs_increment = 1,
+ .ncopies = 2,
+ },
+ [BTRFS_RAID_RAID0] = {
+ .sub_stripes = 1,
+ .dev_stripes = 1,
+ .devs_max = 0,
+ .devs_min = 2,
+ .tolerated_failures = 0,
+ .devs_increment = 1,
+ .ncopies = 1,
+ },
+ [BTRFS_RAID_SINGLE] = {
+ .sub_stripes = 1,
+ .dev_stripes = 1,
+ .devs_max = 1,
+ .devs_min = 1,
+ .tolerated_failures = 0,
+ .devs_increment = 1,
+ .ncopies = 1,
+ },
+ [BTRFS_RAID_RAID5] = {
+ .sub_stripes = 1,
+ .dev_stripes = 1,
+ .devs_max = 0,
+ .devs_min = 2,
+ .tolerated_failures = 1,
+ .devs_increment = 1,
+ .ncopies = 2,
+ },
+ [BTRFS_RAID_RAID6] = {
+ .sub_stripes = 1,
+ .dev_stripes = 1,
+ .devs_max = 0,
+ .devs_min = 3,
+ .tolerated_failures = 2,
+ .devs_increment = 1,
+ .ncopies = 3,
+ },
+};
+
+const u64 const btrfs_raid_group[BTRFS_NR_RAID_TYPES] = {
+ [BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10,
+ [BTRFS_RAID_RAID1] = BTRFS_BLOCK_GROUP_RAID1,
+ [BTRFS_RAID_DUP] = BTRFS_BLOCK_GROUP_DUP,
+ [BTRFS_RAID_RAID0] = BTRFS_BLOCK_GROUP_RAID0,
+ [BTRFS_RAID_SINGLE] = 0,
+ [BTRFS_RAID_RAID5] = BTRFS_BLOCK_GROUP_RAID5,
+ [BTRFS_RAID_RAID6] = BTRFS_BLOCK_GROUP_RAID6,
+};
+
static int init_first_rw_device(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_device *device);
@@ -49,6 +125,7 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
+static void btrfs_close_one_device(struct btrfs_device *device);
DEFINE_MUTEX(uuid_mutex);
static LIST_HEAD(fs_uuids);
@@ -156,8 +233,8 @@ static struct btrfs_device *__alloc_device(void)
spin_lock_init(&dev->reada_lock);
atomic_set(&dev->reada_in_flight, 0);
atomic_set(&dev->dev_stats_ccnt, 0);
- INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_WAIT);
- INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_WAIT);
+ INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
+ INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
return dev;
}
@@ -198,7 +275,6 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
if (IS_ERR(*bdev)) {
ret = PTR_ERR(*bdev);
- printk(KERN_INFO "BTRFS: open %s failed\n", device_path);
goto error;
}
@@ -211,8 +287,8 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
}
invalidate_bdev(*bdev);
*bh = btrfs_read_dev_super(*bdev);
- if (!*bh) {
- ret = -EINVAL;
+ if (IS_ERR(*bh)) {
+ ret = PTR_ERR(*bh);
blkdev_put(*bdev, flags);
goto error;
}
@@ -345,6 +421,9 @@ loop_lock:
pending = pending->bi_next;
cur->bi_next = NULL;
+ /*
+ * atomic_dec_return implies a barrier for waitqueue_active
+ */
if (atomic_dec_return(&fs_info->nr_async_bios) < limit &&
waitqueue_active(&fs_info->async_submit_wait))
wake_up(&fs_info->async_submit_wait);
@@ -765,36 +844,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
- struct btrfs_device *new_device;
- struct rcu_string *name;
-
- if (device->bdev)
- fs_devices->open_devices--;
-
- if (device->writeable &&
- device->devid != BTRFS_DEV_REPLACE_DEVID) {
- list_del_init(&device->dev_alloc_list);
- fs_devices->rw_devices--;
- }
-
- if (device->missing)
- fs_devices->missing_devices--;
-
- new_device = btrfs_alloc_device(NULL, &device->devid,
- device->uuid);
- BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
-
- /* Safe because we are under uuid_mutex */
- if (device->name) {
- name = rcu_string_strdup(device->name->str, GFP_NOFS);
- BUG_ON(!name); /* -ENOMEM */
- rcu_assign_pointer(new_device->name, name);
- }
-
- list_replace_rcu(&device->dev_list, &new_device->dev_list);
- new_device->fs_devices = device->fs_devices;
-
- call_rcu(&device->rcu, free_device);
+ btrfs_close_one_device(device);
}
mutex_unlock(&fs_devices->device_list_mutex);
@@ -1053,7 +1103,7 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->reada = 2;
+ path->reada = READA_FORWARD;
key.objectid = device->devid;
key.offset = start;
@@ -1208,6 +1258,15 @@ int find_free_dev_extent_start(struct btrfs_transaction *transaction,
int ret;
int slot;
struct extent_buffer *l;
+ u64 min_search_start;
+
+ /*
+ * We don't want to overwrite the superblock on the drive nor any area
+ * used by the boot loader (grub for example), so we make sure to start
+ * at an offset of at least 1MB.
+ */
+ min_search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
+ search_start = max(search_start, min_search_start);
path = btrfs_alloc_path();
if (!path)
@@ -1222,7 +1281,7 @@ again:
goto out;
}
- path->reada = 2;
+ path->reada = READA_FORWARD;
path->search_commit_root = 1;
path->skip_locking = 1;
@@ -1348,18 +1407,9 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
struct btrfs_device *device, u64 num_bytes,
u64 *start, u64 *len)
{
- struct btrfs_root *root = device->dev_root;
- u64 search_start;
-
/* FIXME use last free of some kind */
-
- /*
- * we don't want to overwrite the superblock on the drive,
- * so we make sure to start at an offset of at least 1MB
- */
- search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
return find_free_dev_extent_start(trans->transaction, device,
- num_bytes, search_start, start, len);
+ num_bytes, 0, start, len);
}
static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
@@ -1402,7 +1452,7 @@ again:
extent = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_dev_extent);
} else {
- btrfs_error(root->fs_info, ret, "Slot search failed");
+ btrfs_std_error(root->fs_info, ret, "Slot search failed");
goto out;
}
@@ -1410,10 +1460,10 @@ again:
ret = btrfs_del_item(trans, root, path);
if (ret) {
- btrfs_error(root->fs_info, ret,
+ btrfs_std_error(root->fs_info, ret,
"Failed to remove dev extent item");
} else {
- trans->transaction->have_free_bgs = 1;
+ set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
}
out:
btrfs_free_path(path);
@@ -1593,7 +1643,6 @@ static void update_dev_time(char *path_name)
return;
file_update_time(filp);
filp_close(filp, NULL);
- return;
}
static int btrfs_rm_dev_item(struct btrfs_root *root,
@@ -1801,7 +1850,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
if (device->bdev) {
device->fs_devices->open_devices--;
/* remove sysfs entry */
- btrfs_kobj_rm_device(root->fs_info->fs_devices, device);
+ btrfs_sysfs_rm_device_link(root->fs_info->fs_devices, device);
}
call_rcu(&device->rcu, free_device);
@@ -1924,7 +1973,7 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
if (srcdev->writeable) {
fs_devices->rw_devices--;
/* zero out the old super if it is writable */
- btrfs_scratch_superblock(srcdev);
+ btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
}
if (srcdev->bdev)
@@ -1971,10 +2020,10 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
WARN_ON(!tgtdev);
mutex_lock(&fs_info->fs_devices->device_list_mutex);
- btrfs_kobj_rm_device(fs_info->fs_devices, tgtdev);
+ btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev);
if (tgtdev->bdev) {
- btrfs_scratch_superblock(tgtdev);
+ btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
fs_info->fs_devices->open_devices--;
}
fs_info->fs_devices->num_devices--;
@@ -2041,10 +2090,8 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
}
}
- if (!*device) {
- btrfs_err(root->fs_info, "no missing device found");
- return -ENOENT;
- }
+ if (!*device)
+ return BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
return 0;
} else {
@@ -2309,7 +2356,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
tmp + 1);
/* add sysfs device entry */
- btrfs_kobj_add_device(root->fs_info->fs_devices, device);
+ btrfs_sysfs_add_device_link(root->fs_info->fs_devices, device);
/*
* we've got more storage, clear any full flags on the space
@@ -2350,9 +2397,10 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
*/
snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
root->fs_info->fsid);
- if (kobject_rename(&root->fs_info->fs_devices->super_kobj,
+ if (kobject_rename(&root->fs_info->fs_devices->fsid_kobj,
fsid_buf))
- pr_warn("BTRFS: sysfs: failed to create fsid for sprout\n");
+ btrfs_warn(root->fs_info,
+ "sysfs: failed to create fsid for sprout");
}
root->fs_info->num_tolerated_disk_barrier_failures =
@@ -2368,7 +2416,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
ret = btrfs_relocate_sys_chunks(root);
if (ret < 0)
- btrfs_error(root->fs_info, ret,
+ btrfs_std_error(root->fs_info, ret,
"Failed to relocate sys chunks after "
"device initialization. This can be fixed "
"using the \"btrfs balance\" command.");
@@ -2388,7 +2436,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
error_trans:
btrfs_end_transaction(trans, root);
rcu_string_free(device->name);
- btrfs_kobj_rm_device(root->fs_info->fs_devices, device);
+ btrfs_sysfs_rm_device_link(root->fs_info->fs_devices, device);
kfree(device);
error:
blkdev_put(bdev, FMODE_EXCL);
@@ -2613,7 +2661,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
if (ret < 0)
goto out;
else if (ret > 0) { /* Logic error or corruption */
- btrfs_error(root->fs_info, -ENOENT,
+ btrfs_std_error(root->fs_info, -ENOENT,
"Failed lookup while freeing chunk.");
ret = -ENOENT;
goto out;
@@ -2621,7 +2669,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
ret = btrfs_del_item(trans, root, path);
if (ret < 0)
- btrfs_error(root->fs_info, ret,
+ btrfs_std_error(root->fs_info, ret,
"Failed to delete chunk item.");
out:
btrfs_free_path(path);
@@ -2803,10 +2851,11 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, u64 chunk_offset)
if (ret)
return ret;
- trans = btrfs_start_transaction(root, 0);
+ trans = btrfs_start_trans_remove_block_group(root->fs_info,
+ chunk_offset);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
- btrfs_std_error(root->fs_info, ret);
+ btrfs_std_error(root->fs_info, ret, NULL);
return ret;
}
@@ -3009,16 +3058,19 @@ static void update_balance_args(struct btrfs_balance_control *bctl)
* (albeit full) chunks.
*/
if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+ !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
!(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
bctl->data.usage = 90;
}
if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+ !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
!(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
bctl->sys.usage = 90;
}
if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+ !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
!(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
bctl->meta.usage = 90;
@@ -3070,17 +3122,50 @@ static int chunk_profiles_filter(u64 chunk_type,
return 1;
}
-static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
+static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
struct btrfs_balance_args *bargs)
{
struct btrfs_block_group_cache *cache;
+ u64 chunk_used;
+ u64 user_thresh_min;
+ u64 user_thresh_max;
+ int ret = 1;
+
+ cache = btrfs_lookup_block_group(fs_info, chunk_offset);
+ chunk_used = btrfs_block_group_used(&cache->item);
+
+ if (bargs->usage_min == 0)
+ user_thresh_min = 0;
+ else
+ user_thresh_min = div_factor_fine(cache->key.offset,
+ bargs->usage_min);
+
+ if (bargs->usage_max == 0)
+ user_thresh_max = 1;
+ else if (bargs->usage_max > 100)
+ user_thresh_max = cache->key.offset;
+ else
+ user_thresh_max = div_factor_fine(cache->key.offset,
+ bargs->usage_max);
+
+ if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
+ ret = 0;
+
+ btrfs_put_block_group(cache);
+ return ret;
+}
+
+static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
+ u64 chunk_offset, struct btrfs_balance_args *bargs)
+{
+ struct btrfs_block_group_cache *cache;
u64 chunk_used, user_thresh;
int ret = 1;
cache = btrfs_lookup_block_group(fs_info, chunk_offset);
chunk_used = btrfs_block_group_used(&cache->item);
- if (bargs->usage == 0)
+ if (bargs->usage_min == 0)
user_thresh = 1;
else if (bargs->usage > 100)
user_thresh = cache->key.offset;
@@ -3170,6 +3255,19 @@ static int chunk_vrange_filter(struct extent_buffer *leaf,
return 1;
}
+static int chunk_stripes_range_filter(struct extent_buffer *leaf,
+ struct btrfs_chunk *chunk,
+ struct btrfs_balance_args *bargs)
+{
+ int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+
+ if (bargs->stripes_min <= num_stripes
+ && num_stripes <= bargs->stripes_max)
+ return 0;
+
+ return 1;
+}
+
static int chunk_soft_convert_filter(u64 chunk_type,
struct btrfs_balance_args *bargs)
{
@@ -3216,6 +3314,9 @@ static int should_balance_chunk(struct btrfs_root *root,
if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) {
return 0;
+ } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
+ chunk_usage_range_filter(bctl->fs_info, chunk_offset, bargs)) {
+ return 0;
}
/* devid filter */
@@ -3236,6 +3337,12 @@ static int should_balance_chunk(struct btrfs_root *root,
return 0;
}
+ /* stripes filter */
+ if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
+ chunk_stripes_range_filter(leaf, chunk, bargs)) {
+ return 0;
+ }
+
/* soft profile changing mode */
if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
chunk_soft_convert_filter(chunk_type, bargs)) {
@@ -3250,6 +3357,16 @@ static int should_balance_chunk(struct btrfs_root *root,
return 0;
else
bargs->limit--;
+ } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
+ /*
+ * Same logic as the 'limit' filter; the minimum cannot be
+ * determined here because we do not have the global informatoin
+ * about the count of all chunks that satisfy the filters.
+ */
+ if (bargs->limit_max == 0)
+ return 0;
+ else
+ bargs->limit_max--;
}
return 1;
@@ -3264,6 +3381,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
struct btrfs_device *device;
u64 old_size;
u64 size_to_free;
+ u64 chunk_type;
struct btrfs_chunk *chunk;
struct btrfs_path *path;
struct btrfs_key key;
@@ -3274,16 +3392,21 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
int ret;
int enospc_errors = 0;
bool counting = true;
+ /* The single value limit and min/max limits use the same bytes in the */
u64 limit_data = bctl->data.limit;
u64 limit_meta = bctl->meta.limit;
u64 limit_sys = bctl->sys.limit;
+ u32 count_data = 0;
+ u32 count_meta = 0;
+ u32 count_sys = 0;
+ int chunk_reserved = 0;
/* step one make some room on all the devices */
devices = &fs_info->fs_devices->devices;
list_for_each_entry(device, devices, dev_list) {
old_size = btrfs_device_get_total_bytes(device);
size_to_free = div_factor(old_size, 1);
- size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
+ size_to_free = min_t(u64, size_to_free, SZ_1M);
if (!device->writeable ||
btrfs_device_get_total_bytes(device) -
btrfs_device_get_bytes_used(device) > size_to_free ||
@@ -3317,6 +3440,10 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
spin_unlock(&fs_info->balance_lock);
again:
if (!counting) {
+ /*
+ * The single value limit and min/max limits use the same bytes
+ * in the
+ */
bctl->data.limit = limit_data;
bctl->meta.limit = limit_meta;
bctl->sys.limit = limit_sys;
@@ -3364,6 +3491,7 @@ again:
}
chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
+ chunk_type = btrfs_chunk_type(leaf, chunk);
if (!counting) {
spin_lock(&fs_info->balance_lock);
@@ -3373,6 +3501,7 @@ again:
ret = should_balance_chunk(chunk_root, leaf, chunk,
found_key.offset);
+
btrfs_release_path(path);
if (!ret) {
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
@@ -3384,9 +3513,49 @@ again:
spin_lock(&fs_info->balance_lock);
bctl->stat.expected++;
spin_unlock(&fs_info->balance_lock);
+
+ if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
+ count_data++;
+ else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
+ count_sys++;
+ else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
+ count_meta++;
+
+ goto loop;
+ }
+
+ /*
+ * Apply limit_min filter, no need to check if the LIMITS
+ * filter is used, limit_min is 0 by default
+ */
+ if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
+ count_data < bctl->data.limit_min)
+ || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
+ count_meta < bctl->meta.limit_min)
+ || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
+ count_sys < bctl->sys.limit_min)) {
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
goto loop;
}
+ if ((chunk_type & BTRFS_BLOCK_GROUP_DATA) && !chunk_reserved) {
+ trans = btrfs_start_transaction(chunk_root, 0);
+ if (IS_ERR(trans)) {
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ ret = PTR_ERR(trans);
+ goto error;
+ }
+
+ ret = btrfs_force_chunk_alloc(trans, chunk_root,
+ BTRFS_BLOCK_GROUP_DATA);
+ btrfs_end_transaction(trans, chunk_root);
+ if (ret < 0) {
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ goto error;
+ }
+ chunk_reserved = 1;
+ }
+
ret = btrfs_relocate_chunk(chunk_root,
found_key.offset);
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
@@ -3461,11 +3630,20 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info)
unset_balance_control(fs_info);
ret = del_balance_item(fs_info->tree_root);
if (ret)
- btrfs_std_error(fs_info, ret);
+ btrfs_std_error(fs_info, ret, NULL);
atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
}
+/* Non-zero return value signifies invalidity */
+static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg,
+ u64 allowed)
+{
+ return ((bctl_arg->flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+ (!alloc_profile_is_valid(bctl_arg->target, 1) ||
+ (bctl_arg->target & ~allowed)));
+}
+
/*
* Should be called with both balance and volume mutexes held
*/
@@ -3523,27 +3701,21 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
if (num_devices > 3)
allowed |= (BTRFS_BLOCK_GROUP_RAID10 |
BTRFS_BLOCK_GROUP_RAID6);
- if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
- (!alloc_profile_is_valid(bctl->data.target, 1) ||
- (bctl->data.target & ~allowed))) {
+ if (validate_convert_profile(&bctl->data, allowed)) {
btrfs_err(fs_info, "unable to start balance with target "
"data profile %llu",
bctl->data.target);
ret = -EINVAL;
goto out;
}
- if ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
- (!alloc_profile_is_valid(bctl->meta.target, 1) ||
- (bctl->meta.target & ~allowed))) {
+ if (validate_convert_profile(&bctl->meta, allowed)) {
btrfs_err(fs_info,
"unable to start balance with target metadata profile %llu",
bctl->meta.target);
ret = -EINVAL;
goto out;
}
- if ((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
- (!alloc_profile_is_valid(bctl->sys.target, 1) ||
- (bctl->sys.target & ~allowed))) {
+ if (validate_convert_profile(&bctl->sys, allowed)) {
btrfs_err(fs_info,
"unable to start balance with target system profile %llu",
bctl->sys.target);
@@ -3551,14 +3723,6 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
goto out;
}
- /* allow dup'ed data chunks only in mixed mode */
- if (!mixed && (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
- (bctl->data.target & BTRFS_BLOCK_GROUP_DUP)) {
- btrfs_err(fs_info, "dup for data is not allowed");
- ret = -EINVAL;
- goto out;
- }
-
/* allow to reduce meta or sys integrity only if force set */
allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_RAID10 |
@@ -3584,6 +3748,13 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
}
} while (read_seqretry(&fs_info->profiles_lock, seq));
+ if (btrfs_get_num_tolerated_disk_barrier_failures(bctl->meta.target) <
+ btrfs_get_num_tolerated_disk_barrier_failures(bctl->data.target)) {
+ btrfs_warn(fs_info,
+ "metatdata profile 0x%llx has lower redundancy than data profile 0x%llx",
+ bctl->meta.target, bctl->data.target);
+ }
+
if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
fs_info->num_tolerated_disk_barrier_failures = min(
btrfs_calc_num_tolerated_disk_barrier_failures(fs_info),
@@ -4096,7 +4267,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
if (!path)
return -ENOMEM;
- path->reada = 2;
+ path->reada = READA_FORWARD;
lock_chunks(root);
@@ -4285,69 +4456,10 @@ static int btrfs_cmp_device_info(const void *a, const void *b)
return 0;
}
-static const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
- [BTRFS_RAID_RAID10] = {
- .sub_stripes = 2,
- .dev_stripes = 1,
- .devs_max = 0, /* 0 == as many as possible */
- .devs_min = 4,
- .devs_increment = 2,
- .ncopies = 2,
- },
- [BTRFS_RAID_RAID1] = {
- .sub_stripes = 1,
- .dev_stripes = 1,
- .devs_max = 2,
- .devs_min = 2,
- .devs_increment = 2,
- .ncopies = 2,
- },
- [BTRFS_RAID_DUP] = {
- .sub_stripes = 1,
- .dev_stripes = 2,
- .devs_max = 1,
- .devs_min = 1,
- .devs_increment = 1,
- .ncopies = 2,
- },
- [BTRFS_RAID_RAID0] = {
- .sub_stripes = 1,
- .dev_stripes = 1,
- .devs_max = 0,
- .devs_min = 2,
- .devs_increment = 1,
- .ncopies = 1,
- },
- [BTRFS_RAID_SINGLE] = {
- .sub_stripes = 1,
- .dev_stripes = 1,
- .devs_max = 1,
- .devs_min = 1,
- .devs_increment = 1,
- .ncopies = 1,
- },
- [BTRFS_RAID_RAID5] = {
- .sub_stripes = 1,
- .dev_stripes = 1,
- .devs_max = 0,
- .devs_min = 2,
- .devs_increment = 1,
- .ncopies = 2,
- },
- [BTRFS_RAID_RAID6] = {
- .sub_stripes = 1,
- .dev_stripes = 1,
- .devs_max = 0,
- .devs_min = 3,
- .devs_increment = 1,
- .ncopies = 3,
- },
-};
-
static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target)
{
/* TODO allow them to set a preferred stripe size */
- return 64 * 1024;
+ return SZ_64K;
}
static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
@@ -4415,21 +4527,21 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
ncopies = btrfs_raid_array[index].ncopies;
if (type & BTRFS_BLOCK_GROUP_DATA) {
- max_stripe_size = 1024 * 1024 * 1024;
+ max_stripe_size = SZ_1G;
max_chunk_size = 10 * max_stripe_size;
if (!devs_max)
devs_max = BTRFS_MAX_DEVS(info->chunk_root);
} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
/* for larger filesystems, use larger metadata chunks */
- if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024)
- max_stripe_size = 1024 * 1024 * 1024;
+ if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
+ max_stripe_size = SZ_1G;
else
- max_stripe_size = 256 * 1024 * 1024;
+ max_stripe_size = SZ_256M;
max_chunk_size = max_stripe_size;
if (!devs_max)
devs_max = BTRFS_MAX_DEVS(info->chunk_root);
} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
- max_stripe_size = 32 * 1024 * 1024;
+ max_stripe_size = SZ_32M;
max_chunk_size = 2 * max_stripe_size;
if (!devs_max)
devs_max = BTRFS_MAX_DEVS_SYS_CHUNK;
@@ -4680,7 +4792,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
u64 dev_offset;
u64 stripe_size;
int i = 0;
- int ret;
+ int ret = 0;
em_tree = &extent_root->fs_info->mapping_tree.map_tree;
read_lock(&em_tree->lock);
@@ -4711,20 +4823,32 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
goto out;
}
+ /*
+ * Take the device list mutex to prevent races with the final phase of
+ * a device replace operation that replaces the device object associated
+ * with the map's stripes, because the device object's id can change
+ * at any time during that final phase of the device replace operation
+ * (dev-replace.c:btrfs_dev_replace_finishing()).
+ */
+ mutex_lock(&chunk_root->fs_info->fs_devices->device_list_mutex);
for (i = 0; i < map->num_stripes; i++) {
device = map->stripes[i].dev;
dev_offset = map->stripes[i].physical;
ret = btrfs_update_device(trans, device);
if (ret)
- goto out;
+ break;
ret = btrfs_alloc_dev_extent(trans, device,
chunk_root->root_key.objectid,
BTRFS_FIRST_CHUNK_TREE_OBJECTID,
chunk_offset, dev_offset,
stripe_size);
if (ret)
- goto out;
+ break;
+ }
+ if (ret) {
+ mutex_unlock(&chunk_root->fs_info->fs_devices->device_list_mutex);
+ goto out;
}
stripe = &chunk->stripe;
@@ -4737,6 +4861,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
stripe++;
}
+ mutex_unlock(&chunk_root->fs_info->fs_devices->device_list_mutex);
btrfs_set_stack_chunk_length(chunk, chunk_size);
btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
@@ -6352,11 +6477,11 @@ int btrfs_read_sys_array(struct btrfs_root *root)
sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET);
if (!sb)
return -ENOMEM;
- btrfs_set_buffer_uptodate(sb);
+ set_extent_buffer_uptodate(sb);
btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
/*
* The sb extent buffer is artifical and just used to read the system array.
- * btrfs_set_buffer_uptodate() call does not properly mark all it's
+ * set_extent_buffer_uptodate() call does not properly mark all it's
* pages up-to-date when the page is larger: extent does not cover the
* whole page and consequently check_page_uptodate does not find all
* the page's extents up-to-date (the hole beyond sb),
@@ -6399,6 +6524,14 @@ int btrfs_read_sys_array(struct btrfs_root *root)
goto out_short_read;
num_stripes = btrfs_chunk_num_stripes(sb, chunk);
+ if (!num_stripes) {
+ printk(KERN_ERR
+ "BTRFS: invalid number of stripes %u in sys_array at offset %u\n",
+ num_stripes, cur_offset);
+ ret = -EIO;
+ break;
+ }
+
len = btrfs_chunk_item_size(num_stripes);
if (cur_offset + len > array_size)
goto out_short_read;
@@ -6407,6 +6540,9 @@ int btrfs_read_sys_array(struct btrfs_root *root)
if (ret)
break;
} else {
+ printk(KERN_ERR
+ "BTRFS: unexpected item type %u in sys_array at offset %u\n",
+ (u32)key.type, cur_offset);
ret = -EIO;
break;
}
@@ -6594,8 +6730,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
BUG_ON(!path);
ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
if (ret < 0) {
- printk_in_rcu(KERN_WARNING "BTRFS: "
- "error %d while searching for dev_stats item for device %s!\n",
+ btrfs_warn_in_rcu(dev_root->fs_info,
+ "error %d while searching for dev_stats item for device %s",
ret, rcu_str_deref(device->name));
goto out;
}
@@ -6605,8 +6741,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
/* need to delete old one and insert a new one */
ret = btrfs_del_item(trans, dev_root, path);
if (ret != 0) {
- printk_in_rcu(KERN_WARNING "BTRFS: "
- "delete too small dev_stats item for device %s failed %d!\n",
+ btrfs_warn_in_rcu(dev_root->fs_info,
+ "delete too small dev_stats item for device %s failed %d",
rcu_str_deref(device->name), ret);
goto out;
}
@@ -6619,9 +6755,9 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
ret = btrfs_insert_empty_item(trans, dev_root, path,
&key, sizeof(*ptr));
if (ret < 0) {
- printk_in_rcu(KERN_WARNING "BTRFS: "
- "insert dev_stats item for device %s failed %d!\n",
- rcu_str_deref(device->name), ret);
+ btrfs_warn_in_rcu(dev_root->fs_info,
+ "insert dev_stats item for device %s failed %d",
+ rcu_str_deref(device->name), ret);
goto out;
}
}
@@ -6675,8 +6811,8 @@ static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
{
if (!dev->dev_stats_valid)
return;
- printk_ratelimited_in_rcu(KERN_ERR "BTRFS: "
- "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
+ btrfs_err_rl_in_rcu(dev->dev_root->fs_info,
+ "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
rcu_str_deref(dev->name),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
@@ -6695,8 +6831,8 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
if (i == BTRFS_DEV_STAT_VALUES_MAX)
return; /* all values == 0, suppress message */
- printk_in_rcu(KERN_INFO "BTRFS: "
- "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
+ btrfs_info_in_rcu(dev->dev_root->fs_info,
+ "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
rcu_str_deref(dev->name),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
@@ -6740,22 +6876,34 @@ int btrfs_get_dev_stats(struct btrfs_root *root,
return 0;
}
-int btrfs_scratch_superblock(struct btrfs_device *device)
+void btrfs_scratch_superblocks(struct block_device *bdev, char *device_path)
{
struct buffer_head *bh;
struct btrfs_super_block *disk_super;
+ int copy_num;
- bh = btrfs_read_dev_super(device->bdev);
- if (!bh)
- return -EINVAL;
- disk_super = (struct btrfs_super_block *)bh->b_data;
+ if (!bdev)
+ return;
- memset(&disk_super->magic, 0, sizeof(disk_super->magic));
- set_buffer_dirty(bh);
- sync_dirty_buffer(bh);
- brelse(bh);
+ for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX;
+ copy_num++) {
- return 0;
+ if (btrfs_read_dev_one_super(bdev, copy_num, &bh))
+ continue;
+
+ disk_super = (struct btrfs_super_block *)bh->b_data;
+
+ memset(&disk_super->magic, 0, sizeof(disk_super->magic));
+ set_buffer_dirty(bh);
+ sync_dirty_buffer(bh);
+ brelse(bh);
+ }
+
+ /* Notify udev that device has changed */
+ btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
+
+ /* Update ctime/mtime for device path for libblkid */
+ update_dev_time(device_path);
}
/*
@@ -6823,3 +6971,38 @@ void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info)
fs_devices = fs_devices->seed;
}
}
+
+static void btrfs_close_one_device(struct btrfs_device *device)
+{
+ struct btrfs_fs_devices *fs_devices = device->fs_devices;
+ struct btrfs_device *new_device;
+ struct rcu_string *name;
+
+ if (device->bdev)
+ fs_devices->open_devices--;
+
+ if (device->writeable &&
+ device->devid != BTRFS_DEV_REPLACE_DEVID) {
+ list_del_init(&device->dev_alloc_list);
+ fs_devices->rw_devices--;
+ }
+
+ if (device->missing)
+ fs_devices->missing_devices--;
+
+ new_device = btrfs_alloc_device(NULL, &device->devid,
+ device->uuid);
+ BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
+
+ /* Safe because we are under uuid_mutex */
+ if (device->name) {
+ name = rcu_string_strdup(device->name->str, GFP_NOFS);
+ BUG_ON(!name); /* -ENOMEM */
+ rcu_assign_pointer(new_device->name, name);
+ }
+
+ list_replace_rcu(&device->dev_list, &new_device->dev_list);
+ new_device->fs_devices = device->fs_devices;
+
+ call_rcu(&device->rcu, free_device);
+}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 595279a8b99f..1939ebde63df 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -26,7 +26,7 @@
extern struct mutex uuid_mutex;
-#define BTRFS_STRIPE_LEN (64 * 1024)
+#define BTRFS_STRIPE_LEN SZ_64K
struct buffer_head;
struct btrfs_pending_bios {
@@ -256,7 +256,7 @@ struct btrfs_fs_devices {
struct btrfs_fs_info *fs_info;
/* sysfs kobjects */
- struct kobject super_kobj;
+ struct kobject fsid_kobj;
struct kobject *device_dir_kobj;
struct completion kobj_unregister;
};
@@ -334,10 +334,15 @@ struct btrfs_raid_attr {
int dev_stripes; /* stripes per dev */
int devs_max; /* max devs to use */
int devs_min; /* min devs needed */
+ int tolerated_failures; /* max tolerated fail devs */
int devs_increment; /* ndevs has to be a multiple of this */
int ncopies; /* how many copies to data has */
};
+extern const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES];
+
+extern const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES];
+
struct map_lookup {
u64 type;
int io_align;
@@ -375,6 +380,9 @@ struct map_lookup {
#define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3)
#define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4)
#define BTRFS_BALANCE_ARGS_LIMIT (1ULL << 5)
+#define BTRFS_BALANCE_ARGS_LIMIT_RANGE (1ULL << 6)
+#define BTRFS_BALANCE_ARGS_STRIPES_RANGE (1ULL << 7)
+#define BTRFS_BALANCE_ARGS_USAGE_RANGE (1ULL << 10)
#define BTRFS_BALANCE_ARGS_MASK \
(BTRFS_BALANCE_ARGS_PROFILES | \
@@ -382,7 +390,10 @@ struct map_lookup {
BTRFS_BALANCE_ARGS_DEVID | \
BTRFS_BALANCE_ARGS_DRANGE | \
BTRFS_BALANCE_ARGS_VRANGE | \
- BTRFS_BALANCE_ARGS_LIMIT)
+ BTRFS_BALANCE_ARGS_LIMIT | \
+ BTRFS_BALANCE_ARGS_LIMIT_RANGE | \
+ BTRFS_BALANCE_ARGS_STRIPES_RANGE | \
+ BTRFS_BALANCE_ARGS_USAGE_RANGE)
/*
* Profile changing flags. When SOFT is set we won't relocate chunk if
@@ -482,7 +493,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
struct btrfs_device *tgtdev);
void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
struct btrfs_device *tgtdev);
-int btrfs_scratch_superblock(struct btrfs_device *device);
+void btrfs_scratch_superblocks(struct block_device *bdev, char *device_path);
int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
u64 logical, u64 len, int mirror_num);
unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 6f518c90e1c1..fd953c361a43 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -283,7 +283,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->reada = 2;
+ path->reada = READA_FORWARD;
/* search for our xattrs */
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -313,8 +313,10 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
/* check to make sure this item is what we want */
if (found_key.objectid != key.objectid)
break;
- if (found_key.type != BTRFS_XATTR_ITEM_KEY)
+ if (found_key.type > BTRFS_XATTR_ITEM_KEY)
break;
+ if (found_key.type < BTRFS_XATTR_ITEM_KEY)
+ goto next;
di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
if (verify_dir_item(root, leaf, di))
@@ -349,137 +351,89 @@ err:
return ret;
}
-/*
- * List of handlers for synthetic system.* attributes. All real ondisk
- * attributes are handled directly.
- */
-const struct xattr_handler *btrfs_xattr_handlers[] = {
-#ifdef CONFIG_BTRFS_FS_POSIX_ACL
- &posix_acl_access_xattr_handler,
- &posix_acl_default_xattr_handler,
-#endif
- NULL,
-};
-
-/*
- * Check if the attribute is in a supported namespace.
- *
- * This is applied after the check for the synthetic attributes in the system
- * namespace.
- */
-static int btrfs_is_valid_xattr(const char *name)
+static int btrfs_xattr_handler_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
- int len = strlen(name);
- int prefixlen = 0;
-
- if (!strncmp(name, XATTR_SECURITY_PREFIX,
- XATTR_SECURITY_PREFIX_LEN))
- prefixlen = XATTR_SECURITY_PREFIX_LEN;
- else if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
- prefixlen = XATTR_SYSTEM_PREFIX_LEN;
- else if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN))
- prefixlen = XATTR_TRUSTED_PREFIX_LEN;
- else if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
- prefixlen = XATTR_USER_PREFIX_LEN;
- else if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
- prefixlen = XATTR_BTRFS_PREFIX_LEN;
- else
- return -EOPNOTSUPP;
-
- /*
- * The name cannot consist of just prefix
- */
- if (len <= prefixlen)
- return -EINVAL;
+ struct inode *inode = d_inode(dentry);
- return 0;
+ name = xattr_full_name(handler, name);
+ return __btrfs_getxattr(inode, name, buffer, size);
}
-ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
- void *buffer, size_t size)
+static int btrfs_xattr_handler_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *buffer, size_t size,
+ int flags)
{
- int ret;
+ struct inode *inode = d_inode(dentry);
- /*
- * If this is a request for a synthetic attribute in the system.*
- * namespace use the generic infrastructure to resolve a handler
- * for it via sb->s_xattr.
- */
- if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
- return generic_getxattr(dentry, name, buffer, size);
+ name = xattr_full_name(handler, name);
+ return __btrfs_setxattr(NULL, inode, name, buffer, size, flags);
+}
- ret = btrfs_is_valid_xattr(name);
- if (ret)
- return ret;
- return __btrfs_getxattr(d_inode(dentry), name, buffer, size);
+static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
+ struct dentry *dentry,
+ const char *name, const void *value,
+ size_t size, int flags)
+{
+ name = xattr_full_name(handler, name);
+ return btrfs_set_prop(d_inode(dentry), name, value, size, flags);
}
+static const struct xattr_handler btrfs_security_xattr_handler = {
+ .prefix = XATTR_SECURITY_PREFIX,
+ .get = btrfs_xattr_handler_get,
+ .set = btrfs_xattr_handler_set,
+};
+
+static const struct xattr_handler btrfs_trusted_xattr_handler = {
+ .prefix = XATTR_TRUSTED_PREFIX,
+ .get = btrfs_xattr_handler_get,
+ .set = btrfs_xattr_handler_set,
+};
+
+static const struct xattr_handler btrfs_user_xattr_handler = {
+ .prefix = XATTR_USER_PREFIX,
+ .get = btrfs_xattr_handler_get,
+ .set = btrfs_xattr_handler_set,
+};
+
+static const struct xattr_handler btrfs_btrfs_xattr_handler = {
+ .prefix = XATTR_BTRFS_PREFIX,
+ .get = btrfs_xattr_handler_get,
+ .set = btrfs_xattr_handler_set_prop,
+};
+
+const struct xattr_handler *btrfs_xattr_handlers[] = {
+ &btrfs_security_xattr_handler,
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
+#endif
+ &btrfs_trusted_xattr_handler,
+ &btrfs_user_xattr_handler,
+ &btrfs_btrfs_xattr_handler,
+ NULL,
+};
+
int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
size_t size, int flags)
{
struct btrfs_root *root = BTRFS_I(d_inode(dentry))->root;
- int ret;
- /*
- * The permission on security.* and system.* is not checked
- * in permission().
- */
if (btrfs_root_readonly(root))
return -EROFS;
-
- /*
- * If this is a request for a synthetic attribute in the system.*
- * namespace use the generic infrastructure to resolve a handler
- * for it via sb->s_xattr.
- */
- if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
- return generic_setxattr(dentry, name, value, size, flags);
-
- ret = btrfs_is_valid_xattr(name);
- if (ret)
- return ret;
-
- if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
- return btrfs_set_prop(d_inode(dentry), name,
- value, size, flags);
-
- if (size == 0)
- value = ""; /* empty EA, do not remove */
-
- return __btrfs_setxattr(NULL, d_inode(dentry), name, value, size,
- flags);
+ return generic_setxattr(dentry, name, value, size, flags);
}
int btrfs_removexattr(struct dentry *dentry, const char *name)
{
struct btrfs_root *root = BTRFS_I(d_inode(dentry))->root;
- int ret;
- /*
- * The permission on security.* and system.* is not checked
- * in permission().
- */
if (btrfs_root_readonly(root))
return -EROFS;
-
- /*
- * If this is a request for a synthetic attribute in the system.*
- * namespace use the generic infrastructure to resolve a handler
- * for it via sb->s_xattr.
- */
- if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
- return generic_removexattr(dentry, name);
-
- ret = btrfs_is_valid_xattr(name);
- if (ret)
- return ret;
-
- if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
- return btrfs_set_prop(d_inode(dentry), name,
- NULL, 0, XATTR_REPLACE);
-
- return __btrfs_setxattr(NULL, d_inode(dentry), name, NULL, 0,
- XATTR_REPLACE);
+ return generic_removexattr(dentry, name);
}
static int btrfs_initxattrs(struct inode *inode,
@@ -492,7 +446,7 @@ static int btrfs_initxattrs(struct inode *inode,
for (xattr = xattr_array; xattr->name != NULL; xattr++) {
name = kmalloc(XATTR_SECURITY_PREFIX_LEN +
- strlen(xattr->name) + 1, GFP_NOFS);
+ strlen(xattr->name) + 1, GFP_KERNEL);
if (!name) {
err = -ENOMEM;
break;
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 5049608d1388..96807b3d22f5 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -28,8 +28,6 @@ extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
extern int __btrfs_setxattr(struct btrfs_trans_handle *trans,
struct inode *inode, const char *name,
const void *value, size_t size, int flags);
-extern ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
- void *buffer, size_t size);
extern int btrfs_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags);
extern int btrfs_removexattr(struct dentry *dentry, const char *name);
diff --git a/fs/buffer.c b/fs/buffer.c
index 82283abb2795..e1632abb4ca9 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -134,13 +134,10 @@ __clear_page_buffers(struct page *page)
static void buffer_io_error(struct buffer_head *bh, char *msg)
{
- char b[BDEVNAME_SIZE];
-
if (!test_bit(BH_Quiet, &bh->b_state))
printk_ratelimited(KERN_ERR
- "Buffer I/O error on dev %s, logical block %llu%s\n",
- bdevname(bh->b_bdev, b),
- (unsigned long long)bh->b_blocknr, msg);
+ "Buffer I/O error on dev %pg, logical block %llu%s\n",
+ bh->b_bdev, (unsigned long long)bh->b_blocknr, msg);
}
/*
@@ -237,15 +234,13 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
* elsewhere, don't buffer_error if we had some unmapped buffers
*/
if (all_mapped) {
- char b[BDEVNAME_SIZE];
-
printk("__find_get_block_slow() failed. "
"block=%llu, b_blocknr=%llu\n",
(unsigned long long)block,
(unsigned long long)bh->b_blocknr);
printk("b_state=0x%08lx, b_size=%zu\n",
bh->b_state, bh->b_size);
- printk("device %s blocksize: %d\n", bdevname(bdev, b),
+ printk("device %pg blocksize: %d\n", bdev,
1 << bd_inode->i_blkbits);
}
out_unlock:
@@ -531,10 +526,8 @@ repeat:
static void do_thaw_one(struct super_block *sb, void *unused)
{
- char b[BDEVNAME_SIZE];
while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
- printk(KERN_WARNING "Emergency Thaw on %s\n",
- bdevname(sb->s_bdev, b));
+ printk(KERN_WARNING "Emergency Thaw on %pg\n", sb->s_bdev);
}
static void do_thaw_all(struct work_struct *work)
@@ -999,7 +992,7 @@ grow_dev_page(struct block_device *bdev, sector_t block,
int ret = 0; /* Will call free_more_memory() */
gfp_t gfp_mask;
- gfp_mask = (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS) | gfp;
+ gfp_mask = mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS) | gfp;
/*
* XXX: __getblk_slow() can not really deal with failure and
@@ -1074,12 +1067,10 @@ grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp)
* pagecache index. (this comparison is done using sector_t types).
*/
if (unlikely(index != block >> sizebits)) {
- char b[BDEVNAME_SIZE];
-
printk(KERN_ERR "%s: requested out-of-range block %llu for "
- "device %s\n",
+ "device %pg\n",
__func__, (unsigned long long)block,
- bdevname(bdev, b));
+ bdev);
return -EIO;
}
@@ -2420,9 +2411,9 @@ EXPORT_SYMBOL(block_commit_write);
* unlock the page.
*
* Direct callers of this function should protect against filesystem freezing
- * using sb_start_write() - sb_end_write() functions.
+ * using sb_start_pagefault() - sb_end_pagefault() functions.
*/
-int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
+int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
get_block_t get_block)
{
struct page *page = vmf->page;
@@ -2459,26 +2450,6 @@ out_unlock:
unlock_page(page);
return ret;
}
-EXPORT_SYMBOL(__block_page_mkwrite);
-
-int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
- get_block_t get_block)
-{
- int ret;
- struct super_block *sb = file_inode(vma->vm_file)->i_sb;
-
- sb_start_pagefault(sb);
-
- /*
- * Update file times before taking page lock. We may end up failing the
- * fault so this update may be superfluous but who really cares...
- */
- file_update_time(vma->vm_file);
-
- ret = __block_page_mkwrite(vma, vmf, get_block);
- sb_end_pagefault(sb);
- return block_page_mkwrite_return(ret);
-}
EXPORT_SYMBOL(block_page_mkwrite);
/*
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index f601def05bdf..452e98dd7560 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -226,15 +226,9 @@ static ssize_t cachefiles_daemon_write(struct file *file,
return -EOPNOTSUPP;
/* drag the command string into the kernel so we can parse it */
- data = kmalloc(datalen + 1, GFP_KERNEL);
- if (!data)
- return -ENOMEM;
-
- ret = -EFAULT;
- if (copy_from_user(data, _data, datalen) != 0)
- goto error;
-
- data[datalen] = '\0';
+ data = memdup_user_nul(_data, datalen);
+ if (IS_ERR(data))
+ return PTR_ERR(data);
ret = -EINVAL;
if (memchr(data, '\0', datalen))
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index aecd0859eacb..9c4b737a54df 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -30,7 +30,7 @@ extern unsigned cachefiles_debug;
#define CACHEFILES_DEBUG_KLEAVE 2
#define CACHEFILES_DEBUG_KDEBUG 4
-#define cachefiles_gfp (__GFP_WAIT | __GFP_NORETRY | __GFP_NOMEMALLOC)
+#define cachefiles_gfp (__GFP_RECLAIM | __GFP_NORETRY | __GFP_NOMEMALLOC)
/*
* node records
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index fc1056f5c96a..c4b893453e0e 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -655,6 +655,8 @@ lookup_again:
aops = d_backing_inode(object->dentry)->i_mapping->a_ops;
if (!aops->bmap)
goto check_error;
+ if (object->dentry->d_sb->s_blocksize > PAGE_SIZE)
+ goto check_error;
object->backer = object->dentry;
} else {
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 3cbb0e834694..c0f3da3926a0 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -414,9 +414,6 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
ASSERT(inode->i_mapping->a_ops->readpages);
/* calculate the shift required to use bmap */
- if (inode->i_sb->s_blocksize > PAGE_SIZE)
- goto enobufs;
-
shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
op->op.flags &= FSCACHE_OP_KEEP_FLAGS;
@@ -711,9 +708,6 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
ASSERT(inode->i_mapping->a_ops->readpages);
/* calculate the shift required to use bmap */
- if (inode->i_sb->s_blocksize > PAGE_SIZE)
- goto all_enobufs;
-
shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
pagevec_init(&pagevec, 0);
@@ -885,7 +879,7 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page)
loff_t pos, eof;
size_t len;
void *data;
- int ret;
+ int ret = -ENOBUFS;
ASSERT(op != NULL);
ASSERT(page != NULL);
@@ -905,6 +899,15 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page)
cache = container_of(object->fscache.cache,
struct cachefiles_cache, cache);
+ pos = (loff_t)page->index << PAGE_SHIFT;
+
+ /* We mustn't write more data than we have, so we have to beware of a
+ * partial page at EOF.
+ */
+ eof = object->fscache.store_limit_l;
+ if (pos >= eof)
+ goto error;
+
/* write the page to the backing filesystem and let it store it in its
* own time */
path.mnt = cache->mnt;
@@ -912,40 +915,38 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page)
file = dentry_open(&path, O_RDWR | O_LARGEFILE, cache->cache_cred);
if (IS_ERR(file)) {
ret = PTR_ERR(file);
- } else {
- pos = (loff_t) page->index << PAGE_SHIFT;
-
- /* we mustn't write more data than we have, so we have
- * to beware of a partial page at EOF */
- eof = object->fscache.store_limit_l;
- len = PAGE_SIZE;
- if (eof & ~PAGE_MASK) {
- ASSERTCMP(pos, <, eof);
- if (eof - pos < PAGE_SIZE) {
- _debug("cut short %llx to %llx",
- pos, eof);
- len = eof - pos;
- ASSERTCMP(pos + len, ==, eof);
- }
- }
-
- data = kmap(page);
- ret = __kernel_write(file, data, len, &pos);
- kunmap(page);
- if (ret != len)
- ret = -EIO;
- fput(file);
+ goto error_2;
}
- if (ret < 0) {
- if (ret == -EIO)
- cachefiles_io_error_obj(
- object, "Write page to backing file failed");
- ret = -ENOBUFS;
+ len = PAGE_SIZE;
+ if (eof & ~PAGE_MASK) {
+ if (eof - pos < PAGE_SIZE) {
+ _debug("cut short %llx to %llx",
+ pos, eof);
+ len = eof - pos;
+ ASSERTCMP(pos + len, ==, eof);
+ }
}
- _leave(" = %d", ret);
- return ret;
+ data = kmap(page);
+ ret = __kernel_write(file, data, len, &pos);
+ kunmap(page);
+ fput(file);
+ if (ret != len)
+ goto error_eio;
+
+ _leave(" = 0");
+ return 0;
+
+error_eio:
+ ret = -EIO;
+error_2:
+ if (ret == -EIO)
+ cachefiles_io_error_obj(object,
+ "Write page to backing file failed");
+error:
+ _leave(" = -ENOBUFS [%d]", ret);
+ return -ENOBUFS;
}
/*
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index 8f84646f10e9..f19708487e2f 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -49,10 +49,10 @@ struct posix_acl *ceph_get_acl(struct inode *inode, int type)
switch (type) {
case ACL_TYPE_ACCESS:
- name = POSIX_ACL_XATTR_ACCESS;
+ name = XATTR_NAME_POSIX_ACL_ACCESS;
break;
case ACL_TYPE_DEFAULT:
- name = POSIX_ACL_XATTR_DEFAULT;
+ name = XATTR_NAME_POSIX_ACL_DEFAULT;
break;
default:
BUG();
@@ -92,7 +92,7 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type)
switch (type) {
case ACL_TYPE_ACCESS:
- name = POSIX_ACL_XATTR_ACCESS;
+ name = XATTR_NAME_POSIX_ACL_ACCESS;
if (acl) {
ret = posix_acl_equiv_mode(acl, &new_mode);
if (ret < 0)
@@ -106,7 +106,7 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type)
ret = acl ? -EINVAL : 0;
goto out;
}
- name = POSIX_ACL_XATTR_DEFAULT;
+ name = XATTR_NAME_POSIX_ACL_DEFAULT;
break;
default:
ret = -EINVAL;
@@ -202,11 +202,11 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
ceph_pagelist_encode_32(pagelist, acl && default_acl ? 2 : 1);
if (acl) {
- size_t len = strlen(POSIX_ACL_XATTR_ACCESS);
+ size_t len = strlen(XATTR_NAME_POSIX_ACL_ACCESS);
err = ceph_pagelist_reserve(pagelist, len + val_size1 + 8);
if (err)
goto out_err;
- ceph_pagelist_encode_string(pagelist, POSIX_ACL_XATTR_ACCESS,
+ ceph_pagelist_encode_string(pagelist, XATTR_NAME_POSIX_ACL_ACCESS,
len);
err = posix_acl_to_xattr(&init_user_ns, acl,
tmp_buf, val_size1);
@@ -216,12 +216,12 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
ceph_pagelist_append(pagelist, tmp_buf, val_size1);
}
if (default_acl) {
- size_t len = strlen(POSIX_ACL_XATTR_DEFAULT);
+ size_t len = strlen(XATTR_NAME_POSIX_ACL_DEFAULT);
err = ceph_pagelist_reserve(pagelist, len + val_size2 + 8);
if (err)
goto out_err;
err = ceph_pagelist_encode_string(pagelist,
- POSIX_ACL_XATTR_DEFAULT, len);
+ XATTR_NAME_POSIX_ACL_DEFAULT, len);
err = posix_acl_to_xattr(&init_user_ns, default_acl,
tmp_buf, val_size2);
if (err < 0)
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 9d23e788d1df..b7d218a168fb 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1283,8 +1283,8 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
int ret1;
struct address_space *mapping = inode->i_mapping;
struct page *page = find_or_create_page(mapping, 0,
- mapping_gfp_mask(mapping) &
- ~__GFP_FS);
+ mapping_gfp_constraint(mapping,
+ ~__GFP_FS));
if (!page) {
ret = VM_FAULT_OOM;
goto out;
@@ -1428,7 +1428,8 @@ void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
if (i_size_read(inode) == 0)
return;
page = find_or_create_page(mapping, 0,
- mapping_gfp_mask(mapping) & ~__GFP_FS);
+ mapping_gfp_constraint(mapping,
+ ~__GFP_FS));
if (!page)
return;
if (PageUptodate(page)) {
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index 834f9f3723fb..a4766ded1ba7 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -88,7 +88,7 @@ static uint16_t ceph_fscache_inode_get_key(const void *cookie_netfs_data,
const struct ceph_inode_info* ci = cookie_netfs_data;
uint16_t klen;
- /* use ceph virtual inode (id + snaphot) */
+ /* use ceph virtual inode (id + snapshot) */
klen = sizeof(ci->i_vino);
if (klen > maxbuf)
return 0;
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 27b566874bc1..c69e1253b47b 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1655,9 +1655,8 @@ retry_locked:
!S_ISDIR(inode->i_mode) && /* ignore readdir cache */
ci->i_wrbuffer_ref == 0 && /* no dirty pages... */
inode->i_data.nrpages && /* have cached pages */
- (file_wanted == 0 || /* no open files */
- (revoking & (CEPH_CAP_FILE_CACHE|
- CEPH_CAP_FILE_LAZYIO))) && /* or revoking cache */
+ (revoking & (CEPH_CAP_FILE_CACHE|
+ CEPH_CAP_FILE_LAZYIO)) && /* or revoking cache */
!tried_invalidate) {
dout("check_caps trying to invalidate on %p\n", inode);
if (try_nonblocking_invalidate(inode) < 0) {
@@ -1971,49 +1970,46 @@ out:
}
/*
- * wait for any uncommitted directory operations to commit.
+ * wait for any unsafe requests to complete.
*/
-static int unsafe_dirop_wait(struct inode *inode)
+static int unsafe_request_wait(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- struct list_head *head = &ci->i_unsafe_dirops;
- struct ceph_mds_request *req;
- u64 last_tid;
- int ret = 0;
-
- if (!S_ISDIR(inode->i_mode))
- return 0;
+ struct ceph_mds_request *req1 = NULL, *req2 = NULL;
+ int ret, err = 0;
spin_lock(&ci->i_unsafe_lock);
- if (list_empty(head))
- goto out;
-
- req = list_last_entry(head, struct ceph_mds_request,
- r_unsafe_dir_item);
- last_tid = req->r_tid;
-
- do {
- ceph_mdsc_get_request(req);
- spin_unlock(&ci->i_unsafe_lock);
+ if (S_ISDIR(inode->i_mode) && !list_empty(&ci->i_unsafe_dirops)) {
+ req1 = list_last_entry(&ci->i_unsafe_dirops,
+ struct ceph_mds_request,
+ r_unsafe_dir_item);
+ ceph_mdsc_get_request(req1);
+ }
+ if (!list_empty(&ci->i_unsafe_iops)) {
+ req2 = list_last_entry(&ci->i_unsafe_iops,
+ struct ceph_mds_request,
+ r_unsafe_target_item);
+ ceph_mdsc_get_request(req2);
+ }
+ spin_unlock(&ci->i_unsafe_lock);
- dout("unsafe_dirop_wait %p wait on tid %llu (until %llu)\n",
- inode, req->r_tid, last_tid);
- ret = !wait_for_completion_timeout(&req->r_safe_completion,
- ceph_timeout_jiffies(req->r_timeout));
+ dout("unsafe_requeset_wait %p wait on tid %llu %llu\n",
+ inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL);
+ if (req1) {
+ ret = !wait_for_completion_timeout(&req1->r_safe_completion,
+ ceph_timeout_jiffies(req1->r_timeout));
if (ret)
- ret = -EIO; /* timed out */
-
- ceph_mdsc_put_request(req);
-
- spin_lock(&ci->i_unsafe_lock);
- if (ret || list_empty(head))
- break;
- req = list_first_entry(head, struct ceph_mds_request,
- r_unsafe_dir_item);
- } while (req->r_tid < last_tid);
-out:
- spin_unlock(&ci->i_unsafe_lock);
- return ret;
+ err = -EIO;
+ ceph_mdsc_put_request(req1);
+ }
+ if (req2) {
+ ret = !wait_for_completion_timeout(&req2->r_safe_completion,
+ ceph_timeout_jiffies(req2->r_timeout));
+ if (ret)
+ err = -EIO;
+ ceph_mdsc_put_request(req2);
+ }
+ return err;
}
int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
@@ -2039,7 +2035,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
dirty = try_flush_caps(inode, &flush_tid);
dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
- ret = unsafe_dirop_wait(inode);
+ ret = unsafe_request_wait(inode);
/*
* only wait on non-file metadata writeback (the mds
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 0c62868b5c56..3c68e6aee2f0 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -34,6 +34,74 @@
* need to wait for MDS acknowledgement.
*/
+/*
+ * Calculate the length sum of direct io vectors that can
+ * be combined into one page vector.
+ */
+static size_t dio_get_pagev_size(const struct iov_iter *it)
+{
+ const struct iovec *iov = it->iov;
+ const struct iovec *iovend = iov + it->nr_segs;
+ size_t size;
+
+ size = iov->iov_len - it->iov_offset;
+ /*
+ * An iov can be page vectored when both the current tail
+ * and the next base are page aligned.
+ */
+ while (PAGE_ALIGNED((iov->iov_base + iov->iov_len)) &&
+ (++iov < iovend && PAGE_ALIGNED((iov->iov_base)))) {
+ size += iov->iov_len;
+ }
+ dout("dio_get_pagevlen len = %zu\n", size);
+ return size;
+}
+
+/*
+ * Allocate a page vector based on (@it, @nbytes).
+ * The return value is the tuple describing a page vector,
+ * that is (@pages, @page_align, @num_pages).
+ */
+static struct page **
+dio_get_pages_alloc(const struct iov_iter *it, size_t nbytes,
+ size_t *page_align, int *num_pages)
+{
+ struct iov_iter tmp_it = *it;
+ size_t align;
+ struct page **pages;
+ int ret = 0, idx, npages;
+
+ align = (unsigned long)(it->iov->iov_base + it->iov_offset) &
+ (PAGE_SIZE - 1);
+ npages = calc_pages_for(align, nbytes);
+ pages = kmalloc(sizeof(*pages) * npages, GFP_KERNEL);
+ if (!pages) {
+ pages = vmalloc(sizeof(*pages) * npages);
+ if (!pages)
+ return ERR_PTR(-ENOMEM);
+ }
+
+ for (idx = 0; idx < npages; ) {
+ size_t start;
+ ret = iov_iter_get_pages(&tmp_it, pages + idx, nbytes,
+ npages - idx, &start);
+ if (ret < 0)
+ goto fail;
+
+ iov_iter_advance(&tmp_it, ret);
+ nbytes -= ret;
+ idx += (ret + start + PAGE_SIZE - 1) / PAGE_SIZE;
+ }
+
+ BUG_ON(nbytes != 0);
+ *num_pages = npages;
+ *page_align = align;
+ dout("dio_get_pages_alloc: got %d pages align %zu\n", npages, align);
+ return pages;
+fail:
+ ceph_put_page_vector(pages, idx, false);
+ return ERR_PTR(ret);
+}
/*
* Prepare an open request. Preallocate ceph_cap to avoid an
@@ -458,11 +526,10 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
size_t start;
ssize_t n;
- n = iov_iter_get_pages_alloc(i, &pages, INT_MAX, &start);
- if (n < 0)
- return n;
-
- num_pages = (n + start + PAGE_SIZE - 1) / PAGE_SIZE;
+ n = dio_get_pagev_size(i);
+ pages = dio_get_pages_alloc(i, n, &start, &num_pages);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
ret = striped_read(inode, off, n,
pages, num_pages, checkeof,
@@ -592,7 +659,7 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
CEPH_OSD_FLAG_WRITE;
while (iov_iter_count(from) > 0) {
- u64 len = iov_iter_single_seg_count(from);
+ u64 len = dio_get_pagev_size(from);
size_t start;
ssize_t n;
@@ -611,14 +678,14 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
- n = iov_iter_get_pages_alloc(from, &pages, len, &start);
- if (unlikely(n < 0)) {
- ret = n;
+ n = len;
+ pages = dio_get_pages_alloc(from, len, &start, &num_pages);
+ if (IS_ERR(pages)) {
ceph_osdc_put_request(req);
+ ret = PTR_ERR(pages);
break;
}
- num_pages = (n + start + PAGE_SIZE - 1) / PAGE_SIZE;
/*
* throw out any page cache pages in this range. this
* may block.
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 96d2bd829902..da55eb8bcffa 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -452,6 +452,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
INIT_LIST_HEAD(&ci->i_unsafe_writes);
INIT_LIST_HEAD(&ci->i_unsafe_dirops);
+ INIT_LIST_HEAD(&ci->i_unsafe_iops);
spin_lock_init(&ci->i_unsafe_lock);
ci->i_snap_realm = NULL;
@@ -1755,7 +1756,7 @@ retry:
*/
static const struct inode_operations ceph_symlink_iops = {
.readlink = generic_readlink,
- .follow_link = simple_follow_link,
+ .get_link = simple_get_link,
.setattr = ceph_setattr,
.getattr = ceph_getattr,
.setxattr = ceph_setxattr,
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 6706bde9ad1b..a2cb0c254060 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -228,12 +228,12 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
file, lock_cmd, wait, fl);
if (!err) {
- err = flock_lock_file_wait(file, fl);
+ err = locks_lock_file_wait(file, fl);
if (err) {
ceph_lock_message(CEPH_LOCK_FLOCK,
CEPH_MDS_OP_SETFILELOCK,
file, CEPH_LOCK_UNLOCK, 0, fl);
- dout("got %d on flock_lock_file_wait, undid lock", err);
+ dout("got %d on locks_lock_file_wait, undid lock", err);
}
}
return err;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 51cb02da75d9..e7b130a637f9 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -633,13 +633,8 @@ static void __register_request(struct ceph_mds_client *mdsc,
mdsc->oldest_tid = req->r_tid;
if (dir) {
- struct ceph_inode_info *ci = ceph_inode(dir);
-
ihold(dir);
- spin_lock(&ci->i_unsafe_lock);
req->r_unsafe_dir = dir;
- list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);
- spin_unlock(&ci->i_unsafe_lock);
}
}
@@ -665,13 +660,20 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
rb_erase(&req->r_node, &mdsc->request_tree);
RB_CLEAR_NODE(&req->r_node);
- if (req->r_unsafe_dir) {
+ if (req->r_unsafe_dir && req->r_got_unsafe) {
struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
-
spin_lock(&ci->i_unsafe_lock);
list_del_init(&req->r_unsafe_dir_item);
spin_unlock(&ci->i_unsafe_lock);
+ }
+ if (req->r_target_inode && req->r_got_unsafe) {
+ struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
+ spin_lock(&ci->i_unsafe_lock);
+ list_del_init(&req->r_unsafe_target_item);
+ spin_unlock(&ci->i_unsafe_lock);
+ }
+ if (req->r_unsafe_dir) {
iput(req->r_unsafe_dir);
req->r_unsafe_dir = NULL;
}
@@ -1430,6 +1432,13 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
if ((used | wanted) & CEPH_CAP_ANY_WR)
goto out;
}
+ /* The inode has cached pages, but it's no longer used.
+ * we can safely drop it */
+ if (wanted == 0 && used == CEPH_CAP_FILE_CACHE &&
+ !(oissued & CEPH_CAP_FILE_CACHE)) {
+ used = 0;
+ oissued = 0;
+ }
if ((used | wanted) & ~oissued & mine)
goto out; /* we need these caps */
@@ -1438,7 +1447,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
/* we aren't the only cap.. just remove us */
__ceph_remove_cap(cap, true);
} else {
- /* try to drop referring dentries */
+ /* try dropping referring dentries */
spin_unlock(&ci->i_ceph_lock);
d_prune_aliases(inode);
dout("trim_caps_cb %p cap %p pruned, count now %d\n",
@@ -1704,6 +1713,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
req->r_started = jiffies;
req->r_resend_mds = -1;
INIT_LIST_HEAD(&req->r_unsafe_dir_item);
+ INIT_LIST_HEAD(&req->r_unsafe_target_item);
req->r_fmode = -1;
kref_init(&req->r_kref);
INIT_LIST_HEAD(&req->r_wait);
@@ -1935,7 +1945,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
len = sizeof(*head) +
pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)) +
- sizeof(struct timespec);
+ sizeof(struct ceph_timespec);
/* calculate (max) length for cap releases */
len += sizeof(struct ceph_mds_request_release) *
@@ -2477,6 +2487,14 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
} else {
req->r_got_unsafe = true;
list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
+ if (req->r_unsafe_dir) {
+ struct ceph_inode_info *ci =
+ ceph_inode(req->r_unsafe_dir);
+ spin_lock(&ci->i_unsafe_lock);
+ list_add_tail(&req->r_unsafe_dir_item,
+ &ci->i_unsafe_dirops);
+ spin_unlock(&ci->i_unsafe_lock);
+ }
}
dout("handle_reply tid %lld result %d\n", tid, result);
@@ -2518,6 +2536,13 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
up_read(&mdsc->snap_rwsem);
if (realm)
ceph_put_snap_realm(mdsc, realm);
+
+ if (err == 0 && req->r_got_unsafe && req->r_target_inode) {
+ struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
+ spin_lock(&ci->i_unsafe_lock);
+ list_add_tail(&req->r_unsafe_target_item, &ci->i_unsafe_iops);
+ spin_unlock(&ci->i_unsafe_lock);
+ }
out_err:
mutex_lock(&mdsc->mutex);
if (!req->r_aborted) {
@@ -3917,17 +3942,19 @@ static struct ceph_msg *mds_alloc_msg(struct ceph_connection *con,
return msg;
}
-static int sign_message(struct ceph_connection *con, struct ceph_msg *msg)
+static int mds_sign_message(struct ceph_msg *msg)
{
- struct ceph_mds_session *s = con->private;
+ struct ceph_mds_session *s = msg->con->private;
struct ceph_auth_handshake *auth = &s->s_auth;
+
return ceph_auth_sign_message(auth, msg);
}
-static int check_message_signature(struct ceph_connection *con, struct ceph_msg *msg)
+static int mds_check_message_signature(struct ceph_msg *msg)
{
- struct ceph_mds_session *s = con->private;
+ struct ceph_mds_session *s = msg->con->private;
struct ceph_auth_handshake *auth = &s->s_auth;
+
return ceph_auth_check_message_signature(auth, msg);
}
@@ -3940,8 +3967,8 @@ static const struct ceph_connection_operations mds_con_ops = {
.invalidate_authorizer = invalidate_authorizer,
.peer_reset = peer_reset,
.alloc_msg = mds_alloc_msg,
- .sign_message = sign_message,
- .check_message_signature = check_message_signature,
+ .sign_message = mds_sign_message,
+ .check_message_signature = mds_check_message_signature,
};
/* eof */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index f575eafe2261..ccf11ef0ca87 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -236,6 +236,9 @@ struct ceph_mds_request {
struct inode *r_unsafe_dir;
struct list_head r_unsafe_dir_item;
+ /* unsafe requests that modify the target inode */
+ struct list_head r_unsafe_target_item;
+
struct ceph_mds_session *r_session;
int r_attempts; /* resend attempts */
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index f446afada328..ca4d5e8457f1 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -639,8 +639,8 @@ static int __init init_caches(void)
ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
sizeof(struct ceph_inode_info),
__alignof__(struct ceph_inode_info),
- (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
- ceph_inode_init_once);
+ SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
+ SLAB_ACCOUNT, ceph_inode_init_once);
if (ceph_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 2f2460d23a06..75b7d125ce66 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -342,6 +342,7 @@ struct ceph_inode_info {
struct list_head i_unsafe_writes; /* uncommitted sync writes */
struct list_head i_unsafe_dirops; /* uncommitted mds dir ops */
+ struct list_head i_unsafe_iops; /* uncommitted mds inode ops */
spinlock_t i_unsafe_lock;
struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index f4cf200b3c76..6908080e9b6d 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -42,7 +42,7 @@ cifs_spnego_key_instantiate(struct key *key, struct key_preparsed_payload *prep)
goto error;
/* attach the data */
- key->payload.data = payload;
+ key->payload.data[0] = payload;
ret = 0;
error:
@@ -52,7 +52,7 @@ error:
static void
cifs_spnego_key_destroy(struct key *key)
{
- kfree(key->payload.data);
+ kfree(key->payload.data[0]);
}
@@ -167,7 +167,7 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo)
#ifdef CONFIG_CIFS_DEBUG2
if (cifsFYI && !IS_ERR(spnego_key)) {
- struct cifs_spnego_msg *msg = spnego_key->payload.data;
+ struct cifs_spnego_msg *msg = spnego_key->payload.data[0];
cifs_dump_mem("SPNEGO reply blob:", msg->data, min(1024U,
msg->secblob_len + msg->sesskey_len));
}
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 1ea780bc6376..3f93125916bf 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -58,16 +58,15 @@ cifs_idmap_key_instantiate(struct key *key, struct key_preparsed_payload *prep)
* dereference payload.data!
*/
if (prep->datalen <= sizeof(key->payload)) {
- key->payload.value = 0;
- memcpy(&key->payload.value, prep->data, prep->datalen);
- key->datalen = prep->datalen;
- return 0;
+ key->payload.data[0] = NULL;
+ memcpy(&key->payload, prep->data, prep->datalen);
+ } else {
+ payload = kmemdup(prep->data, prep->datalen, GFP_KERNEL);
+ if (!payload)
+ return -ENOMEM;
+ key->payload.data[0] = payload;
}
- payload = kmemdup(prep->data, prep->datalen, GFP_KERNEL);
- if (!payload)
- return -ENOMEM;
- key->payload.data = payload;
key->datalen = prep->datalen;
return 0;
}
@@ -76,7 +75,7 @@ static inline void
cifs_idmap_key_destroy(struct key *key)
{
if (key->datalen > sizeof(key->payload))
- kfree(key->payload.data);
+ kfree(key->payload.data[0]);
}
static struct key_type cifs_idmap_key_type = {
@@ -233,8 +232,8 @@ id_to_sid(unsigned int cid, uint sidtype, struct cifs_sid *ssid)
* it could be.
*/
ksid = sidkey->datalen <= sizeof(sidkey->payload) ?
- (struct cifs_sid *)&sidkey->payload.value :
- (struct cifs_sid *)sidkey->payload.data;
+ (struct cifs_sid *)&sidkey->payload :
+ (struct cifs_sid *)sidkey->payload.data[0];
ksid_size = CIFS_SID_BASE_SIZE + (ksid->num_subauth * sizeof(__le32));
if (ksid_size > sidkey->datalen) {
@@ -307,14 +306,14 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid,
if (sidtype == SIDOWNER) {
kuid_t uid;
uid_t id;
- memcpy(&id, &sidkey->payload.value, sizeof(uid_t));
+ memcpy(&id, &sidkey->payload.data[0], sizeof(uid_t));
uid = make_kuid(&init_user_ns, id);
if (uid_valid(uid))
fuid = uid;
} else {
kgid_t gid;
gid_t id;
- memcpy(&id, &sidkey->payload.value, sizeof(gid_t));
+ memcpy(&id, &sidkey->payload.data[0], sizeof(gid_t));
gid = make_kgid(&init_user_ns, id);
if (gid_valid(gid))
fgid = gid;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index e739950ca084..c4c1169814b2 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -454,6 +454,10 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
seq_puts(s, ",nocase");
if (tcon->retry)
seq_puts(s, ",hard");
+ if (tcon->use_persistent)
+ seq_puts(s, ",persistenthandles");
+ else if (tcon->use_resilient)
+ seq_puts(s, ",resilienthandles");
if (tcon->unix_ext)
seq_puts(s, ",unix");
else
@@ -896,8 +900,7 @@ const struct inode_operations cifs_file_inode_ops = {
const struct inode_operations cifs_symlink_inode_ops = {
.readlink = generic_readlink,
- .follow_link = cifs_follow_link,
- .put_link = kfree_put_link,
+ .get_link = cifs_get_link,
.permission = cifs_permission,
/* BB add the following two eventually */
/* revalidate: cifs_revalidate,
@@ -910,6 +913,59 @@ const struct inode_operations cifs_symlink_inode_ops = {
#endif
};
+static int cifs_clone_file_range(struct file *src_file, loff_t off,
+ struct file *dst_file, loff_t destoff, u64 len)
+{
+ struct inode *src_inode = file_inode(src_file);
+ struct inode *target_inode = file_inode(dst_file);
+ struct cifsFileInfo *smb_file_src = src_file->private_data;
+ struct cifsFileInfo *smb_file_target = dst_file->private_data;
+ struct cifs_tcon *target_tcon = tlink_tcon(smb_file_target->tlink);
+ unsigned int xid;
+ int rc;
+
+ cifs_dbg(FYI, "clone range\n");
+
+ xid = get_xid();
+
+ if (!src_file->private_data || !dst_file->private_data) {
+ rc = -EBADF;
+ cifs_dbg(VFS, "missing cifsFileInfo on copy range src file\n");
+ goto out;
+ }
+
+ /*
+ * Note: cifs case is easier than btrfs since server responsible for
+ * checks for proper open modes and file type and if it wants
+ * server could even support copy of range where source = target
+ */
+ lock_two_nondirectories(target_inode, src_inode);
+
+ if (len == 0)
+ len = src_inode->i_size - off;
+
+ cifs_dbg(FYI, "about to flush pages\n");
+ /* should we flush first and last page first */
+ truncate_inode_pages_range(&target_inode->i_data, destoff,
+ PAGE_CACHE_ALIGN(destoff + len)-1);
+
+ if (target_tcon->ses->server->ops->duplicate_extents)
+ rc = target_tcon->ses->server->ops->duplicate_extents(xid,
+ smb_file_src, smb_file_target, off, len, destoff);
+ else
+ rc = -EOPNOTSUPP;
+
+ /* force revalidate of size and timestamps of target file now
+ that target is updated on the server */
+ CIFS_I(target_inode)->time = 0;
+ /* although unlocking in the reverse order from locking is not
+ strictly necessary here it is a little cleaner to be consistent */
+ unlock_two_nondirectories(src_inode, target_inode);
+out:
+ free_xid(xid);
+ return rc;
+}
+
const struct file_operations cifs_file_ops = {
.read_iter = cifs_loose_read_iter,
.write_iter = cifs_file_write_iter,
@@ -921,9 +977,8 @@ const struct file_operations cifs_file_ops = {
.mmap = cifs_file_mmap,
.splice_read = generic_file_splice_read,
.llseek = cifs_llseek,
-#ifdef CONFIG_CIFS_POSIX
.unlocked_ioctl = cifs_ioctl,
-#endif /* CONFIG_CIFS_POSIX */
+ .clone_file_range = cifs_clone_file_range,
.setlease = cifs_setlease,
.fallocate = cifs_fallocate,
};
@@ -939,9 +994,9 @@ const struct file_operations cifs_file_strict_ops = {
.mmap = cifs_file_strict_mmap,
.splice_read = generic_file_splice_read,
.llseek = cifs_llseek,
-#ifdef CONFIG_CIFS_POSIX
.unlocked_ioctl = cifs_ioctl,
-#endif /* CONFIG_CIFS_POSIX */
+ .clone_file_range = cifs_clone_file_range,
+ .clone_file_range = cifs_clone_file_range,
.setlease = cifs_setlease,
.fallocate = cifs_fallocate,
};
@@ -957,9 +1012,8 @@ const struct file_operations cifs_file_direct_ops = {
.flush = cifs_flush,
.mmap = cifs_file_mmap,
.splice_read = generic_file_splice_read,
-#ifdef CONFIG_CIFS_POSIX
.unlocked_ioctl = cifs_ioctl,
-#endif /* CONFIG_CIFS_POSIX */
+ .clone_file_range = cifs_clone_file_range,
.llseek = cifs_llseek,
.setlease = cifs_setlease,
.fallocate = cifs_fallocate,
@@ -975,9 +1029,8 @@ const struct file_operations cifs_file_nobrl_ops = {
.mmap = cifs_file_mmap,
.splice_read = generic_file_splice_read,
.llseek = cifs_llseek,
-#ifdef CONFIG_CIFS_POSIX
.unlocked_ioctl = cifs_ioctl,
-#endif /* CONFIG_CIFS_POSIX */
+ .clone_file_range = cifs_clone_file_range,
.setlease = cifs_setlease,
.fallocate = cifs_fallocate,
};
@@ -992,9 +1045,8 @@ const struct file_operations cifs_file_strict_nobrl_ops = {
.mmap = cifs_file_strict_mmap,
.splice_read = generic_file_splice_read,
.llseek = cifs_llseek,
-#ifdef CONFIG_CIFS_POSIX
.unlocked_ioctl = cifs_ioctl,
-#endif /* CONFIG_CIFS_POSIX */
+ .clone_file_range = cifs_clone_file_range,
.setlease = cifs_setlease,
.fallocate = cifs_fallocate,
};
@@ -1009,9 +1061,8 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
.flush = cifs_flush,
.mmap = cifs_file_mmap,
.splice_read = generic_file_splice_read,
-#ifdef CONFIG_CIFS_POSIX
.unlocked_ioctl = cifs_ioctl,
-#endif /* CONFIG_CIFS_POSIX */
+ .clone_file_range = cifs_clone_file_range,
.llseek = cifs_llseek,
.setlease = cifs_setlease,
.fallocate = cifs_fallocate,
@@ -1022,6 +1073,7 @@ const struct file_operations cifs_dir_ops = {
.release = cifs_closedir,
.read = generic_read_dir,
.unlocked_ioctl = cifs_ioctl,
+ .clone_file_range = cifs_clone_file_range,
.llseek = generic_file_llseek,
};
@@ -1040,7 +1092,7 @@ cifs_init_inodecache(void)
cifs_inode_cachep = kmem_cache_create("cifs_inode_cache",
sizeof(struct cifsInodeInfo),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
cifs_init_once);
if (cifs_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index c3cc1609025f..68c4547528c4 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -120,9 +120,8 @@ extern struct vfsmount *cifs_dfs_d_automount(struct path *path);
#endif
/* Functions related to symlinks */
-extern const char *cifs_follow_link(struct dentry *direntry, void **cookie);
-extern int cifs_readlink(struct dentry *direntry, char __user *buffer,
- int buflen);
+extern const char *cifs_get_link(struct dentry *, struct inode *,
+ struct delayed_call *);
extern int cifs_symlink(struct inode *inode, struct dentry *direntry,
const char *symname);
extern int cifs_removexattr(struct dentry *, const char *);
@@ -131,7 +130,6 @@ extern int cifs_setxattr(struct dentry *, const char *, const void *,
extern ssize_t cifs_getxattr(struct dentry *, const char *, void *, size_t);
extern ssize_t cifs_listxattr(struct dentry *, char *, size_t);
extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
-
#ifdef CONFIG_CIFS_NFSD_EXPORT
extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index b406a32deb1f..2b510c537a0d 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -493,7 +493,10 @@ struct smb_vol {
bool mfsymlinks:1; /* use Minshall+French Symlinks */
bool multiuser:1;
bool rwpidforward:1; /* pid forward for read/write operations */
- bool nosharesock;
+ bool nosharesock:1;
+ bool persistent:1;
+ bool nopersistent:1;
+ bool resilient:1; /* noresilient not required since not fored for CA */
unsigned int rsize;
unsigned int wsize;
bool sockopt_tcp_nodelay:1;
@@ -895,6 +898,8 @@ struct cifs_tcon {
bool broken_posix_open; /* e.g. Samba server versions < 3.3.2, 3.2.9 */
bool broken_sparse_sup; /* if server or share does not support sparse */
bool need_reconnect:1; /* connection reset, tid now invalid */
+ bool use_resilient:1; /* use resilient instead of durable handles */
+ bool use_persistent:1; /* use persistent instead of durable handles */
#ifdef CONFIG_CIFS_SMB2
bool print:1; /* set if connection to printer share */
bool bad_network_name:1; /* set if ret status STATUS_BAD_NETWORK_NAME */
@@ -1015,6 +1020,7 @@ struct cifs_fid {
__u64 persistent_fid; /* persist file id for smb2 */
__u64 volatile_fid; /* volatile file id for smb2 */
__u8 lease_key[SMB2_LEASE_KEY_SIZE]; /* lease key for smb2 */
+ __u8 create_guid[16];
#endif
struct cifs_pending_open *pending_open;
unsigned int epoch;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 773f4dc77630..ecb0803bdb0e 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -87,6 +87,8 @@ enum {
Opt_sign, Opt_seal, Opt_noac,
Opt_fsc, Opt_mfsymlinks,
Opt_multiuser, Opt_sloppy, Opt_nosharesock,
+ Opt_persistent, Opt_nopersistent,
+ Opt_resilient, Opt_noresilient,
/* Mount options which take numeric value */
Opt_backupuid, Opt_backupgid, Opt_uid,
@@ -169,6 +171,10 @@ static const match_table_t cifs_mount_option_tokens = {
{ Opt_multiuser, "multiuser" },
{ Opt_sloppy, "sloppy" },
{ Opt_nosharesock, "nosharesock" },
+ { Opt_persistent, "persistenthandles"},
+ { Opt_nopersistent, "nopersistenthandles"},
+ { Opt_resilient, "resilienthandles"},
+ { Opt_noresilient, "noresilienthandles"},
{ Opt_backupuid, "backupuid=%s" },
{ Opt_backupgid, "backupgid=%s" },
@@ -1497,6 +1503,33 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
case Opt_nosharesock:
vol->nosharesock = true;
break;
+ case Opt_nopersistent:
+ vol->nopersistent = true;
+ if (vol->persistent) {
+ cifs_dbg(VFS,
+ "persistenthandles mount options conflict\n");
+ goto cifs_parse_mount_err;
+ }
+ break;
+ case Opt_persistent:
+ vol->persistent = true;
+ if ((vol->nopersistent) || (vol->resilient)) {
+ cifs_dbg(VFS,
+ "persistenthandles mount options conflict\n");
+ goto cifs_parse_mount_err;
+ }
+ break;
+ case Opt_resilient:
+ vol->resilient = true;
+ if (vol->persistent) {
+ cifs_dbg(VFS,
+ "persistenthandles mount options conflict\n");
+ goto cifs_parse_mount_err;
+ }
+ break;
+ case Opt_noresilient:
+ vol->resilient = false; /* already the default */
+ break;
/* Numeric Values */
case Opt_backupuid:
@@ -2325,13 +2358,14 @@ static int
cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses)
{
int rc = 0;
- char *desc, *delim, *payload;
+ const char *delim, *payload;
+ char *desc;
ssize_t len;
struct key *key;
struct TCP_Server_Info *server = ses->server;
struct sockaddr_in *sa;
struct sockaddr_in6 *sa6;
- struct user_key_payload *upayload;
+ const struct user_key_payload *upayload;
desc = kmalloc(CIFSCREDS_DESC_SIZE, GFP_KERNEL);
if (!desc)
@@ -2374,14 +2408,14 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses)
}
down_read(&key->sem);
- upayload = key->payload.data;
+ upayload = user_key_payload(key);
if (IS_ERR_OR_NULL(upayload)) {
rc = upayload ? PTR_ERR(upayload) : -EINVAL;
goto out_key_put;
}
/* find first : in payload */
- payload = (char *)upayload->data;
+ payload = upayload->data;
delim = strnchr(payload, upayload->datalen, ':');
cifs_dbg(FYI, "payload=%s\n", payload);
if (!delim) {
@@ -2654,6 +2688,42 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
cifs_dbg(FYI, "DFS disabled (%d)\n", tcon->Flags);
}
tcon->seal = volume_info->seal;
+ tcon->use_persistent = false;
+ /* check if SMB2 or later, CIFS does not support persistent handles */
+ if (volume_info->persistent) {
+ if (ses->server->vals->protocol_id == 0) {
+ cifs_dbg(VFS,
+ "SMB3 or later required for persistent handles\n");
+ rc = -EOPNOTSUPP;
+ goto out_fail;
+#ifdef CONFIG_CIFS_SMB2
+ } else if (ses->server->capabilities &
+ SMB2_GLOBAL_CAP_PERSISTENT_HANDLES)
+ tcon->use_persistent = true;
+ else /* persistent handles requested but not supported */ {
+ cifs_dbg(VFS,
+ "Persistent handles not supported on share\n");
+ rc = -EOPNOTSUPP;
+ goto out_fail;
+#endif /* CONFIG_CIFS_SMB2 */
+ }
+#ifdef CONFIG_CIFS_SMB2
+ } else if ((tcon->capabilities & SMB2_SHARE_CAP_CONTINUOUS_AVAILABILITY)
+ && (ses->server->capabilities & SMB2_GLOBAL_CAP_PERSISTENT_HANDLES)
+ && (volume_info->nopersistent == false)) {
+ cifs_dbg(FYI, "enabling persistent handles\n");
+ tcon->use_persistent = true;
+#endif /* CONFIG_CIFS_SMB2 */
+ } else if (volume_info->resilient) {
+ if (ses->server->vals->protocol_id == 0) {
+ cifs_dbg(VFS,
+ "SMB2.1 or later required for resilient handles\n");
+ rc = -EOPNOTSUPP;
+ goto out_fail;
+ }
+ tcon->use_resilient = true;
+ }
+
/*
* We can have only one retry value for a connection to a share so for
* resources mounted more than once to the same server share the last
@@ -3502,6 +3572,15 @@ try_mount_again:
goto mount_fail_check;
}
+#ifdef CONFIG_CIFS_SMB2
+ if ((volume_info->persistent == true) && ((ses->server->capabilities &
+ SMB2_GLOBAL_CAP_PERSISTENT_HANDLES) == 0)) {
+ cifs_dbg(VFS, "persistent handles not supported by server\n");
+ rc = -EOPNOTSUPP;
+ goto mount_fail_check;
+ }
+#endif /* CONFIG_CIFS_SMB2*/
+
/* search for existing tcon to this server share */
tcon = cifs_get_tcon(ses, volume_info);
if (IS_ERR(tcon)) {
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 62203c387db4..0a2752b79e72 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1553,7 +1553,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
out:
if (flock->fl_flags & FL_POSIX && !rc)
- rc = posix_lock_file_wait(file, flock);
+ rc = locks_lock_file_wait(file, flock);
return rc;
}
@@ -3380,7 +3380,7 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
struct page *page, *tpage;
unsigned int expected_index;
int rc;
- gfp_t gfp = GFP_KERNEL & mapping_gfp_mask(mapping);
+ gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL);
INIT_LIST_HEAD(tmplist);
@@ -3391,13 +3391,13 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
* should have access to this page, we're safe to simply set
* PG_locked without checking it first.
*/
- __set_page_locked(page);
+ __SetPageLocked(page);
rc = add_to_page_cache_locked(page, mapping,
page->index, gfp);
/* give up if we can't stick it in the cache */
if (rc) {
- __clear_page_locked(page);
+ __ClearPageLocked(page);
return rc;
}
@@ -3418,9 +3418,9 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
if (*bytes + PAGE_CACHE_SIZE > rsize)
break;
- __set_page_locked(page);
+ __SetPageLocked(page);
if (add_to_page_cache_locked(page, mapping, page->index, gfp)) {
- __clear_page_locked(page);
+ __ClearPageLocked(page);
break;
}
list_move_tail(&page->lru, tmplist);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 6b66dd5d1540..a329f5ba35aa 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1831,11 +1831,11 @@ cifs_invalidate_mapping(struct inode *inode)
* @word: long word containing the bit lock
*/
static int
-cifs_wait_bit_killable(struct wait_bit_key *key)
+cifs_wait_bit_killable(struct wait_bit_key *key, int mode)
{
- if (fatal_signal_pending(current))
- return -ERESTARTSYS;
freezable_schedule_unsafe();
+ if (signal_pending_state(mode, current))
+ return -ERESTARTSYS;
return 0;
}
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 28a77bf1d559..7a3b84e300f8 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -34,68 +34,36 @@
#include "cifs_ioctl.h"
#include <linux/btrfs.h>
-static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file,
- unsigned long srcfd, u64 off, u64 len, u64 destoff,
- bool dup_extents)
+static int cifs_file_clone_range(unsigned int xid, struct file *src_file,
+ struct file *dst_file)
{
- int rc;
- struct cifsFileInfo *smb_file_target = dst_file->private_data;
+ struct inode *src_inode = file_inode(src_file);
struct inode *target_inode = file_inode(dst_file);
- struct cifs_tcon *target_tcon;
- struct fd src_file;
struct cifsFileInfo *smb_file_src;
- struct inode *src_inode;
+ struct cifsFileInfo *smb_file_target;
struct cifs_tcon *src_tcon;
+ struct cifs_tcon *target_tcon;
+ int rc;
cifs_dbg(FYI, "ioctl clone range\n");
- /* the destination must be opened for writing */
- if (!(dst_file->f_mode & FMODE_WRITE)) {
- cifs_dbg(FYI, "file target not open for write\n");
- return -EINVAL;
- }
-
- /* check if target volume is readonly and take reference */
- rc = mnt_want_write_file(dst_file);
- if (rc) {
- cifs_dbg(FYI, "mnt_want_write failed with rc %d\n", rc);
- return rc;
- }
- src_file = fdget(srcfd);
- if (!src_file.file) {
- rc = -EBADF;
- goto out_drop_write;
- }
-
- if (src_file.file->f_op->unlocked_ioctl != cifs_ioctl) {
- rc = -EBADF;
- cifs_dbg(VFS, "src file seems to be from a different filesystem type\n");
- goto out_fput;
- }
-
- if ((!src_file.file->private_data) || (!dst_file->private_data)) {
+ if (!src_file->private_data || !dst_file->private_data) {
rc = -EBADF;
cifs_dbg(VFS, "missing cifsFileInfo on copy range src file\n");
- goto out_fput;
+ goto out;
}
rc = -EXDEV;
smb_file_target = dst_file->private_data;
- smb_file_src = src_file.file->private_data;
+ smb_file_src = src_file->private_data;
src_tcon = tlink_tcon(smb_file_src->tlink);
target_tcon = tlink_tcon(smb_file_target->tlink);
- /* check if source and target are on same tree connection */
- if (src_tcon != target_tcon) {
- cifs_dbg(VFS, "file copy src and target on different volume\n");
- goto out_fput;
+ if (src_tcon->ses != target_tcon->ses) {
+ cifs_dbg(VFS, "source and target of copy not on same server\n");
+ goto out;
}
- src_inode = file_inode(src_file.file);
- rc = -EINVAL;
- if (S_ISDIR(src_inode->i_mode))
- goto out_fput;
-
/*
* Note: cifs case is easier than btrfs since server responsible for
* checks for proper open modes and file type and if it wants
@@ -103,34 +71,66 @@ static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file,
*/
lock_two_nondirectories(target_inode, src_inode);
- /* determine range to clone */
- rc = -EINVAL;
- if (off + len > src_inode->i_size || off + len < off)
- goto out_unlock;
- if (len == 0)
- len = src_inode->i_size - off;
-
cifs_dbg(FYI, "about to flush pages\n");
/* should we flush first and last page first */
- truncate_inode_pages_range(&target_inode->i_data, destoff,
- PAGE_CACHE_ALIGN(destoff + len)-1);
+ truncate_inode_pages(&target_inode->i_data, 0);
- if (dup_extents && target_tcon->ses->server->ops->duplicate_extents)
- rc = target_tcon->ses->server->ops->duplicate_extents(xid,
- smb_file_src, smb_file_target, off, len, destoff);
- else if (!dup_extents && target_tcon->ses->server->ops->clone_range)
+ if (target_tcon->ses->server->ops->clone_range)
rc = target_tcon->ses->server->ops->clone_range(xid,
- smb_file_src, smb_file_target, off, len, destoff);
+ smb_file_src, smb_file_target, 0, src_inode->i_size, 0);
else
rc = -EOPNOTSUPP;
/* force revalidate of size and timestamps of target file now
that target is updated on the server */
CIFS_I(target_inode)->time = 0;
-out_unlock:
/* although unlocking in the reverse order from locking is not
strictly necessary here it is a little cleaner to be consistent */
unlock_two_nondirectories(src_inode, target_inode);
+out:
+ return rc;
+}
+
+static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file,
+ unsigned long srcfd)
+{
+ int rc;
+ struct fd src_file;
+ struct inode *src_inode;
+
+ cifs_dbg(FYI, "ioctl clone range\n");
+ /* the destination must be opened for writing */
+ if (!(dst_file->f_mode & FMODE_WRITE)) {
+ cifs_dbg(FYI, "file target not open for write\n");
+ return -EINVAL;
+ }
+
+ /* check if target volume is readonly and take reference */
+ rc = mnt_want_write_file(dst_file);
+ if (rc) {
+ cifs_dbg(FYI, "mnt_want_write failed with rc %d\n", rc);
+ return rc;
+ }
+
+ src_file = fdget(srcfd);
+ if (!src_file.file) {
+ rc = -EBADF;
+ goto out_drop_write;
+ }
+
+ if (src_file.file->f_op->unlocked_ioctl != cifs_ioctl) {
+ rc = -EBADF;
+ cifs_dbg(VFS, "src file seems to be from a different filesystem type\n");
+ goto out_fput;
+ }
+
+ src_inode = file_inode(src_file.file);
+ rc = -EINVAL;
+ if (S_ISDIR(src_inode->i_mode))
+ goto out_fput;
+
+ rc = cifs_file_clone_range(xid, src_file.file, dst_file);
+
out_fput:
fdput(src_file);
out_drop_write:
@@ -251,10 +251,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
}
break;
case CIFS_IOC_COPYCHUNK_FILE:
- rc = cifs_ioctl_clone(xid, filep, arg, 0, 0, 0, false);
- break;
- case BTRFS_IOC_CLONE:
- rc = cifs_ioctl_clone(xid, filep, arg, 0, 0, 0, true);
+ rc = cifs_ioctl_clone(xid, filep, arg);
break;
case CIFS_IOC_SET_INTEGRITY:
if (pSMBFile == NULL)
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index e3548f73bdea..062c2375549a 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -627,9 +627,9 @@ cifs_hl_exit:
}
const char *
-cifs_follow_link(struct dentry *direntry, void **cookie)
+cifs_get_link(struct dentry *direntry, struct inode *inode,
+ struct delayed_call *done)
{
- struct inode *inode = d_inode(direntry);
int rc = -ENOMEM;
unsigned int xid;
char *full_path = NULL;
@@ -639,6 +639,9 @@ cifs_follow_link(struct dentry *direntry, void **cookie)
struct cifs_tcon *tcon;
struct TCP_Server_Info *server;
+ if (!direntry)
+ return ERR_PTR(-ECHILD);
+
xid = get_xid();
tlink = cifs_sb_tlink(cifs_sb);
@@ -678,7 +681,8 @@ cifs_follow_link(struct dentry *direntry, void **cookie)
kfree(target_path);
return ERR_PTR(rc);
}
- return *cookie = target_path;
+ set_delayed_call(done, kfree_link, target_path);
+ return target_path;
}
int
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index b1eede3678a9..0557c45e9c33 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -84,7 +84,7 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name,
cifs_dbg(FYI, "%s: for %s\n", __func__, name->name);
dentry = d_hash_and_lookup(parent, name);
- if (unlikely(IS_ERR(dentry)))
+ if (IS_ERR(dentry))
return;
if (dentry) {
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index bce6fdcd5d48..59727e32ed0f 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -988,7 +988,7 @@ sess_auth_kerberos(struct sess_data *sess_data)
goto out;
}
- msg = spnego_key->payload.data;
+ msg = spnego_key->payload.data[0];
/*
* check version field to make sure that cifs.upcall is
* sending us a response in an expected form
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index 2ab297dae5a7..f9e766f464be 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -43,6 +43,7 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
struct smb2_file_all_info *smb2_data = NULL;
__u8 smb2_oplock[17];
struct cifs_fid *fid = oparms->fid;
+ struct network_resiliency_req nr_ioctl_req;
smb2_path = cifs_convert_path_to_utf16(oparms->path, oparms->cifs_sb);
if (smb2_path == NULL) {
@@ -67,6 +68,24 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
if (rc)
goto out;
+
+ if (oparms->tcon->use_resilient) {
+ nr_ioctl_req.Timeout = 0; /* use server default (120 seconds) */
+ nr_ioctl_req.Reserved = 0;
+ rc = SMB2_ioctl(xid, oparms->tcon, fid->persistent_fid,
+ fid->volatile_fid, FSCTL_LMR_REQUEST_RESILIENCY, true,
+ (char *)&nr_ioctl_req, sizeof(nr_ioctl_req),
+ NULL, NULL /* no return info */);
+ if (rc == -EOPNOTSUPP) {
+ cifs_dbg(VFS,
+ "resiliency not supported by server, disabling\n");
+ oparms->tcon->use_resilient = false;
+ } else if (rc)
+ cifs_dbg(FYI, "error %d setting resiliency\n", rc);
+
+ rc = 0;
+ }
+
if (buf) {
/* open response does not have IndexNumber field - get it */
rc = SMB2_get_srv_num(xid, oparms->tcon, fid->persistent_fid,
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 18da19f4f811..53ccdde6ff18 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -810,7 +810,6 @@ smb2_set_file_size(const unsigned int xid, struct cifs_tcon *tcon,
cfile->fid.volatile_fid, cfile->pid, &eof, false);
}
-#ifdef CONFIG_CIFS_SMB311
static int
smb2_duplicate_extents(const unsigned int xid,
struct cifsFileInfo *srcfile,
@@ -854,8 +853,6 @@ smb2_duplicate_extents(const unsigned int xid,
duplicate_extents_out:
return rc;
}
-#endif /* CONFIG_CIFS_SMB311 */
-
static int
smb2_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
@@ -1703,6 +1700,7 @@ struct smb_version_operations smb30_operations = {
.create_lease_buf = smb3_create_lease_buf,
.parse_lease_buf = smb3_parse_lease_buf,
.clone_range = smb2_clone_range,
+ .duplicate_extents = smb2_duplicate_extents,
.validate_negotiate = smb3_validate_negotiate,
.wp_retry_size = smb2_wp_retry_size,
.dir_needs_close = smb2_dir_needs_close,
@@ -1840,7 +1838,7 @@ struct smb_version_values smb21_values = {
struct smb_version_values smb30_values = {
.version_string = SMB30_VERSION_STRING,
.protocol_id = SMB30_PROT_ID,
- .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU,
+ .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES,
.large_lock_type = 0,
.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
@@ -1860,7 +1858,7 @@ struct smb_version_values smb30_values = {
struct smb_version_values smb302_values = {
.version_string = SMB302_VERSION_STRING,
.protocol_id = SMB302_PROT_ID,
- .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU,
+ .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES,
.large_lock_type = 0,
.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
@@ -1881,7 +1879,7 @@ struct smb_version_values smb302_values = {
struct smb_version_values smb311_values = {
.version_string = SMB311_VERSION_STRING,
.protocol_id = SMB311_PROT_ID,
- .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU,
+ .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES,
.large_lock_type = 0,
.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 597a417ba94d..767555518d40 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -660,7 +660,7 @@ ssetup_ntlmssp_authenticate:
goto ssetup_exit;
}
- msg = spnego_key->payload.data;
+ msg = spnego_key->payload.data[0];
/*
* check version field to make sure that cifs.upcall is
* sending us a response in an expected form
@@ -1151,13 +1151,130 @@ add_lease_context(struct TCP_Server_Info *server, struct kvec *iov,
return 0;
}
+static struct create_durable_v2 *
+create_durable_v2_buf(struct cifs_fid *pfid)
+{
+ struct create_durable_v2 *buf;
+
+ buf = kzalloc(sizeof(struct create_durable_v2), GFP_KERNEL);
+ if (!buf)
+ return NULL;
+
+ buf->ccontext.DataOffset = cpu_to_le16(offsetof
+ (struct create_durable_v2, dcontext));
+ buf->ccontext.DataLength = cpu_to_le32(sizeof(struct durable_context_v2));
+ buf->ccontext.NameOffset = cpu_to_le16(offsetof
+ (struct create_durable_v2, Name));
+ buf->ccontext.NameLength = cpu_to_le16(4);
+
+ buf->dcontext.Timeout = 0; /* Should this be configurable by workload */
+ buf->dcontext.Flags = cpu_to_le32(SMB2_DHANDLE_FLAG_PERSISTENT);
+ get_random_bytes(buf->dcontext.CreateGuid, 16);
+ memcpy(pfid->create_guid, buf->dcontext.CreateGuid, 16);
+
+ /* SMB2_CREATE_DURABLE_HANDLE_REQUEST is "DH2Q" */
+ buf->Name[0] = 'D';
+ buf->Name[1] = 'H';
+ buf->Name[2] = '2';
+ buf->Name[3] = 'Q';
+ return buf;
+}
+
+static struct create_durable_handle_reconnect_v2 *
+create_reconnect_durable_v2_buf(struct cifs_fid *fid)
+{
+ struct create_durable_handle_reconnect_v2 *buf;
+
+ buf = kzalloc(sizeof(struct create_durable_handle_reconnect_v2),
+ GFP_KERNEL);
+ if (!buf)
+ return NULL;
+
+ buf->ccontext.DataOffset =
+ cpu_to_le16(offsetof(struct create_durable_handle_reconnect_v2,
+ dcontext));
+ buf->ccontext.DataLength =
+ cpu_to_le32(sizeof(struct durable_reconnect_context_v2));
+ buf->ccontext.NameOffset =
+ cpu_to_le16(offsetof(struct create_durable_handle_reconnect_v2,
+ Name));
+ buf->ccontext.NameLength = cpu_to_le16(4);
+
+ buf->dcontext.Fid.PersistentFileId = fid->persistent_fid;
+ buf->dcontext.Fid.VolatileFileId = fid->volatile_fid;
+ buf->dcontext.Flags = cpu_to_le32(SMB2_DHANDLE_FLAG_PERSISTENT);
+ memcpy(buf->dcontext.CreateGuid, fid->create_guid, 16);
+
+ /* SMB2_CREATE_DURABLE_HANDLE_RECONNECT_V2 is "DH2C" */
+ buf->Name[0] = 'D';
+ buf->Name[1] = 'H';
+ buf->Name[2] = '2';
+ buf->Name[3] = 'C';
+ return buf;
+}
+
static int
-add_durable_context(struct kvec *iov, unsigned int *num_iovec,
+add_durable_v2_context(struct kvec *iov, unsigned int *num_iovec,
struct cifs_open_parms *oparms)
{
struct smb2_create_req *req = iov[0].iov_base;
unsigned int num = *num_iovec;
+ iov[num].iov_base = create_durable_v2_buf(oparms->fid);
+ if (iov[num].iov_base == NULL)
+ return -ENOMEM;
+ iov[num].iov_len = sizeof(struct create_durable_v2);
+ if (!req->CreateContextsOffset)
+ req->CreateContextsOffset =
+ cpu_to_le32(sizeof(struct smb2_create_req) - 4 +
+ iov[1].iov_len);
+ le32_add_cpu(&req->CreateContextsLength, sizeof(struct create_durable_v2));
+ inc_rfc1001_len(&req->hdr, sizeof(struct create_durable_v2));
+ *num_iovec = num + 1;
+ return 0;
+}
+
+static int
+add_durable_reconnect_v2_context(struct kvec *iov, unsigned int *num_iovec,
+ struct cifs_open_parms *oparms)
+{
+ struct smb2_create_req *req = iov[0].iov_base;
+ unsigned int num = *num_iovec;
+
+ /* indicate that we don't need to relock the file */
+ oparms->reconnect = false;
+
+ iov[num].iov_base = create_reconnect_durable_v2_buf(oparms->fid);
+ if (iov[num].iov_base == NULL)
+ return -ENOMEM;
+ iov[num].iov_len = sizeof(struct create_durable_handle_reconnect_v2);
+ if (!req->CreateContextsOffset)
+ req->CreateContextsOffset =
+ cpu_to_le32(sizeof(struct smb2_create_req) - 4 +
+ iov[1].iov_len);
+ le32_add_cpu(&req->CreateContextsLength,
+ sizeof(struct create_durable_handle_reconnect_v2));
+ inc_rfc1001_len(&req->hdr,
+ sizeof(struct create_durable_handle_reconnect_v2));
+ *num_iovec = num + 1;
+ return 0;
+}
+
+static int
+add_durable_context(struct kvec *iov, unsigned int *num_iovec,
+ struct cifs_open_parms *oparms, bool use_persistent)
+{
+ struct smb2_create_req *req = iov[0].iov_base;
+ unsigned int num = *num_iovec;
+
+ if (use_persistent) {
+ if (oparms->reconnect)
+ return add_durable_reconnect_v2_context(iov, num_iovec,
+ oparms);
+ else
+ return add_durable_v2_context(iov, num_iovec, oparms);
+ }
+
if (oparms->reconnect) {
iov[num].iov_base = create_reconnect_durable_buf(oparms->fid);
/* indicate that we don't need to relock the file */
@@ -1275,7 +1392,9 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
ccontext->Next =
cpu_to_le32(server->vals->create_lease_size);
}
- rc = add_durable_context(iov, &num_iovecs, oparms);
+
+ rc = add_durable_context(iov, &num_iovecs, oparms,
+ tcon->use_persistent);
if (rc) {
cifs_small_buf_release(req);
kfree(copy_path);
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index 451108284a2f..4af52780ec35 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -590,6 +590,44 @@ struct create_durable {
} Data;
} __packed;
+/* See MS-SMB2 2.2.13.2.11 */
+/* Flags */
+#define SMB2_DHANDLE_FLAG_PERSISTENT 0x00000002
+struct durable_context_v2 {
+ __le32 Timeout;
+ __le32 Flags;
+ __u64 Reserved;
+ __u8 CreateGuid[16];
+} __packed;
+
+struct create_durable_v2 {
+ struct create_context ccontext;
+ __u8 Name[8];
+ struct durable_context_v2 dcontext;
+} __packed;
+
+/* See MS-SMB2 2.2.13.2.12 */
+struct durable_reconnect_context_v2 {
+ struct {
+ __u64 PersistentFileId;
+ __u64 VolatileFileId;
+ } Fid;
+ __u8 CreateGuid[16];
+ __le32 Flags; /* see above DHANDLE_FLAG_PERSISTENT */
+} __packed;
+
+/* See MS-SMB2 2.2.14.2.12 */
+struct durable_reconnect_context_v2_rsp {
+ __le32 Timeout;
+ __le32 Flags; /* see above DHANDLE_FLAG_PERSISTENT */
+} __packed;
+
+struct create_durable_handle_reconnect_v2 {
+ struct create_context ccontext;
+ __u8 Name[8];
+ struct durable_reconnect_context_v2 dcontext;
+} __packed;
+
#define COPY_CHUNK_RES_KEY_SIZE 24
struct resume_key_req {
char ResumeKey[COPY_CHUNK_RES_KEY_SIZE];
@@ -643,6 +681,13 @@ struct fsctl_get_integrity_information_rsp {
/* Integrity flags for above */
#define FSCTL_INTEGRITY_FLAG_CHECKSUM_ENFORCEMENT_OFF 0x00000001
+/* See MS-SMB2 2.2.31.3 */
+struct network_resiliency_req {
+ __le32 Timeout;
+ __le32 Reserved;
+} __packed;
+/* There is no buffer for the response ie no struct network_resiliency_rsp */
+
struct validate_negotiate_info_req {
__le32 Capabilities;
diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h
index a639d0dab453..f996daeea271 100644
--- a/fs/cifs/smbfsctl.h
+++ b/fs/cifs/smbfsctl.h
@@ -90,7 +90,7 @@
#define FSCTL_SRV_ENUMERATE_SNAPSHOTS 0x00144064
/* Retrieve an opaque file reference for server-side data movement ie copy */
#define FSCTL_SRV_REQUEST_RESUME_KEY 0x00140078
-#define FSCTL_LMR_REQUEST_RESILIENCY 0x001401D4 /* BB add struct */
+#define FSCTL_LMR_REQUEST_RESILIENCY 0x001401D4
#define FSCTL_LMR_GET_LINK_TRACK_INF 0x001400E8 /* BB add struct */
#define FSCTL_LMR_SET_LINK_TRACK_INF 0x001400EC /* BB add struct */
#define FSCTL_VALIDATE_NEGOTIATE_INFO 0x00140204
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index ff9e1f8b16a4..f5dc2f0df4ad 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -190,8 +190,8 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
#endif /* CONFIG_CIFS_ACL */
} else {
int temp;
- temp = strncmp(ea_name, POSIX_ACL_XATTR_ACCESS,
- strlen(POSIX_ACL_XATTR_ACCESS));
+ temp = strncmp(ea_name, XATTR_NAME_POSIX_ACL_ACCESS,
+ strlen(XATTR_NAME_POSIX_ACL_ACCESS));
if (temp == 0) {
#ifdef CONFIG_CIFS_POSIX
if (sb->s_flags & MS_POSIXACL)
@@ -203,8 +203,8 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
#else
cifs_dbg(FYI, "set POSIX ACL not supported\n");
#endif
- } else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT,
- strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) {
+ } else if (strncmp(ea_name, XATTR_NAME_POSIX_ACL_DEFAULT,
+ strlen(XATTR_NAME_POSIX_ACL_DEFAULT)) == 0) {
#ifdef CONFIG_CIFS_POSIX
if (sb->s_flags & MS_POSIXACL)
rc = CIFSSMBSetPosixACL(xid, pTcon, full_path,
@@ -292,8 +292,8 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon,
full_path, ea_name, ea_value, buf_size,
cifs_sb->local_nls, cifs_remap(cifs_sb));
- } else if (strncmp(ea_name, POSIX_ACL_XATTR_ACCESS,
- strlen(POSIX_ACL_XATTR_ACCESS)) == 0) {
+ } else if (strncmp(ea_name, XATTR_NAME_POSIX_ACL_ACCESS,
+ strlen(XATTR_NAME_POSIX_ACL_ACCESS)) == 0) {
#ifdef CONFIG_CIFS_POSIX
if (sb->s_flags & MS_POSIXACL)
rc = CIFSSMBGetPosixACL(xid, pTcon, full_path,
@@ -303,8 +303,8 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
#else
cifs_dbg(FYI, "Query POSIX ACL not supported yet\n");
#endif /* CONFIG_CIFS_POSIX */
- } else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT,
- strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) {
+ } else if (strncmp(ea_name, XATTR_NAME_POSIX_ACL_DEFAULT,
+ strlen(XATTR_NAME_POSIX_ACL_DEFAULT)) == 0) {
#ifdef CONFIG_CIFS_POSIX
if (sb->s_flags & MS_POSIXACL)
rc = CIFSSMBGetPosixACL(xid, pTcon, full_path,
diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c
index 7740b1c871c1..1bfb7ba4e85e 100644
--- a/fs/coda/cnode.c
+++ b/fs/coda/cnode.c
@@ -8,6 +8,7 @@
#include <linux/coda.h>
#include <linux/coda_psdev.h>
+#include <linux/pagemap.h>
#include "coda_linux.h"
static inline int coda_fideq(struct CodaFid *fid1, struct CodaFid *fid2)
@@ -17,8 +18,7 @@ static inline int coda_fideq(struct CodaFid *fid1, struct CodaFid *fid2)
static const struct inode_operations coda_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.setattr = coda_setattr,
};
@@ -35,6 +35,7 @@ static void coda_fill_inode(struct inode *inode, struct coda_vattr *attr)
inode->i_fop = &coda_dir_operations;
} else if (S_ISLNK(inode->i_mode)) {
inode->i_op = &coda_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_data.a_ops = &coda_symlink_aops;
inode->i_mapping = &inode->i_data;
} else
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index cac1390b87a3..57e81cbba0fa 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -74,9 +74,9 @@ static void init_once(void *foo)
int __init coda_init_inodecache(void)
{
coda_inode_cachep = kmem_cache_create("coda_inode_cache",
- sizeof(struct coda_inode_info),
- 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
- init_once);
+ sizeof(struct coda_inode_info), 0,
+ SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
+ SLAB_ACCOUNT, init_once);
if (coda_inode_cachep == NULL)
return -ENOMEM;
return 0;
diff --git a/fs/coda/symlink.c b/fs/coda/symlink.c
index ab94ef63caef..03736e20d720 100644
--- a/fs/coda/symlink.c
+++ b/fs/coda/symlink.c
@@ -26,7 +26,7 @@ static int coda_symlink_filler(struct file *file, struct page *page)
int error;
struct coda_inode_info *cii;
unsigned int len = PAGE_SIZE;
- char *p = kmap(page);
+ char *p = page_address(page);
cii = ITOC(inode);
@@ -34,13 +34,11 @@ static int coda_symlink_filler(struct file *file, struct page *page)
if (error)
goto fail;
SetPageUptodate(page);
- kunmap(page);
unlock_page(page);
return 0;
fail:
SetPageError(page);
- kunmap(page);
unlock_page(page);
return error;
}
diff --git a/fs/compat.c b/fs/compat.c
index 6fd272d455e4..a71936a3f4cb 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -792,7 +792,7 @@ COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name,
const void __user *, data)
{
char *kernel_type;
- unsigned long data_page;
+ void *options;
char *kernel_dev;
int retval;
@@ -806,26 +806,25 @@ COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name,
if (IS_ERR(kernel_dev))
goto out1;
- retval = copy_mount_options(data, &data_page);
- if (retval < 0)
+ options = copy_mount_options(data);
+ retval = PTR_ERR(options);
+ if (IS_ERR(options))
goto out2;
- retval = -EINVAL;
-
- if (kernel_type && data_page) {
+ if (kernel_type && options) {
if (!strcmp(kernel_type, NCPFS_NAME)) {
- do_ncp_super_data_conv((void *)data_page);
+ do_ncp_super_data_conv(options);
} else if (!strcmp(kernel_type, NFS4_NAME)) {
- if (do_nfs4_super_data_conv((void *) data_page))
+ retval = -EINVAL;
+ if (do_nfs4_super_data_conv(options))
goto out3;
}
}
- retval = do_mount(kernel_dev, dir_name, kernel_type,
- flags, (void*)data_page);
+ retval = do_mount(kernel_dev, dir_name, kernel_type, flags, options);
out3:
- free_page(data_page);
+ kfree(options);
out2:
kfree(kernel_dev);
out1:
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 48851f6ea6ec..a5b8eb69a8f4 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -58,6 +58,8 @@
#include <linux/atalk.h>
#include <linux/gfp.h>
+#include "internal.h"
+
#include <net/bluetooth/bluetooth.h>
#include <net/bluetooth/hci_sock.h>
#include <net/bluetooth/rfcomm.h>
@@ -115,19 +117,38 @@
#include <asm/fbio.h>
#endif
-static int w_long(unsigned int fd, unsigned int cmd,
- compat_ulong_t __user *argp)
+#define convert_in_user(srcptr, dstptr) \
+({ \
+ typeof(*srcptr) val; \
+ \
+ get_user(val, srcptr) || put_user(val, dstptr); \
+})
+
+static int do_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
- mm_segment_t old_fs = get_fs();
int err;
- unsigned long val;
- set_fs (KERNEL_DS);
- err = sys_ioctl(fd, cmd, (unsigned long)&val);
- set_fs (old_fs);
- if (!err && put_user(val, argp))
+ err = security_file_ioctl(file, cmd, arg);
+ if (err)
+ return err;
+
+ return vfs_ioctl(file, cmd, arg);
+}
+
+static int w_long(struct file *file,
+ unsigned int cmd, compat_ulong_t __user *argp)
+{
+ int err;
+ unsigned long __user *valp = compat_alloc_user_space(sizeof(*valp));
+
+ if (valp == NULL)
return -EFAULT;
- return err;
+ err = do_ioctl(file, cmd, (unsigned long)valp);
+ if (err)
+ return err;
+ if (convert_in_user(valp, argp))
+ return -EFAULT;
+ return 0;
}
struct compat_video_event {
@@ -139,23 +160,23 @@ struct compat_video_event {
} u;
};
-static int do_video_get_event(unsigned int fd, unsigned int cmd,
- struct compat_video_event __user *up)
+static int do_video_get_event(struct file *file,
+ unsigned int cmd, struct compat_video_event __user *up)
{
- struct video_event kevent;
- mm_segment_t old_fs = get_fs();
+ struct video_event __user *kevent =
+ compat_alloc_user_space(sizeof(*kevent));
int err;
- set_fs(KERNEL_DS);
- err = sys_ioctl(fd, cmd, (unsigned long) &kevent);
- set_fs(old_fs);
+ if (kevent == NULL)
+ return -EFAULT;
+ err = do_ioctl(file, cmd, (unsigned long)kevent);
if (!err) {
- err = put_user(kevent.type, &up->type);
- err |= put_user(kevent.timestamp, &up->timestamp);
- err |= put_user(kevent.u.size.w, &up->u.size.w);
- err |= put_user(kevent.u.size.h, &up->u.size.h);
- err |= put_user(kevent.u.size.aspect_ratio,
+ err = convert_in_user(&kevent->type, &up->type);
+ err |= convert_in_user(&kevent->timestamp, &up->timestamp);
+ err |= convert_in_user(&kevent->u.size.w, &up->u.size.w);
+ err |= convert_in_user(&kevent->u.size.h, &up->u.size.h);
+ err |= convert_in_user(&kevent->u.size.aspect_ratio,
&up->u.size.aspect_ratio);
if (err)
err = -EFAULT;
@@ -169,8 +190,8 @@ struct compat_video_still_picture {
int32_t size;
};
-static int do_video_stillpicture(unsigned int fd, unsigned int cmd,
- struct compat_video_still_picture __user *up)
+static int do_video_stillpicture(struct file *file,
+ unsigned int cmd, struct compat_video_still_picture __user *up)
{
struct video_still_picture __user *up_native;
compat_uptr_t fp;
@@ -190,7 +211,7 @@ static int do_video_stillpicture(unsigned int fd, unsigned int cmd,
if (err)
return -EFAULT;
- err = sys_ioctl(fd, cmd, (unsigned long) up_native);
+ err = do_ioctl(file, cmd, (unsigned long) up_native);
return err;
}
@@ -200,8 +221,8 @@ struct compat_video_spu_palette {
compat_uptr_t palette;
};
-static int do_video_set_spu_palette(unsigned int fd, unsigned int cmd,
- struct compat_video_spu_palette __user *up)
+static int do_video_set_spu_palette(struct file *file,
+ unsigned int cmd, struct compat_video_spu_palette __user *up)
{
struct video_spu_palette __user *up_native;
compat_uptr_t palp;
@@ -218,7 +239,7 @@ static int do_video_set_spu_palette(unsigned int fd, unsigned int cmd,
if (err)
return -EFAULT;
- err = sys_ioctl(fd, cmd, (unsigned long) up_native);
+ err = do_ioctl(file, cmd, (unsigned long) up_native);
return err;
}
@@ -276,7 +297,7 @@ static int sg_build_iovec(sg_io_hdr_t __user *sgio, void __user *dxferp, u16 iov
return 0;
}
-static int sg_ioctl_trans(unsigned int fd, unsigned int cmd,
+static int sg_ioctl_trans(struct file *file, unsigned int cmd,
sg_io_hdr32_t __user *sgio32)
{
sg_io_hdr_t __user *sgio;
@@ -289,7 +310,7 @@ static int sg_ioctl_trans(unsigned int fd, unsigned int cmd,
if (get_user(interface_id, &sgio32->interface_id))
return -EFAULT;
if (interface_id != 'S')
- return sys_ioctl(fd, cmd, (unsigned long)sgio32);
+ return do_ioctl(file, cmd, (unsigned long)sgio32);
if (get_user(iovec_count, &sgio32->iovec_count))
return -EFAULT;
@@ -349,7 +370,7 @@ static int sg_ioctl_trans(unsigned int fd, unsigned int cmd,
if (put_user(compat_ptr(data), &sgio->usr_ptr))
return -EFAULT;
- err = sys_ioctl(fd, cmd, (unsigned long) sgio);
+ err = do_ioctl(file, cmd, (unsigned long) sgio);
if (err >= 0) {
void __user *datap;
@@ -380,13 +401,13 @@ struct compat_sg_req_info { /* used by SG_GET_REQUEST_TABLE ioctl() */
int unused;
};
-static int sg_grt_trans(unsigned int fd, unsigned int cmd, struct
- compat_sg_req_info __user *o)
+static int sg_grt_trans(struct file *file,
+ unsigned int cmd, struct compat_sg_req_info __user *o)
{
int err, i;
sg_req_info_t __user *r;
r = compat_alloc_user_space(sizeof(sg_req_info_t)*SG_MAX_QUEUE);
- err = sys_ioctl(fd,cmd,(unsigned long)r);
+ err = do_ioctl(file, cmd, (unsigned long)r);
if (err < 0)
return err;
for (i = 0; i < SG_MAX_QUEUE; i++) {
@@ -412,8 +433,8 @@ struct sock_fprog32 {
#define PPPIOCSPASS32 _IOW('t', 71, struct sock_fprog32)
#define PPPIOCSACTIVE32 _IOW('t', 70, struct sock_fprog32)
-static int ppp_sock_fprog_ioctl_trans(unsigned int fd, unsigned int cmd,
- struct sock_fprog32 __user *u_fprog32)
+static int ppp_sock_fprog_ioctl_trans(struct file *file,
+ unsigned int cmd, struct sock_fprog32 __user *u_fprog32)
{
struct sock_fprog __user *u_fprog64 = compat_alloc_user_space(sizeof(struct sock_fprog));
void __user *fptr64;
@@ -435,7 +456,7 @@ static int ppp_sock_fprog_ioctl_trans(unsigned int fd, unsigned int cmd,
else
cmd = PPPIOCSACTIVE;
- return sys_ioctl(fd, cmd, (unsigned long) u_fprog64);
+ return do_ioctl(file, cmd, (unsigned long) u_fprog64);
}
struct ppp_option_data32 {
@@ -451,7 +472,7 @@ struct ppp_idle32 {
};
#define PPPIOCGIDLE32 _IOR('t', 63, struct ppp_idle32)
-static int ppp_gidle(unsigned int fd, unsigned int cmd,
+static int ppp_gidle(struct file *file, unsigned int cmd,
struct ppp_idle32 __user *idle32)
{
struct ppp_idle __user *idle;
@@ -460,7 +481,7 @@ static int ppp_gidle(unsigned int fd, unsigned int cmd,
idle = compat_alloc_user_space(sizeof(*idle));
- err = sys_ioctl(fd, PPPIOCGIDLE, (unsigned long) idle);
+ err = do_ioctl(file, PPPIOCGIDLE, (unsigned long) idle);
if (!err) {
if (get_user(xmit, &idle->xmit_idle) ||
@@ -472,7 +493,7 @@ static int ppp_gidle(unsigned int fd, unsigned int cmd,
return err;
}
-static int ppp_scompress(unsigned int fd, unsigned int cmd,
+static int ppp_scompress(struct file *file, unsigned int cmd,
struct ppp_option_data32 __user *odata32)
{
struct ppp_option_data __user *odata;
@@ -492,7 +513,7 @@ static int ppp_scompress(unsigned int fd, unsigned int cmd,
sizeof(__u32) + sizeof(int)))
return -EFAULT;
- return sys_ioctl(fd, PPPIOCSCOMPRESS, (unsigned long) odata);
+ return do_ioctl(file, PPPIOCSCOMPRESS, (unsigned long) odata);
}
#ifdef CONFIG_BLOCK
@@ -512,12 +533,13 @@ struct mtpos32 {
};
#define MTIOCPOS32 _IOR('m', 3, struct mtpos32)
-static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, void __user *argp)
+static int mt_ioctl_trans(struct file *file,
+ unsigned int cmd, void __user *argp)
{
- mm_segment_t old_fs = get_fs();
- struct mtget get;
+ /* NULL initialization to make gcc shut up */
+ struct mtget __user *get = NULL;
struct mtget32 __user *umget32;
- struct mtpos pos;
+ struct mtpos __user *pos = NULL;
struct mtpos32 __user *upos32;
unsigned long kcmd;
void *karg;
@@ -526,32 +548,34 @@ static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, void __user *argp)
switch(cmd) {
case MTIOCPOS32:
kcmd = MTIOCPOS;
- karg = &pos;
+ pos = compat_alloc_user_space(sizeof(*pos));
+ karg = pos;
break;
default: /* MTIOCGET32 */
kcmd = MTIOCGET;
- karg = &get;
+ get = compat_alloc_user_space(sizeof(*get));
+ karg = get;
break;
}
- set_fs (KERNEL_DS);
- err = sys_ioctl (fd, kcmd, (unsigned long)karg);
- set_fs (old_fs);
+ if (karg == NULL)
+ return -EFAULT;
+ err = do_ioctl(file, kcmd, (unsigned long)karg);
if (err)
return err;
switch (cmd) {
case MTIOCPOS32:
upos32 = argp;
- err = __put_user(pos.mt_blkno, &upos32->mt_blkno);
+ err = convert_in_user(&pos->mt_blkno, &upos32->mt_blkno);
break;
case MTIOCGET32:
umget32 = argp;
- err = __put_user(get.mt_type, &umget32->mt_type);
- err |= __put_user(get.mt_resid, &umget32->mt_resid);
- err |= __put_user(get.mt_dsreg, &umget32->mt_dsreg);
- err |= __put_user(get.mt_gstat, &umget32->mt_gstat);
- err |= __put_user(get.mt_erreg, &umget32->mt_erreg);
- err |= __put_user(get.mt_fileno, &umget32->mt_fileno);
- err |= __put_user(get.mt_blkno, &umget32->mt_blkno);
+ err = convert_in_user(&get->mt_type, &umget32->mt_type);
+ err |= convert_in_user(&get->mt_resid, &umget32->mt_resid);
+ err |= convert_in_user(&get->mt_dsreg, &umget32->mt_dsreg);
+ err |= convert_in_user(&get->mt_gstat, &umget32->mt_gstat);
+ err |= convert_in_user(&get->mt_erreg, &umget32->mt_erreg);
+ err |= convert_in_user(&get->mt_fileno, &umget32->mt_fileno);
+ err |= convert_in_user(&get->mt_blkno, &umget32->mt_blkno);
break;
}
return err ? -EFAULT: 0;
@@ -605,42 +629,41 @@ struct serial_struct32 {
compat_int_t reserved[1];
};
-static int serial_struct_ioctl(unsigned fd, unsigned cmd,
- struct serial_struct32 __user *ss32)
+static int serial_struct_ioctl(struct file *file,
+ unsigned cmd, struct serial_struct32 __user *ss32)
{
typedef struct serial_struct32 SS32;
int err;
- struct serial_struct ss;
- mm_segment_t oldseg = get_fs();
+ struct serial_struct __user *ss = compat_alloc_user_space(sizeof(*ss));
__u32 udata;
unsigned int base;
+ unsigned char *iomem_base;
+ if (ss == NULL)
+ return -EFAULT;
if (cmd == TIOCSSERIAL) {
- if (!access_ok(VERIFY_READ, ss32, sizeof(SS32)))
- return -EFAULT;
- if (__copy_from_user(&ss, ss32, offsetof(SS32, iomem_base)))
- return -EFAULT;
- if (__get_user(udata, &ss32->iomem_base))
+ if (copy_in_user(ss, ss32, offsetof(SS32, iomem_base)) ||
+ get_user(udata, &ss32->iomem_base))
return -EFAULT;
- ss.iomem_base = compat_ptr(udata);
- if (__get_user(ss.iomem_reg_shift, &ss32->iomem_reg_shift) ||
- __get_user(ss.port_high, &ss32->port_high))
+ iomem_base = compat_ptr(udata);
+ if (put_user(iomem_base, &ss->iomem_base) ||
+ convert_in_user(&ss32->iomem_reg_shift,
+ &ss->iomem_reg_shift) ||
+ convert_in_user(&ss32->port_high, &ss->port_high) ||
+ put_user(0UL, &ss->iomap_base))
return -EFAULT;
- ss.iomap_base = 0UL;
}
- set_fs(KERNEL_DS);
- err = sys_ioctl(fd,cmd,(unsigned long)(&ss));
- set_fs(oldseg);
+ err = do_ioctl(file, cmd, (unsigned long)ss);
if (cmd == TIOCGSERIAL && err >= 0) {
- if (!access_ok(VERIFY_WRITE, ss32, sizeof(SS32)))
- return -EFAULT;
- if (__copy_to_user(ss32,&ss,offsetof(SS32,iomem_base)))
+ if (copy_in_user(ss32, ss, offsetof(SS32, iomem_base)) ||
+ get_user(iomem_base, &ss->iomem_base))
return -EFAULT;
- base = (unsigned long)ss.iomem_base >> 32 ?
- 0xffffffff : (unsigned)(unsigned long)ss.iomem_base;
- if (__put_user(base, &ss32->iomem_base) ||
- __put_user(ss.iomem_reg_shift, &ss32->iomem_reg_shift) ||
- __put_user(ss.port_high, &ss32->port_high))
+ base = (unsigned long)iomem_base >> 32 ?
+ 0xffffffff : (unsigned)(unsigned long)iomem_base;
+ if (put_user(base, &ss32->iomem_base) ||
+ convert_in_user(&ss->iomem_reg_shift,
+ &ss32->iomem_reg_shift) ||
+ convert_in_user(&ss->port_high, &ss32->port_high))
return -EFAULT;
}
return err;
@@ -674,8 +697,8 @@ struct i2c_rdwr_aligned {
struct i2c_msg msgs[0];
};
-static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd,
- struct i2c_rdwr_ioctl_data32 __user *udata)
+static int do_i2c_rdwr_ioctl(struct file *file,
+ unsigned int cmd, struct i2c_rdwr_ioctl_data32 __user *udata)
{
struct i2c_rdwr_aligned __user *tdata;
struct i2c_msg __user *tmsgs;
@@ -686,7 +709,7 @@ static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd,
if (get_user(nmsgs, &udata->nmsgs))
return -EFAULT;
- if (nmsgs > I2C_RDRW_IOCTL_MAX_MSGS)
+ if (nmsgs > I2C_RDWR_IOCTL_MAX_MSGS)
return -EINVAL;
if (get_user(datap, &udata->msgs))
@@ -708,11 +731,11 @@ static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd,
put_user(compat_ptr(datap), &tmsgs[i].buf))
return -EFAULT;
}
- return sys_ioctl(fd, cmd, (unsigned long)tdata);
+ return do_ioctl(file, cmd, (unsigned long)tdata);
}
-static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd,
- struct i2c_smbus_ioctl_data32 __user *udata)
+static int do_i2c_smbus_ioctl(struct file *file,
+ unsigned int cmd, struct i2c_smbus_ioctl_data32 __user *udata)
{
struct i2c_smbus_ioctl_data __user *tdata;
compat_caddr_t datap;
@@ -734,7 +757,7 @@ static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd,
__put_user(compat_ptr(datap), &tdata->data))
return -EFAULT;
- return sys_ioctl(fd, cmd, (unsigned long)tdata);
+ return do_ioctl(file, cmd, (unsigned long)tdata);
}
#define RTC_IRQP_READ32 _IOR('p', 0x0b, compat_ulong_t)
@@ -742,29 +765,27 @@ static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd,
#define RTC_EPOCH_READ32 _IOR('p', 0x0d, compat_ulong_t)
#define RTC_EPOCH_SET32 _IOW('p', 0x0e, compat_ulong_t)
-static int rtc_ioctl(unsigned fd, unsigned cmd, void __user *argp)
+static int rtc_ioctl(struct file *file,
+ unsigned cmd, void __user *argp)
{
- mm_segment_t oldfs = get_fs();
- compat_ulong_t val32;
- unsigned long kval;
+ unsigned long __user *valp = compat_alloc_user_space(sizeof(*valp));
int ret;
+ if (valp == NULL)
+ return -EFAULT;
switch (cmd) {
case RTC_IRQP_READ32:
case RTC_EPOCH_READ32:
- set_fs(KERNEL_DS);
- ret = sys_ioctl(fd, (cmd == RTC_IRQP_READ32) ?
+ ret = do_ioctl(file, (cmd == RTC_IRQP_READ32) ?
RTC_IRQP_READ : RTC_EPOCH_READ,
- (unsigned long)&kval);
- set_fs(oldfs);
+ (unsigned long)valp);
if (ret)
return ret;
- val32 = kval;
- return put_user(val32, (unsigned int __user *)argp);
+ return convert_in_user(valp, (unsigned int __user *)argp);
case RTC_IRQP_SET32:
- return sys_ioctl(fd, RTC_IRQP_SET, (unsigned long)argp);
+ return do_ioctl(file, RTC_IRQP_SET, (unsigned long)argp);
case RTC_EPOCH_SET32:
- return sys_ioctl(fd, RTC_EPOCH_SET, (unsigned long)argp);
+ return do_ioctl(file, RTC_EPOCH_SET, (unsigned long)argp);
}
return -ENOIOCTLCMD;
@@ -1284,12 +1305,6 @@ COMPATIBLE_IOCTL(PCIIOC_CONTROLLER)
COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_IO)
COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_MEM)
COMPATIBLE_IOCTL(PCIIOC_WRITE_COMBINE)
-/* NBD */
-COMPATIBLE_IOCTL(NBD_DO_IT)
-COMPATIBLE_IOCTL(NBD_CLEAR_SOCK)
-COMPATIBLE_IOCTL(NBD_CLEAR_QUE)
-COMPATIBLE_IOCTL(NBD_PRINT_DEBUG)
-COMPATIBLE_IOCTL(NBD_DISCONNECT)
/* i2c */
COMPATIBLE_IOCTL(I2C_SLAVE)
COMPATIBLE_IOCTL(I2C_SLAVE_FORCE)
@@ -1436,53 +1451,53 @@ IGNORE_IOCTL(FBIOGCURSOR32)
* a compat_ioctl operation in the place that handleѕ the
* ioctl for the native case.
*/
-static long do_ioctl_trans(int fd, unsigned int cmd,
+static long do_ioctl_trans(unsigned int cmd,
unsigned long arg, struct file *file)
{
void __user *argp = compat_ptr(arg);
switch (cmd) {
case PPPIOCGIDLE32:
- return ppp_gidle(fd, cmd, argp);
+ return ppp_gidle(file, cmd, argp);
case PPPIOCSCOMPRESS32:
- return ppp_scompress(fd, cmd, argp);
+ return ppp_scompress(file, cmd, argp);
case PPPIOCSPASS32:
case PPPIOCSACTIVE32:
- return ppp_sock_fprog_ioctl_trans(fd, cmd, argp);
+ return ppp_sock_fprog_ioctl_trans(file, cmd, argp);
#ifdef CONFIG_BLOCK
case SG_IO:
- return sg_ioctl_trans(fd, cmd, argp);
+ return sg_ioctl_trans(file, cmd, argp);
case SG_GET_REQUEST_TABLE:
- return sg_grt_trans(fd, cmd, argp);
+ return sg_grt_trans(file, cmd, argp);
case MTIOCGET32:
case MTIOCPOS32:
- return mt_ioctl_trans(fd, cmd, argp);
+ return mt_ioctl_trans(file, cmd, argp);
#endif
/* Serial */
case TIOCGSERIAL:
case TIOCSSERIAL:
- return serial_struct_ioctl(fd, cmd, argp);
+ return serial_struct_ioctl(file, cmd, argp);
/* i2c */
case I2C_FUNCS:
- return w_long(fd, cmd, argp);
+ return w_long(file, cmd, argp);
case I2C_RDWR:
- return do_i2c_rdwr_ioctl(fd, cmd, argp);
+ return do_i2c_rdwr_ioctl(file, cmd, argp);
case I2C_SMBUS:
- return do_i2c_smbus_ioctl(fd, cmd, argp);
+ return do_i2c_smbus_ioctl(file, cmd, argp);
/* Not implemented in the native kernel */
case RTC_IRQP_READ32:
case RTC_IRQP_SET32:
case RTC_EPOCH_READ32:
case RTC_EPOCH_SET32:
- return rtc_ioctl(fd, cmd, argp);
+ return rtc_ioctl(file, cmd, argp);
/* dvb */
case VIDEO_GET_EVENT:
- return do_video_get_event(fd, cmd, argp);
+ return do_video_get_event(file, cmd, argp);
case VIDEO_STILLPICTURE:
- return do_video_stillpicture(fd, cmd, argp);
+ return do_video_stillpicture(file, cmd, argp);
case VIDEO_SET_SPU_PALETTE:
- return do_video_set_spu_palette(fd, cmd, argp);
+ return do_video_set_spu_palette(file, cmd, argp);
}
/*
@@ -1508,12 +1523,7 @@ static long do_ioctl_trans(int fd, unsigned int cmd,
case KDSKBMETA:
case KDSKBLED:
case KDSETLED:
- /* NBD */
- case NBD_SET_SOCK:
- case NBD_SET_BLKSIZE:
- case NBD_SET_SIZE:
- case NBD_SET_SIZE_BLOCKS:
- return do_vfs_ioctl(file, fd, cmd, arg);
+ return vfs_ioctl(file, cmd, arg);
}
return -ENOIOCTLCMD;
@@ -1580,6 +1590,11 @@ COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd,
goto out_fput;
#endif
+ case FICLONE:
+ case FICLONERANGE:
+ case FIDEDUPERANGE:
+ goto do_ioctl;
+
case FIBMAP:
case FIGETBSZ:
case FIONREAD:
@@ -1602,7 +1617,7 @@ COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd,
if (compat_ioctl_check_table(XFORM(cmd)))
goto found_handler;
- error = do_ioctl_trans(fd, cmd, arg, f.file);
+ error = do_ioctl_trans(cmd, arg, f.file);
if (error == -ENOIOCTLCMD)
error = -ENOTTY;
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index b65d1ef532d5..ccc31fa6f1a7 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -53,13 +53,14 @@ struct configfs_dirent {
#define CONFIGFS_ROOT 0x0001
#define CONFIGFS_DIR 0x0002
#define CONFIGFS_ITEM_ATTR 0x0004
+#define CONFIGFS_ITEM_BIN_ATTR 0x0008
#define CONFIGFS_ITEM_LINK 0x0020
#define CONFIGFS_USET_DIR 0x0040
#define CONFIGFS_USET_DEFAULT 0x0080
#define CONFIGFS_USET_DROPPING 0x0100
#define CONFIGFS_USET_IN_MKDIR 0x0200
#define CONFIGFS_USET_CREATING 0x0400
-#define CONFIGFS_NOT_PINNED (CONFIGFS_ITEM_ATTR)
+#define CONFIGFS_NOT_PINNED (CONFIGFS_ITEM_ATTR | CONFIGFS_ITEM_BIN_ATTR)
extern struct mutex configfs_symlink_mutex;
extern spinlock_t configfs_dirent_lock;
@@ -72,6 +73,8 @@ extern struct inode * configfs_new_inode(umode_t mode, struct configfs_dirent *,
extern int configfs_create(struct dentry *, umode_t mode, void (*init)(struct inode *));
extern int configfs_create_file(struct config_item *, const struct configfs_attribute *);
+extern int configfs_create_bin_file(struct config_item *,
+ const struct configfs_bin_attribute *);
extern int configfs_make_dirent(struct configfs_dirent *,
struct dentry *, void *, umode_t, int);
extern int configfs_dirent_is_ready(struct configfs_dirent *);
@@ -88,7 +91,7 @@ extern void configfs_release_fs(void);
extern struct rw_semaphore configfs_rename_sem;
extern const struct file_operations configfs_dir_operations;
extern const struct file_operations configfs_file_operations;
-extern const struct file_operations bin_fops;
+extern const struct file_operations configfs_bin_file_operations;
extern const struct inode_operations configfs_dir_inode_operations;
extern const struct inode_operations configfs_root_inode_operations;
extern const struct inode_operations configfs_symlink_inode_operations;
@@ -119,6 +122,13 @@ static inline struct configfs_attribute * to_attr(struct dentry * dentry)
return ((struct configfs_attribute *) sd->s_element);
}
+static inline struct configfs_bin_attribute *to_bin_attr(struct dentry *dentry)
+{
+ struct configfs_attribute *attr = to_attr(dentry);
+
+ return container_of(attr, struct configfs_bin_attribute, cb_attr);
+}
+
static inline struct config_item *configfs_get_config_item(struct dentry *dentry)
{
struct config_item * item = NULL;
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index c81ce7f200a6..cab612b2ae76 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -255,6 +255,12 @@ static void configfs_init_file(struct inode * inode)
inode->i_fop = &configfs_file_operations;
}
+static void configfs_init_bin_file(struct inode *inode)
+{
+ inode->i_size = 0;
+ inode->i_fop = &configfs_bin_file_operations;
+}
+
static void init_symlink(struct inode * inode)
{
inode->i_op = &configfs_symlink_inode_operations;
@@ -423,7 +429,9 @@ static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * den
spin_unlock(&configfs_dirent_lock);
error = configfs_create(dentry, (attr->ca_mode & S_IALLUGO) | S_IFREG,
- configfs_init_file);
+ (sd->s_type & CONFIGFS_ITEM_BIN_ATTR) ?
+ configfs_init_bin_file :
+ configfs_init_file);
if (error) {
configfs_put(sd);
return error;
@@ -583,6 +591,7 @@ static int populate_attrs(struct config_item *item)
{
struct config_item_type *t = item->ci_type;
struct configfs_attribute *attr;
+ struct configfs_bin_attribute *bin_attr;
int error = 0;
int i;
@@ -594,6 +603,13 @@ static int populate_attrs(struct config_item *item)
break;
}
}
+ if (t->ct_bin_attrs) {
+ for (i = 0; (bin_attr = t->ct_bin_attrs[i]) != NULL; i++) {
+ error = configfs_create_bin_file(item, bin_attr);
+ if (error)
+ break;
+ }
+ }
if (error)
detach_attrs(item);
@@ -1054,11 +1070,55 @@ out:
return ret;
}
+static int configfs_do_depend_item(struct dentry *subsys_dentry,
+ struct config_item *target)
+{
+ struct configfs_dirent *p;
+ int ret;
+
+ spin_lock(&configfs_dirent_lock);
+ /* Scan the tree, return 0 if found */
+ ret = configfs_depend_prep(subsys_dentry, target);
+ if (ret)
+ goto out_unlock_dirent_lock;
+
+ /*
+ * We are sure that the item is not about to be removed by rmdir(), and
+ * not in the middle of attachment by mkdir().
+ */
+ p = target->ci_dentry->d_fsdata;
+ p->s_dependent_count += 1;
+
+out_unlock_dirent_lock:
+ spin_unlock(&configfs_dirent_lock);
+
+ return ret;
+}
+
+static inline struct configfs_dirent *
+configfs_find_subsys_dentry(struct configfs_dirent *root_sd,
+ struct config_item *subsys_item)
+{
+ struct configfs_dirent *p;
+ struct configfs_dirent *ret = NULL;
+
+ list_for_each_entry(p, &root_sd->s_children, s_sibling) {
+ if (p->s_type & CONFIGFS_DIR &&
+ p->s_element == subsys_item) {
+ ret = p;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+
int configfs_depend_item(struct configfs_subsystem *subsys,
struct config_item *target)
{
int ret;
- struct configfs_dirent *p, *root_sd, *subsys_sd = NULL;
+ struct configfs_dirent *subsys_sd;
struct config_item *s_item = &subsys->su_group.cg_item;
struct dentry *root;
@@ -1077,39 +1137,15 @@ int configfs_depend_item(struct configfs_subsystem *subsys,
*/
mutex_lock(&d_inode(root)->i_mutex);
- root_sd = root->d_fsdata;
-
- list_for_each_entry(p, &root_sd->s_children, s_sibling) {
- if (p->s_type & CONFIGFS_DIR) {
- if (p->s_element == s_item) {
- subsys_sd = p;
- break;
- }
- }
- }
-
+ subsys_sd = configfs_find_subsys_dentry(root->d_fsdata, s_item);
if (!subsys_sd) {
ret = -ENOENT;
goto out_unlock_fs;
}
/* Ok, now we can trust subsys/s_item */
+ ret = configfs_do_depend_item(subsys_sd->s_dentry, target);
- spin_lock(&configfs_dirent_lock);
- /* Scan the tree, return 0 if found */
- ret = configfs_depend_prep(subsys_sd->s_dentry, target);
- if (ret)
- goto out_unlock_dirent_lock;
-
- /*
- * We are sure that the item is not about to be removed by rmdir(), and
- * not in the middle of attachment by mkdir().
- */
- p = target->ci_dentry->d_fsdata;
- p->s_dependent_count += 1;
-
-out_unlock_dirent_lock:
- spin_unlock(&configfs_dirent_lock);
out_unlock_fs:
mutex_unlock(&d_inode(root)->i_mutex);
@@ -1128,8 +1164,7 @@ EXPORT_SYMBOL(configfs_depend_item);
* configfs_depend_item() because we know that that the client driver is
* pinned, thus the subsystem is pinned, and therefore configfs is pinned.
*/
-void configfs_undepend_item(struct configfs_subsystem *subsys,
- struct config_item *target)
+void configfs_undepend_item(struct config_item *target)
{
struct configfs_dirent *sd;
@@ -1152,6 +1187,79 @@ void configfs_undepend_item(struct configfs_subsystem *subsys,
}
EXPORT_SYMBOL(configfs_undepend_item);
+/*
+ * caller_subsys is a caller's subsystem not target's. This is used to
+ * determine if we should lock root and check subsys or not. When we are
+ * in the same subsystem as our target there is no need to do locking as
+ * we know that subsys is valid and is not unregistered during this function
+ * as we are called from callback of one of his children and VFS holds a lock
+ * on some inode. Otherwise we have to lock our root to ensure that target's
+ * subsystem it is not unregistered during this function.
+ */
+int configfs_depend_item_unlocked(struct configfs_subsystem *caller_subsys,
+ struct config_item *target)
+{
+ struct configfs_subsystem *target_subsys;
+ struct config_group *root, *parent;
+ struct configfs_dirent *subsys_sd;
+ int ret = -ENOENT;
+
+ /* Disallow this function for configfs root */
+ if (configfs_is_root(target))
+ return -EINVAL;
+
+ parent = target->ci_group;
+ /*
+ * This may happen when someone is trying to depend root
+ * directory of some subsystem
+ */
+ if (configfs_is_root(&parent->cg_item)) {
+ target_subsys = to_configfs_subsystem(to_config_group(target));
+ root = parent;
+ } else {
+ target_subsys = parent->cg_subsys;
+ /* Find a cofnigfs root as we may need it for locking */
+ for (root = parent; !configfs_is_root(&root->cg_item);
+ root = root->cg_item.ci_group)
+ ;
+ }
+
+ if (target_subsys != caller_subsys) {
+ /*
+ * We are in other configfs subsystem, so we have to do
+ * additional locking to prevent other subsystem from being
+ * unregistered
+ */
+ mutex_lock(&d_inode(root->cg_item.ci_dentry)->i_mutex);
+
+ /*
+ * As we are trying to depend item from other subsystem
+ * we have to check if this subsystem is still registered
+ */
+ subsys_sd = configfs_find_subsys_dentry(
+ root->cg_item.ci_dentry->d_fsdata,
+ &target_subsys->su_group.cg_item);
+ if (!subsys_sd)
+ goto out_root_unlock;
+ } else {
+ subsys_sd = target_subsys->su_group.cg_item.ci_dentry->d_fsdata;
+ }
+
+ /* Now we can execute core of depend item */
+ ret = configfs_do_depend_item(subsys_sd->s_dentry, target);
+
+ if (target_subsys != caller_subsys)
+out_root_unlock:
+ /*
+ * We were called from subsystem other than our target so we
+ * took some locks so now it's time to release them
+ */
+ mutex_unlock(&d_inode(root->cg_item.ci_dentry)->i_mutex);
+
+ return ret;
+}
+EXPORT_SYMBOL(configfs_depend_item_unlocked);
+
static int configfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
int ret = 0;
@@ -1636,6 +1744,116 @@ const struct file_operations configfs_dir_operations = {
.iterate = configfs_readdir,
};
+/**
+ * configfs_register_group - creates a parent-child relation between two groups
+ * @parent_group: parent group
+ * @group: child group
+ *
+ * link groups, creates dentry for the child and attaches it to the
+ * parent dentry.
+ *
+ * Return: 0 on success, negative errno code on error
+ */
+int configfs_register_group(struct config_group *parent_group,
+ struct config_group *group)
+{
+ struct configfs_subsystem *subsys = parent_group->cg_subsys;
+ struct dentry *parent;
+ int ret;
+
+ mutex_lock(&subsys->su_mutex);
+ link_group(parent_group, group);
+ mutex_unlock(&subsys->su_mutex);
+
+ parent = parent_group->cg_item.ci_dentry;
+
+ mutex_lock_nested(&d_inode(parent)->i_mutex, I_MUTEX_PARENT);
+ ret = create_default_group(parent_group, group);
+ if (!ret) {
+ spin_lock(&configfs_dirent_lock);
+ configfs_dir_set_ready(group->cg_item.ci_dentry->d_fsdata);
+ spin_unlock(&configfs_dirent_lock);
+ }
+ mutex_unlock(&d_inode(parent)->i_mutex);
+ return ret;
+}
+EXPORT_SYMBOL(configfs_register_group);
+
+/**
+ * configfs_unregister_group() - unregisters a child group from its parent
+ * @group: parent group to be unregistered
+ *
+ * Undoes configfs_register_group()
+ */
+void configfs_unregister_group(struct config_group *group)
+{
+ struct configfs_subsystem *subsys = group->cg_subsys;
+ struct dentry *dentry = group->cg_item.ci_dentry;
+ struct dentry *parent = group->cg_item.ci_parent->ci_dentry;
+
+ mutex_lock_nested(&d_inode(parent)->i_mutex, I_MUTEX_PARENT);
+ spin_lock(&configfs_dirent_lock);
+ configfs_detach_prep(dentry, NULL);
+ spin_unlock(&configfs_dirent_lock);
+
+ configfs_detach_group(&group->cg_item);
+ d_inode(dentry)->i_flags |= S_DEAD;
+ dont_mount(dentry);
+ d_delete(dentry);
+ mutex_unlock(&d_inode(parent)->i_mutex);
+
+ dput(dentry);
+
+ mutex_lock(&subsys->su_mutex);
+ unlink_group(group);
+ mutex_unlock(&subsys->su_mutex);
+}
+EXPORT_SYMBOL(configfs_unregister_group);
+
+/**
+ * configfs_register_default_group() - allocates and registers a child group
+ * @parent_group: parent group
+ * @name: child group name
+ * @item_type: child item type description
+ *
+ * boilerplate to allocate and register a child group with its parent. We need
+ * kzalloc'ed memory because child's default_group is initially empty.
+ *
+ * Return: allocated config group or ERR_PTR() on error
+ */
+struct config_group *
+configfs_register_default_group(struct config_group *parent_group,
+ const char *name,
+ struct config_item_type *item_type)
+{
+ int ret;
+ struct config_group *group;
+
+ group = kzalloc(sizeof(*group), GFP_KERNEL);
+ if (!group)
+ return ERR_PTR(-ENOMEM);
+ config_group_init_type_name(group, name, item_type);
+
+ ret = configfs_register_group(parent_group, group);
+ if (ret) {
+ kfree(group);
+ return ERR_PTR(ret);
+ }
+ return group;
+}
+EXPORT_SYMBOL(configfs_register_default_group);
+
+/**
+ * configfs_unregister_default_group() - unregisters and frees a child group
+ * @group: the group to act on
+ */
+void configfs_unregister_default_group(struct config_group *group)
+{
+ configfs_unregister_group(group);
+ kfree(group);
+}
+EXPORT_SYMBOL(configfs_unregister_default_group);
+
int configfs_register_subsystem(struct configfs_subsystem *subsys)
{
int err;
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index 403269ffcdf3..3687187c8ea5 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -28,6 +28,7 @@
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/mutex.h>
+#include <linux/vmalloc.h>
#include <asm/uaccess.h>
#include <linux/configfs.h>
@@ -48,6 +49,10 @@ struct configfs_buffer {
struct configfs_item_operations * ops;
struct mutex mutex;
int needs_read_fill;
+ bool read_in_progress;
+ bool write_in_progress;
+ char *bin_buffer;
+ int bin_buffer_size;
};
@@ -65,7 +70,6 @@ static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buf
{
struct configfs_attribute * attr = to_attr(dentry);
struct config_item * item = to_item(dentry->d_parent);
- struct configfs_item_operations * ops = buffer->ops;
int ret = 0;
ssize_t count;
@@ -74,7 +78,8 @@ static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buf
if (!buffer->page)
return -ENOMEM;
- count = ops->show_attribute(item,attr,buffer->page);
+ count = attr->show(item, buffer->page);
+
buffer->needs_read_fill = 0;
BUG_ON(count > (ssize_t)SIMPLE_ATTR_SIZE);
if (count >= 0)
@@ -123,6 +128,87 @@ out:
return retval;
}
+/**
+ * configfs_read_bin_file - read a binary attribute.
+ * @file: file pointer.
+ * @buf: buffer to fill.
+ * @count: number of bytes to read.
+ * @ppos: starting offset in file.
+ *
+ * Userspace wants to read a binary attribute file. The attribute
+ * descriptor is in the file's ->d_fsdata. The target item is in the
+ * directory's ->d_fsdata.
+ *
+ * We check whether we need to refill the buffer. If so we will
+ * call the attributes' attr->read() twice. The first time we
+ * will pass a NULL as a buffer pointer, which the attributes' method
+ * will use to return the size of the buffer required. If no error
+ * occurs we will allocate the buffer using vmalloc and call
+ * attr->read() again passing that buffer as an argument.
+ * Then we just copy to user-space using simple_read_from_buffer.
+ */
+
+static ssize_t
+configfs_read_bin_file(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct configfs_buffer *buffer = file->private_data;
+ struct dentry *dentry = file->f_path.dentry;
+ struct config_item *item = to_item(dentry->d_parent);
+ struct configfs_bin_attribute *bin_attr = to_bin_attr(dentry);
+ ssize_t retval = 0;
+ ssize_t len = min_t(size_t, count, PAGE_SIZE);
+
+ mutex_lock(&buffer->mutex);
+
+ /* we don't support switching read/write modes */
+ if (buffer->write_in_progress) {
+ retval = -ETXTBSY;
+ goto out;
+ }
+ buffer->read_in_progress = 1;
+
+ if (buffer->needs_read_fill) {
+ /* perform first read with buf == NULL to get extent */
+ len = bin_attr->read(item, NULL, 0);
+ if (len <= 0) {
+ retval = len;
+ goto out;
+ }
+
+ /* do not exceed the maximum value */
+ if (bin_attr->cb_max_size && len > bin_attr->cb_max_size) {
+ retval = -EFBIG;
+ goto out;
+ }
+
+ buffer->bin_buffer = vmalloc(len);
+ if (buffer->bin_buffer == NULL) {
+ retval = -ENOMEM;
+ goto out;
+ }
+ buffer->bin_buffer_size = len;
+
+ /* perform second read to fill buffer */
+ len = bin_attr->read(item, buffer->bin_buffer, len);
+ if (len < 0) {
+ retval = len;
+ vfree(buffer->bin_buffer);
+ buffer->bin_buffer_size = 0;
+ buffer->bin_buffer = NULL;
+ goto out;
+ }
+
+ buffer->needs_read_fill = 0;
+ }
+
+ retval = simple_read_from_buffer(buf, count, ppos, buffer->bin_buffer,
+ buffer->bin_buffer_size);
+out:
+ mutex_unlock(&buffer->mutex);
+ return retval;
+}
+
/**
* fill_write_buffer - copy buffer from userspace.
@@ -171,9 +257,8 @@ flush_write_buffer(struct dentry * dentry, struct configfs_buffer * buffer, size
{
struct configfs_attribute * attr = to_attr(dentry);
struct config_item * item = to_item(dentry->d_parent);
- struct configfs_item_operations * ops = buffer->ops;
- return ops->store_attribute(item,attr,buffer->page,count);
+ return attr->store(item, buffer->page, count);
}
@@ -210,10 +295,80 @@ configfs_write_file(struct file *file, const char __user *buf, size_t count, lof
return len;
}
-static int check_perm(struct inode * inode, struct file * file)
+/**
+ * configfs_write_bin_file - write a binary attribute.
+ * @file: file pointer
+ * @buf: data to write
+ * @count: number of bytes
+ * @ppos: starting offset
+ *
+ * Writing to a binary attribute file is similar to a normal read.
+ * We buffer the consecutive writes (binary attribute files do not
+ * support lseek) in a continuously growing buffer, but we don't
+ * commit until the close of the file.
+ */
+
+static ssize_t
+configfs_write_bin_file(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct configfs_buffer *buffer = file->private_data;
+ struct dentry *dentry = file->f_path.dentry;
+ struct configfs_bin_attribute *bin_attr = to_bin_attr(dentry);
+ void *tbuf = NULL;
+ ssize_t len;
+
+ mutex_lock(&buffer->mutex);
+
+ /* we don't support switching read/write modes */
+ if (buffer->read_in_progress) {
+ len = -ETXTBSY;
+ goto out;
+ }
+ buffer->write_in_progress = 1;
+
+ /* buffer grows? */
+ if (*ppos + count > buffer->bin_buffer_size) {
+
+ if (bin_attr->cb_max_size &&
+ *ppos + count > bin_attr->cb_max_size) {
+ len = -EFBIG;
+ }
+
+ tbuf = vmalloc(*ppos + count);
+ if (tbuf == NULL) {
+ len = -ENOMEM;
+ goto out;
+ }
+
+ /* copy old contents */
+ if (buffer->bin_buffer) {
+ memcpy(tbuf, buffer->bin_buffer,
+ buffer->bin_buffer_size);
+ vfree(buffer->bin_buffer);
+ }
+
+ /* clear the new area */
+ memset(tbuf + buffer->bin_buffer_size, 0,
+ *ppos + count - buffer->bin_buffer_size);
+ buffer->bin_buffer = tbuf;
+ buffer->bin_buffer_size = *ppos + count;
+ }
+
+ len = simple_write_to_buffer(buffer->bin_buffer,
+ buffer->bin_buffer_size, ppos, buf, count);
+ if (len > 0)
+ *ppos += len;
+out:
+ mutex_unlock(&buffer->mutex);
+ return len;
+}
+
+static int check_perm(struct inode * inode, struct file * file, int type)
{
struct config_item *item = configfs_get_config_item(file->f_path.dentry->d_parent);
struct configfs_attribute * attr = to_attr(file->f_path.dentry);
+ struct configfs_bin_attribute *bin_attr = NULL;
struct configfs_buffer * buffer;
struct configfs_item_operations * ops = NULL;
int error = 0;
@@ -221,6 +376,9 @@ static int check_perm(struct inode * inode, struct file * file)
if (!item || !attr)
goto Einval;
+ if (type & CONFIGFS_ITEM_BIN_ATTR)
+ bin_attr = to_bin_attr(file->f_path.dentry);
+
/* Grab the module reference for this attribute if we have one */
if (!try_module_get(attr->ca_owner)) {
error = -ENODEV;
@@ -237,10 +395,14 @@ static int check_perm(struct inode * inode, struct file * file)
* and we must have a store method.
*/
if (file->f_mode & FMODE_WRITE) {
+ if (!(inode->i_mode & S_IWUGO))
+ goto Eaccess;
- if (!(inode->i_mode & S_IWUGO) || !ops->store_attribute)
+ if ((type & CONFIGFS_ITEM_ATTR) && !attr->store)
goto Eaccess;
+ if ((type & CONFIGFS_ITEM_BIN_ATTR) && !bin_attr->write)
+ goto Eaccess;
}
/* File needs read support.
@@ -248,7 +410,13 @@ static int check_perm(struct inode * inode, struct file * file)
* must be a show method for it.
*/
if (file->f_mode & FMODE_READ) {
- if (!(inode->i_mode & S_IRUGO) || !ops->show_attribute)
+ if (!(inode->i_mode & S_IRUGO))
+ goto Eaccess;
+
+ if ((type & CONFIGFS_ITEM_ATTR) && !attr->show)
+ goto Eaccess;
+
+ if ((type & CONFIGFS_ITEM_BIN_ATTR) && !bin_attr->read)
goto Eaccess;
}
@@ -262,6 +430,8 @@ static int check_perm(struct inode * inode, struct file * file)
}
mutex_init(&buffer->mutex);
buffer->needs_read_fill = 1;
+ buffer->read_in_progress = 0;
+ buffer->write_in_progress = 0;
buffer->ops = ops;
file->private_data = buffer;
goto Done;
@@ -279,12 +449,7 @@ static int check_perm(struct inode * inode, struct file * file)
return error;
}
-static int configfs_open_file(struct inode * inode, struct file * filp)
-{
- return check_perm(inode,filp);
-}
-
-static int configfs_release(struct inode * inode, struct file * filp)
+static int configfs_release(struct inode *inode, struct file *filp)
{
struct config_item * item = to_item(filp->f_path.dentry->d_parent);
struct configfs_attribute * attr = to_attr(filp->f_path.dentry);
@@ -305,6 +470,47 @@ static int configfs_release(struct inode * inode, struct file * filp)
return 0;
}
+static int configfs_open_file(struct inode *inode, struct file *filp)
+{
+ return check_perm(inode, filp, CONFIGFS_ITEM_ATTR);
+}
+
+static int configfs_open_bin_file(struct inode *inode, struct file *filp)
+{
+ return check_perm(inode, filp, CONFIGFS_ITEM_BIN_ATTR);
+}
+
+static int configfs_release_bin_file(struct inode *inode, struct file *filp)
+{
+ struct configfs_buffer *buffer = filp->private_data;
+ struct dentry *dentry = filp->f_path.dentry;
+ struct config_item *item = to_item(dentry->d_parent);
+ struct configfs_bin_attribute *bin_attr = to_bin_attr(dentry);
+ ssize_t len = 0;
+ int ret;
+
+ buffer->read_in_progress = 0;
+
+ if (buffer->write_in_progress) {
+ buffer->write_in_progress = 0;
+
+ len = bin_attr->write(item, buffer->bin_buffer,
+ buffer->bin_buffer_size);
+
+ /* vfree on NULL is safe */
+ vfree(buffer->bin_buffer);
+ buffer->bin_buffer = NULL;
+ buffer->bin_buffer_size = 0;
+ buffer->needs_read_fill = 1;
+ }
+
+ ret = configfs_release(inode, filp);
+ if (len < 0)
+ return len;
+ return ret;
+}
+
+
const struct file_operations configfs_file_operations = {
.read = configfs_read_file,
.write = configfs_write_file,
@@ -313,6 +519,14 @@ const struct file_operations configfs_file_operations = {
.release = configfs_release,
};
+const struct file_operations configfs_bin_file_operations = {
+ .read = configfs_read_bin_file,
+ .write = configfs_write_bin_file,
+ .llseek = NULL, /* bin file is not seekable */
+ .open = configfs_open_bin_file,
+ .release = configfs_release_bin_file,
+};
+
/**
* configfs_create_file - create an attribute file for an item.
* @item: item we're creating for.
@@ -334,3 +548,24 @@ int configfs_create_file(struct config_item * item, const struct configfs_attrib
return error;
}
+/**
+ * configfs_create_bin_file - create a binary attribute file for an item.
+ * @item: item we're creating for.
+ * @attr: atrribute descriptor.
+ */
+
+int configfs_create_bin_file(struct config_item *item,
+ const struct configfs_bin_attribute *bin_attr)
+{
+ struct dentry *dir = item->ci_dentry;
+ struct configfs_dirent *parent_sd = dir->d_fsdata;
+ umode_t mode = (bin_attr->cb_attr.ca_mode & S_IALLUGO) | S_IFREG;
+ int error = 0;
+
+ mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_NORMAL);
+ error = configfs_make_dirent(parent_sd, NULL, (void *) bin_attr, mode,
+ CONFIGFS_ITEM_BIN_ATTR);
+ mutex_unlock(&dir->d_inode->i_mutex);
+
+ return error;
+}
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index eae87575e681..0cc810e9dccc 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -218,7 +218,7 @@ const unsigned char * configfs_get_name(struct configfs_dirent *sd)
if (sd->s_type & (CONFIGFS_DIR | CONFIGFS_ITEM_LINK))
return sd->s_dentry->d_name.name;
- if (sd->s_type & CONFIGFS_ITEM_ATTR) {
+ if (sd->s_type & (CONFIGFS_ITEM_ATTR | CONFIGFS_ITEM_BIN_ATTR)) {
attr = sd->s_element;
return attr->ca_name;
}
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index ec5c8325b503..db6d69289608 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -279,27 +279,33 @@ static int configfs_getlink(struct dentry *dentry, char * path)
}
-static const char *configfs_follow_link(struct dentry *dentry, void **cookie)
+static const char *configfs_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- unsigned long page = get_zeroed_page(GFP_KERNEL);
+ char *body;
int error;
- if (!page)
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
+ body = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!body)
return ERR_PTR(-ENOMEM);
- error = configfs_getlink(dentry, (char *)page);
+ error = configfs_getlink(dentry, body);
if (!error) {
- return *cookie = (void *)page;
+ set_delayed_call(done, kfree_link, body);
+ return body;
}
- free_page(page);
+ kfree(body);
return ERR_PTR(error);
}
const struct inode_operations configfs_symlink_inode_operations = {
- .follow_link = configfs_follow_link,
+ .get_link = configfs_get_link,
.readlink = generic_readlink,
- .put_link = free_page_put_link,
.setattr = configfs_setattr,
};
diff --git a/fs/coredump.c b/fs/coredump.c
index a8f75640ac86..b3c153ca435d 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -32,6 +32,7 @@
#include <linux/pipe_fs_i.h>
#include <linux/oom.h>
#include <linux/compat.h>
+#include <linux/timekeeping.h>
#include <asm/uaccess.h>
#include <asm/mmu_context.h>
@@ -232,9 +233,10 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm)
break;
/* UNIX time of coredump */
case 't': {
- struct timeval tv;
- do_gettimeofday(&tv);
- err = cn_printf(cn, "%lu", tv.tv_sec);
+ time64_t time;
+
+ time = ktime_get_real_seconds();
+ err = cn_printf(cn, "%lld", time);
break;
}
/* hostname */
@@ -280,23 +282,24 @@ out:
return ispipe;
}
-static int zap_process(struct task_struct *start, int exit_code)
+static int zap_process(struct task_struct *start, int exit_code, int flags)
{
struct task_struct *t;
int nr = 0;
+ /* ignore all signals except SIGKILL, see prepare_signal() */
+ start->signal->flags = SIGNAL_GROUP_COREDUMP | flags;
start->signal->group_exit_code = exit_code;
start->signal->group_stop_count = 0;
- t = start;
- do {
+ for_each_thread(start, t) {
task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
if (t != current && t->mm) {
sigaddset(&t->pending.signal, SIGKILL);
signal_wake_up(t, 1);
nr++;
}
- } while_each_thread(start, t);
+ }
return nr;
}
@@ -311,10 +314,8 @@ static int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
spin_lock_irq(&tsk->sighand->siglock);
if (!signal_group_exit(tsk->signal)) {
mm->core_state = core_state;
- nr = zap_process(tsk, exit_code);
tsk->signal->group_exit_task = tsk;
- /* ignore all signals except SIGKILL, see prepare_signal() */
- tsk->signal->flags = SIGNAL_GROUP_COREDUMP;
+ nr = zap_process(tsk, exit_code, 0);
clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
}
spin_unlock_irq(&tsk->sighand->siglock);
@@ -360,18 +361,18 @@ static int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
continue;
if (g->flags & PF_KTHREAD)
continue;
- p = g;
- do {
- if (p->mm) {
- if (unlikely(p->mm == mm)) {
- lock_task_sighand(p, &flags);
- nr += zap_process(p, exit_code);
- p->signal->flags = SIGNAL_GROUP_EXIT;
- unlock_task_sighand(p, &flags);
- }
- break;
+
+ for_each_thread(g, p) {
+ if (unlikely(!p->mm))
+ continue;
+ if (unlikely(p->mm == mm)) {
+ lock_task_sighand(p, &flags);
+ nr += zap_process(p, exit_code,
+ SIGNAL_GROUP_EXIT);
+ unlock_task_sighand(p, &flags);
}
- } while_each_thread(g, p);
+ break;
+ }
}
rcu_read_unlock();
done:
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 355c522f3585..b862bc219cd7 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -100,6 +100,7 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
break;
case S_IFLNK:
inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_data.a_ops = &cramfs_aops;
break;
default:
diff --git a/fs/dax.c b/fs/dax.c
index a86d3cc2b389..7af879759064 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -28,49 +28,68 @@
#include <linux/sched.h>
#include <linux/uio.h>
#include <linux/vmstat.h>
+#include <linux/pfn_t.h>
+#include <linux/sizes.h>
-int dax_clear_blocks(struct inode *inode, sector_t block, long size)
+static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
+{
+ struct request_queue *q = bdev->bd_queue;
+ long rc = -EIO;
+
+ dax->addr = (void __pmem *) ERR_PTR(-EIO);
+ if (blk_queue_enter(q, true) != 0)
+ return rc;
+
+ rc = bdev_direct_access(bdev, dax);
+ if (rc < 0) {
+ dax->addr = (void __pmem *) ERR_PTR(rc);
+ blk_queue_exit(q);
+ return rc;
+ }
+ return rc;
+}
+
+static void dax_unmap_atomic(struct block_device *bdev,
+ const struct blk_dax_ctl *dax)
+{
+ if (IS_ERR(dax->addr))
+ return;
+ blk_queue_exit(bdev->bd_queue);
+}
+
+/*
+ * dax_clear_blocks() is called from within transaction context from XFS,
+ * and hence this means the stack from this point must follow GFP_NOFS
+ * semantics for all operations.
+ */
+int dax_clear_blocks(struct inode *inode, sector_t block, long _size)
{
struct block_device *bdev = inode->i_sb->s_bdev;
- sector_t sector = block << (inode->i_blkbits - 9);
+ struct blk_dax_ctl dax = {
+ .sector = block << (inode->i_blkbits - 9),
+ .size = _size,
+ };
might_sleep();
do {
- void __pmem *addr;
- unsigned long pfn;
- long count;
+ long count, sz;
- count = bdev_direct_access(bdev, sector, &addr, &pfn, size);
+ count = dax_map_atomic(bdev, &dax);
if (count < 0)
return count;
- BUG_ON(size < count);
- while (count > 0) {
- unsigned pgsz = PAGE_SIZE - offset_in_page(addr);
- if (pgsz > count)
- pgsz = count;
- clear_pmem(addr, pgsz);
- addr += pgsz;
- size -= pgsz;
- count -= pgsz;
- BUG_ON(pgsz & 511);
- sector += pgsz / 512;
- cond_resched();
- }
- } while (size);
+ sz = min_t(long, count, SZ_128K);
+ clear_pmem(dax.addr, sz);
+ dax.size -= sz;
+ dax.sector += sz / 512;
+ dax_unmap_atomic(bdev, &dax);
+ cond_resched();
+ } while (dax.size);
wmb_pmem();
return 0;
}
EXPORT_SYMBOL_GPL(dax_clear_blocks);
-static long dax_get_addr(struct buffer_head *bh, void __pmem **addr,
- unsigned blkbits)
-{
- unsigned long pfn;
- sector_t sector = bh->b_blocknr << (blkbits - 9);
- return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size);
-}
-
/* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */
static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first,
loff_t pos, loff_t end)
@@ -100,19 +119,29 @@ static bool buffer_size_valid(struct buffer_head *bh)
return bh->b_state != 0;
}
+
+static sector_t to_sector(const struct buffer_head *bh,
+ const struct inode *inode)
+{
+ sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
+
+ return sector;
+}
+
static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
loff_t start, loff_t end, get_block_t get_block,
struct buffer_head *bh)
{
- ssize_t retval = 0;
- loff_t pos = start;
- loff_t max = start;
- loff_t bh_max = start;
- void __pmem *addr;
- bool hole = false;
- bool need_wmb = false;
-
- if (iov_iter_rw(iter) != WRITE)
+ loff_t pos = start, max = start, bh_max = start;
+ bool hole = false, need_wmb = false;
+ struct block_device *bdev = NULL;
+ int rw = iov_iter_rw(iter), rc;
+ long map_len = 0;
+ struct blk_dax_ctl dax = {
+ .addr = (void __pmem *) ERR_PTR(-EIO),
+ };
+
+ if (rw == READ)
end = min(end, i_size_read(inode));
while (pos < end) {
@@ -127,13 +156,13 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
if (pos == bh_max) {
bh->b_size = PAGE_ALIGN(end - pos);
bh->b_state = 0;
- retval = get_block(inode, block, bh,
- iov_iter_rw(iter) == WRITE);
- if (retval)
+ rc = get_block(inode, block, bh, rw == WRITE);
+ if (rc)
break;
if (!buffer_size_valid(bh))
bh->b_size = 1 << blkbits;
bh_max = pos - first + bh->b_size;
+ bdev = bh->b_bdev;
} else {
unsigned done = bh->b_size -
(bh_max - (pos - first));
@@ -141,45 +170,53 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
bh->b_size -= done;
}
- hole = iov_iter_rw(iter) != WRITE && !buffer_written(bh);
+ hole = rw == READ && !buffer_written(bh);
if (hole) {
- addr = NULL;
size = bh->b_size - first;
} else {
- retval = dax_get_addr(bh, &addr, blkbits);
- if (retval < 0)
+ dax_unmap_atomic(bdev, &dax);
+ dax.sector = to_sector(bh, inode);
+ dax.size = bh->b_size;
+ map_len = dax_map_atomic(bdev, &dax);
+ if (map_len < 0) {
+ rc = map_len;
break;
+ }
if (buffer_unwritten(bh) || buffer_new(bh)) {
- dax_new_buf(addr, retval, first, pos,
- end);
+ dax_new_buf(dax.addr, map_len, first,
+ pos, end);
need_wmb = true;
}
- addr += first;
- size = retval - first;
+ dax.addr += first;
+ size = map_len - first;
}
max = min(pos + size, end);
}
if (iov_iter_rw(iter) == WRITE) {
- len = copy_from_iter_pmem(addr, max - pos, iter);
+ len = copy_from_iter_pmem(dax.addr, max - pos, iter);
need_wmb = true;
} else if (!hole)
- len = copy_to_iter((void __force *)addr, max - pos,
+ len = copy_to_iter((void __force *) dax.addr, max - pos,
iter);
else
len = iov_iter_zero(max - pos, iter);
- if (!len)
+ if (!len) {
+ rc = -EFAULT;
break;
+ }
pos += len;
- addr += len;
+ if (!IS_ERR(dax.addr))
+ dax.addr += len;
}
if (need_wmb)
wmb_pmem();
+ dax_unmap_atomic(bdev, &dax);
- return (pos == start) ? retval : pos - start;
+ return (pos == start) ? rc : pos - start;
}
/**
@@ -268,28 +305,35 @@ static int dax_load_hole(struct address_space *mapping, struct page *page,
return VM_FAULT_LOCKED;
}
-static int copy_user_bh(struct page *to, struct buffer_head *bh,
- unsigned blkbits, unsigned long vaddr)
+static int copy_user_bh(struct page *to, struct inode *inode,
+ struct buffer_head *bh, unsigned long vaddr)
{
- void __pmem *vfrom;
+ struct blk_dax_ctl dax = {
+ .sector = to_sector(bh, inode),
+ .size = bh->b_size,
+ };
+ struct block_device *bdev = bh->b_bdev;
void *vto;
- if (dax_get_addr(bh, &vfrom, blkbits) < 0)
- return -EIO;
+ if (dax_map_atomic(bdev, &dax) < 0)
+ return PTR_ERR(dax.addr);
vto = kmap_atomic(to);
- copy_user_page(vto, (void __force *)vfrom, vaddr, to);
+ copy_user_page(vto, (void __force *)dax.addr, vaddr, to);
kunmap_atomic(vto);
+ dax_unmap_atomic(bdev, &dax);
return 0;
}
static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
struct vm_area_struct *vma, struct vm_fault *vmf)
{
- struct address_space *mapping = inode->i_mapping;
- sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
unsigned long vaddr = (unsigned long)vmf->virtual_address;
- void __pmem *addr;
- unsigned long pfn;
+ struct address_space *mapping = inode->i_mapping;
+ struct block_device *bdev = bh->b_bdev;
+ struct blk_dax_ctl dax = {
+ .sector = to_sector(bh, inode),
+ .size = bh->b_size,
+ };
pgoff_t size;
int error;
@@ -308,20 +352,18 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
goto out;
}
- error = bdev_direct_access(bh->b_bdev, sector, &addr, &pfn, bh->b_size);
- if (error < 0)
- goto out;
- if (error < PAGE_SIZE) {
- error = -EIO;
+ if (dax_map_atomic(bdev, &dax) < 0) {
+ error = PTR_ERR(dax.addr);
goto out;
}
if (buffer_unwritten(bh) || buffer_new(bh)) {
- clear_pmem(addr, PAGE_SIZE);
+ clear_pmem(dax.addr, PAGE_SIZE);
wmb_pmem();
}
+ dax_unmap_atomic(bdev, &dax);
- error = vm_insert_mixed(vma, vaddr, pfn);
+ error = vm_insert_mixed(vma, vaddr, dax.pfn);
out:
i_mmap_unlock_read(mapping);
@@ -415,7 +457,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
if (vmf->cow_page) {
struct page *new_page = vmf->cow_page;
if (buffer_written(&bh))
- error = copy_user_bh(new_page, &bh, blkbits, vaddr);
+ error = copy_user_bh(new_page, inode, &bh, vaddr);
else
clear_user_highpage(new_page, vaddr);
if (error)
@@ -516,6 +558,24 @@ EXPORT_SYMBOL_GPL(dax_fault);
*/
#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
+static void __dax_dbg(struct buffer_head *bh, unsigned long address,
+ const char *reason, const char *fn)
+{
+ if (bh) {
+ char bname[BDEVNAME_SIZE];
+ bdevname(bh->b_bdev, bname);
+ pr_debug("%s: %s addr: %lx dev %s state %lx start %lld "
+ "length %zd fallback: %s\n", fn, current->comm,
+ address, bname, bh->b_state, (u64)bh->b_blocknr,
+ bh->b_size, reason);
+ } else {
+ pr_debug("%s: %s addr: %lx fallback: %s\n", fn,
+ current->comm, address, reason);
+ }
+}
+
+#define dax_pmd_dbg(bh, address, reason) __dax_dbg(bh, address, reason, "dax_pmd")
+
int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmd, unsigned int flags, get_block_t get_block,
dax_iodone_t complete_unwritten)
@@ -527,37 +587,49 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
unsigned blkbits = inode->i_blkbits;
unsigned long pmd_addr = address & PMD_MASK;
bool write = flags & FAULT_FLAG_WRITE;
- long length;
- void __pmem *kaddr;
+ struct block_device *bdev;
pgoff_t size, pgoff;
- sector_t block, sector;
- unsigned long pfn;
+ sector_t block;
int result = 0;
+ /* dax pmd mappings require pfn_t_devmap() */
+ if (!IS_ENABLED(CONFIG_FS_DAX_PMD))
+ return VM_FAULT_FALLBACK;
+
/* Fall back to PTEs if we're going to COW */
- if (write && !(vma->vm_flags & VM_SHARED))
+ if (write && !(vma->vm_flags & VM_SHARED)) {
+ split_huge_pmd(vma, pmd, address);
+ dax_pmd_dbg(NULL, address, "cow write");
return VM_FAULT_FALLBACK;
+ }
/* If the PMD would extend outside the VMA */
- if (pmd_addr < vma->vm_start)
+ if (pmd_addr < vma->vm_start) {
+ dax_pmd_dbg(NULL, address, "vma start unaligned");
return VM_FAULT_FALLBACK;
- if ((pmd_addr + PMD_SIZE) > vma->vm_end)
+ }
+ if ((pmd_addr + PMD_SIZE) > vma->vm_end) {
+ dax_pmd_dbg(NULL, address, "vma end unaligned");
return VM_FAULT_FALLBACK;
+ }
pgoff = linear_page_index(vma, pmd_addr);
size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
if (pgoff >= size)
return VM_FAULT_SIGBUS;
/* If the PMD would cover blocks out of the file */
- if ((pgoff | PG_PMD_COLOUR) >= size)
+ if ((pgoff | PG_PMD_COLOUR) >= size) {
+ dax_pmd_dbg(NULL, address,
+ "offset + huge page size > file size");
return VM_FAULT_FALLBACK;
+ }
memset(&bh, 0, sizeof(bh));
block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
bh.b_size = PMD_SIZE;
- length = get_block(inode, block, &bh, write);
- if (length)
+ if (get_block(inode, block, &bh, write) != 0)
return VM_FAULT_SIGBUS;
+ bdev = bh.b_bdev;
i_mmap_lock_read(mapping);
/*
@@ -565,8 +637,10 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
* just fall back to PTEs. Calling get_block 512 times in a loop
* would be silly.
*/
- if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE)
+ if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) {
+ dax_pmd_dbg(&bh, address, "allocated block too small");
goto fallback;
+ }
/*
* If we allocated new storage, make sure no process has any
@@ -589,50 +663,82 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
result = VM_FAULT_SIGBUS;
goto out;
}
- if ((pgoff | PG_PMD_COLOUR) >= size)
+ if ((pgoff | PG_PMD_COLOUR) >= size) {
+ dax_pmd_dbg(&bh, address, "pgoff unaligned");
goto fallback;
+ }
if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) {
spinlock_t *ptl;
pmd_t entry;
struct page *zero_page = get_huge_zero_page();
- if (unlikely(!zero_page))
+ if (unlikely(!zero_page)) {
+ dax_pmd_dbg(&bh, address, "no zero page");
goto fallback;
+ }
ptl = pmd_lock(vma->vm_mm, pmd);
if (!pmd_none(*pmd)) {
spin_unlock(ptl);
+ dax_pmd_dbg(&bh, address, "pmd already present");
goto fallback;
}
+ dev_dbg(part_to_dev(bdev->bd_part),
+ "%s: %s addr: %lx pfn: <zero> sect: %llx\n",
+ __func__, current->comm, address,
+ (unsigned long long) to_sector(&bh, inode));
+
entry = mk_pmd(zero_page, vma->vm_page_prot);
entry = pmd_mkhuge(entry);
set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry);
result = VM_FAULT_NOPAGE;
spin_unlock(ptl);
} else {
- sector = bh.b_blocknr << (blkbits - 9);
- length = bdev_direct_access(bh.b_bdev, sector, &kaddr, &pfn,
- bh.b_size);
+ struct blk_dax_ctl dax = {
+ .sector = to_sector(&bh, inode),
+ .size = PMD_SIZE,
+ };
+ long length = dax_map_atomic(bdev, &dax);
+
if (length < 0) {
result = VM_FAULT_SIGBUS;
goto out;
}
- if ((length < PMD_SIZE) || (pfn & PG_PMD_COLOUR))
+ if (length < PMD_SIZE) {
+ dax_pmd_dbg(&bh, address, "dax-length too small");
+ dax_unmap_atomic(bdev, &dax);
+ goto fallback;
+ }
+ if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) {
+ dax_pmd_dbg(&bh, address, "pfn unaligned");
+ dax_unmap_atomic(bdev, &dax);
+ goto fallback;
+ }
+
+ if (!pfn_t_devmap(dax.pfn)) {
+ dax_unmap_atomic(bdev, &dax);
+ dax_pmd_dbg(&bh, address, "pfn not in memmap");
goto fallback;
+ }
if (buffer_unwritten(&bh) || buffer_new(&bh)) {
- int i;
- for (i = 0; i < PTRS_PER_PMD; i++)
- clear_pmem(kaddr + i * PAGE_SIZE, PAGE_SIZE);
+ clear_pmem(dax.addr, PMD_SIZE);
wmb_pmem();
count_vm_event(PGMAJFAULT);
mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
result |= VM_FAULT_MAJOR;
}
-
- result |= vmf_insert_pfn_pmd(vma, address, pmd, pfn, write);
+ dax_unmap_atomic(bdev, &dax);
+
+ dev_dbg(part_to_dev(bdev->bd_part),
+ "%s: %s addr: %lx pfn: %lx sect: %llx\n",
+ __func__, current->comm, address,
+ pfn_t_to_pfn(dax.pfn),
+ (unsigned long long) dax.sector);
+ result |= vmf_insert_pfn_pmd(vma, address, pmd,
+ dax.pfn, write);
}
out:
@@ -734,12 +840,17 @@ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
if (err < 0)
return err;
if (buffer_written(&bh)) {
- void __pmem *addr;
- err = dax_get_addr(&bh, &addr, inode->i_blkbits);
- if (err < 0)
- return err;
- clear_pmem(addr + offset, length);
+ struct block_device *bdev = bh.b_bdev;
+ struct blk_dax_ctl dax = {
+ .sector = to_sector(&bh, inode),
+ .size = PAGE_CACHE_SIZE,
+ };
+
+ if (dax_map_atomic(bdev, &dax) < 0)
+ return PTR_ERR(dax.addr);
+ clear_pmem(dax.addr + offset, length);
wmb_pmem();
+ dax_unmap_atomic(bdev, &dax);
}
return 0;
diff --git a/fs/dcache.c b/fs/dcache.c
index 5c33aeb0f68f..b4539e84e577 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1571,7 +1571,8 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
dentry->d_iname[DNAME_INLINE_LEN-1] = 0;
if (name->len > DNAME_INLINE_LEN-1) {
size_t size = offsetof(struct external_name, name[1]);
- struct external_name *p = kmalloc(size + name->len, GFP_KERNEL);
+ struct external_name *p = kmalloc(size + name->len,
+ GFP_KERNEL_ACCOUNT);
if (!p) {
kmem_cache_free(dentry_cache, dentry);
return NULL;
@@ -1734,7 +1735,7 @@ static unsigned d_flags_for_inode(struct inode *inode)
}
if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) {
- if (unlikely(inode->i_op->follow_link)) {
+ if (unlikely(inode->i_op->get_link)) {
add_flags = DCACHE_SYMLINK_TYPE;
goto type_determined;
}
@@ -3303,18 +3304,18 @@ out:
* @new_dentry: new dentry
* @old_dentry: old dentry
*
- * Returns 1 if new_dentry is a subdirectory of the parent (at any depth).
- * Returns 0 otherwise.
+ * Returns true if new_dentry is a subdirectory of the parent (at any depth).
+ * Returns false otherwise.
* Caller must ensure that "new_dentry" is pinned before calling is_subdir()
*/
-int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
+bool is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
{
- int result;
+ bool result;
unsigned seq;
if (new_dentry == old_dentry)
- return 1;
+ return true;
do {
/* for restarting inner loop in case of seq retry */
@@ -3325,9 +3326,9 @@ int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
*/
rcu_read_lock();
if (d_ancestor(old_dentry, new_dentry))
- result = 1;
+ result = true;
else
- result = 0;
+ result = false;
rcu_read_unlock();
} while (read_seqretry(&rename_lock, seq));
@@ -3415,7 +3416,7 @@ static void __init dcache_init(void)
* of the dcache.
*/
dentry_cache = KMEM_CACHE(dentry,
- SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD);
+ SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD|SLAB_ACCOUNT);
/* Hash may have been set up in dcache_init_early */
if (!hashdist)
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 6c55ade071c3..d2ba12e23ed9 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -42,6 +42,22 @@ const struct file_operations debugfs_file_operations = {
.llseek = noop_llseek,
};
+static struct dentry *debugfs_create_mode(const char *name, umode_t mode,
+ struct dentry *parent, void *value,
+ const struct file_operations *fops,
+ const struct file_operations *fops_ro,
+ const struct file_operations *fops_wo)
+{
+ /* if there are no write bits set, make read only */
+ if (!(mode & S_IWUGO))
+ return debugfs_create_file(name, mode, parent, value, fops_ro);
+ /* if there are no read bits set, make write only */
+ if (!(mode & S_IRUGO))
+ return debugfs_create_file(name, mode, parent, value, fops_wo);
+
+ return debugfs_create_file(name, mode, parent, value, fops);
+}
+
static int debugfs_u8_set(void *data, u64 val)
{
*(u8 *)data = val;
@@ -83,14 +99,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u8_wo, NULL, debugfs_u8_set, "%llu\n");
struct dentry *debugfs_create_u8(const char *name, umode_t mode,
struct dentry *parent, u8 *value)
{
- /* if there are no write bits set, make read only */
- if (!(mode & S_IWUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_u8_ro);
- /* if there are no read bits set, make write only */
- if (!(mode & S_IRUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_u8_wo);
-
- return debugfs_create_file(name, mode, parent, value, &fops_u8);
+ return debugfs_create_mode(name, mode, parent, value, &fops_u8,
+ &fops_u8_ro, &fops_u8_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_u8);
@@ -135,14 +145,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u16_wo, NULL, debugfs_u16_set, "%llu\n");
struct dentry *debugfs_create_u16(const char *name, umode_t mode,
struct dentry *parent, u16 *value)
{
- /* if there are no write bits set, make read only */
- if (!(mode & S_IWUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_u16_ro);
- /* if there are no read bits set, make write only */
- if (!(mode & S_IRUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_u16_wo);
-
- return debugfs_create_file(name, mode, parent, value, &fops_u16);
+ return debugfs_create_mode(name, mode, parent, value, &fops_u16,
+ &fops_u16_ro, &fops_u16_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_u16);
@@ -187,14 +191,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u32_wo, NULL, debugfs_u32_set, "%llu\n");
struct dentry *debugfs_create_u32(const char *name, umode_t mode,
struct dentry *parent, u32 *value)
{
- /* if there are no write bits set, make read only */
- if (!(mode & S_IWUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_u32_ro);
- /* if there are no read bits set, make write only */
- if (!(mode & S_IRUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_u32_wo);
-
- return debugfs_create_file(name, mode, parent, value, &fops_u32);
+ return debugfs_create_mode(name, mode, parent, value, &fops_u32,
+ &fops_u32_ro, &fops_u32_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_u32);
@@ -240,17 +238,59 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n");
struct dentry *debugfs_create_u64(const char *name, umode_t mode,
struct dentry *parent, u64 *value)
{
- /* if there are no write bits set, make read only */
- if (!(mode & S_IWUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_u64_ro);
- /* if there are no read bits set, make write only */
- if (!(mode & S_IRUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_u64_wo);
-
- return debugfs_create_file(name, mode, parent, value, &fops_u64);
+ return debugfs_create_mode(name, mode, parent, value, &fops_u64,
+ &fops_u64_ro, &fops_u64_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_u64);
+static int debugfs_ulong_set(void *data, u64 val)
+{
+ *(unsigned long *)data = val;
+ return 0;
+}
+
+static int debugfs_ulong_get(void *data, u64 *val)
+{
+ *val = *(unsigned long *)data;
+ return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(fops_ulong, debugfs_ulong_get, debugfs_ulong_set, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_ulong_ro, debugfs_ulong_get, NULL, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_ulong_wo, NULL, debugfs_ulong_set, "%llu\n");
+
+/**
+ * debugfs_create_ulong - create a debugfs file that is used to read and write
+ * an unsigned long value.
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file. This should be a
+ * directory dentry if set. If this parameter is %NULL, then the
+ * file will be created in the root of the debugfs filesystem.
+ * @value: a pointer to the variable that the file should read to and write
+ * from.
+ *
+ * This function creates a file in debugfs with the given name that
+ * contains the value of the variable @value. If the @mode variable is so
+ * set, it can be read from, and written to.
+ *
+ * This function will return a pointer to a dentry if it succeeds. This
+ * pointer must be passed to the debugfs_remove() function when the file is
+ * to be removed (no automatic cleanup happens if your module is unloaded,
+ * you are responsible here.) If an error occurs, %NULL will be returned.
+ *
+ * If debugfs is not enabled in the kernel, the value -%ENODEV will be
+ * returned. It is not wise to check for this value, but rather, check for
+ * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
+ * code.
+ */
+struct dentry *debugfs_create_ulong(const char *name, umode_t mode,
+ struct dentry *parent, unsigned long *value)
+{
+ return debugfs_create_mode(name, mode, parent, value, &fops_ulong,
+ &fops_ulong_ro, &fops_ulong_wo);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_ulong);
+
DEFINE_SIMPLE_ATTRIBUTE(fops_x8, debugfs_u8_get, debugfs_u8_set, "0x%02llx\n");
DEFINE_SIMPLE_ATTRIBUTE(fops_x8_ro, debugfs_u8_get, NULL, "0x%02llx\n");
DEFINE_SIMPLE_ATTRIBUTE(fops_x8_wo, NULL, debugfs_u8_set, "0x%02llx\n");
@@ -264,6 +304,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_x32_ro, debugfs_u32_get, NULL, "0x%08llx\n");
DEFINE_SIMPLE_ATTRIBUTE(fops_x32_wo, NULL, debugfs_u32_set, "0x%08llx\n");
DEFINE_SIMPLE_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set, "0x%016llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_x64_ro, debugfs_u64_get, NULL, "0x%016llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_x64_wo, NULL, debugfs_u64_set, "0x%016llx\n");
/*
* debugfs_create_x{8,16,32,64} - create a debugfs file that is used to read and write an unsigned {8,16,32,64}-bit value
@@ -286,14 +328,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set, "0x%016llx\n
struct dentry *debugfs_create_x8(const char *name, umode_t mode,
struct dentry *parent, u8 *value)
{
- /* if there are no write bits set, make read only */
- if (!(mode & S_IWUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_x8_ro);
- /* if there are no read bits set, make write only */
- if (!(mode & S_IRUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_x8_wo);
-
- return debugfs_create_file(name, mode, parent, value, &fops_x8);
+ return debugfs_create_mode(name, mode, parent, value, &fops_x8,
+ &fops_x8_ro, &fops_x8_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_x8);
@@ -310,14 +346,8 @@ EXPORT_SYMBOL_GPL(debugfs_create_x8);
struct dentry *debugfs_create_x16(const char *name, umode_t mode,
struct dentry *parent, u16 *value)
{
- /* if there are no write bits set, make read only */
- if (!(mode & S_IWUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_x16_ro);
- /* if there are no read bits set, make write only */
- if (!(mode & S_IRUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_x16_wo);
-
- return debugfs_create_file(name, mode, parent, value, &fops_x16);
+ return debugfs_create_mode(name, mode, parent, value, &fops_x16,
+ &fops_x16_ro, &fops_x16_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_x16);
@@ -334,14 +364,8 @@ EXPORT_SYMBOL_GPL(debugfs_create_x16);
struct dentry *debugfs_create_x32(const char *name, umode_t mode,
struct dentry *parent, u32 *value)
{
- /* if there are no write bits set, make read only */
- if (!(mode & S_IWUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_x32_ro);
- /* if there are no read bits set, make write only */
- if (!(mode & S_IRUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_x32_wo);
-
- return debugfs_create_file(name, mode, parent, value, &fops_x32);
+ return debugfs_create_mode(name, mode, parent, value, &fops_x32,
+ &fops_x32_ro, &fops_x32_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_x32);
@@ -358,7 +382,8 @@ EXPORT_SYMBOL_GPL(debugfs_create_x32);
struct dentry *debugfs_create_x64(const char *name, umode_t mode,
struct dentry *parent, u64 *value)
{
- return debugfs_create_file(name, mode, parent, value, &fops_x64);
+ return debugfs_create_mode(name, mode, parent, value, &fops_x64,
+ &fops_x64_ro, &fops_x64_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_x64);
@@ -375,6 +400,8 @@ static int debugfs_size_t_get(void *data, u64 *val)
}
DEFINE_SIMPLE_ATTRIBUTE(fops_size_t, debugfs_size_t_get, debugfs_size_t_set,
"%llu\n"); /* %llu and %zu are more or less the same */
+DEFINE_SIMPLE_ATTRIBUTE(fops_size_t_ro, debugfs_size_t_get, NULL, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_size_t_wo, NULL, debugfs_size_t_set, "%llu\n");
/**
* debugfs_create_size_t - create a debugfs file that is used to read and write an size_t value
@@ -389,7 +416,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_size_t, debugfs_size_t_get, debugfs_size_t_set,
struct dentry *debugfs_create_size_t(const char *name, umode_t mode,
struct dentry *parent, size_t *value)
{
- return debugfs_create_file(name, mode, parent, value, &fops_size_t);
+ return debugfs_create_mode(name, mode, parent, value, &fops_size_t,
+ &fops_size_t_ro, &fops_size_t_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_size_t);
@@ -422,16 +450,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t_wo, NULL, debugfs_atomic_t_set, "%lld\n");
struct dentry *debugfs_create_atomic_t(const char *name, umode_t mode,
struct dentry *parent, atomic_t *value)
{
- /* if there are no write bits set, make read only */
- if (!(mode & S_IWUGO))
- return debugfs_create_file(name, mode, parent, value,
- &fops_atomic_t_ro);
- /* if there are no read bits set, make write only */
- if (!(mode & S_IRUGO))
- return debugfs_create_file(name, mode, parent, value,
- &fops_atomic_t_wo);
-
- return debugfs_create_file(name, mode, parent, value, &fops_atomic_t);
+ return debugfs_create_mode(name, mode, parent, value, &fops_atomic_t,
+ &fops_atomic_t_ro, &fops_atomic_t_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_atomic_t);
@@ -439,7 +459,7 @@ ssize_t debugfs_read_file_bool(struct file *file, char __user *user_buf,
size_t count, loff_t *ppos)
{
char buf[3];
- u32 *val = file->private_data;
+ bool *val = file->private_data;
if (*val)
buf[0] = 'Y';
@@ -457,7 +477,7 @@ ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf,
char buf[32];
size_t buf_size;
bool bv;
- u32 *val = file->private_data;
+ bool *val = file->private_data;
buf_size = min(count, (sizeof(buf)-1));
if (copy_from_user(buf, user_buf, buf_size))
@@ -478,6 +498,18 @@ static const struct file_operations fops_bool = {
.llseek = default_llseek,
};
+static const struct file_operations fops_bool_ro = {
+ .read = debugfs_read_file_bool,
+ .open = simple_open,
+ .llseek = default_llseek,
+};
+
+static const struct file_operations fops_bool_wo = {
+ .write = debugfs_write_file_bool,
+ .open = simple_open,
+ .llseek = default_llseek,
+};
+
/**
* debugfs_create_bool - create a debugfs file that is used to read and write a boolean value
* @name: a pointer to a string containing the name of the file to create.
@@ -503,9 +535,10 @@ static const struct file_operations fops_bool = {
* code.
*/
struct dentry *debugfs_create_bool(const char *name, umode_t mode,
- struct dentry *parent, u32 *value)
+ struct dentry *parent, bool *value)
{
- return debugfs_create_file(name, mode, parent, value, &fops_bool);
+ return debugfs_create_mode(name, mode, parent, value, &fops_bool,
+ &fops_bool_ro, &fops_bool_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_bool);
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index c711be8d6a3c..b7fcc0de0b2f 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -271,8 +271,12 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
dput(dentry);
dentry = ERR_PTR(-EEXIST);
}
- if (IS_ERR(dentry))
+
+ if (IS_ERR(dentry)) {
mutex_unlock(&d_inode(parent)->i_mutex);
+ simple_release_fs(&debugfs_mount, &debugfs_mount_count);
+ }
+
return dentry;
}
@@ -533,7 +537,8 @@ static int __debugfs_remove(struct dentry *dentry, struct dentry *parent)
/**
* debugfs_remove - removes a file or directory from the debugfs filesystem
* @dentry: a pointer to a the dentry of the file or directory to be
- * removed.
+ * removed. If this parameter is NULL or an error value, nothing
+ * will be done.
*
* This function removes a file or directory in debugfs that was previously
* created with a call to another debugfs function (like
@@ -565,7 +570,8 @@ EXPORT_SYMBOL_GPL(debugfs_remove);
/**
* debugfs_remove_recursive - recursively removes a directory
- * @dentry: a pointer to a the dentry of the directory to be removed.
+ * @dentry: a pointer to a the dentry of the directory to be removed. If this
+ * parameter is NULL or an error value, nothing will be done.
*
* This function recursively removes a directory tree in debugfs that
* was previously created with a call to another debugfs function
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 11256291642e..602e8441bc0f 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -109,6 +109,8 @@ struct dio_submit {
struct dio {
int flags; /* doesn't change */
int rw;
+ blk_qc_t bio_cookie;
+ struct block_device *bio_bdev;
struct inode *inode;
loff_t i_size; /* i_size when submitted */
dio_iodone_t *end_io; /* IO completion function */
@@ -120,6 +122,7 @@ struct dio {
int page_errors; /* errno from get_user_pages() */
int is_async; /* is IO async ? */
bool defer_completion; /* defer AIO completion to workqueue? */
+ bool should_dirty; /* if pages should be dirtied */
int io_error; /* IO error in completion path */
unsigned long refcount; /* direct_io_worker() and bios */
struct bio *bio_list; /* singly linked via bi_private */
@@ -360,7 +363,7 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
/*
* bio_alloc() is guaranteed to return a bio when called with
- * __GFP_WAIT and we request a valid number of vectors.
+ * __GFP_RECLAIM and we request a valid number of vectors.
*/
bio = bio_alloc(GFP_KERNEL, nr_vecs);
@@ -393,14 +396,17 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
dio->refcount++;
spin_unlock_irqrestore(&dio->bio_lock, flags);
- if (dio->is_async && dio->rw == READ)
+ if (dio->is_async && dio->rw == READ && dio->should_dirty)
bio_set_pages_dirty(bio);
- if (sdio->submit_io)
+ dio->bio_bdev = bio->bi_bdev;
+
+ if (sdio->submit_io) {
sdio->submit_io(dio->rw, bio, dio->inode,
sdio->logical_offset_in_bio);
- else
- submit_bio(dio->rw, bio);
+ dio->bio_cookie = BLK_QC_T_NONE;
+ } else
+ dio->bio_cookie = submit_bio(dio->rw, bio);
sdio->bio = NULL;
sdio->boundary = 0;
@@ -439,7 +445,8 @@ static struct bio *dio_await_one(struct dio *dio)
__set_current_state(TASK_UNINTERRUPTIBLE);
dio->waiter = current;
spin_unlock_irqrestore(&dio->bio_lock, flags);
- io_schedule();
+ if (!blk_poll(bdev_get_queue(dio->bio_bdev), dio->bio_cookie))
+ io_schedule();
/* wake up sets us TASK_RUNNING */
spin_lock_irqsave(&dio->bio_lock, flags);
dio->waiter = NULL;
@@ -464,14 +471,15 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio)
if (bio->bi_error)
dio->io_error = -EIO;
- if (dio->is_async && dio->rw == READ) {
+ if (dio->is_async && dio->rw == READ && dio->should_dirty) {
bio_check_pages_dirty(bio); /* transfers ownership */
err = bio->bi_error;
} else {
bio_for_each_segment_all(bvec, bio, i) {
struct page *page = bvec->bv_page;
- if (dio->rw == READ && !PageCompound(page))
+ if (dio->rw == READ && !PageCompound(page) &&
+ dio->should_dirty)
set_page_dirty_lock(page);
page_cache_release(page);
}
@@ -1161,6 +1169,16 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
}
}
+ /* Once we sampled i_size check for reads beyond EOF */
+ dio->i_size = i_size_read(inode);
+ if (iov_iter_rw(iter) == READ && offset >= dio->i_size) {
+ if (dio->flags & DIO_LOCKING)
+ mutex_unlock(&inode->i_mutex);
+ kmem_cache_free(dio_cache, dio);
+ retval = 0;
+ goto out;
+ }
+
/*
* For file extending writes updating i_size before data writeouts
* complete can expose uninitialized blocks in dumb filesystems.
@@ -1214,11 +1232,11 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
sdio.next_block_for_io = -1;
dio->iocb = iocb;
- dio->i_size = i_size_read(inode);
spin_lock_init(&dio->bio_lock);
dio->refcount = 1;
+ dio->should_dirty = (iter->type == ITER_IOVEC);
sdio.iter = iter;
sdio.final_block_in_request =
(offset + iov_iter_count(iter)) >> blkbits;
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index d521bddf876d..8e294fbbac39 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -61,35 +61,8 @@ static struct config_item *make_node(struct config_group *, const char *);
static void drop_node(struct config_group *, struct config_item *);
static void release_node(struct config_item *);
-static ssize_t show_cluster(struct config_item *i, struct configfs_attribute *a,
- char *buf);
-static ssize_t store_cluster(struct config_item *i,
- struct configfs_attribute *a,
- const char *buf, size_t len);
-static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
- char *buf);
-static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
- const char *buf, size_t len);
-static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
- char *buf);
-static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
- const char *buf, size_t len);
-
-static ssize_t comm_nodeid_read(struct dlm_comm *cm, char *buf);
-static ssize_t comm_nodeid_write(struct dlm_comm *cm, const char *buf,
- size_t len);
-static ssize_t comm_local_read(struct dlm_comm *cm, char *buf);
-static ssize_t comm_local_write(struct dlm_comm *cm, const char *buf,
- size_t len);
-static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf,
- size_t len);
-static ssize_t comm_addr_list_read(struct dlm_comm *cm, char *buf);
-static ssize_t node_nodeid_read(struct dlm_node *nd, char *buf);
-static ssize_t node_nodeid_write(struct dlm_node *nd, const char *buf,
- size_t len);
-static ssize_t node_weight_read(struct dlm_node *nd, char *buf);
-static ssize_t node_weight_write(struct dlm_node *nd, const char *buf,
- size_t len);
+static struct configfs_attribute *comm_attrs[];
+static struct configfs_attribute *node_attrs[];
struct dlm_cluster {
struct config_group group;
@@ -108,6 +81,12 @@ struct dlm_cluster {
char cl_cluster_name[DLM_LOCKSPACE_LEN];
};
+static struct dlm_cluster *config_item_to_cluster(struct config_item *i)
+{
+ return i ? container_of(to_config_group(i), struct dlm_cluster, group) :
+ NULL;
+}
+
enum {
CLUSTER_ATTR_TCP_PORT = 0,
CLUSTER_ATTR_BUFFER_SIZE,
@@ -124,33 +103,24 @@ enum {
CLUSTER_ATTR_CLUSTER_NAME,
};
-struct cluster_attribute {
- struct configfs_attribute attr;
- ssize_t (*show)(struct dlm_cluster *, char *);
- ssize_t (*store)(struct dlm_cluster *, const char *, size_t);
-};
-
-static ssize_t cluster_cluster_name_read(struct dlm_cluster *cl, char *buf)
+static ssize_t cluster_cluster_name_show(struct config_item *item, char *buf)
{
+ struct dlm_cluster *cl = config_item_to_cluster(item);
return sprintf(buf, "%s\n", cl->cl_cluster_name);
}
-static ssize_t cluster_cluster_name_write(struct dlm_cluster *cl,
+static ssize_t cluster_cluster_name_store(struct config_item *item,
const char *buf, size_t len)
{
+ struct dlm_cluster *cl = config_item_to_cluster(item);
+
strlcpy(dlm_config.ci_cluster_name, buf,
sizeof(dlm_config.ci_cluster_name));
strlcpy(cl->cl_cluster_name, buf, sizeof(cl->cl_cluster_name));
return len;
}
-static struct cluster_attribute cluster_attr_cluster_name = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "cluster_name",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = cluster_cluster_name_read,
- .store = cluster_cluster_name_write,
-};
+CONFIGFS_ATTR(cluster_, cluster_name);
static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field,
int *info_field, int check_zero,
@@ -175,17 +145,19 @@ static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field,
}
#define CLUSTER_ATTR(name, check_zero) \
-static ssize_t name##_write(struct dlm_cluster *cl, const char *buf, size_t len) \
+static ssize_t cluster_##name##_store(struct config_item *item, \
+ const char *buf, size_t len) \
{ \
+ struct dlm_cluster *cl = config_item_to_cluster(item); \
return cluster_set(cl, &cl->cl_##name, &dlm_config.ci_##name, \
check_zero, buf, len); \
} \
-static ssize_t name##_read(struct dlm_cluster *cl, char *buf) \
+static ssize_t cluster_##name##_show(struct config_item *item, char *buf) \
{ \
+ struct dlm_cluster *cl = config_item_to_cluster(item); \
return snprintf(buf, PAGE_SIZE, "%u\n", cl->cl_##name); \
} \
-static struct cluster_attribute cluster_attr_##name = \
-__CONFIGFS_ATTR(name, 0644, name##_read, name##_write)
+CONFIGFS_ATTR(cluster_, name);
CLUSTER_ATTR(tcp_port, 1);
CLUSTER_ATTR(buffer_size, 1);
@@ -201,19 +173,19 @@ CLUSTER_ATTR(new_rsb_count, 0);
CLUSTER_ATTR(recover_callbacks, 0);
static struct configfs_attribute *cluster_attrs[] = {
- [CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port.attr,
- [CLUSTER_ATTR_BUFFER_SIZE] = &cluster_attr_buffer_size.attr,
- [CLUSTER_ATTR_RSBTBL_SIZE] = &cluster_attr_rsbtbl_size.attr,
- [CLUSTER_ATTR_RECOVER_TIMER] = &cluster_attr_recover_timer.attr,
- [CLUSTER_ATTR_TOSS_SECS] = &cluster_attr_toss_secs.attr,
- [CLUSTER_ATTR_SCAN_SECS] = &cluster_attr_scan_secs.attr,
- [CLUSTER_ATTR_LOG_DEBUG] = &cluster_attr_log_debug.attr,
- [CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol.attr,
- [CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs.attr,
- [CLUSTER_ATTR_WAITWARN_US] = &cluster_attr_waitwarn_us.attr,
- [CLUSTER_ATTR_NEW_RSB_COUNT] = &cluster_attr_new_rsb_count.attr,
- [CLUSTER_ATTR_RECOVER_CALLBACKS] = &cluster_attr_recover_callbacks.attr,
- [CLUSTER_ATTR_CLUSTER_NAME] = &cluster_attr_cluster_name.attr,
+ [CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port,
+ [CLUSTER_ATTR_BUFFER_SIZE] = &cluster_attr_buffer_size,
+ [CLUSTER_ATTR_RSBTBL_SIZE] = &cluster_attr_rsbtbl_size,
+ [CLUSTER_ATTR_RECOVER_TIMER] = &cluster_attr_recover_timer,
+ [CLUSTER_ATTR_TOSS_SECS] = &cluster_attr_toss_secs,
+ [CLUSTER_ATTR_SCAN_SECS] = &cluster_attr_scan_secs,
+ [CLUSTER_ATTR_LOG_DEBUG] = &cluster_attr_log_debug,
+ [CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol,
+ [CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs,
+ [CLUSTER_ATTR_WAITWARN_US] = &cluster_attr_waitwarn_us,
+ [CLUSTER_ATTR_NEW_RSB_COUNT] = &cluster_attr_new_rsb_count,
+ [CLUSTER_ATTR_RECOVER_CALLBACKS] = &cluster_attr_recover_callbacks,
+ [CLUSTER_ATTR_CLUSTER_NAME] = &cluster_attr_cluster_name,
NULL,
};
@@ -224,83 +196,11 @@ enum {
COMM_ATTR_ADDR_LIST,
};
-struct comm_attribute {
- struct configfs_attribute attr;
- ssize_t (*show)(struct dlm_comm *, char *);
- ssize_t (*store)(struct dlm_comm *, const char *, size_t);
-};
-
-static struct comm_attribute comm_attr_nodeid = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "nodeid",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = comm_nodeid_read,
- .store = comm_nodeid_write,
-};
-
-static struct comm_attribute comm_attr_local = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "local",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = comm_local_read,
- .store = comm_local_write,
-};
-
-static struct comm_attribute comm_attr_addr = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "addr",
- .ca_mode = S_IWUSR },
- .store = comm_addr_write,
-};
-
-static struct comm_attribute comm_attr_addr_list = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "addr_list",
- .ca_mode = S_IRUGO },
- .show = comm_addr_list_read,
-};
-
-static struct configfs_attribute *comm_attrs[] = {
- [COMM_ATTR_NODEID] = &comm_attr_nodeid.attr,
- [COMM_ATTR_LOCAL] = &comm_attr_local.attr,
- [COMM_ATTR_ADDR] = &comm_attr_addr.attr,
- [COMM_ATTR_ADDR_LIST] = &comm_attr_addr_list.attr,
- NULL,
-};
-
enum {
NODE_ATTR_NODEID = 0,
NODE_ATTR_WEIGHT,
};
-struct node_attribute {
- struct configfs_attribute attr;
- ssize_t (*show)(struct dlm_node *, char *);
- ssize_t (*store)(struct dlm_node *, const char *, size_t);
-};
-
-static struct node_attribute node_attr_nodeid = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "nodeid",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = node_nodeid_read,
- .store = node_nodeid_write,
-};
-
-static struct node_attribute node_attr_weight = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "weight",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = node_weight_read,
- .store = node_weight_write,
-};
-
-static struct configfs_attribute *node_attrs[] = {
- [NODE_ATTR_NODEID] = &node_attr_nodeid.attr,
- [NODE_ATTR_WEIGHT] = &node_attr_weight.attr,
- NULL,
-};
-
struct dlm_clusters {
struct configfs_subsystem subsys;
};
@@ -349,8 +249,6 @@ static struct configfs_group_operations clusters_ops = {
static struct configfs_item_operations cluster_ops = {
.release = release_cluster,
- .show_attribute = show_cluster,
- .store_attribute = store_cluster,
};
static struct configfs_group_operations spaces_ops = {
@@ -369,8 +267,6 @@ static struct configfs_group_operations comms_ops = {
static struct configfs_item_operations comm_ops = {
.release = release_comm,
- .show_attribute = show_comm,
- .store_attribute = store_comm,
};
static struct configfs_group_operations nodes_ops = {
@@ -380,8 +276,6 @@ static struct configfs_group_operations nodes_ops = {
static struct configfs_item_operations node_ops = {
.release = release_node,
- .show_attribute = show_node,
- .store_attribute = store_node,
};
static struct config_item_type clusters_type = {
@@ -427,12 +321,6 @@ static struct config_item_type node_type = {
.ct_owner = THIS_MODULE,
};
-static struct dlm_cluster *config_item_to_cluster(struct config_item *i)
-{
- return i ? container_of(to_config_group(i), struct dlm_cluster, group) :
- NULL;
-}
-
static struct dlm_space *config_item_to_space(struct config_item *i)
{
return i ? container_of(to_config_group(i), struct dlm_space, group) :
@@ -687,66 +575,30 @@ void dlm_config_exit(void)
* Functions for user space to read/write attributes
*/
-static ssize_t show_cluster(struct config_item *i, struct configfs_attribute *a,
- char *buf)
-{
- struct dlm_cluster *cl = config_item_to_cluster(i);
- struct cluster_attribute *cla =
- container_of(a, struct cluster_attribute, attr);
- return cla->show ? cla->show(cl, buf) : 0;
-}
-
-static ssize_t store_cluster(struct config_item *i,
- struct configfs_attribute *a,
- const char *buf, size_t len)
+static ssize_t comm_nodeid_show(struct config_item *item, char *buf)
{
- struct dlm_cluster *cl = config_item_to_cluster(i);
- struct cluster_attribute *cla =
- container_of(a, struct cluster_attribute, attr);
- return cla->store ? cla->store(cl, buf, len) : -EINVAL;
-}
-
-static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
- char *buf)
-{
- struct dlm_comm *cm = config_item_to_comm(i);
- struct comm_attribute *cma =
- container_of(a, struct comm_attribute, attr);
- return cma->show ? cma->show(cm, buf) : 0;
-}
-
-static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
- const char *buf, size_t len)
-{
- struct dlm_comm *cm = config_item_to_comm(i);
- struct comm_attribute *cma =
- container_of(a, struct comm_attribute, attr);
- return cma->store ? cma->store(cm, buf, len) : -EINVAL;
+ return sprintf(buf, "%d\n", config_item_to_comm(item)->nodeid);
}
-static ssize_t comm_nodeid_read(struct dlm_comm *cm, char *buf)
-{
- return sprintf(buf, "%d\n", cm->nodeid);
-}
-
-static ssize_t comm_nodeid_write(struct dlm_comm *cm, const char *buf,
+static ssize_t comm_nodeid_store(struct config_item *item, const char *buf,
size_t len)
{
- int rc = kstrtoint(buf, 0, &cm->nodeid);
+ int rc = kstrtoint(buf, 0, &config_item_to_comm(item)->nodeid);
if (rc)
return rc;
return len;
}
-static ssize_t comm_local_read(struct dlm_comm *cm, char *buf)
+static ssize_t comm_local_show(struct config_item *item, char *buf)
{
- return sprintf(buf, "%d\n", cm->local);
+ return sprintf(buf, "%d\n", config_item_to_comm(item)->local);
}
-static ssize_t comm_local_write(struct dlm_comm *cm, const char *buf,
+static ssize_t comm_local_store(struct config_item *item, const char *buf,
size_t len)
{
+ struct dlm_comm *cm = config_item_to_comm(item);
int rc = kstrtoint(buf, 0, &cm->local);
if (rc)
@@ -756,8 +608,10 @@ static ssize_t comm_local_write(struct dlm_comm *cm, const char *buf,
return len;
}
-static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf, size_t len)
+static ssize_t comm_addr_store(struct config_item *item, const char *buf,
+ size_t len)
{
+ struct dlm_comm *cm = config_item_to_comm(item);
struct sockaddr_storage *addr;
int rv;
@@ -783,8 +637,9 @@ static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf, size_t len)
return len;
}
-static ssize_t comm_addr_list_read(struct dlm_comm *cm, char *buf)
+static ssize_t comm_addr_list_show(struct config_item *item, char *buf)
{
+ struct dlm_comm *cm = config_item_to_comm(item);
ssize_t s;
ssize_t allowance;
int i;
@@ -827,32 +682,28 @@ static ssize_t comm_addr_list_read(struct dlm_comm *cm, char *buf)
return 4096 - allowance;
}
-static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
- char *buf)
-{
- struct dlm_node *nd = config_item_to_node(i);
- struct node_attribute *nda =
- container_of(a, struct node_attribute, attr);
- return nda->show ? nda->show(nd, buf) : 0;
-}
+CONFIGFS_ATTR(comm_, nodeid);
+CONFIGFS_ATTR(comm_, local);
+CONFIGFS_ATTR_WO(comm_, addr);
+CONFIGFS_ATTR_RO(comm_, addr_list);
-static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
- const char *buf, size_t len)
-{
- struct dlm_node *nd = config_item_to_node(i);
- struct node_attribute *nda =
- container_of(a, struct node_attribute, attr);
- return nda->store ? nda->store(nd, buf, len) : -EINVAL;
-}
+static struct configfs_attribute *comm_attrs[] = {
+ [COMM_ATTR_NODEID] = &comm_attr_nodeid,
+ [COMM_ATTR_LOCAL] = &comm_attr_local,
+ [COMM_ATTR_ADDR] = &comm_attr_addr,
+ [COMM_ATTR_ADDR_LIST] = &comm_attr_addr_list,
+ NULL,
+};
-static ssize_t node_nodeid_read(struct dlm_node *nd, char *buf)
+static ssize_t node_nodeid_show(struct config_item *item, char *buf)
{
- return sprintf(buf, "%d\n", nd->nodeid);
+ return sprintf(buf, "%d\n", config_item_to_node(item)->nodeid);
}
-static ssize_t node_nodeid_write(struct dlm_node *nd, const char *buf,
+static ssize_t node_nodeid_store(struct config_item *item, const char *buf,
size_t len)
{
+ struct dlm_node *nd = config_item_to_node(item);
uint32_t seq = 0;
int rc = kstrtoint(buf, 0, &nd->nodeid);
@@ -863,21 +714,30 @@ static ssize_t node_nodeid_write(struct dlm_node *nd, const char *buf,
return len;
}
-static ssize_t node_weight_read(struct dlm_node *nd, char *buf)
+static ssize_t node_weight_show(struct config_item *item, char *buf)
{
- return sprintf(buf, "%d\n", nd->weight);
+ return sprintf(buf, "%d\n", config_item_to_node(item)->weight);
}
-static ssize_t node_weight_write(struct dlm_node *nd, const char *buf,
+static ssize_t node_weight_store(struct config_item *item, const char *buf,
size_t len)
{
- int rc = kstrtoint(buf, 0, &nd->weight);
+ int rc = kstrtoint(buf, 0, &config_item_to_node(item)->weight);
if (rc)
return rc;
return len;
}
+CONFIGFS_ATTR(node_, nodeid);
+CONFIGFS_ATTR(node_, weight);
+
+static struct configfs_attribute *node_attrs[] = {
+ [NODE_ATTR_NODEID] = &node_attr_nodeid,
+ [NODE_ATTR_WEIGHT] = &node_attr_weight,
+ NULL,
+};
+
/*
* Functions for the dlm to get the info that's been configured
*/
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 87e9d796cf7d..3a37bd3f9637 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -421,7 +421,7 @@ static void lowcomms_write_space(struct sock *sk)
if (test_and_clear_bit(CF_APP_LIMITED, &con->flags)) {
con->sock->sk->sk_write_pending--;
- clear_bit(SOCK_ASYNC_NOSPACE, &con->sock->flags);
+ clear_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags);
}
if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
@@ -1448,7 +1448,7 @@ static void send_to_sock(struct connection *con)
msg_flags);
if (ret == -EAGAIN || ret == 0) {
if (ret == -EAGAIN &&
- test_bit(SOCK_ASYNC_NOSPACE, &con->sock->flags) &&
+ test_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags) &&
!test_and_set_bit(CF_APP_LIMITED, &con->flags)) {
/* Notify TCP that we're limited by the
* application window size.
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index 5532f097f6da..d401425f602a 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -145,7 +145,7 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
send_op(op);
if (xop->callback == NULL) {
- rv = wait_event_killable(recv_wq, (op->done != 0));
+ rv = wait_event_interruptible(recv_wq, (op->done != 0));
if (rv == -ERESTARTSYS) {
log_debug(ls, "dlm_posix_lock: wait killed %llx",
(unsigned long long)number);
@@ -172,7 +172,7 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
rv = op->info.rv;
if (!rv) {
- if (posix_lock_file_wait(file, fl) < 0)
+ if (locks_lock_file_wait(file, fl) < 0)
log_error(ls, "dlm_posix_lock: vfs lock error %llx",
(unsigned long long)number);
}
@@ -262,7 +262,7 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
/* cause the vfs unlock to return ENOENT if lock is not found */
fl->fl_flags |= FL_EXISTS;
- rv = posix_lock_file_wait(file, fl);
+ rv = locks_lock_file_wait(file, fl);
if (rv == -ENOENT) {
rv = 0;
goto out_free;
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 173b3873a4f4..1925d6d222b8 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -515,14 +515,9 @@ static ssize_t device_write(struct file *file, const char __user *buf,
if (count > sizeof(struct dlm_write_request) + DLM_RESNAME_MAXLEN)
return -EINVAL;
- kbuf = kzalloc(count + 1, GFP_NOFS);
- if (!kbuf)
- return -ENOMEM;
-
- if (copy_from_user(kbuf, buf, count)) {
- error = -EFAULT;
- goto out_free;
- }
+ kbuf = memdup_user_nul(buf, count);
+ if (!IS_ERR(kbuf))
+ return PTR_ERR(kbuf);
if (check_version(kbuf)) {
error = -EBADE;
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 5ba029e627cc..7b39260c7bba 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -86,7 +86,7 @@ ecryptfs_get_encrypted_key_payload_data(struct key *key)
{
if (key->type == &key_type_encrypted)
return (struct ecryptfs_auth_tok *)
- (&((struct encrypted_key_payload *)key->payload.data)->payload_data);
+ (&((struct encrypted_key_payload *)key->payload.data[0])->payload_data);
else
return NULL;
}
@@ -117,8 +117,7 @@ ecryptfs_get_key_payload_data(struct key *key)
auth_tok = ecryptfs_get_encrypted_key_payload_data(key);
if (!auth_tok)
- return (struct ecryptfs_auth_tok *)
- (((struct user_key_payload *)key->payload.data)->data);
+ return (struct ecryptfs_auth_tok *)user_key_payload(key)->data;
else
return auth_tok;
}
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 3c4db1172d22..040aa879d634 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -270,7 +270,7 @@ ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry,
ecryptfs_inode = ecryptfs_do_create(directory_inode, ecryptfs_dentry,
mode);
- if (unlikely(IS_ERR(ecryptfs_inode))) {
+ if (IS_ERR(ecryptfs_inode)) {
ecryptfs_printk(KERN_WARNING, "Failed to create file in"
"lower filesystem\n");
rc = PTR_ERR(ecryptfs_inode);
@@ -282,9 +282,7 @@ ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry,
if (rc) {
ecryptfs_do_unlink(directory_inode, ecryptfs_dentry,
ecryptfs_inode);
- make_bad_inode(ecryptfs_inode);
- unlock_new_inode(ecryptfs_inode);
- iput(ecryptfs_inode);
+ iget_failed(ecryptfs_inode);
goto out;
}
unlock_new_inode(ecryptfs_inode);
@@ -674,16 +672,24 @@ out:
return rc ? ERR_PTR(rc) : buf;
}
-static const char *ecryptfs_follow_link(struct dentry *dentry, void **cookie)
+static const char *ecryptfs_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
size_t len;
- char *buf = ecryptfs_readlink_lower(dentry, &len);
+ char *buf;
+
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
+ buf = ecryptfs_readlink_lower(dentry, &len);
if (IS_ERR(buf))
return buf;
fsstack_copy_attr_atime(d_inode(dentry),
d_inode(ecryptfs_dentry_to_lower(dentry)));
buf[len] = '\0';
- return *cookie = buf;
+ set_delayed_call(done, kfree_link, buf);
+ return buf;
}
/**
@@ -1095,8 +1101,7 @@ out:
const struct inode_operations ecryptfs_symlink_iops = {
.readlink = generic_readlink,
- .follow_link = ecryptfs_follow_link,
- .put_link = kfree_put_link,
+ .get_link = ecryptfs_get_link,
.permission = ecryptfs_permission,
.setattr = ecryptfs_setattr,
.getattr = ecryptfs_getattr_link,
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 4f4d0474bee9..e25b6b06bacf 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -663,6 +663,7 @@ static struct ecryptfs_cache_info {
struct kmem_cache **cache;
const char *name;
size_t size;
+ unsigned long flags;
void (*ctor)(void *obj);
} ecryptfs_cache_infos[] = {
{
@@ -684,6 +685,7 @@ static struct ecryptfs_cache_info {
.cache = &ecryptfs_inode_info_cache,
.name = "ecryptfs_inode_cache",
.size = sizeof(struct ecryptfs_inode_info),
+ .flags = SLAB_ACCOUNT,
.ctor = inode_info_init_once,
},
{
@@ -755,8 +757,8 @@ static int ecryptfs_init_kmem_caches(void)
struct ecryptfs_cache_info *info;
info = &ecryptfs_cache_infos[i];
- *(info->cache) = kmem_cache_create(info->name, info->size,
- 0, SLAB_HWCACHE_ALIGN, info->ctor);
+ *(info->cache) = kmem_cache_create(info->name, info->size, 0,
+ SLAB_HWCACHE_ALIGN | info->flags, info->ctor);
if (!*(info->cache)) {
ecryptfs_free_kmem_caches();
ecryptfs_printk(KERN_WARNING, "%s: "
diff --git a/fs/efs/inode.c b/fs/efs/inode.c
index 079d20306ee1..cdf0872382af 100644
--- a/fs/efs/inode.c
+++ b/fs/efs/inode.c
@@ -151,6 +151,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino)
break;
case S_IFLNK:
inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_data.a_ops = &efs_symlink_aops;
break;
case S_IFCHR:
diff --git a/fs/efs/super.c b/fs/efs/super.c
index c8411a30f7da..cb68dac4f9d3 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -94,9 +94,9 @@ static void init_once(void *foo)
static int __init init_inodecache(void)
{
efs_inode_cachep = kmem_cache_create("efs_inode_cache",
- sizeof(struct efs_inode_info),
- 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
- init_once);
+ sizeof(struct efs_inode_info), 0,
+ SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
+ SLAB_ACCOUNT, init_once);
if (efs_inode_cachep == NULL)
return -ENOMEM;
return 0;
diff --git a/fs/efs/symlink.c b/fs/efs/symlink.c
index 75117d0dac2b..4870cc82deb0 100644
--- a/fs/efs/symlink.c
+++ b/fs/efs/symlink.c
@@ -13,7 +13,7 @@
static int efs_symlink_readpage(struct file *file, struct page *page)
{
- char *link = kmap(page);
+ char *link = page_address(page);
struct buffer_head * bh;
struct inode * inode = page->mapping->host;
efs_block_t size = inode->i_size;
@@ -39,12 +39,10 @@ static int efs_symlink_readpage(struct file *file, struct page *page)
}
link[size] = '\0';
SetPageUptodate(page);
- kunmap(page);
unlock_page(page);
return 0;
fail:
SetPageError(page);
- kunmap(page);
unlock_page(page);
return err;
}
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 8d0c0df01854..ed70cf9fdc7b 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -45,10 +45,10 @@ struct eventfd_ctx {
*
* This function is supposed to be called by the kernel in paths that do not
* allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
- * value, and we signal this as overflow condition by returining a POLLERR
+ * value, and we signal this as overflow condition by returning a POLLERR
* to poll(2).
*
- * Returns the amount by which the counter was incrememnted. This will be less
+ * Returns the amount by which the counter was incremented. This will be less
* than @n if the counter has overflowed.
*/
__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
diff --git a/fs/exec.c b/fs/exec.c
index b06623a9347f..828ec5f07de0 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -119,7 +119,7 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
int error = PTR_ERR(tmp);
static const struct open_flags uselib_flags = {
.open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
- .acc_mode = MAY_READ | MAY_EXEC | MAY_OPEN,
+ .acc_mode = MAY_READ | MAY_EXEC,
.intent = LOOKUP_OPEN,
.lookup_flags = LOOKUP_FOLLOW,
};
@@ -763,7 +763,7 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags)
int err;
struct open_flags open_exec_flags = {
.open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
- .acc_mode = MAY_EXEC | MAY_OPEN,
+ .acc_mode = MAY_EXEC,
.intent = LOOKUP_OPEN,
.lookup_flags = LOOKUP_FOLLOW,
};
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 73c64daa0f55..9eaf595aeaf8 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -592,10 +592,7 @@ static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
}
unlock_page(page);
}
- if (PageDirty(page) || PageWriteback(page))
- *uptodate = true;
- else
- *uptodate = PageUptodate(page);
+ *uptodate = PageUptodate(page);
EXOFS_DBGMSG2("index=0x%lx uptodate=%d\n", index, *uptodate);
return page;
} else {
@@ -1227,6 +1224,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
inode->i_link = (char *)oi->i_data;
} else {
inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &exofs_aops;
}
} else {
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
index 09a6bb1ad63c..c20d77df2679 100644
--- a/fs/exofs/namei.c
+++ b/fs/exofs/namei.c
@@ -80,9 +80,6 @@ static int exofs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
struct inode *inode;
int err;
- if (!new_valid_dev(rdev))
- return -EINVAL;
-
inode = exofs_new_inode(dir, mode);
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
@@ -114,6 +111,7 @@ static int exofs_symlink(struct inode *dir, struct dentry *dentry,
if (l > sizeof(oi->i_data)) {
/* slow symlink */
inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &exofs_aops;
memset(oi->i_data, 0, sizeof(oi->i_data));
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index b795c567b5e1..6658a50530a0 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -194,8 +194,8 @@ static int init_inodecache(void)
{
exofs_inode_cachep = kmem_cache_create("exofs_inode_cache",
sizeof(struct exofs_i_info), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
- exofs_init_once);
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD |
+ SLAB_ACCOUNT, exofs_init_once);
if (exofs_inode_cachep == NULL)
return -ENOMEM;
return 0;
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 8d15febd0aa3..4c69c94cafd8 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -684,6 +684,9 @@ struct ext2_inode_info {
struct rw_semaphore xattr_sem;
#endif
rwlock_t i_meta_lock;
+#ifdef CONFIG_FS_DAX
+ struct rw_semaphore dax_sem;
+#endif
/*
* truncate_mutex is for serialising ext2_truncate() against
@@ -699,6 +702,14 @@ struct ext2_inode_info {
#endif
};
+#ifdef CONFIG_FS_DAX
+#define dax_sem_down_write(ext2_inode) down_write(&(ext2_inode)->dax_sem)
+#define dax_sem_up_write(ext2_inode) up_write(&(ext2_inode)->dax_sem)
+#else
+#define dax_sem_down_write(ext2_inode)
+#define dax_sem_up_write(ext2_inode)
+#endif
+
/*
* Inode dynamic state flags
*/
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 1982c3f11aec..11a42c5a09ae 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -27,27 +27,103 @@
#include "acl.h"
#ifdef CONFIG_FS_DAX
+/*
+ * The lock ordering for ext2 DAX fault paths is:
+ *
+ * mmap_sem (MM)
+ * sb_start_pagefault (vfs, freeze)
+ * ext2_inode_info->dax_sem
+ * address_space->i_mmap_rwsem or page_lock (mutually exclusive in DAX)
+ * ext2_inode_info->truncate_mutex
+ *
+ * The default page_lock and i_size verification done by non-DAX fault paths
+ * is sufficient because ext2 doesn't support hole punching.
+ */
static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
- return dax_fault(vma, vmf, ext2_get_block, NULL);
+ struct inode *inode = file_inode(vma->vm_file);
+ struct ext2_inode_info *ei = EXT2_I(inode);
+ int ret;
+
+ if (vmf->flags & FAULT_FLAG_WRITE) {
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vma->vm_file);
+ }
+ down_read(&ei->dax_sem);
+
+ ret = __dax_fault(vma, vmf, ext2_get_block, NULL);
+
+ up_read(&ei->dax_sem);
+ if (vmf->flags & FAULT_FLAG_WRITE)
+ sb_end_pagefault(inode->i_sb);
+ return ret;
}
static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmd, unsigned int flags)
{
- return dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block, NULL);
+ struct inode *inode = file_inode(vma->vm_file);
+ struct ext2_inode_info *ei = EXT2_I(inode);
+ int ret;
+
+ if (flags & FAULT_FLAG_WRITE) {
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vma->vm_file);
+ }
+ down_read(&ei->dax_sem);
+
+ ret = __dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block, NULL);
+
+ up_read(&ei->dax_sem);
+ if (flags & FAULT_FLAG_WRITE)
+ sb_end_pagefault(inode->i_sb);
+ return ret;
}
static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
- return dax_mkwrite(vma, vmf, ext2_get_block, NULL);
+ struct inode *inode = file_inode(vma->vm_file);
+ struct ext2_inode_info *ei = EXT2_I(inode);
+ int ret;
+
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vma->vm_file);
+ down_read(&ei->dax_sem);
+
+ ret = __dax_mkwrite(vma, vmf, ext2_get_block, NULL);
+
+ up_read(&ei->dax_sem);
+ sb_end_pagefault(inode->i_sb);
+ return ret;
+}
+
+static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma,
+ struct vm_fault *vmf)
+{
+ struct inode *inode = file_inode(vma->vm_file);
+ struct ext2_inode_info *ei = EXT2_I(inode);
+ int ret = VM_FAULT_NOPAGE;
+ loff_t size;
+
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vma->vm_file);
+ down_read(&ei->dax_sem);
+
+ /* check that the faulting page hasn't raced with truncate */
+ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (vmf->pgoff >= size)
+ ret = VM_FAULT_SIGBUS;
+
+ up_read(&ei->dax_sem);
+ sb_end_pagefault(inode->i_sb);
+ return ret;
}
static const struct vm_operations_struct ext2_dax_vm_ops = {
.fault = ext2_dax_fault,
.pmd_fault = ext2_dax_pmd_fault,
.page_mkwrite = ext2_dax_mkwrite,
- .pfn_mkwrite = dax_pfn_mkwrite,
+ .pfn_mkwrite = ext2_dax_pfn_mkwrite,
};
static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index c60a248c640c..338eefda70c6 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1085,6 +1085,7 @@ static void ext2_free_branches(struct inode *inode, __le32 *p, __le32 *q, int de
ext2_free_data(inode, p, q);
}
+/* dax_sem must be held when calling this function */
static void __ext2_truncate_blocks(struct inode *inode, loff_t offset)
{
__le32 *i_data = EXT2_I(inode)->i_data;
@@ -1100,6 +1101,10 @@ static void __ext2_truncate_blocks(struct inode *inode, loff_t offset)
blocksize = inode->i_sb->s_blocksize;
iblock = (offset + blocksize-1) >> EXT2_BLOCK_SIZE_BITS(inode->i_sb);
+#ifdef CONFIG_FS_DAX
+ WARN_ON(!rwsem_is_locked(&ei->dax_sem));
+#endif
+
n = ext2_block_to_path(inode, iblock, offsets, NULL);
if (n == 0)
return;
@@ -1185,7 +1190,10 @@ static void ext2_truncate_blocks(struct inode *inode, loff_t offset)
return;
if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
return;
+
+ dax_sem_down_write(EXT2_I(inode));
__ext2_truncate_blocks(inode, offset);
+ dax_sem_up_write(EXT2_I(inode));
}
static int ext2_setsize(struct inode *inode, loff_t newsize)
@@ -1213,8 +1221,10 @@ static int ext2_setsize(struct inode *inode, loff_t newsize)
if (error)
return error;
+ dax_sem_down_write(EXT2_I(inode));
truncate_setsize(inode, newsize);
__ext2_truncate_blocks(inode, newsize);
+ dax_sem_up_write(EXT2_I(inode));
inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
if (inode_needs_sync(inode)) {
@@ -1410,6 +1420,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
sizeof(ei->i_data) - 1);
} else {
inode->i_op = &ext2_symlink_inode_operations;
+ inode_nohighmem(inode);
if (test_opt(inode->i_sb, NOBH))
inode->i_mapping->a_ops = &ext2_nobh_aops;
else
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index b4841e3066a5..7a2be8f7f3c3 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -143,9 +143,6 @@ static int ext2_mknod (struct inode * dir, struct dentry *dentry, umode_t mode,
struct inode * inode;
int err;
- if (!new_valid_dev(rdev))
- return -EINVAL;
-
err = dquot_initialize(dir);
if (err)
return err;
@@ -186,6 +183,7 @@ static int ext2_symlink (struct inode * dir, struct dentry * dentry,
if (l > sizeof (EXT2_I(inode)->i_data)) {
/* slow symlink */
inode->i_op = &ext2_symlink_inode_operations;
+ inode_nohighmem(inode);
if (test_opt(inode->i_sb, NOBH))
inode->i_mapping->a_ops = &ext2_nobh_aops;
else
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 900e19cf9ef6..2a188413a2b0 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -192,6 +192,9 @@ static void init_once(void *foo)
init_rwsem(&ei->xattr_sem);
#endif
mutex_init(&ei->truncate_mutex);
+#ifdef CONFIG_FS_DAX
+ init_rwsem(&ei->dax_sem);
+#endif
inode_init_once(&ei->vfs_inode);
}
@@ -200,7 +203,7 @@ static int __init init_inodecache(void)
ext2_inode_cachep = kmem_cache_create("ext2_inode_cache",
sizeof(struct ext2_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (ext2_inode_cachep == NULL)
return -ENOMEM;
@@ -566,6 +569,8 @@ static int parse_options(char *options, struct super_block *sb)
/* Fall through */
case Opt_dax:
#ifdef CONFIG_FS_DAX
+ ext2_msg(sb, KERN_WARNING,
+ "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
set_opt(sbi->s_mount_opt, DAX);
#else
ext2_msg(sb, KERN_INFO, "dax option not supported");
diff --git a/fs/ext2/symlink.c b/fs/ext2/symlink.c
index ae17179f3810..3495d8ae4b33 100644
--- a/fs/ext2/symlink.c
+++ b/fs/ext2/symlink.c
@@ -22,8 +22,7 @@
const struct inode_operations ext2_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.setattr = ext2_setattr,
#ifdef CONFIG_EXT2_FS_XATTR
.setxattr = generic_setxattr,
@@ -35,7 +34,7 @@ const struct inode_operations ext2_symlink_inode_operations = {
const struct inode_operations ext2_fast_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = simple_follow_link,
+ .get_link = simple_get_link,
.setattr = ext2_setattr,
#ifdef CONFIG_EXT2_FS_XATTR
.setxattr = generic_setxattr,
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 0b6bfd3a398b..f57a7aba32eb 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -77,10 +77,8 @@
printk("\n"); \
} while (0)
# define ea_bdebug(bh, f...) do { \
- char b[BDEVNAME_SIZE]; \
- printk(KERN_DEBUG "block %s:%lu: ", \
- bdevname(bh->b_bdev, b), \
- (unsigned long) bh->b_blocknr); \
+ printk(KERN_DEBUG "block %pg:%lu: ", \
+ bh->b_bdev, (unsigned long) bh->b_blocknr); \
printk(f); \
printk("\n"); \
} while (0)
@@ -292,17 +290,21 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_list",
const struct xattr_handler *handler =
ext2_xattr_handler(entry->e_name_index);
- if (handler) {
- size_t size = handler->list(dentry, buffer, rest,
- entry->e_name,
- entry->e_name_len,
- handler->flags);
+ if (handler && (!handler->list || handler->list(dentry))) {
+ const char *prefix = handler->prefix ?: handler->name;
+ size_t prefix_len = strlen(prefix);
+ size_t size = prefix_len + entry->e_name_len + 1;
+
if (buffer) {
if (size > rest) {
error = -ERANGE;
goto cleanup;
}
- buffer += size;
+ memcpy(buffer, prefix, prefix_len);
+ buffer += prefix_len;
+ memcpy(buffer, entry->e_name, entry->e_name_len);
+ buffer += entry->e_name_len;
+ *buffer++ = 0;
}
rest -= size;
}
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index 702fc6840246..ba97f243b050 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -7,37 +7,20 @@
#include <linux/security.h>
#include "xattr.h"
-static size_t
-ext2_xattr_security_list(struct dentry *dentry, char *list, size_t list_size,
- const char *name, size_t name_len, int type)
-{
- const int prefix_len = XATTR_SECURITY_PREFIX_LEN;
- const size_t total_len = prefix_len + name_len + 1;
-
- if (list && total_len <= list_size) {
- memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
- memcpy(list+prefix_len, name, name_len);
- list[prefix_len + name_len] = '\0';
- }
- return total_len;
-}
-
static int
-ext2_xattr_security_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+ext2_xattr_security_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
return ext2_xattr_get(d_inode(dentry), EXT2_XATTR_INDEX_SECURITY, name,
buffer, size);
}
static int
-ext2_xattr_security_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+ext2_xattr_security_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_SECURITY, name,
value, size, flags);
}
@@ -68,7 +51,6 @@ ext2_init_security(struct inode *inode, struct inode *dir,
const struct xattr_handler ext2_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
- .list = ext2_xattr_security_list,
.get = ext2_xattr_security_get,
.set = ext2_xattr_security_set,
};
diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c
index 42b6e9874bcc..2c94d1930626 100644
--- a/fs/ext2/xattr_trusted.c
+++ b/fs/ext2/xattr_trusted.c
@@ -8,40 +8,26 @@
#include "ext2.h"
#include "xattr.h"
-static size_t
-ext2_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size,
- const char *name, size_t name_len, int type)
+static bool
+ext2_xattr_trusted_list(struct dentry *dentry)
{
- const int prefix_len = XATTR_TRUSTED_PREFIX_LEN;
- const size_t total_len = prefix_len + name_len + 1;
-
- if (!capable(CAP_SYS_ADMIN))
- return 0;
-
- if (list && total_len <= list_size) {
- memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
- memcpy(list+prefix_len, name, name_len);
- list[prefix_len + name_len] = '\0';
- }
- return total_len;
+ return capable(CAP_SYS_ADMIN);
}
static int
-ext2_xattr_trusted_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+ext2_xattr_trusted_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
return ext2_xattr_get(d_inode(dentry), EXT2_XATTR_INDEX_TRUSTED, name,
buffer, size);
}
static int
-ext2_xattr_trusted_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+ext2_xattr_trusted_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_TRUSTED, name,
value, size, flags);
}
diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c
index ecdc4605192c..72a2a96d677f 100644
--- a/fs/ext2/xattr_user.c
+++ b/fs/ext2/xattr_user.c
@@ -10,30 +10,17 @@
#include "ext2.h"
#include "xattr.h"
-static size_t
-ext2_xattr_user_list(struct dentry *dentry, char *list, size_t list_size,
- const char *name, size_t name_len, int type)
+static bool
+ext2_xattr_user_list(struct dentry *dentry)
{
- const size_t prefix_len = XATTR_USER_PREFIX_LEN;
- const size_t total_len = prefix_len + name_len + 1;
-
- if (!test_opt(dentry->d_sb, XATTR_USER))
- return 0;
-
- if (list && total_len <= list_size) {
- memcpy(list, XATTR_USER_PREFIX, prefix_len);
- memcpy(list+prefix_len, name, name_len);
- list[prefix_len + name_len] = '\0';
- }
- return total_len;
+ return test_opt(dentry->d_sb, XATTR_USER);
}
static int
-ext2_xattr_user_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+ext2_xattr_user_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
if (!test_opt(dentry->d_sb, XATTR_USER))
return -EOPNOTSUPP;
return ext2_xattr_get(d_inode(dentry), EXT2_XATTR_INDEX_USER,
@@ -41,11 +28,10 @@ ext2_xattr_user_get(struct dentry *dentry, const char *name,
}
static int
-ext2_xattr_user_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+ext2_xattr_user_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
if (!test_opt(dentry->d_sb, XATTR_USER))
return -EOPNOTSUPP;
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 75285ea9aa05..f52cf54f0cbc 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -8,7 +8,7 @@ ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
mmp.o indirect.o extents_status.o xattr.o xattr_user.o \
- xattr_trusted.o inline.o readpage.o
+ xattr_trusted.o inline.o readpage.o sysfs.o
ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index cd6ea29be645..ec0668a60678 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -191,6 +191,7 @@ static int ext4_init_block_bitmap(struct super_block *sb,
/* If checksum is bad mark all blocks used to prevent allocation
* essentially implementing a per-group read-only flag. */
if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
+ ext4_error(sb, "Checksum bad for group %u", block_group);
grp = ext4_get_group_info(sb, block_group);
if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
percpu_counter_sub(&sbi->s_freeclusters_counter,
@@ -203,7 +204,7 @@ static int ext4_init_block_bitmap(struct super_block *sb,
count);
}
set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
- return -EIO;
+ return -EFSBADCRC;
}
memset(bh->b_data, 0, sb->s_blocksize);
@@ -213,7 +214,7 @@ static int ext4_init_block_bitmap(struct super_block *sb,
start = ext4_group_first_block_no(sb, block_group);
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
+ if (ext4_has_feature_flex_bg(sb))
flex_bg = 1;
/* Set bits for block and inode bitmaps, and inode table */
@@ -322,7 +323,7 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb,
ext4_fsblk_t blk;
ext4_fsblk_t group_first_block;
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
+ if (ext4_has_feature_flex_bg(sb)) {
/* with FLEX_BG, the inode/block bitmaps and itable
* blocks may not be in the group at all
* so the bitmap validation will be skipped for those groups
@@ -360,42 +361,45 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb,
return 0;
}
-static void ext4_validate_block_bitmap(struct super_block *sb,
- struct ext4_group_desc *desc,
- ext4_group_t block_group,
- struct buffer_head *bh)
+static int ext4_validate_block_bitmap(struct super_block *sb,
+ struct ext4_group_desc *desc,
+ ext4_group_t block_group,
+ struct buffer_head *bh)
{
ext4_fsblk_t blk;
struct ext4_group_info *grp = ext4_get_group_info(sb, block_group);
struct ext4_sb_info *sbi = EXT4_SB(sb);
- if (buffer_verified(bh) || EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
- return;
+ if (buffer_verified(bh))
+ return 0;
+ if (EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+ return -EFSCORRUPTED;
ext4_lock_group(sb, block_group);
- blk = ext4_valid_block_bitmap(sb, desc, block_group, bh);
- if (unlikely(blk != 0)) {
+ if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group,
+ desc, bh))) {
ext4_unlock_group(sb, block_group);
- ext4_error(sb, "bg %u: block %llu: invalid block bitmap",
- block_group, blk);
+ ext4_error(sb, "bg %u: bad block bitmap checksum", block_group);
if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
percpu_counter_sub(&sbi->s_freeclusters_counter,
grp->bb_free);
set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
- return;
+ return -EFSBADCRC;
}
- if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group,
- desc, bh))) {
+ blk = ext4_valid_block_bitmap(sb, desc, block_group, bh);
+ if (unlikely(blk != 0)) {
ext4_unlock_group(sb, block_group);
- ext4_error(sb, "bg %u: bad block bitmap checksum", block_group);
+ ext4_error(sb, "bg %u: block %llu: invalid block bitmap",
+ block_group, blk);
if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
percpu_counter_sub(&sbi->s_freeclusters_counter,
grp->bb_free);
set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
- return;
+ return -EFSCORRUPTED;
}
set_buffer_verified(bh);
ext4_unlock_group(sb, block_group);
+ return 0;
}
/**
@@ -414,17 +418,18 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
struct ext4_group_desc *desc;
struct buffer_head *bh;
ext4_fsblk_t bitmap_blk;
+ int err;
desc = ext4_get_group_desc(sb, block_group, NULL);
if (!desc)
- return NULL;
+ return ERR_PTR(-EFSCORRUPTED);
bitmap_blk = ext4_block_bitmap(sb, desc);
bh = sb_getblk(sb, bitmap_blk);
if (unlikely(!bh)) {
ext4_error(sb, "Cannot get buffer for block bitmap - "
"block_group = %u, block_bitmap = %llu",
block_group, bitmap_blk);
- return NULL;
+ return ERR_PTR(-ENOMEM);
}
if (bitmap_uptodate(bh))
@@ -437,7 +442,6 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
}
ext4_lock_group(sb, block_group);
if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
- int err;
err = ext4_init_block_bitmap(sb, bh, block_group, desc);
set_bitmap_uptodate(bh);
@@ -445,7 +449,7 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
ext4_unlock_group(sb, block_group);
unlock_buffer(bh);
if (err)
- ext4_error(sb, "Checksum bad for grp %u", block_group);
+ goto out;
goto verify;
}
ext4_unlock_group(sb, block_group);
@@ -468,11 +472,13 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
submit_bh(READ | REQ_META | REQ_PRIO, bh);
return bh;
verify:
- ext4_validate_block_bitmap(sb, desc, block_group, bh);
- if (buffer_verified(bh))
- return bh;
+ err = ext4_validate_block_bitmap(sb, desc, block_group, bh);
+ if (err)
+ goto out;
+ return bh;
+out:
put_bh(bh);
- return NULL;
+ return ERR_PTR(err);
}
/* Returns 0 on success, 1 on error */
@@ -485,32 +491,32 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
return 0;
desc = ext4_get_group_desc(sb, block_group, NULL);
if (!desc)
- return 1;
+ return -EFSCORRUPTED;
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
ext4_error(sb, "Cannot read block bitmap - "
"block_group = %u, block_bitmap = %llu",
block_group, (unsigned long long) bh->b_blocknr);
- return 1;
+ return -EIO;
}
clear_buffer_new(bh);
/* Panic or remount fs read-only if block bitmap is invalid */
- ext4_validate_block_bitmap(sb, desc, block_group, bh);
- /* ...but check for error just in case errors=continue. */
- return !buffer_verified(bh);
+ return ext4_validate_block_bitmap(sb, desc, block_group, bh);
}
struct buffer_head *
ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
{
struct buffer_head *bh;
+ int err;
bh = ext4_read_block_bitmap_nowait(sb, block_group);
- if (!bh)
- return NULL;
- if (ext4_wait_block_bitmap(sb, block_group, bh)) {
+ if (IS_ERR(bh))
+ return bh;
+ err = ext4_wait_block_bitmap(sb, block_group, bh);
+ if (err) {
put_bh(bh);
- return NULL;
+ return ERR_PTR(err);
}
return bh;
}
@@ -681,8 +687,10 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
desc_count += ext4_free_group_clusters(sb, gdp);
brelse(bitmap_bh);
bitmap_bh = ext4_read_block_bitmap(sb, i);
- if (bitmap_bh == NULL)
+ if (IS_ERR(bitmap_bh)) {
+ bitmap_bh = NULL;
continue;
+ }
x = ext4_count_free(bitmap_bh->b_data,
EXT4_CLUSTERS_PER_GROUP(sb) / 8);
@@ -740,14 +748,13 @@ int ext4_bg_has_super(struct super_block *sb, ext4_group_t group)
if (group == 0)
return 1;
- if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_SPARSE_SUPER2)) {
+ if (ext4_has_feature_sparse_super2(sb)) {
if (group == le32_to_cpu(es->s_backup_bgs[0]) ||
group == le32_to_cpu(es->s_backup_bgs[1]))
return 1;
return 0;
}
- if ((group <= 1) || !EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER))
+ if ((group <= 1) || !ext4_has_feature_sparse_super(sb))
return 1;
if (!(group & 1))
return 0;
@@ -776,7 +783,7 @@ static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb,
if (!ext4_bg_has_super(sb, group))
return 0;
- if (EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG))
+ if (ext4_has_feature_meta_bg(sb))
return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
else
return EXT4_SB(sb)->s_gdb_count;
@@ -797,8 +804,7 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group)
le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb);
- if (!EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG) ||
- metagroup < first_meta_bg)
+ if (!ext4_has_feature_meta_bg(sb) || metagroup < first_meta_bg)
return ext4_bg_num_gdb_nometa(sb, group);
return ext4_bg_num_gdb_meta(sb,group);
@@ -818,7 +824,7 @@ static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
/* Check for superblock and gdt backups in this group */
num = ext4_bg_has_super(sb, block_group);
- if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) ||
+ if (!ext4_has_feature_meta_bg(sb) ||
block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) *
sbi->s_desc_per_block) {
if (num) {
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 3522340c7a99..02ddec6d8a7d 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -234,7 +234,7 @@ int ext4_check_blockref(const char *function, unsigned int line,
es->s_last_error_block = cpu_to_le64(blk);
ext4_error_inode(inode, function, line, blk,
"invalid block");
- return -EIO;
+ return -EFSCORRUPTED;
}
}
return 0;
diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c
index 45731558138c..1a0835073663 100644
--- a/fs/ext4/crypto.c
+++ b/fs/ext4/crypto.c
@@ -253,8 +253,7 @@ typedef enum {
EXT4_ENCRYPT,
} ext4_direction_t;
-static int ext4_page_crypto(struct ext4_crypto_ctx *ctx,
- struct inode *inode,
+static int ext4_page_crypto(struct inode *inode,
ext4_direction_t rw,
pgoff_t index,
struct page *src_page,
@@ -296,7 +295,6 @@ static int ext4_page_crypto(struct ext4_crypto_ctx *ctx,
else
res = crypto_ablkcipher_encrypt(req);
if (res == -EINPROGRESS || res == -EBUSY) {
- BUG_ON(req->base.data != &ecr);
wait_for_completion(&ecr.completion);
res = ecr.res;
}
@@ -353,7 +351,7 @@ struct page *ext4_encrypt(struct inode *inode,
if (IS_ERR(ciphertext_page))
goto errout;
ctx->w.control_page = plaintext_page;
- err = ext4_page_crypto(ctx, inode, EXT4_ENCRYPT, plaintext_page->index,
+ err = ext4_page_crypto(inode, EXT4_ENCRYPT, plaintext_page->index,
plaintext_page, ciphertext_page);
if (err) {
ciphertext_page = ERR_PTR(err);
@@ -378,40 +376,29 @@ struct page *ext4_encrypt(struct inode *inode,
*
* Return: Zero on success, non-zero otherwise.
*/
-int ext4_decrypt(struct ext4_crypto_ctx *ctx, struct page *page)
+int ext4_decrypt(struct page *page)
{
BUG_ON(!PageLocked(page));
- return ext4_page_crypto(ctx, page->mapping->host,
+ return ext4_page_crypto(page->mapping->host,
EXT4_DECRYPT, page->index, page, page);
}
-/*
- * Convenience function which takes care of allocating and
- * deallocating the encryption context
- */
-int ext4_decrypt_one(struct inode *inode, struct page *page)
-{
- int ret;
-
- struct ext4_crypto_ctx *ctx = ext4_get_crypto_ctx(inode);
-
- if (IS_ERR(ctx))
- return PTR_ERR(ctx);
- ret = ext4_decrypt(ctx, page);
- ext4_release_crypto_ctx(ctx);
- return ret;
-}
-
int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex)
{
struct ext4_crypto_ctx *ctx;
struct page *ciphertext_page = NULL;
struct bio *bio;
- ext4_lblk_t lblk = ex->ee_block;
+ ext4_lblk_t lblk = le32_to_cpu(ex->ee_block);
ext4_fsblk_t pblk = ext4_ext_pblock(ex);
unsigned int len = ext4_ext_get_actual_len(ex);
- int err = 0;
+ int ret, err = 0;
+
+#if 0
+ ext4_msg(inode->i_sb, KERN_CRIT,
+ "ext4_encrypted_zeroout ino %lu lblk %u len %u",
+ (unsigned long) inode->i_ino, lblk, len);
+#endif
BUG_ON(inode->i_sb->s_blocksize != PAGE_CACHE_SIZE);
@@ -426,7 +413,7 @@ int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex)
}
while (len--) {
- err = ext4_page_crypto(ctx, inode, EXT4_ENCRYPT, lblk,
+ err = ext4_page_crypto(inode, EXT4_ENCRYPT, lblk,
ZERO_PAGE(0), ciphertext_page);
if (err)
goto errout;
@@ -437,17 +424,26 @@ int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex)
goto errout;
}
bio->bi_bdev = inode->i_sb->s_bdev;
- bio->bi_iter.bi_sector = pblk;
- err = bio_add_page(bio, ciphertext_page,
+ bio->bi_iter.bi_sector =
+ pblk << (inode->i_sb->s_blocksize_bits - 9);
+ ret = bio_add_page(bio, ciphertext_page,
inode->i_sb->s_blocksize, 0);
- if (err) {
+ if (ret != inode->i_sb->s_blocksize) {
+ /* should never happen! */
+ ext4_msg(inode->i_sb, KERN_ERR,
+ "bio_add_page failed: %d", ret);
+ WARN_ON(1);
bio_put(bio);
+ err = -EIO;
goto errout;
}
err = submit_bio_wait(WRITE, bio);
+ if ((err == 0) && bio->bi_error)
+ err = -EIO;
bio_put(bio);
if (err)
goto errout;
+ lblk++; pblk++;
}
err = 0;
errout:
diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c
index 847f919c84d9..2fbef8a14760 100644
--- a/fs/ext4/crypto_fname.c
+++ b/fs/ext4/crypto_fname.c
@@ -120,7 +120,6 @@ static int ext4_fname_encrypt(struct inode *inode,
ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv);
res = crypto_ablkcipher_encrypt(req);
if (res == -EINPROGRESS || res == -EBUSY) {
- BUG_ON(req->base.data != &ecr);
wait_for_completion(&ecr.completion);
res = ecr.res;
}
@@ -182,7 +181,6 @@ static int ext4_fname_decrypt(struct inode *inode,
ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv);
res = crypto_ablkcipher_decrypt(req);
if (res == -EINPROGRESS || res == -EBUSY) {
- BUG_ON(req->base.data != &ecr);
wait_for_completion(&ecr.completion);
res = ecr.res;
}
diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c
index 1d510c11b100..c5882b36e558 100644
--- a/fs/ext4/crypto_key.c
+++ b/fs/ext4/crypto_key.c
@@ -71,7 +71,6 @@ static int ext4_derive_key_aes(char deriving_key[EXT4_AES_128_ECB_KEY_SIZE],
EXT4_AES_256_XTS_KEY_SIZE, NULL);
res = crypto_ablkcipher_encrypt(req);
if (res == -EINPROGRESS || res == -EBUSY) {
- BUG_ON(req->base.data != &ecr);
wait_for_completion(&ecr.completion);
res = ecr.res;
}
@@ -121,7 +120,7 @@ int _ext4_get_encryption_info(struct inode *inode)
struct key *keyring_key = NULL;
struct ext4_encryption_key *master_key;
struct ext4_encryption_context ctx;
- struct user_key_payload *ukp;
+ const struct user_key_payload *ukp;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct crypto_ablkcipher *ctfm;
const char *cipher_str;
@@ -208,8 +207,13 @@ retry:
goto out;
}
crypt_info->ci_keyring_key = keyring_key;
- BUG_ON(keyring_key->type != &key_type_logon);
- ukp = ((struct user_key_payload *)keyring_key->payload.data);
+ if (keyring_key->type != &key_type_logon) {
+ printk_once(KERN_WARNING
+ "ext4: key type must be logon\n");
+ res = -ENOKEY;
+ goto out;
+ }
+ ukp = user_key_payload(keyring_key);
if (ukp->datalen != sizeof(struct ext4_encryption_key)) {
res = -EINVAL;
goto out;
@@ -217,7 +221,13 @@ retry:
master_key = (struct ext4_encryption_key *)ukp->data;
BUILD_BUG_ON(EXT4_AES_128_ECB_KEY_SIZE !=
EXT4_KEY_DERIVATION_NONCE_SIZE);
- BUG_ON(master_key->size != EXT4_AES_256_XTS_KEY_SIZE);
+ if (master_key->size != EXT4_AES_256_XTS_KEY_SIZE) {
+ printk_once(KERN_WARNING
+ "ext4: key size incorrect: %d\n",
+ master_key->size);
+ res = -ENOKEY;
+ goto out;
+ }
res = ext4_derive_key_aes(ctx.nonce, master_key->raw,
raw_key);
if (res)
diff --git a/fs/ext4/crypto_policy.c b/fs/ext4/crypto_policy.c
index a640ec2c4b13..ad050698143f 100644
--- a/fs/ext4/crypto_policy.c
+++ b/fs/ext4/crypto_policy.c
@@ -150,7 +150,8 @@ int ext4_is_child_context_consistent_with_parent(struct inode *parent,
if ((parent == NULL) || (child == NULL)) {
pr_err("parent %p child %p\n", parent, child);
- BUG_ON(1);
+ WARN_ON(1); /* Should never happen */
+ return 0;
}
/* no restrictions if the parent directory is not encrypted */
if (!ext4_encrypted_inode(parent))
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index f9e14911918c..1d1bca74f844 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -40,8 +40,7 @@ static int is_dx_dir(struct inode *inode)
{
struct super_block *sb = inode->i_sb;
- if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_COMPAT_DIR_INDEX) &&
+ if (ext4_has_feature_dir_index(inode->i_sb) &&
((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
((inode->i_size >> sb->s_blocksize_bits) == 1) ||
ext4_has_inline_data(inode)))
@@ -621,14 +620,14 @@ int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf,
while ((char *) de < top) {
if (ext4_check_dir_entry(dir, NULL, de, bh,
buf, buf_size, offset))
- return -EIO;
+ return -EFSCORRUPTED;
nlen = EXT4_DIR_REC_LEN(de->name_len);
rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
offset += rlen;
}
if ((char *) de > top)
- return -EIO;
+ return -EFSCORRUPTED;
return 0;
}
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index fd1f28be5296..cc7ca4e87144 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -26,6 +26,7 @@
#include <linux/seqlock.h>
#include <linux/mutex.h>
#include <linux/timer.h>
+#include <linux/version.h>
#include <linux/wait.h>
#include <linux/blockgroup_lock.h>
#include <linux/percpu_counter.h>
@@ -374,6 +375,7 @@ struct flex_groups {
#define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */
#define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */
#define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */
+#define EXT4_PROJINHERIT_FL 0x20000000 /* Create with parents projid */
#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
#define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */
@@ -431,6 +433,7 @@ enum {
EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */
EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */
EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */
+ EXT4_INODE_PROJINHERIT = 29, /* Create with parents projid */
EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */
};
@@ -475,6 +478,7 @@ static inline void ext4_check_flag_values(void)
CHECK_FLAG_VALUE(EA_INODE);
CHECK_FLAG_VALUE(EOFBLOCKS);
CHECK_FLAG_VALUE(INLINE_DATA);
+ CHECK_FLAG_VALUE(PROJINHERIT);
CHECK_FLAG_VALUE(RESERVED);
}
@@ -692,6 +696,7 @@ struct ext4_inode {
__le32 i_crtime; /* File Creation time */
__le32 i_crtime_extra; /* extra FileCreationtime (nsec << 2 | epoch) */
__le32 i_version_hi; /* high 32 bits for 64-bit version */
+ __le32 i_projid; /* Project ID */
};
struct move_extent {
@@ -723,19 +728,55 @@ struct move_extent {
<= (EXT4_GOOD_OLD_INODE_SIZE + \
(einode)->i_extra_isize)) \
+/*
+ * We use an encoding that preserves the times for extra epoch "00":
+ *
+ * extra msb of adjust for signed
+ * epoch 32-bit 32-bit tv_sec to
+ * bits time decoded 64-bit tv_sec 64-bit tv_sec valid time range
+ * 0 0 1 -0x80000000..-0x00000001 0x000000000 1901-12-13..1969-12-31
+ * 0 0 0 0x000000000..0x07fffffff 0x000000000 1970-01-01..2038-01-19
+ * 0 1 1 0x080000000..0x0ffffffff 0x100000000 2038-01-19..2106-02-07
+ * 0 1 0 0x100000000..0x17fffffff 0x100000000 2106-02-07..2174-02-25
+ * 1 0 1 0x180000000..0x1ffffffff 0x200000000 2174-02-25..2242-03-16
+ * 1 0 0 0x200000000..0x27fffffff 0x200000000 2242-03-16..2310-04-04
+ * 1 1 1 0x280000000..0x2ffffffff 0x300000000 2310-04-04..2378-04-22
+ * 1 1 0 0x300000000..0x37fffffff 0x300000000 2378-04-22..2446-05-10
+ *
+ * Note that previous versions of the kernel on 64-bit systems would
+ * incorrectly use extra epoch bits 1,1 for dates between 1901 and
+ * 1970. e2fsck will correct this, assuming that it is run on the
+ * affected filesystem before 2242.
+ */
+
static inline __le32 ext4_encode_extra_time(struct timespec *time)
{
- return cpu_to_le32((sizeof(time->tv_sec) > 4 ?
- (time->tv_sec >> 32) & EXT4_EPOCH_MASK : 0) |
- ((time->tv_nsec << EXT4_EPOCH_BITS) & EXT4_NSEC_MASK));
+ u32 extra = sizeof(time->tv_sec) > 4 ?
+ ((time->tv_sec - (s32)time->tv_sec) >> 32) & EXT4_EPOCH_MASK : 0;
+ return cpu_to_le32(extra | (time->tv_nsec << EXT4_EPOCH_BITS));
}
static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra)
{
- if (sizeof(time->tv_sec) > 4)
- time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK)
- << 32;
- time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS;
+ if (unlikely(sizeof(time->tv_sec) > 4 &&
+ (extra & cpu_to_le32(EXT4_EPOCH_MASK)))) {
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4,20,0)
+ /* Handle legacy encoding of pre-1970 dates with epoch
+ * bits 1,1. We assume that by kernel version 4.20,
+ * everyone will have run fsck over the affected
+ * filesystems to correct the problem. (This
+ * backwards compatibility may be removed before this
+ * time, at the discretion of the ext4 developers.)
+ */
+ u64 extra_bits = le32_to_cpu(extra) & EXT4_EPOCH_MASK;
+ if (extra_bits == 3 && ((time->tv_sec) & 0x80000000) != 0)
+ extra_bits = 0;
+ time->tv_sec += extra_bits << 32;
+#else
+ time->tv_sec += (u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32;
+#endif
+ }
+ time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS;
}
#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \
@@ -1019,6 +1060,9 @@ struct ext4_inode_info {
#define EXT4_MOUNT2_HURD_COMPAT 0x00000004 /* Support HURD-castrated
file systems */
+#define EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM 0x00000008 /* User explicitly
+ specified journal checksum */
+
#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \
~EXT4_MOUNT_##opt
#define set_opt(sb, opt) EXT4_SB(sb)->s_mount_opt |= \
@@ -1179,7 +1223,9 @@ struct ext4_super_block {
__u8 s_encrypt_algos[4]; /* Encryption algorithms in use */
__u8 s_encrypt_pw_salt[16]; /* Salt used for string2key algorithm */
__le32 s_lpf_ino; /* Location of the lost+found inode */
- __le32 s_reserved[100]; /* Padding to the end of the block */
+ __le32 s_prj_quota_inum; /* inode for tracking project quota */
+ __le32 s_checksum_seed; /* crc32c(uuid) if csum_seed set */
+ __le32 s_reserved[98]; /* Padding to the end of the block */
__le32 s_checksum; /* crc32c(superblock) */
};
@@ -1522,6 +1568,7 @@ static inline int ext4_encrypted_inode(struct inode *inode)
* Feature set definitions
*/
+/* Use the ext4_{has,set,clear}_feature_* helpers; these will be removed */
#define EXT4_HAS_COMPAT_FEATURE(sb,mask) \
((EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask)) != 0)
#define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask) \
@@ -1566,6 +1613,7 @@ static inline int ext4_encrypted_inode(struct inode *inode)
*/
#define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400
#define EXT4_FEATURE_RO_COMPAT_READONLY 0x1000
+#define EXT4_FEATURE_RO_COMPAT_PROJECT 0x2000
#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001
#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002
@@ -1578,11 +1626,99 @@ static inline int ext4_encrypted_inode(struct inode *inode)
#define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200
#define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */
#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */
-#define EXT4_FEATURE_INCOMPAT_BG_USE_META_CSUM 0x2000 /* use crc32c for bg */
+#define EXT4_FEATURE_INCOMPAT_CSUM_SEED 0x2000
#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */
#define EXT4_FEATURE_INCOMPAT_INLINE_DATA 0x8000 /* data in inode */
#define EXT4_FEATURE_INCOMPAT_ENCRYPT 0x10000
+#define EXT4_FEATURE_COMPAT_FUNCS(name, flagname) \
+static inline bool ext4_has_feature_##name(struct super_block *sb) \
+{ \
+ return ((EXT4_SB(sb)->s_es->s_feature_compat & \
+ cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname)) != 0); \
+} \
+static inline void ext4_set_feature_##name(struct super_block *sb) \
+{ \
+ EXT4_SB(sb)->s_es->s_feature_compat |= \
+ cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \
+} \
+static inline void ext4_clear_feature_##name(struct super_block *sb) \
+{ \
+ EXT4_SB(sb)->s_es->s_feature_compat &= \
+ ~cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \
+}
+
+#define EXT4_FEATURE_RO_COMPAT_FUNCS(name, flagname) \
+static inline bool ext4_has_feature_##name(struct super_block *sb) \
+{ \
+ return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & \
+ cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname)) != 0); \
+} \
+static inline void ext4_set_feature_##name(struct super_block *sb) \
+{ \
+ EXT4_SB(sb)->s_es->s_feature_ro_compat |= \
+ cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \
+} \
+static inline void ext4_clear_feature_##name(struct super_block *sb) \
+{ \
+ EXT4_SB(sb)->s_es->s_feature_ro_compat &= \
+ ~cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \
+}
+
+#define EXT4_FEATURE_INCOMPAT_FUNCS(name, flagname) \
+static inline bool ext4_has_feature_##name(struct super_block *sb) \
+{ \
+ return ((EXT4_SB(sb)->s_es->s_feature_incompat & \
+ cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname)) != 0); \
+} \
+static inline void ext4_set_feature_##name(struct super_block *sb) \
+{ \
+ EXT4_SB(sb)->s_es->s_feature_incompat |= \
+ cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \
+} \
+static inline void ext4_clear_feature_##name(struct super_block *sb) \
+{ \
+ EXT4_SB(sb)->s_es->s_feature_incompat &= \
+ ~cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \
+}
+
+EXT4_FEATURE_COMPAT_FUNCS(dir_prealloc, DIR_PREALLOC)
+EXT4_FEATURE_COMPAT_FUNCS(imagic_inodes, IMAGIC_INODES)
+EXT4_FEATURE_COMPAT_FUNCS(journal, HAS_JOURNAL)
+EXT4_FEATURE_COMPAT_FUNCS(xattr, EXT_ATTR)
+EXT4_FEATURE_COMPAT_FUNCS(resize_inode, RESIZE_INODE)
+EXT4_FEATURE_COMPAT_FUNCS(dir_index, DIR_INDEX)
+EXT4_FEATURE_COMPAT_FUNCS(sparse_super2, SPARSE_SUPER2)
+
+EXT4_FEATURE_RO_COMPAT_FUNCS(sparse_super, SPARSE_SUPER)
+EXT4_FEATURE_RO_COMPAT_FUNCS(large_file, LARGE_FILE)
+EXT4_FEATURE_RO_COMPAT_FUNCS(btree_dir, BTREE_DIR)
+EXT4_FEATURE_RO_COMPAT_FUNCS(huge_file, HUGE_FILE)
+EXT4_FEATURE_RO_COMPAT_FUNCS(gdt_csum, GDT_CSUM)
+EXT4_FEATURE_RO_COMPAT_FUNCS(dir_nlink, DIR_NLINK)
+EXT4_FEATURE_RO_COMPAT_FUNCS(extra_isize, EXTRA_ISIZE)
+EXT4_FEATURE_RO_COMPAT_FUNCS(quota, QUOTA)
+EXT4_FEATURE_RO_COMPAT_FUNCS(bigalloc, BIGALLOC)
+EXT4_FEATURE_RO_COMPAT_FUNCS(metadata_csum, METADATA_CSUM)
+EXT4_FEATURE_RO_COMPAT_FUNCS(readonly, READONLY)
+EXT4_FEATURE_RO_COMPAT_FUNCS(project, PROJECT)
+
+EXT4_FEATURE_INCOMPAT_FUNCS(compression, COMPRESSION)
+EXT4_FEATURE_INCOMPAT_FUNCS(filetype, FILETYPE)
+EXT4_FEATURE_INCOMPAT_FUNCS(journal_needs_recovery, RECOVER)
+EXT4_FEATURE_INCOMPAT_FUNCS(journal_dev, JOURNAL_DEV)
+EXT4_FEATURE_INCOMPAT_FUNCS(meta_bg, META_BG)
+EXT4_FEATURE_INCOMPAT_FUNCS(extents, EXTENTS)
+EXT4_FEATURE_INCOMPAT_FUNCS(64bit, 64BIT)
+EXT4_FEATURE_INCOMPAT_FUNCS(mmp, MMP)
+EXT4_FEATURE_INCOMPAT_FUNCS(flex_bg, FLEX_BG)
+EXT4_FEATURE_INCOMPAT_FUNCS(ea_inode, EA_INODE)
+EXT4_FEATURE_INCOMPAT_FUNCS(dirdata, DIRDATA)
+EXT4_FEATURE_INCOMPAT_FUNCS(csum_seed, CSUM_SEED)
+EXT4_FEATURE_INCOMPAT_FUNCS(largedir, LARGEDIR)
+EXT4_FEATURE_INCOMPAT_FUNCS(inline_data, INLINE_DATA)
+EXT4_FEATURE_INCOMPAT_FUNCS(encrypt, ENCRYPT)
+
#define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR
#define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
EXT4_FEATURE_INCOMPAT_META_BG)
@@ -1598,7 +1734,7 @@ static inline int ext4_encrypted_inode(struct inode *inode)
EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
-#define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR
+#define EXT4_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR
#define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
EXT4_FEATURE_INCOMPAT_RECOVER| \
EXT4_FEATURE_INCOMPAT_META_BG| \
@@ -1607,7 +1743,8 @@ static inline int ext4_encrypted_inode(struct inode *inode)
EXT4_FEATURE_INCOMPAT_FLEX_BG| \
EXT4_FEATURE_INCOMPAT_MMP | \
EXT4_FEATURE_INCOMPAT_INLINE_DATA | \
- EXT4_FEATURE_INCOMPAT_ENCRYPT)
+ EXT4_FEATURE_INCOMPAT_ENCRYPT | \
+ EXT4_FEATURE_INCOMPAT_CSUM_SEED)
#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
@@ -1619,6 +1756,40 @@ static inline int ext4_encrypted_inode(struct inode *inode)
EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\
EXT4_FEATURE_RO_COMPAT_QUOTA)
+#define EXTN_FEATURE_FUNCS(ver) \
+static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \
+{ \
+ return ((EXT4_SB(sb)->s_es->s_feature_compat & \
+ cpu_to_le32(~EXT##ver##_FEATURE_COMPAT_SUPP)) != 0); \
+} \
+static inline bool ext4_has_unknown_ext##ver##_ro_compat_features(struct super_block *sb) \
+{ \
+ return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & \
+ cpu_to_le32(~EXT##ver##_FEATURE_RO_COMPAT_SUPP)) != 0); \
+} \
+static inline bool ext4_has_unknown_ext##ver##_incompat_features(struct super_block *sb) \
+{ \
+ return ((EXT4_SB(sb)->s_es->s_feature_incompat & \
+ cpu_to_le32(~EXT##ver##_FEATURE_INCOMPAT_SUPP)) != 0); \
+}
+
+EXTN_FEATURE_FUNCS(2)
+EXTN_FEATURE_FUNCS(3)
+EXTN_FEATURE_FUNCS(4)
+
+static inline bool ext4_has_compat_features(struct super_block *sb)
+{
+ return (EXT4_SB(sb)->s_es->s_feature_compat != 0);
+}
+static inline bool ext4_has_ro_compat_features(struct super_block *sb)
+{
+ return (EXT4_SB(sb)->s_es->s_feature_ro_compat != 0);
+}
+static inline bool ext4_has_incompat_features(struct super_block *sb)
+{
+ return (EXT4_SB(sb)->s_es->s_feature_incompat != 0);
+}
+
/*
* Default values for user and/or group using reserved blocks
*/
@@ -1769,8 +1940,7 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
* (c) Daniel Phillips, 2001
*/
-#define is_dx(dir) (EXT4_HAS_COMPAT_FEATURE(dir->i_sb, \
- EXT4_FEATURE_COMPAT_DIR_INDEX) && \
+#define is_dx(dir) (ext4_has_feature_dir_index((dir)->i_sb) && \
ext4_test_inode_flag((dir), EXT4_INODE_INDEX))
#define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX)
#define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
@@ -2063,8 +2233,7 @@ void ext4_release_crypto_ctx(struct ext4_crypto_ctx *ctx);
void ext4_restore_control_page(struct page *data_page);
struct page *ext4_encrypt(struct inode *inode,
struct page *plaintext_page);
-int ext4_decrypt(struct ext4_crypto_ctx *ctx, struct page *page);
-int ext4_decrypt_one(struct inode *inode, struct page *page);
+int ext4_decrypt(struct page *page);
int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex);
#ifdef CONFIG_EXT4_FS_ENCRYPTION
@@ -2072,7 +2241,7 @@ int ext4_init_crypto(void);
void ext4_exit_crypto(void);
static inline int ext4_sb_has_crypto(struct super_block *sb)
{
- return EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT);
+ return ext4_has_feature_encrypt(sb);
}
#else
static inline int ext4_init_crypto(void) { return 0; }
@@ -2193,8 +2362,7 @@ int ext4_insert_dentry(struct inode *dir,
struct ext4_filename *fname);
static inline void ext4_update_dx_flag(struct inode *inode)
{
- if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_COMPAT_DIR_INDEX))
+ if (!ext4_has_feature_dir_index(inode->i_sb))
ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
}
static unsigned char ext4_filetype_table[] = {
@@ -2203,8 +2371,7 @@ static unsigned char ext4_filetype_table[] = {
static inline unsigned char get_dtype(struct super_block *sb, int filetype)
{
- if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) ||
- (filetype >= EXT4_FT_MAX))
+ if (!ext4_has_feature_filetype(sb) || filetype >= EXT4_FT_MAX)
return DT_UNKNOWN;
return ext4_filetype_table[filetype];
@@ -2245,6 +2412,7 @@ extern int ext4_init_inode_table(struct super_block *sb,
extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate);
/* mballoc.c */
+extern const struct file_operations ext4_seq_mb_groups_fops;
extern long ext4_mb_stats;
extern long ext4_mb_max_to_scan;
extern int ext4_mb_init(struct super_block *);
@@ -2372,6 +2540,7 @@ extern int ext4_group_extend(struct super_block *sb,
extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count);
/* super.c */
+extern int ext4_seq_options_show(struct seq_file *seq, void *offset);
extern int ext4_calculate_overhead(struct super_block *sb);
extern void ext4_superblock_csum_set(struct super_block *sb);
extern void *ext4_kvmalloc(size_t size, gfp_t flags);
@@ -2534,15 +2703,13 @@ extern int ext4_register_li_request(struct super_block *sb,
static inline int ext4_has_group_desc_csum(struct super_block *sb)
{
- return EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_GDT_CSUM) ||
- (EXT4_SB(sb)->s_chksum_driver != NULL);
+ return ext4_has_feature_gdt_csum(sb) ||
+ EXT4_SB(sb)->s_chksum_driver != NULL;
}
static inline int ext4_has_metadata_csum(struct super_block *sb)
{
- WARN_ON_ONCE(EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
+ WARN_ON_ONCE(ext4_has_feature_metadata_csum(sb) &&
!EXT4_SB(sb)->s_chksum_driver);
return (EXT4_SB(sb)->s_chksum_driver != NULL);
@@ -2889,7 +3056,7 @@ static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = {
static inline void ext4_set_de_type(struct super_block *sb,
struct ext4_dir_entry_2 *de,
umode_t mode) {
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE))
+ if (ext4_has_feature_filetype(sb))
de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
}
@@ -2903,6 +3070,12 @@ extern const struct inode_operations ext4_encrypted_symlink_inode_operations;
extern const struct inode_operations ext4_symlink_inode_operations;
extern const struct inode_operations ext4_fast_symlink_inode_operations;
+/* sysfs.c */
+extern int ext4_register_sysfs(struct super_block *sb);
+extern void ext4_unregister_sysfs(struct super_block *sb);
+extern int __init ext4_init_sysfs(void);
+extern void ext4_exit_sysfs(void);
+
/* block_validity */
extern void ext4_release_system_zone(struct super_block *sb);
extern int ext4_setup_system_zone(struct super_block *sb);
@@ -3049,4 +3222,7 @@ extern void ext4_resize_end(struct super_block *sb);
#endif /* __KERNEL__ */
+#define EFSBADCRC EBADMSG /* Bad CRC detected */
+#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
+
#endif /* _EXT4_H */
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index d41843181818..e770c1ee4613 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -88,13 +88,13 @@ int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
return 0;
}
+ err = handle->h_err;
if (!handle->h_transaction) {
- err = jbd2_journal_stop(handle);
- return handle->h_err ? handle->h_err : err;
+ rc = jbd2_journal_stop(handle);
+ return err ? err : rc;
}
sb = handle->h_transaction->t_journal->j_private;
- err = handle->h_err;
rc = jbd2_journal_stop(handle);
if (!err)
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 9c5b49fb281e..5f5846211095 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -34,8 +34,7 @@
*/
#define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \
- (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \
- ? 20U : 8U)
+ (ext4_has_feature_extents(sb) ? 20U : 8U)
/* Extended attribute operations touch at most two data buffers,
* two bitmap buffers, and two group summaries, in addition to the inode
@@ -84,17 +83,16 @@
/* Amount of blocks needed for quota update - we know that the structure was
* allocated so we need to update only data block */
#define EXT4_QUOTA_TRANS_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\
- EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) ?\
- 1 : 0)
+ ext4_has_feature_quota(sb)) ? 1 : 0)
/* Amount of blocks needed for quota insert/delete - we do some block writes
* but inode, sb and group updates are done only once */
#define EXT4_QUOTA_INIT_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\
- EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) ?\
+ ext4_has_feature_quota(sb)) ?\
(DQUOT_INIT_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\
+3+DQUOT_INIT_REWRITE) : 0)
#define EXT4_QUOTA_DEL_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\
- EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) ?\
+ ext4_has_feature_quota(sb)) ?\
(DQUOT_DEL_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\
+3+DQUOT_DEL_REWRITE) : 0)
#else
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 2553aa8b608d..551353b1b17a 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -442,7 +442,7 @@ static int __ext4_ext_check(const char *function, unsigned int line,
int depth, ext4_fsblk_t pblk)
{
const char *error_msg;
- int max = 0;
+ int max = 0, err = -EFSCORRUPTED;
if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
error_msg = "invalid magic";
@@ -473,6 +473,7 @@ static int __ext4_ext_check(const char *function, unsigned int line,
if (ext_depth(inode) != depth &&
!ext4_extent_block_csum_verify(inode, eh)) {
error_msg = "extent tree corrupted";
+ err = -EFSBADCRC;
goto corrupted;
}
return 0;
@@ -485,7 +486,7 @@ corrupted:
le16_to_cpu(eh->eh_magic),
le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
max, le16_to_cpu(eh->eh_depth), depth);
- return -EIO;
+ return err;
}
#define ext4_ext_check(inode, eh, depth, pblk) \
@@ -899,7 +900,7 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block,
bh = read_extent_tree_block(inode, path[ppos].p_block, --i,
flags);
- if (unlikely(IS_ERR(bh))) {
+ if (IS_ERR(bh)) {
ret = PTR_ERR(bh);
goto err;
}
@@ -910,7 +911,7 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block,
put_bh(bh);
EXT4_ERROR_INODE(inode,
"ppos %d > depth %d", ppos, depth);
- ret = -EIO;
+ ret = -EFSCORRUPTED;
goto err;
}
path[ppos].p_bh = bh;
@@ -959,7 +960,7 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
EXT4_ERROR_INODE(inode,
"logical %d == ei_block %d!",
logical, le32_to_cpu(curp->p_idx->ei_block));
- return -EIO;
+ return -EFSCORRUPTED;
}
if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
@@ -968,7 +969,7 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
"eh_entries %d >= eh_max %d!",
le16_to_cpu(curp->p_hdr->eh_entries),
le16_to_cpu(curp->p_hdr->eh_max));
- return -EIO;
+ return -EFSCORRUPTED;
}
if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
@@ -992,7 +993,7 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
- return -EIO;
+ return -EFSCORRUPTED;
}
ix->ei_block = cpu_to_le32(logical);
@@ -1001,7 +1002,7 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
- return -EIO;
+ return -EFSCORRUPTED;
}
err = ext4_ext_dirty(handle, inode, curp);
@@ -1042,7 +1043,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
* border from split point */
if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) {
EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!");
- return -EIO;
+ return -EFSCORRUPTED;
}
if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
border = path[depth].p_ext[1].ee_block;
@@ -1086,7 +1087,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
newblock = ablocks[--a];
if (unlikely(newblock == 0)) {
EXT4_ERROR_INODE(inode, "newblock == 0!");
- err = -EIO;
+ err = -EFSCORRUPTED;
goto cleanup;
}
bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS);
@@ -1112,7 +1113,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!",
path[depth].p_hdr->eh_entries,
path[depth].p_hdr->eh_max);
- err = -EIO;
+ err = -EFSCORRUPTED;
goto cleanup;
}
/* start copy from next extent */
@@ -1151,7 +1152,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
k = depth - at - 1;
if (unlikely(k < 0)) {
EXT4_ERROR_INODE(inode, "k %d < 0!", k);
- err = -EIO;
+ err = -EFSCORRUPTED;
goto cleanup;
}
if (k)
@@ -1191,7 +1192,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
EXT4_ERROR_INODE(inode,
"EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!",
le32_to_cpu(path[i].p_ext->ee_block));
- err = -EIO;
+ err = -EFSCORRUPTED;
goto cleanup;
}
/* start copy indexes */
@@ -1425,7 +1426,7 @@ static int ext4_ext_search_left(struct inode *inode,
if (unlikely(path == NULL)) {
EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
- return -EIO;
+ return -EFSCORRUPTED;
}
depth = path->p_depth;
*phys = 0;
@@ -1444,7 +1445,7 @@ static int ext4_ext_search_left(struct inode *inode,
EXT4_ERROR_INODE(inode,
"EXT_FIRST_EXTENT != ex *logical %d ee_block %d!",
*logical, le32_to_cpu(ex->ee_block));
- return -EIO;
+ return -EFSCORRUPTED;
}
while (--depth >= 0) {
ix = path[depth].p_idx;
@@ -1455,7 +1456,7 @@ static int ext4_ext_search_left(struct inode *inode,
EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0,
depth);
- return -EIO;
+ return -EFSCORRUPTED;
}
}
return 0;
@@ -1465,7 +1466,7 @@ static int ext4_ext_search_left(struct inode *inode,
EXT4_ERROR_INODE(inode,
"logical %d < ee_block %d + ee_len %d!",
*logical, le32_to_cpu(ex->ee_block), ee_len);
- return -EIO;
+ return -EFSCORRUPTED;
}
*logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
@@ -1495,7 +1496,7 @@ static int ext4_ext_search_right(struct inode *inode,
if (unlikely(path == NULL)) {
EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
- return -EIO;
+ return -EFSCORRUPTED;
}
depth = path->p_depth;
*phys = 0;
@@ -1514,7 +1515,7 @@ static int ext4_ext_search_right(struct inode *inode,
EXT4_ERROR_INODE(inode,
"first_extent(path[%d].p_hdr) != ex",
depth);
- return -EIO;
+ return -EFSCORRUPTED;
}
while (--depth >= 0) {
ix = path[depth].p_idx;
@@ -1522,7 +1523,7 @@ static int ext4_ext_search_right(struct inode *inode,
EXT4_ERROR_INODE(inode,
"ix != EXT_FIRST_INDEX *logical %d!",
*logical);
- return -EIO;
+ return -EFSCORRUPTED;
}
}
goto found_extent;
@@ -1532,7 +1533,7 @@ static int ext4_ext_search_right(struct inode *inode,
EXT4_ERROR_INODE(inode,
"logical %d < ee_block %d + ee_len %d!",
*logical, le32_to_cpu(ex->ee_block), ee_len);
- return -EIO;
+ return -EFSCORRUPTED;
}
if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
@@ -1670,7 +1671,7 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
if (unlikely(ex == NULL || eh == NULL)) {
EXT4_ERROR_INODE(inode,
"ex %p == NULL or eh %p == NULL", ex, eh);
- return -EIO;
+ return -EFSCORRUPTED;
}
if (depth == 0) {
@@ -1938,14 +1939,14 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
mb_flags |= EXT4_MB_DELALLOC_RESERVED;
if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
- return -EIO;
+ return -EFSCORRUPTED;
}
depth = ext_depth(inode);
ex = path[depth].p_ext;
eh = path[depth].p_hdr;
if (unlikely(path[depth].p_hdr == NULL)) {
EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
- return -EIO;
+ return -EFSCORRUPTED;
}
/* try to insert block into found extent and return */
@@ -2172,7 +2173,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
if (unlikely(path[depth].p_hdr == NULL)) {
up_read(&EXT4_I(inode)->i_data_sem);
EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
- err = -EIO;
+ err = -EFSCORRUPTED;
break;
}
ex = path[depth].p_ext;
@@ -2241,7 +2242,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
if (unlikely(es.es_len == 0)) {
EXT4_ERROR_INODE(inode, "es.es_len == 0");
- err = -EIO;
+ err = -EFSCORRUPTED;
break;
}
@@ -2264,7 +2265,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
"next extent == %u, next "
"delalloc extent = %u",
next, next_del);
- err = -EIO;
+ err = -EFSCORRUPTED;
break;
}
}
@@ -2363,7 +2364,7 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
leaf = ext4_idx_pblock(path->p_idx);
if (unlikely(path->p_hdr->eh_entries == 0)) {
EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
- return -EIO;
+ return -EFSCORRUPTED;
}
err = ext4_ext_get_access(handle, inode, path);
if (err)
@@ -2612,7 +2613,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
eh = path[depth].p_hdr;
if (unlikely(path[depth].p_hdr == NULL)) {
EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
- return -EIO;
+ return -EFSCORRUPTED;
}
/* find where to start removing */
ex = path[depth].p_ext;
@@ -2666,7 +2667,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
"on extent %u:%u",
start, end, ex_ee_block,
ex_ee_block + ex_ee_len - 1);
- err = -EIO;
+ err = -EFSCORRUPTED;
goto out;
} else if (a != ex_ee_block) {
/* remove tail of the extent */
@@ -2841,7 +2842,7 @@ again:
EXT4_ERROR_INODE(inode,
"path[%d].p_hdr == NULL",
depth);
- err = -EIO;
+ err = -EFSCORRUPTED;
}
goto out;
}
@@ -2920,7 +2921,7 @@ again:
i = 0;
if (ext4_ext_check(inode, path[0].p_hdr, depth, 0)) {
- err = -EIO;
+ err = -EFSCORRUPTED;
goto out;
}
}
@@ -2978,7 +2979,7 @@ again:
* Should be a no-op if we did IO above. */
cond_resched();
if (WARN_ON(i + 1 > depth)) {
- err = -EIO;
+ err = -EFSCORRUPTED;
break;
}
path[i + 1].p_bh = bh;
@@ -3054,7 +3055,7 @@ void ext4_ext_init(struct super_block *sb)
* possible initialization would be here
*/
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+ if (ext4_has_feature_extents(sb)) {
#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
printk(KERN_INFO "EXT4-fs: file extents enabled"
#ifdef AGGRESSIVE_TEST
@@ -3081,7 +3082,7 @@ void ext4_ext_init(struct super_block *sb)
*/
void ext4_ext_release(struct super_block *sb)
{
- if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
+ if (!ext4_has_feature_extents(sb))
return;
#ifdef EXTENTS_STATS
@@ -3345,7 +3346,7 @@ static int ext4_split_extent(handle_t *handle,
if (!ex) {
EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
(unsigned long) map->m_lblk);
- return -EIO;
+ return -EFSCORRUPTED;
}
unwritten = ext4_ext_is_unwritten(ex);
split_flag1 = 0;
@@ -3558,6 +3559,9 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
max_zeroout = sbi->s_extent_max_zeroout_kb >>
(inode->i_sb->s_blocksize_bits - 10);
+ if (ext4_encrypted_inode(inode))
+ max_zeroout = 0;
+
/* If extent is less than s_max_zeroout_kb, zeroout directly */
if (max_zeroout && (ee_len <= max_zeroout)) {
err = ext4_ext_zeroout(inode, ex);
@@ -3970,7 +3974,7 @@ convert_initialized_extent(handle_t *handle, struct inode *inode,
if (!ex) {
EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
(unsigned long) map->m_lblk);
- return -EIO;
+ return -EFSCORRUPTED;
}
}
@@ -4308,7 +4312,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
"lblock: %lu, depth: %d pblock %lld",
(unsigned long) map->m_lblk, depth,
path[depth].p_block);
- err = -EIO;
+ err = -EFSCORRUPTED;
goto out2;
}
@@ -5271,7 +5275,7 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
if (depth == path->p_depth) {
ex_start = path[depth].p_ext;
if (!ex_start)
- return -EIO;
+ return -EFSCORRUPTED;
ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
@@ -5411,7 +5415,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
if (!extent) {
EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
(unsigned long) *iterator);
- return -EIO;
+ return -EFSCORRUPTED;
}
if (SHIFT == SHIFT_LEFT && *iterator >
le32_to_cpu(extent->ee_block)) {
@@ -5792,7 +5796,7 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
int split = 0;
path1 = ext4_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE);
- if (unlikely(IS_ERR(path1))) {
+ if (IS_ERR(path1)) {
*erp = PTR_ERR(path1);
path1 = NULL;
finish:
@@ -5800,7 +5804,7 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
goto repeat;
}
path2 = ext4_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE);
- if (unlikely(IS_ERR(path2))) {
+ if (IS_ERR(path2)) {
*erp = PTR_ERR(path2);
path2 = NULL;
goto finish;
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 26724aeece73..ac748b3af1c1 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -1089,20 +1089,9 @@ static unsigned long ext4_es_scan(struct shrinker *shrink,
return nr_shrunk;
}
-static void *ext4_es_seq_shrinker_info_start(struct seq_file *seq, loff_t *pos)
+int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v)
{
- return *pos ? NULL : SEQ_START_TOKEN;
-}
-
-static void *
-ext4_es_seq_shrinker_info_next(struct seq_file *seq, void *v, loff_t *pos)
-{
- return NULL;
-}
-
-static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v)
-{
- struct ext4_sb_info *sbi = seq->private;
+ struct ext4_sb_info *sbi = EXT4_SB((struct super_block *) seq->private);
struct ext4_es_stats *es_stats = &sbi->s_es_stats;
struct ext4_inode_info *ei, *max = NULL;
unsigned int inode_cnt = 0;
@@ -1143,45 +1132,6 @@ static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v)
return 0;
}
-static void ext4_es_seq_shrinker_info_stop(struct seq_file *seq, void *v)
-{
-}
-
-static const struct seq_operations ext4_es_seq_shrinker_info_ops = {
- .start = ext4_es_seq_shrinker_info_start,
- .next = ext4_es_seq_shrinker_info_next,
- .stop = ext4_es_seq_shrinker_info_stop,
- .show = ext4_es_seq_shrinker_info_show,
-};
-
-static int
-ext4_es_seq_shrinker_info_open(struct inode *inode, struct file *file)
-{
- int ret;
-
- ret = seq_open(file, &ext4_es_seq_shrinker_info_ops);
- if (!ret) {
- struct seq_file *m = file->private_data;
- m->private = PDE_DATA(inode);
- }
-
- return ret;
-}
-
-static int
-ext4_es_seq_shrinker_info_release(struct inode *inode, struct file *file)
-{
- return seq_release(inode, file);
-}
-
-static const struct file_operations ext4_es_seq_shrinker_info_fops = {
- .owner = THIS_MODULE,
- .open = ext4_es_seq_shrinker_info_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = ext4_es_seq_shrinker_info_release,
-};
-
int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
{
int err;
@@ -1210,10 +1160,6 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
if (err)
goto err2;
- if (sbi->s_proc)
- proc_create_data("es_shrinker_info", S_IRUGO, sbi->s_proc,
- &ext4_es_seq_shrinker_info_fops, sbi);
-
return 0;
err2:
@@ -1225,8 +1171,6 @@ err1:
void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
{
- if (sbi->s_proc)
- remove_proc_entry("es_shrinker_info", sbi->s_proc);
percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
unregister_shrinker(&sbi->s_es_shrinker);
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 691b52613ce4..f7aa24f4642d 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -172,4 +172,6 @@ static inline void ext4_es_store_pblock_status(struct extent_status *es,
extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi);
extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
+extern int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v);
+
#endif /* _EXT4_EXTENTS_STATUS_H */
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 619bfc1fda8c..1b8024d26f65 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -64,7 +64,7 @@ void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
}
/* Initializes an uninitialized inode bitmap */
-static unsigned ext4_init_inode_bitmap(struct super_block *sb,
+static int ext4_init_inode_bitmap(struct super_block *sb,
struct buffer_head *bh,
ext4_group_t block_group,
struct ext4_group_desc *gdp)
@@ -89,7 +89,7 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb,
count);
}
set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
- return 0;
+ return -EFSBADCRC;
}
memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
@@ -99,7 +99,7 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb,
EXT4_INODES_PER_GROUP(sb) / 8);
ext4_group_desc_csum_set(sb, block_group, gdp);
- return EXT4_INODES_PER_GROUP(sb);
+ return 0;
}
void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate)
@@ -112,6 +112,42 @@ void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate)
put_bh(bh);
}
+static int ext4_validate_inode_bitmap(struct super_block *sb,
+ struct ext4_group_desc *desc,
+ ext4_group_t block_group,
+ struct buffer_head *bh)
+{
+ ext4_fsblk_t blk;
+ struct ext4_group_info *grp = ext4_get_group_info(sb, block_group);
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (buffer_verified(bh))
+ return 0;
+ if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
+ return -EFSCORRUPTED;
+
+ ext4_lock_group(sb, block_group);
+ blk = ext4_inode_bitmap(sb, desc);
+ if (!ext4_inode_bitmap_csum_verify(sb, block_group, desc, bh,
+ EXT4_INODES_PER_GROUP(sb) / 8)) {
+ ext4_unlock_group(sb, block_group);
+ ext4_error(sb, "Corrupt inode bitmap - block_group = %u, "
+ "inode_bitmap = %llu", block_group, blk);
+ grp = ext4_get_group_info(sb, block_group);
+ if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
+ int count;
+ count = ext4_free_inodes_count(sb, desc);
+ percpu_counter_sub(&sbi->s_freeinodes_counter,
+ count);
+ }
+ set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
+ return -EFSBADCRC;
+ }
+ set_buffer_verified(bh);
+ ext4_unlock_group(sb, block_group);
+ return 0;
+}
+
/*
* Read the inode allocation bitmap for a given block_group, reading
* into the specified slot in the superblock's bitmap cache.
@@ -124,12 +160,11 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
struct ext4_group_desc *desc;
struct buffer_head *bh = NULL;
ext4_fsblk_t bitmap_blk;
- struct ext4_group_info *grp;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int err;
desc = ext4_get_group_desc(sb, block_group, NULL);
if (!desc)
- return NULL;
+ return ERR_PTR(-EFSCORRUPTED);
bitmap_blk = ext4_inode_bitmap(sb, desc);
bh = sb_getblk(sb, bitmap_blk);
@@ -137,7 +172,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
ext4_error(sb, "Cannot read inode bitmap - "
"block_group = %u, inode_bitmap = %llu",
block_group, bitmap_blk);
- return NULL;
+ return ERR_PTR(-EIO);
}
if (bitmap_uptodate(bh))
goto verify;
@@ -150,12 +185,14 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
ext4_lock_group(sb, block_group);
if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
- ext4_init_inode_bitmap(sb, bh, block_group, desc);
+ err = ext4_init_inode_bitmap(sb, bh, block_group, desc);
set_bitmap_uptodate(bh);
set_buffer_uptodate(bh);
set_buffer_verified(bh);
ext4_unlock_group(sb, block_group);
unlock_buffer(bh);
+ if (err)
+ goto out;
return bh;
}
ext4_unlock_group(sb, block_group);
@@ -182,31 +219,17 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
ext4_error(sb, "Cannot read inode bitmap - "
"block_group = %u, inode_bitmap = %llu",
block_group, bitmap_blk);
- return NULL;
+ return ERR_PTR(-EIO);
}
verify:
- ext4_lock_group(sb, block_group);
- if (!buffer_verified(bh) &&
- !ext4_inode_bitmap_csum_verify(sb, block_group, desc, bh,
- EXT4_INODES_PER_GROUP(sb) / 8)) {
- ext4_unlock_group(sb, block_group);
- put_bh(bh);
- ext4_error(sb, "Corrupt inode bitmap - block_group = %u, "
- "inode_bitmap = %llu", block_group, bitmap_blk);
- grp = ext4_get_group_info(sb, block_group);
- if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
- int count;
- count = ext4_free_inodes_count(sb, desc);
- percpu_counter_sub(&sbi->s_freeinodes_counter,
- count);
- }
- set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
- return NULL;
- }
- ext4_unlock_group(sb, block_group);
- set_buffer_verified(bh);
+ err = ext4_validate_inode_bitmap(sb, desc, block_group, bh);
+ if (err)
+ goto out;
return bh;
+out:
+ put_bh(bh);
+ return ERR_PTR(err);
}
/*
@@ -286,8 +309,15 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
/* Don't bother if the inode bitmap is corrupt. */
grp = ext4_get_group_info(sb, block_group);
- if (unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) || !bitmap_bh)
+ if (IS_ERR(bitmap_bh)) {
+ fatal = PTR_ERR(bitmap_bh);
+ bitmap_bh = NULL;
+ goto error_return;
+ }
+ if (unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp))) {
+ fatal = -EFSCORRUPTED;
goto error_return;
+ }
BUFFER_TRACE(bitmap_bh, "get_write_access");
fatal = ext4_journal_get_write_access(handle, bitmap_bh);
@@ -826,7 +856,9 @@ got_group:
brelse(inode_bitmap_bh);
inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
/* Skip groups with suspicious inode tables */
- if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp) || !inode_bitmap_bh) {
+ if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp) ||
+ IS_ERR(inode_bitmap_bh)) {
+ inode_bitmap_bh = NULL;
if (++group == ngroups)
group = 0;
continue;
@@ -902,8 +934,8 @@ got:
struct buffer_head *block_bitmap_bh;
block_bitmap_bh = ext4_read_block_bitmap(sb, group);
- if (!block_bitmap_bh) {
- err = -EIO;
+ if (IS_ERR(block_bitmap_bh)) {
+ err = PTR_ERR(block_bitmap_bh);
goto out;
}
BUFFER_TRACE(block_bitmap_bh, "get block bitmap access");
@@ -1045,7 +1077,7 @@ got:
ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
ei->i_inline_off = 0;
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_INLINE_DATA))
+ if (ext4_has_feature_inline_data(sb))
ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
ret = inode;
err = dquot_alloc_inode(inode);
@@ -1060,7 +1092,7 @@ got:
if (err)
goto fail_free_drop;
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+ if (ext4_has_feature_extents(sb)) {
/* set extent flag only for directory, file and normal symlink*/
if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
@@ -1116,14 +1148,17 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
/* Error cases - e2fsck has already cleaned up for us */
if (ino > max_ino) {
ext4_warning(sb, "bad orphan ino %lu! e2fsck was run?", ino);
+ err = -EFSCORRUPTED;
goto error;
}
block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
- if (!bitmap_bh) {
- ext4_warning(sb, "inode bitmap error for orphan %lu", ino);
+ if (IS_ERR(bitmap_bh)) {
+ err = PTR_ERR(bitmap_bh);
+ ext4_warning(sb, "inode bitmap error %ld for orphan %lu",
+ ino, err);
goto error;
}
@@ -1198,8 +1233,10 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
desc_count += ext4_free_inodes_count(sb, gdp);
brelse(bitmap_bh);
bitmap_bh = ext4_read_inode_bitmap(sb, i);
- if (!bitmap_bh)
+ if (IS_ERR(bitmap_bh)) {
+ bitmap_bh = NULL;
continue;
+ }
x = ext4_count_free(bitmap_bh->b_data,
EXT4_INODES_PER_GROUP(sb) / 8);
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 2468261748b2..355ef9c36c87 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -562,11 +562,10 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
/*
* Okay, we need to do block allocation.
*/
- if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+ if (ext4_has_feature_bigalloc(inode->i_sb)) {
EXT4_ERROR_INODE(inode, "Can't allocate blocks for "
"non-extent mapped inodes with bigalloc");
- return -EUCLEAN;
+ return -EFSCORRUPTED;
}
/* Set up for the direct block allocation */
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index cd944a7a99cd..d884989cc83d 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -434,8 +434,7 @@ static int ext4_destroy_inline_data_nolock(handle_t *handle,
memset((void *)ext4_raw_inode(&is.iloc)->i_block,
0, EXT4_MIN_INLINE_DATA_SIZE);
- if (EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+ if (ext4_has_feature_extents(inode->i_sb)) {
if (S_ISDIR(inode->i_mode) ||
S_ISREG(inode->i_mode) || S_ISLNK(inode->i_mode)) {
ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 612fbcf76b5c..b3bd912df6bf 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -378,7 +378,7 @@ static int __check_block_validity(struct inode *inode, const char *func,
"lblock %lu mapped to illegal pblock "
"(length %d)", (unsigned long) map->m_lblk,
map->m_len);
- return -EIO;
+ return -EFSCORRUPTED;
}
return 0;
}
@@ -480,7 +480,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
/* We can handle the block number less than EXT_MAX_BLOCKS */
if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS))
- return -EIO;
+ return -EFSCORRUPTED;
/* Lookup extent status tree firstly */
if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
@@ -965,7 +965,7 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
if (unlikely(err))
page_zero_new_buffers(page, from, to);
else if (decrypt)
- err = ext4_decrypt_one(inode, page);
+ err = ext4_decrypt(page);
return err;
}
#endif
@@ -1181,6 +1181,38 @@ errout:
return ret ? ret : copied;
}
+/*
+ * This is a private version of page_zero_new_buffers() which doesn't
+ * set the buffer to be dirty, since in data=journalled mode we need
+ * to call ext4_handle_dirty_metadata() instead.
+ */
+static void zero_new_buffers(struct page *page, unsigned from, unsigned to)
+{
+ unsigned int block_start = 0, block_end;
+ struct buffer_head *head, *bh;
+
+ bh = head = page_buffers(page);
+ do {
+ block_end = block_start + bh->b_size;
+ if (buffer_new(bh)) {
+ if (block_end > from && block_start < to) {
+ if (!PageUptodate(page)) {
+ unsigned start, size;
+
+ start = max(from, block_start);
+ size = min(to, block_end) - start;
+
+ zero_user(page, start, size);
+ set_buffer_uptodate(bh);
+ }
+ clear_buffer_new(bh);
+ }
+ }
+ block_start = block_end;
+ bh = bh->b_this_page;
+ } while (bh != head);
+}
+
static int ext4_journalled_write_end(struct file *file,
struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
@@ -1207,7 +1239,7 @@ static int ext4_journalled_write_end(struct file *file,
if (copied < len) {
if (!PageUptodate(page))
copied = 0;
- page_zero_new_buffers(page, from+copied, to);
+ zero_new_buffers(page, from+copied, to);
}
ret = ext4_walk_page_buffers(handle, page_buffers(page), from,
@@ -1815,11 +1847,22 @@ static int ext4_writepage(struct page *page,
* the page. But we may reach here when we do a journal commit via
* journal_submit_inode_data_buffers() and in that case we must write
* allocated buffers to achieve data=ordered mode guarantees.
+ *
+ * Also, if there is only one buffer per page (the fs block
+ * size == the page size), if one buffer needs block
+ * allocation or needs to modify the extent tree to clear the
+ * unwritten flag, we know that the page can't be written at
+ * all, so we might as well refuse the write immediately.
+ * Unfortunately if the block size != page size, we can't as
+ * easily detect this case using ext4_walk_page_buffers(), but
+ * for the extremely common case, this is an optimization that
+ * skips a useless round trip through ext4_bio_write_page().
*/
if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL,
ext4_bh_delay_or_unwritten)) {
redirty_page_for_writepage(wbc, page);
- if (current->flags & PF_MEMALLOC) {
+ if ((current->flags & PF_MEMALLOC) ||
+ (inode->i_sb->s_blocksize == PAGE_CACHE_SIZE)) {
/*
* For memory cleaning there's no point in writing only
* some buffers. So just bail out. Warn if we came here
@@ -2599,8 +2642,7 @@ static int ext4_nonda_switch(struct super_block *sb)
/* We always reserve for an inode update; the superblock could be there too */
static int ext4_da_write_credits(struct inode *inode, loff_t pos, unsigned len)
{
- if (likely(EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_RO_COMPAT_LARGE_FILE)))
+ if (likely(ext4_has_feature_large_file(inode->i_sb)))
return 1;
if (pos + len <= 0x7fffffffULL)
@@ -3344,7 +3386,7 @@ static int __ext4_block_zero_page_range(handle_t *handle,
int err = 0;
page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
- mapping_gfp_mask(mapping) & ~__GFP_FS);
+ mapping_gfp_constraint(mapping, ~__GFP_FS));
if (!page)
return -ENOMEM;
@@ -3393,7 +3435,7 @@ static int __ext4_block_zero_page_range(handle_t *handle,
/* We expect the key to be set. */
BUG_ON(!ext4_has_encryption_key(inode));
BUG_ON(blocksize != PAGE_CACHE_SIZE);
- WARN_ON_ONCE(ext4_decrypt_one(inode, page));
+ WARN_ON_ONCE(ext4_decrypt(page));
}
}
if (ext4_should_journal_data(inode)) {
@@ -3820,7 +3862,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
iloc->bh = NULL;
if (!ext4_valid_inum(sb, inode->i_ino))
- return -EIO;
+ return -EFSCORRUPTED;
iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
@@ -4006,8 +4048,7 @@ static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
struct inode *inode = &(ei->vfs_inode);
struct super_block *sb = inode->i_sb;
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
+ if (ext4_has_feature_huge_file(sb)) {
/* we are using combined 48 bit field */
i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |
le32_to_cpu(raw_inode->i_blocks_lo);
@@ -4068,7 +4109,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
EXT4_ERROR_INODE(inode, "bad extra_isize (%u != %u)",
EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize,
EXT4_INODE_SIZE(inode->i_sb));
- ret = -EIO;
+ ret = -EFSCORRUPTED;
goto bad_inode;
}
} else
@@ -4088,7 +4129,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
if (!ext4_inode_csum_verify(inode, raw_inode, ei)) {
EXT4_ERROR_INODE(inode, "checksum invalid");
- ret = -EIO;
+ ret = -EFSBADCRC;
goto bad_inode;
}
@@ -4130,7 +4171,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
ei->i_flags = le32_to_cpu(raw_inode->i_flags);
inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
+ if (ext4_has_feature_64bit(sb))
ei->i_file_acl |=
((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
inode->i_size = ext4_isize(raw_inode);
@@ -4203,7 +4244,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
!ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
EXT4_ERROR_INODE(inode, "bad extended attribute block %llu",
ei->i_file_acl);
- ret = -EIO;
+ ret = -EFSCORRUPTED;
goto bad_inode;
} else if (!ext4_has_inline_data(inode)) {
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
@@ -4242,6 +4283,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
inode->i_op = &ext4_symlink_inode_operations;
ext4_set_aops(inode);
}
+ inode_nohighmem(inode);
} else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
inode->i_op = &ext4_special_inode_operations;
@@ -4254,7 +4296,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
} else if (ino == EXT4_BOOT_LOADER_INO) {
make_bad_inode(inode);
} else {
- ret = -EIO;
+ ret = -EFSCORRUPTED;
EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
goto bad_inode;
}
@@ -4272,7 +4314,7 @@ bad_inode:
struct inode *ext4_iget_normal(struct super_block *sb, unsigned long ino)
{
if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)
- return ERR_PTR(-EIO);
+ return ERR_PTR(-EFSCORRUPTED);
return ext4_iget(sb, ino);
}
@@ -4294,7 +4336,7 @@ static int ext4_inode_blocks_set(handle_t *handle,
ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
return 0;
}
- if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE))
+ if (!ext4_has_feature_huge_file(sb))
return -EFBIG;
if (i_blocks <= 0xffffffffffffULL) {
@@ -4455,8 +4497,7 @@ static int ext4_do_update_inode(handle_t *handle,
need_datasync = 1;
}
if (ei->i_disksize > 0x7fffffffULL) {
- if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||
+ if (!ext4_has_feature_large_file(sb) ||
EXT4_SB(sb)->s_es->s_rev_level ==
cpu_to_le32(EXT4_GOOD_OLD_REV))
set_large_file = 1;
@@ -4505,8 +4546,7 @@ static int ext4_do_update_inode(handle_t *handle,
if (err)
goto out_brelse;
ext4_update_dynamic_rev(sb);
- EXT4_SET_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
+ ext4_set_feature_large_file(sb);
ext4_handle_sync(handle);
err = ext4_handle_dirty_super(handle, sb);
}
@@ -5244,7 +5284,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
!ext4_should_journal_data(inode) &&
!ext4_nonda_switch(inode->i_sb)) {
do {
- ret = __block_page_mkwrite(vma, vmf,
+ ret = block_page_mkwrite(vma, vmf,
ext4_da_get_block_prep);
} while (ret == -ENOSPC &&
ext4_should_retry_alloc(inode->i_sb, &retries));
@@ -5291,7 +5331,7 @@ retry_alloc:
ret = VM_FAULT_SIGBUS;
goto out;
}
- ret = __block_page_mkwrite(vma, vmf, get_block);
+ ret = block_page_mkwrite(vma, vmf, get_block);
if (!ret && ext4_should_journal_data(inode)) {
if (ext4_walk_page_buffers(handle, page_buffers(page), 0,
PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) {
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 1346cfa355d0..5e872fd40e5e 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -145,8 +145,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
inode_bl->i_version = 1;
i_size_write(inode_bl, 0);
inode_bl->i_mode = S_IFREG;
- if (EXT4_HAS_INCOMPAT_FEATURE(sb,
- EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+ if (ext4_has_feature_extents(sb)) {
ext4_set_inode_flag(inode_bl, EXT4_INODE_EXTENTS);
ext4_ext_tree_init(handle, inode_bl);
} else
@@ -383,8 +382,7 @@ setversion_out:
goto group_extend_out;
}
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+ if (ext4_has_feature_bigalloc(sb)) {
ext4_msg(sb, KERN_ERR,
"Online resizing not supported with bigalloc");
err = -EOPNOTSUPP;
@@ -432,8 +430,7 @@ group_extend_out:
goto mext_out;
}
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+ if (ext4_has_feature_bigalloc(sb)) {
ext4_msg(sb, KERN_ERR,
"Online defrag not supported with bigalloc");
err = -EOPNOTSUPP;
@@ -470,8 +467,7 @@ mext_out:
goto group_add_out;
}
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+ if (ext4_has_feature_bigalloc(sb)) {
ext4_msg(sb, KERN_ERR,
"Online resizing not supported with bigalloc");
err = -EOPNOTSUPP;
@@ -553,8 +549,7 @@ group_add_out:
int err = 0, err2 = 0;
ext4_group_t o_group = EXT4_SB(sb)->s_groups_count;
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+ if (ext4_has_feature_bigalloc(sb)) {
ext4_msg(sb, KERN_ERR,
"Online resizing not (yet) supported with bigalloc");
return -EOPNOTSUPP;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 34b610ea5030..61eaf74dca37 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -874,8 +874,10 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
bh[i] = NULL;
continue;
}
- if (!(bh[i] = ext4_read_block_bitmap_nowait(sb, group))) {
- err = -ENOMEM;
+ bh[i] = ext4_read_block_bitmap_nowait(sb, group);
+ if (IS_ERR(bh[i])) {
+ err = PTR_ERR(bh[i]);
+ bh[i] = NULL;
goto out;
}
mb_debug(1, "read bitmap for group %u\n", group);
@@ -883,8 +885,13 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
/* wait for I/O completion */
for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
- if (bh[i] && ext4_wait_block_bitmap(sb, group, bh[i]))
- err = -EIO;
+ int err2;
+
+ if (!bh[i])
+ continue;
+ err2 = ext4_wait_block_bitmap(sb, group, bh[i]);
+ if (!err)
+ err = err2;
}
first_block = page->index * blocks_per_page;
@@ -2333,7 +2340,7 @@ static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
}
-static const struct file_operations ext4_mb_seq_groups_fops = {
+const struct file_operations ext4_seq_mb_groups_fops = {
.owner = THIS_MODULE,
.open = ext4_mb_seq_groups_open,
.read = seq_read,
@@ -2447,7 +2454,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
kmalloc(sb->s_blocksize, GFP_NOFS);
BUG_ON(meta_group_info[i]->bb_bitmap == NULL);
bh = ext4_read_block_bitmap(sb, group);
- BUG_ON(bh == NULL);
+ BUG_ON(IS_ERR_OR_NULL(bh));
memcpy(meta_group_info[i]->bb_bitmap, bh->b_data,
sb->s_blocksize);
put_bh(bh);
@@ -2661,10 +2668,6 @@ int ext4_mb_init(struct super_block *sb)
if (ret != 0)
goto out_free_locality_groups;
- if (sbi->s_proc)
- proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
- &ext4_mb_seq_groups_fops, sb);
-
return 0;
out_free_locality_groups:
@@ -2705,9 +2708,6 @@ int ext4_mb_release(struct super_block *sb)
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
- if (sbi->s_proc)
- remove_proc_entry("mb_groups", sbi->s_proc);
-
if (sbi->s_group_info) {
for (i = 0; i < ngroups; i++) {
grinfo = ext4_get_group_info(sb, i);
@@ -2896,10 +2896,12 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
sb = ac->ac_sb;
sbi = EXT4_SB(sb);
- err = -EIO;
bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
- if (!bitmap_bh)
+ if (IS_ERR(bitmap_bh)) {
+ err = PTR_ERR(bitmap_bh);
+ bitmap_bh = NULL;
goto out_err;
+ }
BUFFER_TRACE(bitmap_bh, "getting write access");
err = ext4_journal_get_write_access(handle, bitmap_bh);
@@ -3843,8 +3845,10 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
return 0;
bitmap_bh = ext4_read_block_bitmap(sb, group);
- if (bitmap_bh == NULL) {
- ext4_error(sb, "Error reading block bitmap for %u", group);
+ if (IS_ERR(bitmap_bh)) {
+ err = PTR_ERR(bitmap_bh);
+ ext4_error(sb, "Error %d reading block bitmap for %u",
+ err, group);
return 0;
}
@@ -4015,9 +4019,10 @@ repeat:
}
bitmap_bh = ext4_read_block_bitmap(sb, group);
- if (bitmap_bh == NULL) {
- ext4_error(sb, "Error reading block bitmap for %u",
- group);
+ if (IS_ERR(bitmap_bh)) {
+ err = PTR_ERR(bitmap_bh);
+ ext4_error(sb, "Error %d reading block bitmap for %u",
+ err, group);
ext4_mb_unload_buddy(&e4b);
continue;
}
@@ -4682,22 +4687,11 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
ext4_debug("freeing block %llu\n", block);
trace_ext4_free_blocks(inode, block, count, flags);
- if (flags & EXT4_FREE_BLOCKS_FORGET) {
- struct buffer_head *tbh = bh;
- int i;
-
- BUG_ON(bh && (count > 1));
+ if (bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
+ BUG_ON(count > 1);
- for (i = 0; i < count; i++) {
- cond_resched();
- if (!bh)
- tbh = sb_find_get_block(inode->i_sb,
- block + i);
- if (!tbh)
- continue;
- ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
- inode, tbh, block + i);
- }
+ ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
+ inode, bh, block);
}
/*
@@ -4742,6 +4736,19 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
count += sbi->s_cluster_ratio - overflow;
}
+ if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
+ int i;
+
+ for (i = 0; i < count; i++) {
+ cond_resched();
+ bh = sb_find_get_block(inode->i_sb, block + i);
+ if (!bh)
+ continue;
+ ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
+ inode, bh, block + i);
+ }
+ }
+
do_more:
overflow = 0;
ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
@@ -4761,8 +4768,9 @@ do_more:
}
count_clusters = EXT4_NUM_B2C(sbi, count);
bitmap_bh = ext4_read_block_bitmap(sb, block_group);
- if (!bitmap_bh) {
- err = -EIO;
+ if (IS_ERR(bitmap_bh)) {
+ err = PTR_ERR(bitmap_bh);
+ bitmap_bh = NULL;
goto error_return;
}
gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
@@ -4931,8 +4939,9 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
}
bitmap_bh = ext4_read_block_bitmap(sb, block_group);
- if (!bitmap_bh) {
- err = -EIO;
+ if (IS_ERR(bitmap_bh)) {
+ err = PTR_ERR(bitmap_bh);
+ bitmap_bh = NULL;
goto error_return;
}
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 6163ad21cb0e..a4651894cc33 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -448,8 +448,7 @@ int ext4_ext_migrate(struct inode *inode)
* If the filesystem does not support extents, or the inode
* already is extent-based, error out.
*/
- if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_INCOMPAT_EXTENTS) ||
+ if (!ext4_has_feature_extents(inode->i_sb) ||
(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
return -EINVAL;
@@ -625,13 +624,11 @@ int ext4_ind_migrate(struct inode *inode)
handle_t *handle;
int ret;
- if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_INCOMPAT_EXTENTS) ||
+ if (!ext4_has_feature_extents(inode->i_sb) ||
(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
return -EINVAL;
- if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_RO_COMPAT_BIGALLOC))
+ if (ext4_has_feature_bigalloc(inode->i_sb))
return -EOPNOTSUPP;
/*
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 6eb1a619890c..0a512aa81bf7 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -98,10 +98,12 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
}
mmp = (struct mmp_struct *)((*bh)->b_data);
- if (le32_to_cpu(mmp->mmp_magic) == EXT4_MMP_MAGIC &&
- ext4_mmp_csum_verify(sb, mmp))
+ if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC)
+ ret = -EFSCORRUPTED;
+ else if (!ext4_mmp_csum_verify(sb, mmp))
+ ret = -EFSBADCRC;
+ else
return 0;
- ret = -EINVAL;
warn_exit:
ext4_warning(sb, "Error %d while reading MMP block %llu",
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 9f61e7679a6d..f27e0c2598c5 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -109,7 +109,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
if (!bh) {
ext4_error_inode(inode, func, line, block,
"Directory hole found");
- return ERR_PTR(-EIO);
+ return ERR_PTR(-EFSCORRUPTED);
}
dirent = (struct ext4_dir_entry *) bh->b_data;
/* Determine whether or not we have an index block */
@@ -124,7 +124,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
if (!is_dx_block && type == INDEX) {
ext4_error_inode(inode, func, line, block,
"directory leaf block found instead of index block");
- return ERR_PTR(-EIO);
+ return ERR_PTR(-EFSCORRUPTED);
}
if (!ext4_has_metadata_csum(inode->i_sb) ||
buffer_verified(bh))
@@ -142,7 +142,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
ext4_error_inode(inode, func, line, block,
"Directory index failed checksum");
brelse(bh);
- return ERR_PTR(-EIO);
+ return ERR_PTR(-EFSBADCRC);
}
}
if (!is_dx_block) {
@@ -152,7 +152,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
ext4_error_inode(inode, func, line, block,
"Directory block failed checksum");
brelse(bh);
- return ERR_PTR(-EIO);
+ return ERR_PTR(-EFSBADCRC);
}
}
return bh;
@@ -1429,7 +1429,7 @@ restart:
}
num++;
bh = ext4_getblk(NULL, dir, b++, 0);
- if (unlikely(IS_ERR(bh))) {
+ if (IS_ERR(bh)) {
if (ra_max == 0) {
ret = bh;
goto cleanup_and_exit;
@@ -1570,19 +1570,19 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
brelse(bh);
if (!ext4_valid_inum(dir->i_sb, ino)) {
EXT4_ERROR_INODE(dir, "bad inode number: %u", ino);
- return ERR_PTR(-EIO);
+ return ERR_PTR(-EFSCORRUPTED);
}
if (unlikely(ino == dir->i_ino)) {
EXT4_ERROR_INODE(dir, "'%pd' linked to parent dir",
dentry);
- return ERR_PTR(-EIO);
+ return ERR_PTR(-EFSCORRUPTED);
}
inode = ext4_iget_normal(dir->i_sb, ino);
if (inode == ERR_PTR(-ESTALE)) {
EXT4_ERROR_INODE(dir,
"deleted inode referenced: %u",
ino);
- return ERR_PTR(-EIO);
+ return ERR_PTR(-EFSCORRUPTED);
}
if (!IS_ERR(inode) && ext4_encrypted_inode(dir) &&
(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
@@ -1619,7 +1619,7 @@ struct dentry *ext4_get_parent(struct dentry *child)
if (!ext4_valid_inum(d_inode(child)->i_sb, ino)) {
EXT4_ERROR_INODE(d_inode(child),
"bad parent inode number: %u", ino);
- return ERR_PTR(-EIO);
+ return ERR_PTR(-EFSCORRUPTED);
}
return d_obtain_alias(ext4_iget_normal(d_inode(child)->i_sb, ino));
@@ -1807,7 +1807,7 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode,
while ((char *) de <= top) {
if (ext4_check_dir_entry(dir, NULL, de, bh,
buf, buf_size, offset)) {
- res = -EIO;
+ res = -EFSCORRUPTED;
goto return_result;
}
/* Provide crypto context and crypto buffer to ext4 match */
@@ -1967,7 +1967,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
if ((char *) de >= (((char *) root) + blocksize)) {
EXT4_ERROR_INODE(dir, "invalid rec_len for '..'");
brelse(bh);
- return -EIO;
+ return -EFSCORRUPTED;
}
len = ((char *) root) + (blocksize - csum_size) - (char *) de;
@@ -2118,7 +2118,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
goto out;
if (blocks == 1 && !dx_fallback &&
- EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) {
+ ext4_has_feature_dir_index(sb)) {
retval = make_indexed_dir(handle, &fname, dentry,
inode, bh);
bh = NULL; /* make_indexed_dir releases bh */
@@ -2315,7 +2315,7 @@ int ext4_generic_delete_entry(handle_t *handle,
while (i < buf_size - csum_size) {
if (ext4_check_dir_entry(dir, NULL, de, bh,
bh->b_data, bh->b_size, i))
- return -EIO;
+ return -EFSCORRUPTED;
if (de == de_del) {
if (pde)
pde->rec_len = ext4_rec_len_to_disk(
@@ -2388,8 +2388,7 @@ static void ext4_inc_count(handle_t *handle, struct inode *inode)
/* limit is 16-bit i_links_count */
if (inode->i_nlink >= EXT4_LINK_MAX || inode->i_nlink == 2) {
set_nlink(inode, 1);
- EXT4_SET_RO_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_RO_COMPAT_DIR_NLINK);
+ ext4_set_feature_dir_nlink(inode->i_sb);
}
}
}
@@ -2469,9 +2468,6 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry,
struct inode *inode;
int err, credits, retries = 0;
- if (!new_valid_dev(rdev))
- return -EINVAL;
-
err = dquot_initialize(dir);
if (err)
return err;
@@ -2934,7 +2930,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
inode = d_inode(dentry);
- retval = -EIO;
+ retval = -EFSCORRUPTED;
if (le32_to_cpu(de->inode) != inode->i_ino)
goto end_rmdir;
@@ -3008,7 +3004,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
inode = d_inode(dentry);
- retval = -EIO;
+ retval = -EFSCORRUPTED;
if (le32_to_cpu(de->inode) != inode->i_ino)
goto end_unlink;
@@ -3136,6 +3132,7 @@ static int ext4_symlink(struct inode *dir,
if ((disk_link.len > EXT4_N_BLOCKS * 4)) {
if (!encryption_required)
inode->i_op = &ext4_symlink_inode_operations;
+ inode_nohighmem(inode);
ext4_set_aops(inode);
/*
* We cannot call page_symlink() with transaction started
@@ -3310,7 +3307,7 @@ static int ext4_rename_dir_prepare(handle_t *handle, struct ext4_renament *ent)
if (!ent->dir_bh)
return retval;
if (le32_to_cpu(ent->parent_de->inode) != ent->dir->i_ino)
- return -EIO;
+ return -EFSCORRUPTED;
BUFFER_TRACE(ent->dir_bh, "get_write_access");
return ext4_journal_get_write_access(handle, ent->dir_bh);
}
@@ -3352,8 +3349,7 @@ static int ext4_setent(handle_t *handle, struct ext4_renament *ent,
if (retval)
return retval;
ent->de->inode = cpu_to_le32(ino);
- if (EXT4_HAS_INCOMPAT_FEATURE(ent->dir->i_sb,
- EXT4_FEATURE_INCOMPAT_FILETYPE))
+ if (ext4_has_feature_filetype(ent->dir->i_sb))
ent->de->file_type = file_type;
ent->dir->i_version++;
ent->dir->i_ctime = ent->dir->i_mtime =
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 84ba4d2b3a35..090b3498638e 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -52,9 +52,8 @@ void ext4_exit_pageio(void)
*/
static void buffer_io_error(struct buffer_head *bh)
{
- char b[BDEVNAME_SIZE];
- printk_ratelimited(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n",
- bdevname(bh->b_bdev, b),
+ printk_ratelimited(KERN_ERR "Buffer I/O error on device %pg, logical block %llu\n",
+ bh->b_bdev,
(unsigned long long)bh->b_blocknr);
}
@@ -425,6 +424,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
struct buffer_head *bh, *head;
int ret = 0;
int nr_submitted = 0;
+ int nr_to_submit = 0;
blocksize = 1 << inode->i_blkbits;
@@ -477,11 +477,13 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
}
set_buffer_async_write(bh);
+ nr_to_submit++;
} while ((bh = bh->b_this_page) != head);
bh = head = page_buffers(page);
- if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
+ if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode) &&
+ nr_to_submit) {
data_page = ext4_encrypt(inode, page);
if (IS_ERR(data_page)) {
ret = PTR_ERR(data_page);
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index 560af0437704..5dc5e95063de 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -62,7 +62,7 @@ static void completion_pages(struct work_struct *work)
bio_for_each_segment_all(bv, bio, i) {
struct page *page = bv->bv_page;
- int ret = ext4_decrypt(ctx, page);
+ int ret = ext4_decrypt(page);
if (ret) {
WARN_ON_ONCE(1);
SetPageError(page);
@@ -166,7 +166,7 @@ int ext4_mpage_readpages(struct address_space *mapping,
page = list_entry(pages->prev, struct page, lru);
list_del(&page->lru);
if (add_to_page_cache_lru(page, mapping, page->index,
- GFP_KERNEL & mapping_gfp_mask(mapping)))
+ mapping_gfp_constraint(mapping, GFP_KERNEL)))
goto next_page;
}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index cf0c472047e3..ad62d7acc315 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -490,7 +490,7 @@ static int setup_new_flex_group_blocks(struct super_block *sb,
group_data[0].group != sbi->s_groups_count);
reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks);
- meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
+ meta_bg = ext4_has_feature_meta_bg(sb);
/* This transaction may be extended/restarted along the way */
handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, EXT4_MAX_TRANS_DATA);
@@ -680,8 +680,7 @@ static unsigned ext4_list_backups(struct super_block *sb, unsigned *three,
int mult = 3;
unsigned ret;
- if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
+ if (!ext4_has_feature_sparse_super(sb)) {
ret = *min;
*min += 1;
return ret;
@@ -1040,7 +1039,7 @@ exit_free:
* do not copy the full number of backups at this time. The resize
* which changed s_groups_count will backup again.
*/
-static void update_backups(struct super_block *sb, int blk_off, char *data,
+static void update_backups(struct super_block *sb, sector_t blk_off, char *data,
int size, int meta_bg)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -1065,7 +1064,7 @@ static void update_backups(struct super_block *sb, int blk_off, char *data,
group = ext4_list_backups(sb, &three, &five, &seven);
last = sbi->s_groups_count;
} else {
- group = ext4_meta_bg_first_group(sb, group) + 1;
+ group = ext4_get_group_number(sb, blk_off) + 1;
last = (ext4_group_t)(group + EXT4_DESC_PER_BLOCK(sb) - 2);
}
@@ -1158,7 +1157,7 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb,
int i, gdb_off, gdb_num, err = 0;
int meta_bg;
- meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
+ meta_bg = ext4_has_feature_meta_bg(sb);
for (i = 0; i < count; i++, group++) {
int reserved_gdb = ext4_bg_has_super(sb, group) ?
le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
@@ -1381,9 +1380,7 @@ static void ext4_update_super(struct super_block *sb,
ext4_debug("free blocks count %llu",
percpu_counter_read(&sbi->s_freeclusters_counter));
- if (EXT4_HAS_INCOMPAT_FEATURE(sb,
- EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
- sbi->s_log_groups_per_flex) {
+ if (ext4_has_feature_flex_bg(sb) && sbi->s_log_groups_per_flex) {
ext4_group_t flex_group;
flex_group = ext4_flex_group(sbi, group_data[0].group);
atomic64_add(EXT4_NUM_B2C(sbi, free_blocks),
@@ -1476,8 +1473,7 @@ exit_journal:
int gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
int gdb_num_end = ((group + flex_gd->count - 1) /
EXT4_DESC_PER_BLOCK(sb));
- int meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb,
- EXT4_FEATURE_INCOMPAT_META_BG);
+ int meta_bg = ext4_has_feature_meta_bg(sb);
sector_t old_gdb = 0;
update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es,
@@ -1585,8 +1581,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb);
- if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
+ if (gdb_off == 0 && !ext4_has_feature_sparse_super(sb)) {
ext4_warning(sb, "Can't resize non-sparse filesystem further");
return -EPERM;
}
@@ -1604,9 +1599,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
}
if (reserved_gdb || gdb_off == 0) {
- if (!EXT4_HAS_COMPAT_FEATURE(sb,
- EXT4_FEATURE_COMPAT_RESIZE_INODE)
- || !le16_to_cpu(es->s_reserved_gdt_blocks)) {
+ if (ext4_has_feature_resize_inode(sb) ||
+ !le16_to_cpu(es->s_reserved_gdt_blocks)) {
ext4_warning(sb,
"No reserved GDT blocks, can't resize");
return -EPERM;
@@ -1825,8 +1819,8 @@ static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode)
if (err)
goto errout;
- EXT4_CLEAR_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE);
- EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
+ ext4_clear_feature_resize_inode(sb);
+ ext4_set_feature_meta_bg(sb);
sbi->s_es->s_first_meta_bg =
cpu_to_le32(num_desc_blocks(sb, sbi->s_groups_count));
@@ -1918,9 +1912,9 @@ retry:
n_desc_blocks = num_desc_blocks(sb, n_group + 1);
o_desc_blocks = num_desc_blocks(sb, sbi->s_groups_count);
- meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
+ meta_bg = ext4_has_feature_meta_bg(sb);
- if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE)) {
+ if (ext4_has_feature_resize_inode(sb)) {
if (meta_bg) {
ext4_error(sb, "resize_inode and meta_bg enabled "
"simultaneously");
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index a63c7b0a10cf..f1b56ff01208 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -34,7 +34,6 @@
#include <linux/namei.h>
#include <linux/quotaops.h>
#include <linux/seq_file.h>
-#include <linux/proc_fs.h>
#include <linux/ctype.h>
#include <linux/log2.h>
#include <linux/crc16.h>
@@ -54,11 +53,8 @@
#define CREATE_TRACE_POINTS
#include <trace/events/ext4.h>
-static struct proc_dir_entry *ext4_proc_root;
-static struct kset *ext4_kset;
static struct ext4_lazy_init *ext4_li_info;
static struct mutex ext4_li_mtx;
-static struct ext4_features *ext4_feat;
static int ext4_mballoc_ready;
static struct ratelimit_state ext4_mount_msg_ratelimit;
@@ -83,7 +79,6 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly);
static void ext4_destroy_lazyinit_thread(void);
static void ext4_unregister_li_request(struct super_block *sb);
static void ext4_clear_request_list(void);
-static int ext4_reserve_clusters(struct ext4_sb_info *, ext4_fsblk_t);
#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2)
static struct file_system_type ext2_fs_type = {
@@ -115,8 +110,7 @@ MODULE_ALIAS("ext3");
static int ext4_verify_csum_type(struct super_block *sb,
struct ext4_super_block *es)
{
- if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (!ext4_has_feature_metadata_csum(sb))
return 1;
return es->s_checksum_type == EXT4_CRC32C_CHKSUM;
@@ -394,9 +388,13 @@ static void ext4_handle_error(struct super_block *sb)
smp_wmb();
sb->s_flags |= MS_RDONLY;
}
- if (test_opt(sb, ERRORS_PANIC))
+ if (test_opt(sb, ERRORS_PANIC)) {
+ if (EXT4_SB(sb)->s_journal &&
+ !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR))
+ return;
panic("EXT4-fs (device %s): panic forced after error\n",
sb->s_id);
+ }
}
#define ext4_error_ratelimit(sb) \
@@ -495,6 +493,12 @@ const char *ext4_decode_error(struct super_block *sb, int errno,
char *errstr = NULL;
switch (errno) {
+ case -EFSCORRUPTED:
+ errstr = "Corrupt filesystem";
+ break;
+ case -EFSBADCRC:
+ errstr = "Filesystem failed CRC";
+ break;
case -EIO:
errstr = "IO failure";
break;
@@ -585,8 +589,12 @@ void __ext4_abort(struct super_block *sb, const char *function,
jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
save_error_info(sb, function, line);
}
- if (test_opt(sb, ERRORS_PANIC))
+ if (test_opt(sb, ERRORS_PANIC)) {
+ if (EXT4_SB(sb)->s_journal &&
+ !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR))
+ return;
panic("EXT4-fs panic from previous error\n");
+ }
}
void __ext4_msg(struct super_block *sb,
@@ -800,6 +808,7 @@ static void ext4_put_super(struct super_block *sb)
ext4_abort(sb, "Couldn't clean up the journal");
}
+ ext4_unregister_sysfs(sb);
ext4_es_unregister_shrinker(sbi);
del_timer_sync(&sbi->s_err_report);
ext4_release_system_zone(sb);
@@ -808,18 +817,12 @@ static void ext4_put_super(struct super_block *sb)
ext4_xattr_put_super(sb);
if (!(sb->s_flags & MS_RDONLY)) {
- EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+ ext4_clear_feature_journal_needs_recovery(sb);
es->s_state = cpu_to_le16(sbi->s_mount_state);
}
if (!(sb->s_flags & MS_RDONLY))
ext4_commit_super(sb, 1);
- if (sbi->s_proc) {
- remove_proc_entry("options", sbi->s_proc);
- remove_proc_entry(sb->s_id, ext4_proc_root);
- }
- kobject_del(&sbi->s_kobj);
-
for (i = 0; i < sbi->s_gdb_count; i++)
brelse(sbi->s_group_desc[i]);
kvfree(sbi->s_group_desc);
@@ -963,7 +966,7 @@ static int __init init_inodecache(void)
ext4_inode_cachep = kmem_cache_create("ext4_inode_cache",
sizeof(struct ext4_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (ext4_inode_cachep == NULL)
return -ENOMEM;
@@ -1058,7 +1061,7 @@ static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
return 0;
if (journal)
return jbd2_journal_try_to_free_buffers(journal, page,
- wait & ~__GFP_WAIT);
+ wait & ~__GFP_DIRECT_RECLAIM);
return try_to_free_buffers(page);
}
@@ -1288,7 +1291,7 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
"quota options when quota turned on");
return -1;
}
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) {
+ if (ext4_has_feature_quota(sb)) {
ext4_msg(sb, KERN_ERR, "Cannot set journaled quota options "
"when QUOTA feature is enabled");
return -1;
@@ -1381,10 +1384,10 @@ static const struct mount_opts {
{Opt_nojournal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
MOPT_EXT4_ONLY | MOPT_CLEAR},
{Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
- MOPT_EXT4_ONLY | MOPT_SET},
+ MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
{Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT |
EXT4_MOUNT_JOURNAL_CHECKSUM),
- MOPT_EXT4_ONLY | MOPT_SET},
+ MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
{Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_NO_EXT2 | MOPT_SET},
{Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR},
{Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR},
@@ -1513,8 +1516,14 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
return -1;
if (args->from && (m->flags & MOPT_GTE0) && (arg < 0))
return -1;
- if (m->flags & MOPT_EXPLICIT)
- set_opt2(sb, EXPLICIT_DELALLOC);
+ if (m->flags & MOPT_EXPLICIT) {
+ if (m->mount_opt & EXT4_MOUNT_DELALLOC) {
+ set_opt2(sb, EXPLICIT_DELALLOC);
+ } else if (m->mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) {
+ set_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM);
+ } else
+ return -1;
+ }
if (m->flags & MOPT_CLEAR_ERR)
clear_opt(sb, ERRORS_MASK);
if (token == Opt_noquota && sb_any_quota_loaded(sb)) {
@@ -1647,8 +1656,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
"quota options when quota turned on");
return -1;
}
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_QUOTA)) {
+ if (ext4_has_feature_quota(sb)) {
ext4_msg(sb, KERN_ERR,
"Cannot set journaled quota options "
"when QUOTA feature is enabled");
@@ -1656,8 +1664,12 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
}
sbi->s_jquota_fmt = m->mount_opt;
#endif
-#ifndef CONFIG_FS_DAX
} else if (token == Opt_dax) {
+#ifdef CONFIG_FS_DAX
+ ext4_msg(sb, KERN_WARNING,
+ "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
+ sbi->s_mount_opt |= m->mount_opt;
+#else
ext4_msg(sb, KERN_INFO, "dax option not supported");
return -1;
#endif
@@ -1707,7 +1719,7 @@ static int parse_options(char *options, struct super_block *sb,
return 0;
}
#ifdef CONFIG_QUOTA
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
+ if (ext4_has_feature_quota(sb) &&
(test_opt(sb, USRQUOTA) || test_opt(sb, GRPQUOTA))) {
ext4_msg(sb, KERN_ERR, "Cannot set quota options when QUOTA "
"feature is enabled");
@@ -1880,7 +1892,7 @@ static int ext4_show_options(struct seq_file *seq, struct dentry *root)
return _ext4_show_options(seq, root->d_sb, 0);
}
-static int options_seq_show(struct seq_file *seq, void *offset)
+int ext4_seq_options_show(struct seq_file *seq, void *offset)
{
struct super_block *sb = seq->private;
int rc;
@@ -1891,19 +1903,6 @@ static int options_seq_show(struct seq_file *seq, void *offset)
return rc;
}
-static int options_open_fs(struct inode *inode, struct file *file)
-{
- return single_open(file, options_seq_show, PDE_DATA(inode));
-}
-
-static const struct file_operations ext4_seq_options_fops = {
- .owner = THIS_MODULE,
- .open = options_open_fs,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
int read_only)
{
@@ -1944,7 +1943,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
es->s_mtime = cpu_to_le32(get_seconds());
ext4_update_dynamic_rev(sb);
if (sbi->s_journal)
- EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+ ext4_set_feature_journal_needs_recovery(sb);
ext4_commit_super(sb, 1);
done:
@@ -2027,12 +2026,13 @@ failed:
return 0;
}
-static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
+static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group,
struct ext4_group_desc *gdp)
{
int offset;
__u16 crc = 0;
__le32 le_group = cpu_to_le32(block_group);
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
if (ext4_has_metadata_csum(sbi->s_sb)) {
/* Use new metadata_csum algorithm */
@@ -2052,8 +2052,7 @@ static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
}
/* old crc16 code */
- if (!(sbi->s_es->s_feature_ro_compat &
- cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)))
+ if (!ext4_has_feature_gdt_csum(sb))
return 0;
offset = offsetof(struct ext4_group_desc, bg_checksum);
@@ -2063,8 +2062,7 @@ static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
crc = crc16(crc, (__u8 *)gdp, offset);
offset += sizeof(gdp->bg_checksum); /* skip checksum */
/* for checksum of struct ext4_group_desc do the rest...*/
- if ((sbi->s_es->s_feature_incompat &
- cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT)) &&
+ if (ext4_has_feature_64bit(sb) &&
offset < le16_to_cpu(sbi->s_es->s_desc_size))
crc = crc16(crc, (__u8 *)gdp + offset,
le16_to_cpu(sbi->s_es->s_desc_size) -
@@ -2078,8 +2076,7 @@ int ext4_group_desc_csum_verify(struct super_block *sb, __u32 block_group,
struct ext4_group_desc *gdp)
{
if (ext4_has_group_desc_csum(sb) &&
- (gdp->bg_checksum != ext4_group_desc_csum(EXT4_SB(sb),
- block_group, gdp)))
+ (gdp->bg_checksum != ext4_group_desc_csum(sb, block_group, gdp)))
return 0;
return 1;
@@ -2090,7 +2087,7 @@ void ext4_group_desc_csum_set(struct super_block *sb, __u32 block_group,
{
if (!ext4_has_group_desc_csum(sb))
return;
- gdp->bg_checksum = ext4_group_desc_csum(EXT4_SB(sb), block_group, gdp);
+ gdp->bg_checksum = ext4_group_desc_csum(sb, block_group, gdp);
}
/* Called at mount-time, super-block is locked */
@@ -2106,7 +2103,7 @@ static int ext4_check_descriptors(struct super_block *sb,
int flexbg_flag = 0;
ext4_group_t i, grp = sbi->s_groups_count;
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
+ if (ext4_has_feature_flex_bg(sb))
flexbg_flag = 1;
ext4_debug("Checking group descriptors");
@@ -2150,7 +2147,7 @@ static int ext4_check_descriptors(struct super_block *sb,
if (!ext4_group_desc_csum_verify(sb, i, gdp)) {
ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
"Checksum for group %u failed (%u!=%u)",
- i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
+ i, le16_to_cpu(ext4_group_desc_csum(sb, i,
gdp)), le16_to_cpu(gdp->bg_checksum));
if (!(sb->s_flags & MS_RDONLY)) {
ext4_unlock_group(sb, i);
@@ -2413,8 +2410,7 @@ static ext4_fsblk_t descriptor_loc(struct super_block *sb,
first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);
- if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) ||
- nr < first_meta_bg)
+ if (!ext4_has_feature_meta_bg(sb) || nr < first_meta_bg)
return logical_sb_block + nr + 1;
bg = sbi->s_desc_per_block * nr;
if (ext4_bg_has_super(sb, bg))
@@ -2470,335 +2466,6 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
return ret;
}
-/* sysfs supprt */
-
-struct ext4_attr {
- struct attribute attr;
- ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
- ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
- const char *, size_t);
- union {
- int offset;
- int deprecated_val;
- } u;
-};
-
-static int parse_strtoull(const char *buf,
- unsigned long long max, unsigned long long *value)
-{
- int ret;
-
- ret = kstrtoull(skip_spaces(buf), 0, value);
- if (!ret && *value > max)
- ret = -EINVAL;
- return ret;
-}
-
-static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a,
- struct ext4_sb_info *sbi,
- char *buf)
-{
- return snprintf(buf, PAGE_SIZE, "%llu\n",
- (s64) EXT4_C2B(sbi,
- percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
-}
-
-static ssize_t session_write_kbytes_show(struct ext4_attr *a,
- struct ext4_sb_info *sbi, char *buf)
-{
- struct super_block *sb = sbi->s_buddy_cache->i_sb;
-
- if (!sb->s_bdev->bd_part)
- return snprintf(buf, PAGE_SIZE, "0\n");
- return snprintf(buf, PAGE_SIZE, "%lu\n",
- (part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
- sbi->s_sectors_written_start) >> 1);
-}
-
-static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
- struct ext4_sb_info *sbi, char *buf)
-{
- struct super_block *sb = sbi->s_buddy_cache->i_sb;
-
- if (!sb->s_bdev->bd_part)
- return snprintf(buf, PAGE_SIZE, "0\n");
- return snprintf(buf, PAGE_SIZE, "%llu\n",
- (unsigned long long)(sbi->s_kbytes_written +
- ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
- EXT4_SB(sb)->s_sectors_written_start) >> 1)));
-}
-
-static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
- struct ext4_sb_info *sbi,
- const char *buf, size_t count)
-{
- unsigned long t;
- int ret;
-
- ret = kstrtoul(skip_spaces(buf), 0, &t);
- if (ret)
- return ret;
-
- if (t && (!is_power_of_2(t) || t > 0x40000000))
- return -EINVAL;
-
- sbi->s_inode_readahead_blks = t;
- return count;
-}
-
-static ssize_t sbi_ui_show(struct ext4_attr *a,
- struct ext4_sb_info *sbi, char *buf)
-{
- unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
-
- return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
-}
-
-static ssize_t sbi_ui_store(struct ext4_attr *a,
- struct ext4_sb_info *sbi,
- const char *buf, size_t count)
-{
- unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
- unsigned long t;
- int ret;
-
- ret = kstrtoul(skip_spaces(buf), 0, &t);
- if (ret)
- return ret;
- *ui = t;
- return count;
-}
-
-static ssize_t es_ui_show(struct ext4_attr *a,
- struct ext4_sb_info *sbi, char *buf)
-{
-
- unsigned int *ui = (unsigned int *) (((char *) sbi->s_es) +
- a->u.offset);
-
- return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
-}
-
-static ssize_t reserved_clusters_show(struct ext4_attr *a,
- struct ext4_sb_info *sbi, char *buf)
-{
- return snprintf(buf, PAGE_SIZE, "%llu\n",
- (unsigned long long) atomic64_read(&sbi->s_resv_clusters));
-}
-
-static ssize_t reserved_clusters_store(struct ext4_attr *a,
- struct ext4_sb_info *sbi,
- const char *buf, size_t count)
-{
- unsigned long long val;
- int ret;
-
- if (parse_strtoull(buf, -1ULL, &val))
- return -EINVAL;
- ret = ext4_reserve_clusters(sbi, val);
-
- return ret ? ret : count;
-}
-
-static ssize_t trigger_test_error(struct ext4_attr *a,
- struct ext4_sb_info *sbi,
- const char *buf, size_t count)
-{
- int len = count;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (len && buf[len-1] == '\n')
- len--;
-
- if (len)
- ext4_error(sbi->s_sb, "%.*s", len, buf);
- return count;
-}
-
-static ssize_t sbi_deprecated_show(struct ext4_attr *a,
- struct ext4_sb_info *sbi, char *buf)
-{
- return snprintf(buf, PAGE_SIZE, "%d\n", a->u.deprecated_val);
-}
-
-#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \
-static struct ext4_attr ext4_attr_##_name = { \
- .attr = {.name = __stringify(_name), .mode = _mode }, \
- .show = _show, \
- .store = _store, \
- .u = { \
- .offset = offsetof(struct ext4_sb_info, _elname),\
- }, \
-}
-
-#define EXT4_ATTR_OFFSET_ES(_name,_mode,_show,_store,_elname) \
-static struct ext4_attr ext4_attr_##_name = { \
- .attr = {.name = __stringify(_name), .mode = _mode }, \
- .show = _show, \
- .store = _store, \
- .u = { \
- .offset = offsetof(struct ext4_super_block, _elname), \
- }, \
-}
-
-#define EXT4_ATTR(name, mode, show, store) \
-static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
-
-#define EXT4_INFO_ATTR(name) EXT4_ATTR(name, 0444, NULL, NULL)
-#define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL)
-#define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store)
-
-#define EXT4_RO_ATTR_ES_UI(name, elname) \
- EXT4_ATTR_OFFSET_ES(name, 0444, es_ui_show, NULL, elname)
-#define EXT4_RW_ATTR_SBI_UI(name, elname) \
- EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname)
-
-#define ATTR_LIST(name) &ext4_attr_##name.attr
-#define EXT4_DEPRECATED_ATTR(_name, _val) \
-static struct ext4_attr ext4_attr_##_name = { \
- .attr = {.name = __stringify(_name), .mode = 0444 }, \
- .show = sbi_deprecated_show, \
- .u = { \
- .deprecated_val = _val, \
- }, \
-}
-
-EXT4_RO_ATTR(delayed_allocation_blocks);
-EXT4_RO_ATTR(session_write_kbytes);
-EXT4_RO_ATTR(lifetime_write_kbytes);
-EXT4_RW_ATTR(reserved_clusters);
-EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
- inode_readahead_blks_store, s_inode_readahead_blks);
-EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
-EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
-EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
-EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
-EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
-EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
-EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
-EXT4_DEPRECATED_ATTR(max_writeback_mb_bump, 128);
-EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
-EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error);
-EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
-EXT4_RW_ATTR_SBI_UI(err_ratelimit_burst, s_err_ratelimit_state.burst);
-EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval);
-EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst);
-EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
-EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
-EXT4_RO_ATTR_ES_UI(errors_count, s_error_count);
-EXT4_RO_ATTR_ES_UI(first_error_time, s_first_error_time);
-EXT4_RO_ATTR_ES_UI(last_error_time, s_last_error_time);
-
-static struct attribute *ext4_attrs[] = {
- ATTR_LIST(delayed_allocation_blocks),
- ATTR_LIST(session_write_kbytes),
- ATTR_LIST(lifetime_write_kbytes),
- ATTR_LIST(reserved_clusters),
- ATTR_LIST(inode_readahead_blks),
- ATTR_LIST(inode_goal),
- ATTR_LIST(mb_stats),
- ATTR_LIST(mb_max_to_scan),
- ATTR_LIST(mb_min_to_scan),
- ATTR_LIST(mb_order2_req),
- ATTR_LIST(mb_stream_req),
- ATTR_LIST(mb_group_prealloc),
- ATTR_LIST(max_writeback_mb_bump),
- ATTR_LIST(extent_max_zeroout_kb),
- ATTR_LIST(trigger_fs_error),
- ATTR_LIST(err_ratelimit_interval_ms),
- ATTR_LIST(err_ratelimit_burst),
- ATTR_LIST(warning_ratelimit_interval_ms),
- ATTR_LIST(warning_ratelimit_burst),
- ATTR_LIST(msg_ratelimit_interval_ms),
- ATTR_LIST(msg_ratelimit_burst),
- ATTR_LIST(errors_count),
- ATTR_LIST(first_error_time),
- ATTR_LIST(last_error_time),
- NULL,
-};
-
-/* Features this copy of ext4 supports */
-EXT4_INFO_ATTR(lazy_itable_init);
-EXT4_INFO_ATTR(batched_discard);
-EXT4_INFO_ATTR(meta_bg_resize);
-EXT4_INFO_ATTR(encryption);
-
-static struct attribute *ext4_feat_attrs[] = {
- ATTR_LIST(lazy_itable_init),
- ATTR_LIST(batched_discard),
- ATTR_LIST(meta_bg_resize),
- ATTR_LIST(encryption),
- NULL,
-};
-
-static ssize_t ext4_attr_show(struct kobject *kobj,
- struct attribute *attr, char *buf)
-{
- struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
- s_kobj);
- struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
-
- return a->show ? a->show(a, sbi, buf) : 0;
-}
-
-static ssize_t ext4_attr_store(struct kobject *kobj,
- struct attribute *attr,
- const char *buf, size_t len)
-{
- struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
- s_kobj);
- struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
-
- return a->store ? a->store(a, sbi, buf, len) : 0;
-}
-
-static void ext4_sb_release(struct kobject *kobj)
-{
- struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
- s_kobj);
- complete(&sbi->s_kobj_unregister);
-}
-
-static const struct sysfs_ops ext4_attr_ops = {
- .show = ext4_attr_show,
- .store = ext4_attr_store,
-};
-
-static struct kobj_type ext4_ktype = {
- .default_attrs = ext4_attrs,
- .sysfs_ops = &ext4_attr_ops,
- .release = ext4_sb_release,
-};
-
-static void ext4_feat_release(struct kobject *kobj)
-{
- complete(&ext4_feat->f_kobj_unregister);
-}
-
-static ssize_t ext4_feat_show(struct kobject *kobj,
- struct attribute *attr, char *buf)
-{
- return snprintf(buf, PAGE_SIZE, "supported\n");
-}
-
-/*
- * We can not use ext4_attr_show/store because it relies on the kobject
- * being embedded in the ext4_sb_info structure which is definitely not
- * true in this case.
- */
-static const struct sysfs_ops ext4_feat_ops = {
- .show = ext4_feat_show,
- .store = NULL,
-};
-
-static struct kobj_type ext4_feat_ktype = {
- .default_attrs = ext4_feat_attrs,
- .sysfs_ops = &ext4_feat_ops,
- .release = ext4_feat_release,
-};
-
/*
* Check whether this filesystem can be mounted based on
* the features present and the RDONLY/RDWR mount requested.
@@ -2807,7 +2474,7 @@ static struct kobj_type ext4_feat_ktype = {
*/
static int ext4_feature_set_ok(struct super_block *sb, int readonly)
{
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP)) {
+ if (ext4_has_unknown_ext4_incompat_features(sb)) {
ext4_msg(sb, KERN_ERR,
"Couldn't mount because of "
"unsupported optional features (%x)",
@@ -2819,14 +2486,14 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
if (readonly)
return 1;
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_READONLY)) {
+ if (ext4_has_feature_readonly(sb)) {
ext4_msg(sb, KERN_INFO, "filesystem is read-only");
sb->s_flags |= MS_RDONLY;
return 1;
}
/* Check that feature set is OK for a read-write mount */
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP)) {
+ if (ext4_has_unknown_ext4_ro_compat_features(sb)) {
ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of "
"unsupported optional features (%x)",
(le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
@@ -2837,7 +2504,7 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
* Large file size enabled file system can only be mounted
* read-write on 32-bit systems if kernel is built with CONFIG_LBDAF
*/
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
+ if (ext4_has_feature_huge_file(sb)) {
if (sizeof(blkcnt_t) < sizeof(u64)) {
ext4_msg(sb, KERN_ERR, "Filesystem with huge files "
"cannot be mounted RDWR without "
@@ -2845,8 +2512,7 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
return 0;
}
}
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC) &&
- !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+ if (ext4_has_feature_bigalloc(sb) && !ext4_has_feature_extents(sb)) {
ext4_msg(sb, KERN_ERR,
"Can't support bigalloc feature without "
"extents feature\n");
@@ -2854,8 +2520,7 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
}
#ifndef CONFIG_QUOTA
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
- !readonly) {
+ if (ext4_has_feature_quota(sb) && !readonly) {
ext4_msg(sb, KERN_ERR,
"Filesystem with quota feature cannot be mounted RDWR "
"without CONFIG_QUOTA");
@@ -3312,7 +2977,7 @@ static int count_overhead(struct super_block *sb, ext4_group_t grp,
ext4_group_t i, ngroups = ext4_get_groups_count(sb);
int s, j, count = 0;
- if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC))
+ if (!ext4_has_feature_bigalloc(sb))
return (ext4_bg_has_super(sb, grp) + ext4_bg_num_gdb(sb, grp) +
sbi->s_itb_per_group + 2);
@@ -3403,10 +3068,10 @@ int ext4_calculate_overhead(struct super_block *sb)
return 0;
}
-
-static ext4_fsblk_t ext4_calculate_resv_clusters(struct super_block *sb)
+static void ext4_set_resv_clusters(struct super_block *sb)
{
ext4_fsblk_t resv_clusters;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
/*
* There's no need to reserve anything when we aren't using extents.
@@ -3414,8 +3079,8 @@ static ext4_fsblk_t ext4_calculate_resv_clusters(struct super_block *sb)
* hole punching doesn't need new metadata... This is needed especially
* to keep ext2/3 backward compatibility.
*/
- if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
- return 0;
+ if (!ext4_has_feature_extents(sb))
+ return;
/*
* By default we reserve 2% or 4096 clusters, whichever is smaller.
* This should cover the situations where we can not afford to run
@@ -3424,26 +3089,13 @@ static ext4_fsblk_t ext4_calculate_resv_clusters(struct super_block *sb)
* allocation would require 1, or 2 blocks, higher numbers are
* very rare.
*/
- resv_clusters = ext4_blocks_count(EXT4_SB(sb)->s_es) >>
- EXT4_SB(sb)->s_cluster_bits;
+ resv_clusters = (ext4_blocks_count(sbi->s_es) >>
+ sbi->s_cluster_bits);
do_div(resv_clusters, 50);
resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096);
- return resv_clusters;
-}
-
-
-static int ext4_reserve_clusters(struct ext4_sb_info *sbi, ext4_fsblk_t count)
-{
- ext4_fsblk_t clusters = ext4_blocks_count(sbi->s_es) >>
- sbi->s_cluster_bits;
-
- if (count >= clusters)
- return -EINVAL;
-
- atomic64_set(&sbi->s_resv_clusters, count);
- return 0;
+ atomic64_set(&sbi->s_resv_clusters, resv_clusters);
}
static int ext4_fill_super(struct super_block *sb, void *data, int silent)
@@ -3526,9 +3178,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);
/* Warn if metadata_csum and gdt_csum are both set. */
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
- EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
+ if (ext4_has_feature_metadata_csum(sb) &&
+ ext4_has_feature_gdt_csum(sb))
ext4_warning(sb, "metadata_csum and uninit_bg are "
"redundant flags; please run fsck.");
@@ -3541,8 +3192,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
/* Load the checksum driver */
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
+ if (ext4_has_feature_metadata_csum(sb)) {
sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
if (IS_ERR(sbi->s_chksum_driver)) {
ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver.");
@@ -3557,11 +3207,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
"invalid superblock checksum. Run e2fsck?");
silent = 1;
+ ret = -EFSBADCRC;
goto cantfind_ext4;
}
/* Precompute checksum seed for all metadata */
- if (ext4_has_metadata_csum(sb))
+ if (ext4_has_feature_csum_seed(sb))
+ sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed);
+ else if (ext4_has_metadata_csum(sb))
sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
sizeof(es->s_uuid));
@@ -3664,17 +3317,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
(test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
- (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) ||
- EXT4_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
- EXT4_HAS_INCOMPAT_FEATURE(sb, ~0U)))
+ (ext4_has_compat_features(sb) ||
+ ext4_has_ro_compat_features(sb) ||
+ ext4_has_incompat_features(sb)))
ext4_msg(sb, KERN_WARNING,
"feature flags set on rev 0 fs, "
"running e2fsck is recommended");
if (es->s_creator_os == cpu_to_le32(EXT4_OS_HURD)) {
set_opt2(sb, HURD_COMPAT);
- if (EXT4_HAS_INCOMPAT_FEATURE(sb,
- EXT4_FEATURE_INCOMPAT_64BIT)) {
+ if (ext4_has_feature_64bit(sb)) {
ext4_msg(sb, KERN_ERR,
"The Hurd can't support 64-bit file systems");
goto failed_mount;
@@ -3732,8 +3384,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
}
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT) &&
- es->s_encryption_level) {
+ if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) {
ext4_msg(sb, KERN_ERR, "Unsupported encryption level %d",
es->s_encryption_level);
goto failed_mount;
@@ -3765,8 +3416,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
}
- has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
+ has_huge_files = ext4_has_feature_huge_file(sb);
sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
has_huge_files);
sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);
@@ -3790,7 +3440,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
sbi->s_desc_size = le16_to_cpu(es->s_desc_size);
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) {
+ if (ext4_has_feature_64bit(sb)) {
if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT ||
sbi->s_desc_size > EXT4_MAX_DESC_SIZE ||
!is_power_of_2(sbi->s_desc_size)) {
@@ -3821,7 +3471,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
for (i = 0; i < 4; i++)
sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
sbi->s_def_hash_version = es->s_def_hash_version;
- if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) {
+ if (ext4_has_feature_dir_index(sb)) {
i = le32_to_cpu(es->s_flags);
if (i & EXT2_FLAGS_UNSIGNED_HASH)
sbi->s_hash_unsigned = 3;
@@ -3841,8 +3491,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
/* Handle clustersize */
clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size);
- has_bigalloc = EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_BIGALLOC);
+ has_bigalloc = ext4_has_feature_bigalloc(sb);
if (has_bigalloc) {
if (clustersize < blocksize) {
ext4_msg(sb, KERN_ERR,
@@ -3961,13 +3610,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount;
}
- if (ext4_proc_root)
- sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
-
- if (sbi->s_proc)
- proc_create_data("options", S_IRUGO, sbi->s_proc,
- &ext4_seq_options_fops, sb);
-
bgl_lock_init(sbi->s_blockgroup_lock);
for (i = 0; i < db_count; i++) {
@@ -3982,6 +3624,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
if (!ext4_check_descriptors(sb, &first_not_zeroed)) {
ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
+ ret = -EFSCORRUPTED;
goto failed_mount2;
}
@@ -4007,7 +3650,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sb->s_xattr = ext4_xattr_handlers;
#ifdef CONFIG_QUOTA
sb->dq_op = &ext4_quota_operations;
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))
+ if (ext4_has_feature_quota(sb))
sb->s_qcop = &dquot_quotactl_sysfile_ops;
else
sb->s_qcop = &ext4_qctl_operations;
@@ -4021,11 +3664,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sb->s_root = NULL;
needs_recovery = (es->s_last_orphan != 0 ||
- EXT4_HAS_INCOMPAT_FEATURE(sb,
- EXT4_FEATURE_INCOMPAT_RECOVER));
+ ext4_has_feature_journal_needs_recovery(sb));
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) &&
- !(sb->s_flags & MS_RDONLY))
+ if (ext4_has_feature_mmp(sb) && !(sb->s_flags & MS_RDONLY))
if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)))
goto failed_mount3a;
@@ -4033,23 +3674,47 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
* The first inode we look at is the journal inode. Don't try
* root first: it may be modified in the journal!
*/
- if (!test_opt(sb, NOLOAD) &&
- EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
+ if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) {
if (ext4_load_journal(sb, es, journal_devnum))
goto failed_mount3a;
} else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
- EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
+ ext4_has_feature_journal_needs_recovery(sb)) {
ext4_msg(sb, KERN_ERR, "required journal recovery "
"suppressed and not mounted read-only");
goto failed_mount_wq;
} else {
+ /* Nojournal mode, all journal mount options are illegal */
+ if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "journal_checksum, fs mounted w/o journal");
+ goto failed_mount_wq;
+ }
+ if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "journal_async_commit, fs mounted w/o journal");
+ goto failed_mount_wq;
+ }
+ if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "commit=%lu, fs mounted w/o journal",
+ sbi->s_commit_interval / HZ);
+ goto failed_mount_wq;
+ }
+ if (EXT4_MOUNT_DATA_FLAGS &
+ (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "data=, fs mounted w/o journal");
+ goto failed_mount_wq;
+ }
+ sbi->s_def_mount_opt &= EXT4_MOUNT_JOURNAL_CHECKSUM;
+ clear_opt(sb, JOURNAL_CHECKSUM);
clear_opt(sb, DATA_FLAGS);
sbi->s_journal = NULL;
needs_recovery = 0;
goto no_journal;
}
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT) &&
+ if (ext4_has_feature_64bit(sb) &&
!jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
JBD2_FEATURE_INCOMPAT_64BIT)) {
ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
@@ -4101,18 +3766,16 @@ no_journal:
}
}
- if ((DUMMY_ENCRYPTION_ENABLED(sbi) ||
- EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT)) &&
+ if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) &&
(blocksize != PAGE_CACHE_SIZE)) {
ext4_msg(sb, KERN_ERR,
"Unsupported blocksize for fs encryption");
goto failed_mount_wq;
}
- if (DUMMY_ENCRYPTION_ENABLED(sbi) &&
- !(sb->s_flags & MS_RDONLY) &&
- !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT)) {
- EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT);
+ if (DUMMY_ENCRYPTION_ENABLED(sbi) && !(sb->s_flags & MS_RDONLY) &&
+ !ext4_has_feature_encrypt(sb)) {
+ ext4_set_feature_encrypt(sb);
ext4_commit_super(sb, 1);
}
@@ -4171,8 +3834,7 @@ no_journal:
if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) {
sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
EXT4_GOOD_OLD_INODE_SIZE;
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE)) {
+ if (ext4_has_feature_extra_isize(sb)) {
if (sbi->s_want_extra_isize <
le16_to_cpu(es->s_want_extra_isize))
sbi->s_want_extra_isize =
@@ -4192,12 +3854,7 @@ no_journal:
"available");
}
- err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sb));
- if (err) {
- ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for "
- "reserved pool", ext4_calculate_resv_clusters(sb));
- goto failed_mount4a;
- }
+ ext4_set_resv_clusters(sb);
err = ext4_setup_system_zone(sb);
if (err) {
@@ -4236,7 +3893,7 @@ no_journal:
goto failed_mount6;
}
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
+ if (ext4_has_feature_flex_bg(sb))
if (!ext4_fill_flex_info(sb)) {
ext4_msg(sb, KERN_ERR,
"unable to initialize "
@@ -4248,17 +3905,13 @@ no_journal:
if (err)
goto failed_mount6;
- sbi->s_kobj.kset = ext4_kset;
- init_completion(&sbi->s_kobj_unregister);
- err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL,
- "%s", sb->s_id);
+ err = ext4_register_sysfs(sb);
if (err)
goto failed_mount7;
#ifdef CONFIG_QUOTA
/* Enable quota usage during mount. */
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
- !(sb->s_flags & MS_RDONLY)) {
+ if (ext4_has_feature_quota(sb) && !(sb->s_flags & MS_RDONLY)) {
err = ext4_enable_quotas(sb);
if (err)
goto failed_mount8;
@@ -4313,7 +3966,7 @@ cantfind_ext4:
#ifdef CONFIG_QUOTA
failed_mount8:
- kobject_del(&sbi->s_kobj);
+ ext4_unregister_sysfs(sb);
#endif
failed_mount7:
ext4_unregister_li_request(sb);
@@ -4353,10 +4006,6 @@ failed_mount2:
failed_mount:
if (sbi->s_chksum_driver)
crypto_free_shash(sbi->s_chksum_driver);
- if (sbi->s_proc) {
- remove_proc_entry("options", sbi->s_proc);
- remove_proc_entry(sb->s_id, ext4_proc_root);
- }
#ifdef CONFIG_QUOTA
for (i = 0; i < EXT4_MAXQUOTAS; i++)
kfree(sbi->s_qf_names[i]);
@@ -4403,7 +4052,7 @@ static journal_t *ext4_get_journal(struct super_block *sb,
struct inode *journal_inode;
journal_t *journal;
- BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+ BUG_ON(!ext4_has_feature_journal(sb));
/* First, test for the existence of a valid inode on disk. Bad
* things happen if we iget() an unused inode, as the subsequent
@@ -4453,7 +4102,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
struct ext4_super_block *es;
struct block_device *bdev;
- BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+ BUG_ON(!ext4_has_feature_journal(sb));
bdev = ext4_blkdev_get(j_dev, sb);
if (bdev == NULL)
@@ -4545,7 +4194,7 @@ static int ext4_load_journal(struct super_block *sb,
int err = 0;
int really_read_only;
- BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+ BUG_ON(!ext4_has_feature_journal(sb));
if (journal_devnum &&
journal_devnum != le32_to_cpu(es->s_journal_dev)) {
@@ -4562,7 +4211,7 @@ static int ext4_load_journal(struct super_block *sb,
* crash? For recovery, we need to check in advance whether we
* can get read-write access to the device.
*/
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
+ if (ext4_has_feature_journal_needs_recovery(sb)) {
if (sb->s_flags & MS_RDONLY) {
ext4_msg(sb, KERN_INFO, "INFO: recovery "
"required on readonly filesystem");
@@ -4593,7 +4242,7 @@ static int ext4_load_journal(struct super_block *sb,
if (!(journal->j_flags & JBD2_BARRIER))
ext4_msg(sb, KERN_INFO, "barriers disabled");
- if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER))
+ if (!ext4_has_feature_journal_needs_recovery(sb))
err = jbd2_journal_wipe(journal, !really_read_only);
if (!err) {
char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL);
@@ -4707,7 +4356,7 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
{
journal_t *journal = EXT4_SB(sb)->s_journal;
- if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
+ if (!ext4_has_feature_journal(sb)) {
BUG_ON(journal != NULL);
return;
}
@@ -4715,9 +4364,9 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
if (jbd2_journal_flush(journal) < 0)
goto out;
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) &&
+ if (ext4_has_feature_journal_needs_recovery(sb) &&
sb->s_flags & MS_RDONLY) {
- EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+ ext4_clear_feature_journal_needs_recovery(sb);
ext4_commit_super(sb, 1);
}
@@ -4737,7 +4386,7 @@ static void ext4_clear_journal_err(struct super_block *sb,
int j_errno;
const char *errstr;
- BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+ BUG_ON(!ext4_has_feature_journal(sb));
journal = EXT4_SB(sb)->s_journal;
@@ -4852,7 +4501,7 @@ static int ext4_freeze(struct super_block *sb)
goto out;
/* Journal blocked and flushed, clear needs_recovery flag. */
- EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+ ext4_clear_feature_journal_needs_recovery(sb);
}
error = ext4_commit_super(sb, 1);
@@ -4874,7 +4523,7 @@ static int ext4_unfreeze(struct super_block *sb)
if (EXT4_SB(sb)->s_journal) {
/* Reset the needs_recovery flag before the fs is unlocked. */
- EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+ ext4_set_feature_journal_needs_recovery(sb);
}
ext4_commit_super(sb, 1);
@@ -5027,8 +4676,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
ext4_mark_recovery_complete(sb, es);
} else {
/* Make sure we can mount this feature set readwrite */
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_READONLY) ||
+ if (ext4_has_feature_readonly(sb) ||
!ext4_feature_set_ok(sb, 0)) {
err = -EROFS;
goto restore_opts;
@@ -5044,9 +4692,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
if (!ext4_group_desc_csum_verify(sb, g, gdp)) {
ext4_msg(sb, KERN_ERR,
"ext4_remount: Checksum for group %u failed (%u!=%u)",
- g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)),
+ g, le16_to_cpu(ext4_group_desc_csum(sb, g, gdp)),
le16_to_cpu(gdp->bg_checksum));
- err = -EINVAL;
+ err = -EFSBADCRC;
goto restore_opts;
}
}
@@ -5076,8 +4724,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
sbi->s_mount_state = le16_to_cpu(es->s_state);
if (!ext4_setup_super(sb, es, 0))
sb->s_flags &= ~MS_RDONLY;
- if (EXT4_HAS_INCOMPAT_FEATURE(sb,
- EXT4_FEATURE_INCOMPAT_MMP))
+ if (ext4_has_feature_mmp(sb))
if (ext4_multi_mount_protect(sb,
le64_to_cpu(es->s_mmp_block))) {
err = -EROFS;
@@ -5110,8 +4757,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
if (enable_quota) {
if (sb_any_quota_suspended(sb))
dquot_resume(sb, -1);
- else if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_QUOTA)) {
+ else if (ext4_has_feature_quota(sb)) {
err = ext4_enable_quotas(sb);
if (err)
goto restore_opts;
@@ -5255,7 +4901,7 @@ static int ext4_mark_dquot_dirty(struct dquot *dquot)
struct ext4_sb_info *sbi = EXT4_SB(sb);
/* Are we journaling quotas? */
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) ||
+ if (ext4_has_feature_quota(sb) ||
sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
dquot_mark_dquot_dirty(dquot);
return ext4_write_dquot(dquot);
@@ -5343,7 +4989,7 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum)
};
- BUG_ON(!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA));
+ BUG_ON(!ext4_has_feature_quota(sb));
if (!qf_inums[type])
return -EPERM;
@@ -5537,11 +5183,11 @@ static inline void unregister_as_ext2(void)
static inline int ext2_feature_set_ok(struct super_block *sb)
{
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT2_FEATURE_INCOMPAT_SUPP))
+ if (ext4_has_unknown_ext2_incompat_features(sb))
return 0;
if (sb->s_flags & MS_RDONLY)
return 1;
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP))
+ if (ext4_has_unknown_ext2_ro_compat_features(sb))
return 0;
return 1;
}
@@ -5566,13 +5212,13 @@ static inline void unregister_as_ext3(void)
static inline int ext3_feature_set_ok(struct super_block *sb)
{
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP))
+ if (ext4_has_unknown_ext3_incompat_features(sb))
return 0;
- if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
+ if (!ext4_has_feature_journal(sb))
return 0;
if (sb->s_flags & MS_RDONLY)
return 1;
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP))
+ if (ext4_has_unknown_ext3_ro_compat_features(sb))
return 0;
return 1;
}
@@ -5586,37 +5232,6 @@ static struct file_system_type ext4_fs_type = {
};
MODULE_ALIAS_FS("ext4");
-static int __init ext4_init_feat_adverts(void)
-{
- struct ext4_features *ef;
- int ret = -ENOMEM;
-
- ef = kzalloc(sizeof(struct ext4_features), GFP_KERNEL);
- if (!ef)
- goto out;
-
- ef->f_kobj.kset = ext4_kset;
- init_completion(&ef->f_kobj_unregister);
- ret = kobject_init_and_add(&ef->f_kobj, &ext4_feat_ktype, NULL,
- "features");
- if (ret) {
- kfree(ef);
- goto out;
- }
-
- ext4_feat = ef;
- ret = 0;
-out:
- return ret;
-}
-
-static void ext4_exit_feat_adverts(void)
-{
- kobject_put(&ext4_feat->f_kobj);
- wait_for_completion(&ext4_feat->f_kobj_unregister);
- kfree(ext4_feat);
-}
-
/* Shared across all ext4 file systems */
wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
@@ -5643,21 +5258,15 @@ static int __init ext4_init_fs(void)
err = ext4_init_pageio();
if (err)
- goto out7;
+ goto out5;
err = ext4_init_system_zone();
if (err)
- goto out6;
- ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
- if (!ext4_kset) {
- err = -ENOMEM;
- goto out5;
- }
- ext4_proc_root = proc_mkdir("fs/ext4", NULL);
+ goto out4;
- err = ext4_init_feat_adverts();
+ err = ext4_init_sysfs();
if (err)
- goto out4;
+ goto out3;
err = ext4_init_mballoc();
if (err)
@@ -5682,16 +5291,12 @@ out1:
ext4_mballoc_ready = 0;
ext4_exit_mballoc();
out2:
- ext4_exit_feat_adverts();
-out4:
- if (ext4_proc_root)
- remove_proc_entry("fs/ext4", NULL);
- kset_unregister(ext4_kset);
-out5:
+ ext4_exit_sysfs();
+out3:
ext4_exit_system_zone();
-out6:
+out4:
ext4_exit_pageio();
-out7:
+out5:
ext4_exit_es();
return err;
@@ -5706,9 +5311,7 @@ static void __exit ext4_exit_fs(void)
unregister_filesystem(&ext4_fs_type);
destroy_inodecache();
ext4_exit_mballoc();
- ext4_exit_feat_adverts();
- remove_proc_entry("fs/ext4", NULL);
- kset_unregister(ext4_kset);
+ ext4_exit_sysfs();
ext4_exit_system_zone();
ext4_exit_pageio();
ext4_exit_es();
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index c677f2c1044b..6f7ee30a89ce 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -23,17 +23,21 @@
#include "xattr.h"
#ifdef CONFIG_EXT4_FS_ENCRYPTION
-static const char *ext4_encrypted_follow_link(struct dentry *dentry, void **cookie)
+static const char *ext4_encrypted_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
struct page *cpage = NULL;
char *caddr, *paddr = NULL;
struct ext4_str cstr, pstr;
- struct inode *inode = d_inode(dentry);
struct ext4_encrypted_symlink_data *sd;
loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1);
int res;
u32 plen, max_size = inode->i_sb->s_blocksize;
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
res = ext4_get_encryption_info(inode);
if (res)
return ERR_PTR(res);
@@ -45,19 +49,19 @@ static const char *ext4_encrypted_follow_link(struct dentry *dentry, void **cook
cpage = read_mapping_page(inode->i_mapping, 0, NULL);
if (IS_ERR(cpage))
return ERR_CAST(cpage);
- caddr = kmap(cpage);
+ caddr = page_address(cpage);
caddr[size] = 0;
}
/* Symlink is encrypted */
sd = (struct ext4_encrypted_symlink_data *)caddr;
cstr.name = sd->encrypted_path;
- cstr.len = le32_to_cpu(sd->len);
+ cstr.len = le16_to_cpu(sd->len);
if ((cstr.len +
sizeof(struct ext4_encrypted_symlink_data) - 1) >
max_size) {
/* Symlink data on the disk is corrupted */
- res = -EIO;
+ res = -EFSCORRUPTED;
goto errout;
}
plen = (cstr.len < EXT4_FNAME_CRYPTO_DIGEST_SIZE*2) ?
@@ -75,24 +79,20 @@ static const char *ext4_encrypted_follow_link(struct dentry *dentry, void **cook
/* Null-terminate the name */
if (res <= plen)
paddr[res] = '\0';
- if (cpage) {
- kunmap(cpage);
+ if (cpage)
page_cache_release(cpage);
- }
- return *cookie = paddr;
+ set_delayed_call(done, kfree_link, paddr);
+ return paddr;
errout:
- if (cpage) {
- kunmap(cpage);
+ if (cpage)
page_cache_release(cpage);
- }
kfree(paddr);
return ERR_PTR(res);
}
const struct inode_operations ext4_encrypted_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = ext4_encrypted_follow_link,
- .put_link = kfree_put_link,
+ .get_link = ext4_encrypted_get_link,
.setattr = ext4_setattr,
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
@@ -103,8 +103,7 @@ const struct inode_operations ext4_encrypted_symlink_inode_operations = {
const struct inode_operations ext4_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.setattr = ext4_setattr,
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
@@ -114,7 +113,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
const struct inode_operations ext4_fast_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = simple_follow_link,
+ .get_link = simple_get_link,
.setattr = ext4_setattr,
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
new file mode 100644
index 000000000000..1420a3c614af
--- /dev/null
+++ b/fs/ext4/sysfs.c
@@ -0,0 +1,448 @@
+/*
+ * linux/fs/ext4/sysfs.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Theodore Ts'o (tytso@mit.edu)
+ *
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+
+#include "ext4.h"
+#include "ext4_jbd2.h"
+
+typedef enum {
+ attr_noop,
+ attr_delayed_allocation_blocks,
+ attr_session_write_kbytes,
+ attr_lifetime_write_kbytes,
+ attr_reserved_clusters,
+ attr_inode_readahead,
+ attr_trigger_test_error,
+ attr_feature,
+ attr_pointer_ui,
+ attr_pointer_atomic,
+} attr_id_t;
+
+typedef enum {
+ ptr_explicit,
+ ptr_ext4_sb_info_offset,
+ ptr_ext4_super_block_offset,
+} attr_ptr_t;
+
+static const char *proc_dirname = "fs/ext4";
+static struct proc_dir_entry *ext4_proc_root;
+
+struct ext4_attr {
+ struct attribute attr;
+ short attr_id;
+ short attr_ptr;
+ union {
+ int offset;
+ void *explicit_ptr;
+ } u;
+};
+
+static ssize_t session_write_kbytes_show(struct ext4_attr *a,
+ struct ext4_sb_info *sbi, char *buf)
+{
+ struct super_block *sb = sbi->s_buddy_cache->i_sb;
+
+ if (!sb->s_bdev->bd_part)
+ return snprintf(buf, PAGE_SIZE, "0\n");
+ return snprintf(buf, PAGE_SIZE, "%lu\n",
+ (part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+ sbi->s_sectors_written_start) >> 1);
+}
+
+static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
+ struct ext4_sb_info *sbi, char *buf)
+{
+ struct super_block *sb = sbi->s_buddy_cache->i_sb;
+
+ if (!sb->s_bdev->bd_part)
+ return snprintf(buf, PAGE_SIZE, "0\n");
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
+ (unsigned long long)(sbi->s_kbytes_written +
+ ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+ EXT4_SB(sb)->s_sectors_written_start) >> 1)));
+}
+
+static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
+ struct ext4_sb_info *sbi,
+ const char *buf, size_t count)
+{
+ unsigned long t;
+ int ret;
+
+ ret = kstrtoul(skip_spaces(buf), 0, &t);
+ if (ret)
+ return ret;
+
+ if (t && (!is_power_of_2(t) || t > 0x40000000))
+ return -EINVAL;
+
+ sbi->s_inode_readahead_blks = t;
+ return count;
+}
+
+static ssize_t reserved_clusters_store(struct ext4_attr *a,
+ struct ext4_sb_info *sbi,
+ const char *buf, size_t count)
+{
+ unsigned long long val;
+ ext4_fsblk_t clusters = (ext4_blocks_count(sbi->s_es) >>
+ sbi->s_cluster_bits);
+ int ret;
+
+ ret = kstrtoull(skip_spaces(buf), 0, &val);
+ if (!ret || val >= clusters)
+ return -EINVAL;
+
+ atomic64_set(&sbi->s_resv_clusters, val);
+ return count;
+}
+
+static ssize_t trigger_test_error(struct ext4_attr *a,
+ struct ext4_sb_info *sbi,
+ const char *buf, size_t count)
+{
+ int len = count;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (len && buf[len-1] == '\n')
+ len--;
+
+ if (len)
+ ext4_error(sbi->s_sb, "%.*s", len, buf);
+ return count;
+}
+
+#define EXT4_ATTR(_name,_mode,_id) \
+static struct ext4_attr ext4_attr_##_name = { \
+ .attr = {.name = __stringify(_name), .mode = _mode }, \
+ .attr_id = attr_##_id, \
+}
+
+#define EXT4_ATTR_FUNC(_name,_mode) EXT4_ATTR(_name,_mode,_name)
+
+#define EXT4_ATTR_FEATURE(_name) EXT4_ATTR(_name, 0444, feature)
+
+#define EXT4_ATTR_OFFSET(_name,_mode,_id,_struct,_elname) \
+static struct ext4_attr ext4_attr_##_name = { \
+ .attr = {.name = __stringify(_name), .mode = _mode }, \
+ .attr_id = attr_##_id, \
+ .attr_ptr = ptr_##_struct##_offset, \
+ .u = { \
+ .offset = offsetof(struct _struct, _elname),\
+ }, \
+}
+
+#define EXT4_RO_ATTR_ES_UI(_name,_elname) \
+ EXT4_ATTR_OFFSET(_name, 0444, pointer_ui, ext4_super_block, _elname)
+
+#define EXT4_RW_ATTR_SBI_UI(_name,_elname) \
+ EXT4_ATTR_OFFSET(_name, 0644, pointer_ui, ext4_sb_info, _elname)
+
+#define EXT4_ATTR_PTR(_name,_mode,_id,_ptr) \
+static struct ext4_attr ext4_attr_##_name = { \
+ .attr = {.name = __stringify(_name), .mode = _mode }, \
+ .attr_id = attr_##_id, \
+ .attr_ptr = ptr_explicit, \
+ .u = { \
+ .explicit_ptr = _ptr, \
+ }, \
+}
+
+#define ATTR_LIST(name) &ext4_attr_##name.attr
+
+EXT4_ATTR_FUNC(delayed_allocation_blocks, 0444);
+EXT4_ATTR_FUNC(session_write_kbytes, 0444);
+EXT4_ATTR_FUNC(lifetime_write_kbytes, 0444);
+EXT4_ATTR_FUNC(reserved_clusters, 0644);
+
+EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead,
+ ext4_sb_info, s_inode_readahead_blks);
+EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
+EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
+EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
+EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
+EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
+EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
+EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
+EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
+EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error);
+EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
+EXT4_RW_ATTR_SBI_UI(err_ratelimit_burst, s_err_ratelimit_state.burst);
+EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval);
+EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst);
+EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
+EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
+EXT4_RO_ATTR_ES_UI(errors_count, s_error_count);
+EXT4_RO_ATTR_ES_UI(first_error_time, s_first_error_time);
+EXT4_RO_ATTR_ES_UI(last_error_time, s_last_error_time);
+
+static unsigned int old_bump_val = 128;
+EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val);
+
+static struct attribute *ext4_attrs[] = {
+ ATTR_LIST(delayed_allocation_blocks),
+ ATTR_LIST(session_write_kbytes),
+ ATTR_LIST(lifetime_write_kbytes),
+ ATTR_LIST(reserved_clusters),
+ ATTR_LIST(inode_readahead_blks),
+ ATTR_LIST(inode_goal),
+ ATTR_LIST(mb_stats),
+ ATTR_LIST(mb_max_to_scan),
+ ATTR_LIST(mb_min_to_scan),
+ ATTR_LIST(mb_order2_req),
+ ATTR_LIST(mb_stream_req),
+ ATTR_LIST(mb_group_prealloc),
+ ATTR_LIST(max_writeback_mb_bump),
+ ATTR_LIST(extent_max_zeroout_kb),
+ ATTR_LIST(trigger_fs_error),
+ ATTR_LIST(err_ratelimit_interval_ms),
+ ATTR_LIST(err_ratelimit_burst),
+ ATTR_LIST(warning_ratelimit_interval_ms),
+ ATTR_LIST(warning_ratelimit_burst),
+ ATTR_LIST(msg_ratelimit_interval_ms),
+ ATTR_LIST(msg_ratelimit_burst),
+ ATTR_LIST(errors_count),
+ ATTR_LIST(first_error_time),
+ ATTR_LIST(last_error_time),
+ NULL,
+};
+
+/* Features this copy of ext4 supports */
+EXT4_ATTR_FEATURE(lazy_itable_init);
+EXT4_ATTR_FEATURE(batched_discard);
+EXT4_ATTR_FEATURE(meta_bg_resize);
+EXT4_ATTR_FEATURE(encryption);
+EXT4_ATTR_FEATURE(metadata_csum_seed);
+
+static struct attribute *ext4_feat_attrs[] = {
+ ATTR_LIST(lazy_itable_init),
+ ATTR_LIST(batched_discard),
+ ATTR_LIST(meta_bg_resize),
+ ATTR_LIST(encryption),
+ ATTR_LIST(metadata_csum_seed),
+ NULL,
+};
+
+static void *calc_ptr(struct ext4_attr *a, struct ext4_sb_info *sbi)
+{
+ switch (a->attr_ptr) {
+ case ptr_explicit:
+ return a->u.explicit_ptr;
+ case ptr_ext4_sb_info_offset:
+ return (void *) (((char *) sbi) + a->u.offset);
+ case ptr_ext4_super_block_offset:
+ return (void *) (((char *) sbi->s_es) + a->u.offset);
+ }
+ return NULL;
+}
+
+static ssize_t ext4_attr_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+ s_kobj);
+ struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
+ void *ptr = calc_ptr(a, sbi);
+
+ switch (a->attr_id) {
+ case attr_delayed_allocation_blocks:
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
+ (s64) EXT4_C2B(sbi,
+ percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
+ case attr_session_write_kbytes:
+ return session_write_kbytes_show(a, sbi, buf);
+ case attr_lifetime_write_kbytes:
+ return lifetime_write_kbytes_show(a, sbi, buf);
+ case attr_reserved_clusters:
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
+ (unsigned long long)
+ atomic64_read(&sbi->s_resv_clusters));
+ case attr_inode_readahead:
+ case attr_pointer_ui:
+ if (!ptr)
+ return 0;
+ return snprintf(buf, PAGE_SIZE, "%u\n",
+ *((unsigned int *) ptr));
+ case attr_pointer_atomic:
+ if (!ptr)
+ return 0;
+ return snprintf(buf, PAGE_SIZE, "%d\n",
+ atomic_read((atomic_t *) ptr));
+ case attr_feature:
+ return snprintf(buf, PAGE_SIZE, "supported\n");
+ }
+
+ return 0;
+}
+
+static ssize_t ext4_attr_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf, size_t len)
+{
+ struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+ s_kobj);
+ struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
+ void *ptr = calc_ptr(a, sbi);
+ unsigned long t;
+ int ret;
+
+ switch (a->attr_id) {
+ case attr_reserved_clusters:
+ return reserved_clusters_store(a, sbi, buf, len);
+ case attr_pointer_ui:
+ if (!ptr)
+ return 0;
+ ret = kstrtoul(skip_spaces(buf), 0, &t);
+ if (ret)
+ return ret;
+ *((unsigned int *) ptr) = t;
+ return len;
+ case attr_inode_readahead:
+ return inode_readahead_blks_store(a, sbi, buf, len);
+ case attr_trigger_test_error:
+ return trigger_test_error(a, sbi, buf, len);
+ }
+ return 0;
+}
+
+static void ext4_sb_release(struct kobject *kobj)
+{
+ struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+ s_kobj);
+ complete(&sbi->s_kobj_unregister);
+}
+
+static const struct sysfs_ops ext4_attr_ops = {
+ .show = ext4_attr_show,
+ .store = ext4_attr_store,
+};
+
+static struct kobj_type ext4_sb_ktype = {
+ .default_attrs = ext4_attrs,
+ .sysfs_ops = &ext4_attr_ops,
+ .release = ext4_sb_release,
+};
+
+static struct kobj_type ext4_ktype = {
+ .sysfs_ops = &ext4_attr_ops,
+};
+
+static struct kset ext4_kset = {
+ .kobj = {.ktype = &ext4_ktype},
+};
+
+static struct kobj_type ext4_feat_ktype = {
+ .default_attrs = ext4_feat_attrs,
+ .sysfs_ops = &ext4_attr_ops,
+};
+
+static struct kobject ext4_feat = {
+ .kset = &ext4_kset,
+};
+
+#define PROC_FILE_SHOW_DEFN(name) \
+static int name##_open(struct inode *inode, struct file *file) \
+{ \
+ return single_open(file, ext4_seq_##name##_show, PDE_DATA(inode)); \
+} \
+\
+static const struct file_operations ext4_seq_##name##_fops = { \
+ .owner = THIS_MODULE, \
+ .open = name##_open, \
+ .read = seq_read, \
+ .llseek = seq_lseek, \
+ .release = single_release, \
+}
+
+#define PROC_FILE_LIST(name) \
+ { __stringify(name), &ext4_seq_##name##_fops }
+
+PROC_FILE_SHOW_DEFN(es_shrinker_info);
+PROC_FILE_SHOW_DEFN(options);
+
+static struct ext4_proc_files {
+ const char *name;
+ const struct file_operations *fops;
+} proc_files[] = {
+ PROC_FILE_LIST(options),
+ PROC_FILE_LIST(es_shrinker_info),
+ PROC_FILE_LIST(mb_groups),
+ { NULL, NULL },
+};
+
+int ext4_register_sysfs(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_proc_files *p;
+ int err;
+
+ sbi->s_kobj.kset = &ext4_kset;
+ init_completion(&sbi->s_kobj_unregister);
+ err = kobject_init_and_add(&sbi->s_kobj, &ext4_sb_ktype, NULL,
+ "%s", sb->s_id);
+ if (err)
+ return err;
+
+ if (ext4_proc_root)
+ sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
+
+ if (sbi->s_proc) {
+ for (p = proc_files; p->name; p++)
+ proc_create_data(p->name, S_IRUGO, sbi->s_proc,
+ p->fops, sb);
+ }
+ return 0;
+}
+
+void ext4_unregister_sysfs(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_proc_files *p;
+
+ if (sbi->s_proc) {
+ for (p = proc_files; p->name; p++)
+ remove_proc_entry(p->name, sbi->s_proc);
+ remove_proc_entry(sb->s_id, ext4_proc_root);
+ }
+ kobject_del(&sbi->s_kobj);
+}
+
+int __init ext4_init_sysfs(void)
+{
+ int ret;
+
+ kobject_set_name(&ext4_kset.kobj, "ext4");
+ ext4_kset.kobj.parent = fs_kobj;
+ ret = kset_register(&ext4_kset);
+ if (ret)
+ return ret;
+
+ ret = kobject_init_and_add(&ext4_feat, &ext4_feat_ktype,
+ NULL, "features");
+ if (ret)
+ kset_unregister(&ext4_kset);
+ else
+ ext4_proc_root = proc_mkdir(proc_dirname, NULL);
+ return ret;
+}
+
+void ext4_exit_sysfs(void)
+{
+ kobject_put(&ext4_feat);
+ kset_unregister(&ext4_kset);
+ remove_proc_entry(proc_dirname, NULL);
+ ext4_proc_root = NULL;
+}
+
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 16e28c08d1e8..a95151e875bd 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -68,10 +68,8 @@
printk("\n"); \
} while (0)
# define ea_bdebug(bh, f...) do { \
- char b[BDEVNAME_SIZE]; \
- printk(KERN_DEBUG "block %s:%lu: ", \
- bdevname(bh->b_bdev, b), \
- (unsigned long) bh->b_blocknr); \
+ printk(KERN_DEBUG "block %pg:%lu: ", \
+ bh->b_bdev, (unsigned long) bh->b_blocknr); \
printk(f); \
printk("\n"); \
} while (0)
@@ -195,7 +193,7 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end,
while (!IS_LAST_ENTRY(e)) {
struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e);
if ((void *)next >= end)
- return -EIO;
+ return -EFSCORRUPTED;
e = next;
}
@@ -205,7 +203,7 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end,
(void *)e + sizeof(__u32) ||
value_start + le16_to_cpu(entry->e_value_offs) +
le32_to_cpu(entry->e_value_size) > end))
- return -EIO;
+ return -EFSCORRUPTED;
entry = EXT4_XATTR_NEXT(entry);
}
@@ -222,9 +220,9 @@ ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh)
if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
BHDR(bh)->h_blocks != cpu_to_le32(1))
- return -EIO;
+ return -EFSCORRUPTED;
if (!ext4_xattr_block_csum_verify(inode, bh->b_blocknr, BHDR(bh)))
- return -EIO;
+ return -EFSBADCRC;
error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size,
bh->b_data);
if (!error)
@@ -239,7 +237,7 @@ ext4_xattr_check_entry(struct ext4_xattr_entry *entry, size_t size)
if (entry->e_value_block != 0 || value_size > size ||
le16_to_cpu(entry->e_value_offs) + value_size > size)
- return -EIO;
+ return -EFSCORRUPTED;
return 0;
}
@@ -266,7 +264,7 @@ ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index,
}
*pentry = entry;
if (!cmp && ext4_xattr_check_entry(entry, size))
- return -EIO;
+ return -EFSCORRUPTED;
return cmp ? -ENODATA : 0;
}
@@ -297,13 +295,13 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
bad_block:
EXT4_ERROR_INODE(inode, "bad block %llu",
EXT4_I(inode)->i_file_acl);
- error = -EIO;
+ error = -EFSCORRUPTED;
goto cleanup;
}
ext4_xattr_cache_insert(ext4_mb_cache, bh);
entry = BFIRST(bh);
error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1);
- if (error == -EIO)
+ if (error == -EFSCORRUPTED)
goto bad_block;
if (error)
goto cleanup;
@@ -404,20 +402,24 @@ ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry,
const struct xattr_handler *handler =
ext4_xattr_handler(entry->e_name_index);
- if (handler) {
- size_t size = handler->list(dentry, buffer, rest,
- entry->e_name,
- entry->e_name_len,
- handler->flags);
+ if (handler && (!handler->list || handler->list(dentry))) {
+ const char *prefix = handler->prefix ?: handler->name;
+ size_t prefix_len = strlen(prefix);
+ size_t size = prefix_len + entry->e_name_len + 1;
+
if (buffer) {
if (size > rest)
return -ERANGE;
- buffer += size;
+ memcpy(buffer, prefix, prefix_len);
+ buffer += prefix_len;
+ memcpy(buffer, entry->e_name, entry->e_name_len);
+ buffer += entry->e_name_len;
+ *buffer++ = 0;
}
rest -= size;
}
}
- return buffer_size - rest;
+ return buffer_size - rest; /* total size */
}
static int
@@ -445,7 +447,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
if (ext4_xattr_check_block(inode, bh)) {
EXT4_ERROR_INODE(inode, "bad block %llu",
EXT4_I(inode)->i_file_acl);
- error = -EIO;
+ error = -EFSCORRUPTED;
goto cleanup;
}
ext4_xattr_cache_insert(ext4_mb_cache, bh);
@@ -525,12 +527,12 @@ errout:
static void ext4_xattr_update_super_block(handle_t *handle,
struct super_block *sb)
{
- if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR))
+ if (ext4_has_feature_xattr(sb))
return;
BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) {
- EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR);
+ ext4_set_feature_xattr(sb);
ext4_handle_dirty_super(handle, sb);
}
}
@@ -751,7 +753,7 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
if (ext4_xattr_check_block(inode, bs->bh)) {
EXT4_ERROR_INODE(inode, "bad block %llu",
EXT4_I(inode)->i_file_acl);
- error = -EIO;
+ error = -EFSCORRUPTED;
goto cleanup;
}
/* Find the named attribute. */
@@ -811,7 +813,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
bs->bh);
}
unlock_buffer(bs->bh);
- if (error == -EIO)
+ if (error == -EFSCORRUPTED)
goto bad_block;
if (!error)
error = ext4_handle_dirty_xattr_block(handle,
@@ -855,7 +857,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
}
error = ext4_xattr_set_entry(i, s);
- if (error == -EIO)
+ if (error == -EFSCORRUPTED)
goto bad_block;
if (error)
goto cleanup;
@@ -1314,7 +1316,7 @@ retry:
if (ext4_xattr_check_block(inode, bh)) {
EXT4_ERROR_INODE(inode, "bad block %llu",
EXT4_I(inode)->i_file_acl);
- error = -EIO;
+ error = -EFSCORRUPTED;
goto cleanup;
}
base = BHDR(bh);
@@ -1579,7 +1581,7 @@ ext4_xattr_cmp(struct ext4_xattr_header *header1,
memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len))
return 1;
if (entry1->e_value_block != 0 || entry2->e_value_block != 0)
- return -EIO;
+ return -EFSCORRUPTED;
if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs),
(char *)header2 + le16_to_cpu(entry2->e_value_offs),
le32_to_cpu(entry1->e_value_size)))
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 95d90e0560f0..3e81bdca071a 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -11,38 +11,20 @@
#include "ext4.h"
#include "xattr.h"
-static size_t
-ext4_xattr_security_list(struct dentry *dentry, char *list, size_t list_size,
- const char *name, size_t name_len, int type)
-{
- const size_t prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1;
- const size_t total_len = prefix_len + name_len + 1;
-
-
- if (list && total_len <= list_size) {
- memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
- memcpy(list+prefix_len, name, name_len);
- list[prefix_len + name_len] = '\0';
- }
- return total_len;
-}
-
static int
-ext4_xattr_security_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+ext4_xattr_security_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
return ext4_xattr_get(d_inode(dentry), EXT4_XATTR_INDEX_SECURITY,
name, buffer, size);
}
static int
-ext4_xattr_security_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+ext4_xattr_security_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_SECURITY,
name, value, size, flags);
}
@@ -76,7 +58,6 @@ ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
const struct xattr_handler ext4_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
- .list = ext4_xattr_security_list,
.get = ext4_xattr_security_get,
.set = ext4_xattr_security_set,
};
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index 891ee2ddfbd6..2a3c6f9b8cb8 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -12,40 +12,26 @@
#include "ext4.h"
#include "xattr.h"
-static size_t
-ext4_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size,
- const char *name, size_t name_len, int type)
+static bool
+ext4_xattr_trusted_list(struct dentry *dentry)
{
- const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
- const size_t total_len = prefix_len + name_len + 1;
-
- if (!capable(CAP_SYS_ADMIN))
- return 0;
-
- if (list && total_len <= list_size) {
- memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
- memcpy(list+prefix_len, name, name_len);
- list[prefix_len + name_len] = '\0';
- }
- return total_len;
+ return capable(CAP_SYS_ADMIN);
}
static int
-ext4_xattr_trusted_get(struct dentry *dentry, const char *name, void *buffer,
- size_t size, int type)
+ext4_xattr_trusted_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name, void *buffer,
+ size_t size)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
return ext4_xattr_get(d_inode(dentry), EXT4_XATTR_INDEX_TRUSTED,
name, buffer, size);
}
static int
-ext4_xattr_trusted_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+ext4_xattr_trusted_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_TRUSTED,
name, value, size, flags);
}
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index 6ed932b3c043..d152f431e432 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -11,30 +11,17 @@
#include "ext4.h"
#include "xattr.h"
-static size_t
-ext4_xattr_user_list(struct dentry *dentry, char *list, size_t list_size,
- const char *name, size_t name_len, int type)
+static bool
+ext4_xattr_user_list(struct dentry *dentry)
{
- const size_t prefix_len = XATTR_USER_PREFIX_LEN;
- const size_t total_len = prefix_len + name_len + 1;
-
- if (!test_opt(dentry->d_sb, XATTR_USER))
- return 0;
-
- if (list && total_len <= list_size) {
- memcpy(list, XATTR_USER_PREFIX, prefix_len);
- memcpy(list+prefix_len, name, name_len);
- list[prefix_len + name_len] = '\0';
- }
- return total_len;
+ return test_opt(dentry->d_sb, XATTR_USER);
}
static int
-ext4_xattr_user_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+ext4_xattr_user_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
if (!test_opt(dentry->d_sb, XATTR_USER))
return -EOPNOTSUPP;
return ext4_xattr_get(d_inode(dentry), EXT4_XATTR_INDEX_USER,
@@ -42,11 +29,10 @@ ext4_xattr_user_get(struct dentry *dentry, const char *name,
}
static int
-ext4_xattr_user_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+ext4_xattr_user_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
if (!test_opt(dentry->d_sb, XATTR_USER))
return -EOPNOTSUPP;
return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_USER,
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index c5a38e352a80..3842af954cd5 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -47,7 +47,8 @@ repeat:
/*
* We guarantee no failure on the returned page.
*/
-struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
+static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index,
+ bool is_meta)
{
struct address_space *mapping = META_MAPPING(sbi);
struct page *page;
@@ -58,6 +59,9 @@ struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
.blk_addr = index,
.encrypted_page = NULL,
};
+
+ if (unlikely(!is_meta))
+ fio.rw &= ~REQ_META;
repeat:
page = grab_cache_page(mapping, index);
if (!page) {
@@ -91,6 +95,17 @@ out:
return page;
}
+struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
+{
+ return __get_meta_page(sbi, index, true);
+}
+
+/* for POR only */
+struct page *get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index)
+{
+ return __get_meta_page(sbi, index, false);
+}
+
bool is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type)
{
switch (type) {
@@ -125,7 +140,8 @@ bool is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type)
/*
* Readahead CP/NAT/SIT/SSA pages
*/
-int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type)
+int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
+ int type, bool sync)
{
block_t prev_blk_addr = 0;
struct page *page;
@@ -133,10 +149,13 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type
struct f2fs_io_info fio = {
.sbi = sbi,
.type = META,
- .rw = READ_SYNC | REQ_META | REQ_PRIO,
+ .rw = sync ? (READ_SYNC | REQ_META | REQ_PRIO) : READA,
.encrypted_page = NULL,
};
+ if (unlikely(type == META_POR))
+ fio.rw &= ~REQ_META;
+
for (; nrpages-- > 0; blkno++) {
if (!is_valid_blkaddr(sbi, blkno, type))
@@ -196,7 +215,7 @@ void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index)
f2fs_put_page(page, 0);
if (readahead)
- ra_meta_pages(sbi, index, MAX_BIO_BLOCKS(sbi), META_POR);
+ ra_meta_pages(sbi, index, MAX_BIO_BLOCKS(sbi), META_POR, true);
}
static int f2fs_write_meta_page(struct page *page,
@@ -218,7 +237,7 @@ static int f2fs_write_meta_page(struct page *page,
dec_page_count(sbi, F2FS_DIRTY_META);
unlock_page(page);
- if (wbc->for_reclaim)
+ if (wbc->for_reclaim || unlikely(f2fs_cp_error(sbi)))
f2fs_submit_merged_bio(sbi, META, WRITE);
return 0;
@@ -257,7 +276,7 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
long nr_to_write)
{
struct address_space *mapping = META_MAPPING(sbi);
- pgoff_t index = 0, end = LONG_MAX;
+ pgoff_t index = 0, end = LONG_MAX, prev = LONG_MAX;
struct pagevec pvec;
long nwritten = 0;
struct writeback_control wbc = {
@@ -277,6 +296,13 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
+ if (prev == LONG_MAX)
+ prev = page->index - 1;
+ if (nr_to_write != LONG_MAX && page->index != prev + 1) {
+ pagevec_release(&pvec);
+ goto stop;
+ }
+
lock_page(page);
if (unlikely(page->mapping != mapping)) {
@@ -297,13 +323,14 @@ continue_unlock:
break;
}
nwritten++;
+ prev = page->index;
if (unlikely(nwritten >= nr_to_write))
break;
}
pagevec_release(&pvec);
cond_resched();
}
-
+stop:
if (nwritten)
f2fs_submit_merged_bio(sbi, type, WRITE);
@@ -383,13 +410,13 @@ static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
spin_unlock(&im->ino_lock);
}
-void add_dirty_inode(struct f2fs_sb_info *sbi, nid_t ino, int type)
+void add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
{
/* add new dirty ino entry into list */
__add_ino_entry(sbi, ino, type);
}
-void remove_dirty_inode(struct f2fs_sb_info *sbi, nid_t ino, int type)
+void remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
{
/* remove dirty ino entry from list */
__remove_ino_entry(sbi, ino, type);
@@ -407,7 +434,7 @@ bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode)
return e ? true : false;
}
-void release_dirty_inode(struct f2fs_sb_info *sbi)
+void release_ino_entry(struct f2fs_sb_info *sbi)
{
struct ino_entry *e, *tmp;
int i;
@@ -495,7 +522,7 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi)
start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi);
orphan_blocks = __start_sum_addr(sbi) - 1 - __cp_payload(sbi);
- ra_meta_pages(sbi, start_blk, orphan_blocks, META_CP);
+ ra_meta_pages(sbi, start_blk, orphan_blocks, META_CP, true);
for (i = 0; i < orphan_blocks; i++) {
struct page *page = get_meta_page(sbi, start_blk + i);
@@ -695,47 +722,48 @@ fail_no_cp:
return -EINVAL;
}
-static int __add_dirty_inode(struct inode *inode, struct inode_entry *new)
+static void __add_dirty_inode(struct inode *inode, enum inode_type type)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ int flag = (type == DIR_INODE) ? FI_DIRTY_DIR : FI_DIRTY_FILE;
- if (is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR))
- return -EEXIST;
+ if (is_inode_flag_set(fi, flag))
+ return;
- set_inode_flag(F2FS_I(inode), FI_DIRTY_DIR);
- F2FS_I(inode)->dirty_dir = new;
- list_add_tail(&new->list, &sbi->dir_inode_list);
- stat_inc_dirty_dir(sbi);
- return 0;
+ set_inode_flag(fi, flag);
+ list_add_tail(&fi->dirty_list, &sbi->inode_list[type]);
+ stat_inc_dirty_inode(sbi, type);
+}
+
+static void __remove_dirty_inode(struct inode *inode, enum inode_type type)
+{
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ int flag = (type == DIR_INODE) ? FI_DIRTY_DIR : FI_DIRTY_FILE;
+
+ if (get_dirty_pages(inode) ||
+ !is_inode_flag_set(F2FS_I(inode), flag))
+ return;
+
+ list_del_init(&fi->dirty_list);
+ clear_inode_flag(fi, flag);
+ stat_dec_dirty_inode(F2FS_I_SB(inode), type);
}
void update_dirty_page(struct inode *inode, struct page *page)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct inode_entry *new;
- int ret = 0;
+ enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE;
if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) &&
!S_ISLNK(inode->i_mode))
return;
- if (!S_ISDIR(inode->i_mode)) {
- inode_inc_dirty_pages(inode);
- goto out;
- }
-
- new = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
- new->inode = inode;
- INIT_LIST_HEAD(&new->list);
-
- spin_lock(&sbi->dir_inode_lock);
- ret = __add_dirty_inode(inode, new);
+ spin_lock(&sbi->inode_lock[type]);
+ __add_dirty_inode(inode, type);
inode_inc_dirty_pages(inode);
- spin_unlock(&sbi->dir_inode_lock);
+ spin_unlock(&sbi->inode_lock[type]);
- if (ret)
- kmem_cache_free(inode_entry_slab, new);
-out:
SetPagePrivate(page);
f2fs_trace_pid(page);
}
@@ -743,70 +771,60 @@ out:
void add_dirty_dir_inode(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct inode_entry *new =
- f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
- int ret = 0;
- new->inode = inode;
- INIT_LIST_HEAD(&new->list);
-
- spin_lock(&sbi->dir_inode_lock);
- ret = __add_dirty_inode(inode, new);
- spin_unlock(&sbi->dir_inode_lock);
-
- if (ret)
- kmem_cache_free(inode_entry_slab, new);
+ spin_lock(&sbi->inode_lock[DIR_INODE]);
+ __add_dirty_inode(inode, DIR_INODE);
+ spin_unlock(&sbi->inode_lock[DIR_INODE]);
}
-void remove_dirty_dir_inode(struct inode *inode)
+void remove_dirty_inode(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct inode_entry *entry;
-
- if (!S_ISDIR(inode->i_mode))
- return;
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE;
- spin_lock(&sbi->dir_inode_lock);
- if (get_dirty_pages(inode) ||
- !is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR)) {
- spin_unlock(&sbi->dir_inode_lock);
+ if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) &&
+ !S_ISLNK(inode->i_mode))
return;
- }
- entry = F2FS_I(inode)->dirty_dir;
- list_del(&entry->list);
- F2FS_I(inode)->dirty_dir = NULL;
- clear_inode_flag(F2FS_I(inode), FI_DIRTY_DIR);
- stat_dec_dirty_dir(sbi);
- spin_unlock(&sbi->dir_inode_lock);
- kmem_cache_free(inode_entry_slab, entry);
+ spin_lock(&sbi->inode_lock[type]);
+ __remove_dirty_inode(inode, type);
+ spin_unlock(&sbi->inode_lock[type]);
/* Only from the recovery routine */
- if (is_inode_flag_set(F2FS_I(inode), FI_DELAY_IPUT)) {
- clear_inode_flag(F2FS_I(inode), FI_DELAY_IPUT);
+ if (is_inode_flag_set(fi, FI_DELAY_IPUT)) {
+ clear_inode_flag(fi, FI_DELAY_IPUT);
iput(inode);
}
}
-void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi)
+int sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type)
{
struct list_head *head;
- struct inode_entry *entry;
struct inode *inode;
+ struct f2fs_inode_info *fi;
+ bool is_dir = (type == DIR_INODE);
+
+ trace_f2fs_sync_dirty_inodes_enter(sbi->sb, is_dir,
+ get_pages(sbi, is_dir ?
+ F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA));
retry:
if (unlikely(f2fs_cp_error(sbi)))
- return;
+ return -EIO;
- spin_lock(&sbi->dir_inode_lock);
+ spin_lock(&sbi->inode_lock[type]);
- head = &sbi->dir_inode_list;
+ head = &sbi->inode_list[type];
if (list_empty(head)) {
- spin_unlock(&sbi->dir_inode_lock);
- return;
+ spin_unlock(&sbi->inode_lock[type]);
+ trace_f2fs_sync_dirty_inodes_exit(sbi->sb, is_dir,
+ get_pages(sbi, is_dir ?
+ F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA));
+ return 0;
}
- entry = list_entry(head->next, struct inode_entry, list);
- inode = igrab(entry->inode);
- spin_unlock(&sbi->dir_inode_lock);
+ fi = list_entry(head->next, struct f2fs_inode_info, dirty_list);
+ inode = igrab(&fi->vfs_inode);
+ spin_unlock(&sbi->inode_lock[type]);
if (inode) {
filemap_fdatawrite(inode->i_mapping);
iput(inode);
@@ -841,11 +859,9 @@ retry_flush_dents:
/* write all the dirty dentry pages */
if (get_pages(sbi, F2FS_DIRTY_DENTS)) {
f2fs_unlock_all(sbi);
- sync_dirty_dir_inodes(sbi);
- if (unlikely(f2fs_cp_error(sbi))) {
- err = -EIO;
+ err = sync_dirty_inodes(sbi, DIR_INODE);
+ if (err)
goto out;
- }
goto retry_flush_dents;
}
@@ -858,10 +874,9 @@ retry_flush_nodes:
if (get_pages(sbi, F2FS_DIRTY_NODES)) {
up_write(&sbi->node_write);
- sync_node_pages(sbi, 0, &wbc);
- if (unlikely(f2fs_cp_error(sbi))) {
+ err = sync_node_pages(sbi, 0, &wbc);
+ if (err) {
f2fs_unlock_all(sbi);
- err = -EIO;
goto out;
}
goto retry_flush_nodes;
@@ -892,7 +907,7 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi)
finish_wait(&sbi->cp_wait, &wait);
}
-static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
+static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
{
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
@@ -918,7 +933,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
while (get_pages(sbi, F2FS_DIRTY_META)) {
sync_meta_pages(sbi, META, LONG_MAX);
if (unlikely(f2fs_cp_error(sbi)))
- return;
+ return -EIO;
}
next_free_nid(sbi, &last_nid);
@@ -1000,6 +1015,11 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
start_blk = __start_cp_addr(sbi);
+ /* need to wait for end_io results */
+ wait_on_all_pages_writeback(sbi);
+ if (unlikely(f2fs_cp_error(sbi)))
+ return -EIO;
+
/* write out checkpoint buffer at block 0 */
update_meta_page(sbi, ckpt, start_blk++);
@@ -1026,7 +1046,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
wait_on_all_pages_writeback(sbi);
if (unlikely(f2fs_cp_error(sbi)))
- return;
+ return -EIO;
filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LONG_MAX);
filemap_fdatawait_range(META_MAPPING(sbi), 0, LONG_MAX);
@@ -1049,22 +1069,25 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
invalidate_mapping_pages(META_MAPPING(sbi), discard_blk,
discard_blk);
- release_dirty_inode(sbi);
+ release_ino_entry(sbi);
if (unlikely(f2fs_cp_error(sbi)))
- return;
+ return -EIO;
clear_prefree_segments(sbi, cpc);
clear_sbi_flag(sbi, SBI_IS_DIRTY);
+
+ return 0;
}
/*
* We guarantee that this checkpoint procedure will not fail.
*/
-void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
+int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
{
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
unsigned long long ckpt_ver;
+ int err = 0;
mutex_lock(&sbi->cp_mutex);
@@ -1072,14 +1095,19 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
(cpc->reason == CP_FASTBOOT || cpc->reason == CP_SYNC ||
(cpc->reason == CP_DISCARD && !sbi->discard_blks)))
goto out;
- if (unlikely(f2fs_cp_error(sbi)))
+ if (unlikely(f2fs_cp_error(sbi))) {
+ err = -EIO;
goto out;
- if (f2fs_readonly(sbi->sb))
+ }
+ if (f2fs_readonly(sbi->sb)) {
+ err = -EROFS;
goto out;
+ }
trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "start block_ops");
- if (block_operations(sbi))
+ err = block_operations(sbi);
+ if (err)
goto out;
trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops");
@@ -1101,7 +1129,7 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
flush_sit_entries(sbi, cpc);
/* unlock all the fs_lock[] in do_checkpoint() */
- do_checkpoint(sbi, cpc);
+ err = do_checkpoint(sbi, cpc);
unblock_operations(sbi);
stat_inc_cp_count(sbi->stat_info);
@@ -1109,9 +1137,13 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
if (cpc->reason == CP_RECOVERY)
f2fs_msg(sbi->sb, KERN_NOTICE,
"checkpoint: version = %llx", ckpt_ver);
+
+ /* do checkpoint periodically */
+ f2fs_update_time(sbi, CP_TIME);
+ trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint");
out:
mutex_unlock(&sbi->cp_mutex);
- trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint");
+ return err;
}
void init_ino_entry_info(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/crypto_key.c b/fs/f2fs/crypto_key.c
index 9f77de2ef317..5de2d866a25c 100644
--- a/fs/f2fs/crypto_key.c
+++ b/fs/f2fs/crypto_key.c
@@ -122,7 +122,7 @@ int _f2fs_get_encryption_info(struct inode *inode)
struct key *keyring_key = NULL;
struct f2fs_encryption_key *master_key;
struct f2fs_encryption_context ctx;
- struct user_key_payload *ukp;
+ const struct user_key_payload *ukp;
struct crypto_ablkcipher *ctfm;
const char *cipher_str;
char raw_key[F2FS_MAX_KEY_SIZE];
@@ -199,7 +199,7 @@ retry:
}
crypt_info->ci_keyring_key = keyring_key;
BUG_ON(keyring_key->type != &key_type_logon);
- ukp = ((struct user_key_payload *)keyring_key->payload.data);
+ ukp = user_key_payload(keyring_key);
if (ukp->datalen != sizeof(struct f2fs_encryption_key)) {
res = -EINVAL;
goto out;
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index a82abe921b89..ac9e7c6aac74 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -225,7 +225,8 @@ void set_data_blkaddr(struct dnode_of_data *dn)
/* Get physical address of data block */
addr_array = blkaddr_in_node(rn);
addr_array[ofs_in_node] = cpu_to_le32(dn->data_blkaddr);
- set_page_dirty(node_page);
+ if (set_page_dirty(node_page))
+ dn->node_changed = true;
}
int reserve_new_block(struct dnode_of_data *dn)
@@ -275,7 +276,8 @@ int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index)
return f2fs_reserve_block(dn, index);
}
-struct page *get_read_data_page(struct inode *inode, pgoff_t index, int rw)
+struct page *get_read_data_page(struct inode *inode, pgoff_t index,
+ int rw, bool for_write)
{
struct address_space *mapping = inode->i_mapping;
struct dnode_of_data dn;
@@ -292,7 +294,7 @@ struct page *get_read_data_page(struct inode *inode, pgoff_t index, int rw)
if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
return read_mapping_page(mapping, index, NULL);
- page = grab_cache_page(mapping, index);
+ page = f2fs_grab_cache_page(mapping, index, for_write);
if (!page)
return ERR_PTR(-ENOMEM);
@@ -352,7 +354,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index)
return page;
f2fs_put_page(page, 0);
- page = get_read_data_page(inode, index, READ_SYNC);
+ page = get_read_data_page(inode, index, READ_SYNC, false);
if (IS_ERR(page))
return page;
@@ -372,12 +374,13 @@ struct page *find_data_page(struct inode *inode, pgoff_t index)
* Because, the callers, functions in dir.c and GC, should be able to know
* whether this page exists or not.
*/
-struct page *get_lock_data_page(struct inode *inode, pgoff_t index)
+struct page *get_lock_data_page(struct inode *inode, pgoff_t index,
+ bool for_write)
{
struct address_space *mapping = inode->i_mapping;
struct page *page;
repeat:
- page = get_read_data_page(inode, index, READ_SYNC);
+ page = get_read_data_page(inode, index, READ_SYNC, for_write);
if (IS_ERR(page))
return page;
@@ -410,8 +413,8 @@ struct page *get_new_data_page(struct inode *inode,
struct page *page;
struct dnode_of_data dn;
int err;
-repeat:
- page = grab_cache_page(mapping, index);
+
+ page = f2fs_grab_cache_page(mapping, index, true);
if (!page) {
/*
* before exiting, we should make sure ipage will be released
@@ -439,17 +442,16 @@ repeat:
} else {
f2fs_put_page(page, 1);
- page = get_read_data_page(inode, index, READ_SYNC);
+ /* if ipage exists, blkaddr should be NEW_ADDR */
+ f2fs_bug_on(F2FS_I_SB(inode), ipage);
+ page = get_lock_data_page(inode, index, true);
if (IS_ERR(page))
- goto repeat;
-
- /* wait for read completion */
- lock_page(page);
+ return page;
}
got_it:
- if (new_i_size &&
- i_size_read(inode) < ((index + 1) << PAGE_CACHE_SHIFT)) {
- i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT));
+ if (new_i_size && i_size_read(inode) <
+ ((loff_t)(index + 1) << PAGE_CACHE_SHIFT)) {
+ i_size_write(inode, ((loff_t)(index + 1) << PAGE_CACHE_SHIFT));
/* Only the directory inode sets new_i_size */
set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR);
}
@@ -489,16 +491,13 @@ alloc:
/* update i_size */
fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
dn->ofs_in_node;
- if (i_size_read(dn->inode) < ((fofs + 1) << PAGE_CACHE_SHIFT))
- i_size_write(dn->inode, ((fofs + 1) << PAGE_CACHE_SHIFT));
-
- /* direct IO doesn't use extent cache to maximize the performance */
- f2fs_drop_largest_extent(dn->inode, fofs);
-
+ if (i_size_read(dn->inode) < ((loff_t)(fofs + 1) << PAGE_CACHE_SHIFT))
+ i_size_write(dn->inode,
+ ((loff_t)(fofs + 1) << PAGE_CACHE_SHIFT));
return 0;
}
-static void __allocate_data_blocks(struct inode *inode, loff_t offset,
+static int __allocate_data_blocks(struct inode *inode, loff_t offset,
size_t count)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
@@ -507,14 +506,15 @@ static void __allocate_data_blocks(struct inode *inode, loff_t offset,
u64 len = F2FS_BYTES_TO_BLK(count);
bool allocated;
u64 end_offset;
+ int err = 0;
while (len) {
- f2fs_balance_fs(sbi);
f2fs_lock_op(sbi);
/* When reading holes, we need its node page */
set_new_dnode(&dn, inode, NULL, NULL, 0);
- if (get_dnode_of_data(&dn, start, ALLOC_NODE))
+ err = get_dnode_of_data(&dn, start, ALLOC_NODE);
+ if (err)
goto out;
allocated = false;
@@ -523,9 +523,15 @@ static void __allocate_data_blocks(struct inode *inode, loff_t offset,
while (dn.ofs_in_node < end_offset && len) {
block_t blkaddr;
+ if (unlikely(f2fs_cp_error(sbi))) {
+ err = -EIO;
+ goto sync_out;
+ }
+
blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
if (blkaddr == NULL_ADDR || blkaddr == NEW_ADDR) {
- if (__allocate_data_block(&dn))
+ err = __allocate_data_block(&dn);
+ if (err)
goto sync_out;
allocated = true;
}
@@ -539,8 +545,10 @@ static void __allocate_data_blocks(struct inode *inode, loff_t offset,
f2fs_put_dnode(&dn);
f2fs_unlock_op(sbi);
+
+ f2fs_balance_fs(sbi, dn.node_changed);
}
- return;
+ return err;
sync_out:
if (allocated)
@@ -548,7 +556,8 @@ sync_out:
f2fs_put_dnode(&dn);
out:
f2fs_unlock_op(sbi);
- return;
+ f2fs_balance_fs(sbi, dn.node_changed);
+ return err;
}
/*
@@ -560,16 +569,18 @@ out:
* b. do not use extent cache for better performance
* c. give the block addresses to blockdev
*/
-static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
+int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
int create, int flag)
{
unsigned int maxblocks = map->m_len;
struct dnode_of_data dn;
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
int mode = create ? ALLOC_NODE : LOOKUP_NODE_RA;
pgoff_t pgofs, end_offset;
int err = 0, ofs = 1;
struct extent_info ei;
bool allocated = false;
+ block_t blkaddr;
map->m_len = 0;
map->m_flags = 0;
@@ -585,7 +596,7 @@ static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
}
if (create)
- f2fs_lock_op(F2FS_I_SB(inode));
+ f2fs_lock_op(sbi);
/* When reading holes, we need its node page */
set_new_dnode(&dn, inode, NULL, NULL, 0);
@@ -595,50 +606,59 @@ static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
err = 0;
goto unlock_out;
}
- if (dn.data_blkaddr == NEW_ADDR) {
- if (flag == F2FS_GET_BLOCK_BMAP) {
- err = -ENOENT;
- goto put_out;
- } else if (flag == F2FS_GET_BLOCK_READ ||
- flag == F2FS_GET_BLOCK_DIO) {
- goto put_out;
+
+ if (dn.data_blkaddr == NEW_ADDR || dn.data_blkaddr == NULL_ADDR) {
+ if (create) {
+ if (unlikely(f2fs_cp_error(sbi))) {
+ err = -EIO;
+ goto put_out;
+ }
+ err = __allocate_data_block(&dn);
+ if (err)
+ goto put_out;
+ allocated = true;
+ map->m_flags = F2FS_MAP_NEW;
+ } else {
+ if (flag != F2FS_GET_BLOCK_FIEMAP ||
+ dn.data_blkaddr != NEW_ADDR) {
+ if (flag == F2FS_GET_BLOCK_BMAP)
+ err = -ENOENT;
+ goto put_out;
+ }
+
+ /*
+ * preallocated unwritten block should be mapped
+ * for fiemap.
+ */
+ if (dn.data_blkaddr == NEW_ADDR)
+ map->m_flags = F2FS_MAP_UNWRITTEN;
}
- /*
- * if it is in fiemap call path (flag = F2FS_GET_BLOCK_FIEMAP),
- * mark it as mapped and unwritten block.
- */
}
- if (dn.data_blkaddr != NULL_ADDR) {
- map->m_flags = F2FS_MAP_MAPPED;
- map->m_pblk = dn.data_blkaddr;
- if (dn.data_blkaddr == NEW_ADDR)
- map->m_flags |= F2FS_MAP_UNWRITTEN;
- } else if (create) {
- err = __allocate_data_block(&dn);
- if (err)
- goto put_out;
- allocated = true;
- map->m_flags = F2FS_MAP_NEW | F2FS_MAP_MAPPED;
- map->m_pblk = dn.data_blkaddr;
- } else {
- if (flag == F2FS_GET_BLOCK_BMAP)
- err = -ENOENT;
- goto put_out;
- }
+ map->m_flags |= F2FS_MAP_MAPPED;
+ map->m_pblk = dn.data_blkaddr;
+ map->m_len = 1;
end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
- map->m_len = 1;
dn.ofs_in_node++;
pgofs++;
get_next:
+ if (map->m_len >= maxblocks)
+ goto sync_out;
+
if (dn.ofs_in_node >= end_offset) {
if (allocated)
sync_inode_page(&dn);
allocated = false;
f2fs_put_dnode(&dn);
+ if (create) {
+ f2fs_unlock_op(sbi);
+ f2fs_balance_fs(sbi, dn.node_changed);
+ f2fs_lock_op(sbi);
+ }
+
set_new_dnode(&dn, inode, NULL, NULL, 0);
err = get_dnode_of_data(&dn, pgofs, mode);
if (err) {
@@ -647,43 +667,56 @@ get_next:
goto unlock_out;
}
- if (dn.data_blkaddr == NEW_ADDR &&
- flag != F2FS_GET_BLOCK_FIEMAP)
- goto put_out;
-
end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
}
- if (maxblocks > map->m_len) {
- block_t blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
- if (blkaddr == NULL_ADDR && create) {
+ blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
+
+ if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) {
+ if (create) {
+ if (unlikely(f2fs_cp_error(sbi))) {
+ err = -EIO;
+ goto sync_out;
+ }
err = __allocate_data_block(&dn);
if (err)
goto sync_out;
allocated = true;
map->m_flags |= F2FS_MAP_NEW;
blkaddr = dn.data_blkaddr;
- }
- /* Give more consecutive addresses for the readahead */
- if ((map->m_pblk != NEW_ADDR &&
- blkaddr == (map->m_pblk + ofs)) ||
- (map->m_pblk == NEW_ADDR &&
- blkaddr == NEW_ADDR)) {
- ofs++;
- dn.ofs_in_node++;
- pgofs++;
- map->m_len++;
- goto get_next;
+ } else {
+ /*
+ * we only merge preallocated unwritten blocks
+ * for fiemap.
+ */
+ if (flag != F2FS_GET_BLOCK_FIEMAP ||
+ blkaddr != NEW_ADDR)
+ goto sync_out;
}
}
+
+ /* Give more consecutive addresses for the readahead */
+ if ((map->m_pblk != NEW_ADDR &&
+ blkaddr == (map->m_pblk + ofs)) ||
+ (map->m_pblk == NEW_ADDR &&
+ blkaddr == NEW_ADDR)) {
+ ofs++;
+ dn.ofs_in_node++;
+ pgofs++;
+ map->m_len++;
+ goto get_next;
+ }
+
sync_out:
if (allocated)
sync_inode_page(&dn);
put_out:
f2fs_put_dnode(&dn);
unlock_out:
- if (create)
- f2fs_unlock_op(F2FS_I_SB(inode));
+ if (create) {
+ f2fs_unlock_op(sbi);
+ f2fs_balance_fs(sbi, dn.node_changed);
+ }
out:
trace_f2fs_map_blocks(inode, map, err);
return err;
@@ -723,6 +756,10 @@ static int get_data_block_dio(struct inode *inode, sector_t iblock,
static int get_data_block_bmap(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
+ /* Block number less than F2FS MAX BLOCKS */
+ if (unlikely(iblock >= F2FS_I_SB(inode)->max_file_blocks))
+ return -EFBIG;
+
return __get_data_block(inode, iblock, bh_result, create,
F2FS_GET_BLOCK_BMAP);
}
@@ -742,28 +779,36 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
{
struct buffer_head map_bh;
sector_t start_blk, last_blk;
- loff_t isize = i_size_read(inode);
+ loff_t isize;
u64 logical = 0, phys = 0, size = 0;
u32 flags = 0;
- bool past_eof = false, whole_file = false;
int ret = 0;
ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
if (ret)
return ret;
+ if (f2fs_has_inline_data(inode)) {
+ ret = f2fs_inline_data_fiemap(inode, fieinfo, start, len);
+ if (ret != -EAGAIN)
+ return ret;
+ }
+
mutex_lock(&inode->i_mutex);
- if (len >= isize) {
- whole_file = true;
- len = isize;
- }
+ isize = i_size_read(inode);
+ if (start >= isize)
+ goto out;
+
+ if (start + len > isize)
+ len = isize - start;
if (logical_to_blk(inode, len) == 0)
len = blk_to_logical(inode, 1);
start_blk = logical_to_blk(inode, start);
last_blk = logical_to_blk(inode, start + len - 1);
+
next:
memset(&map_bh, 0, sizeof(struct buffer_head));
map_bh.b_size = len;
@@ -775,59 +820,37 @@ next:
/* HOLE */
if (!buffer_mapped(&map_bh)) {
- start_blk++;
-
- if (!past_eof && blk_to_logical(inode, start_blk) >= isize)
- past_eof = 1;
-
- if (past_eof && size) {
- flags |= FIEMAP_EXTENT_LAST;
- ret = fiemap_fill_next_extent(fieinfo, logical,
- phys, size, flags);
- } else if (size) {
- ret = fiemap_fill_next_extent(fieinfo, logical,
- phys, size, flags);
- size = 0;
- }
+ /* Go through holes util pass the EOF */
+ if (blk_to_logical(inode, start_blk++) < isize)
+ goto prep_next;
+ /* Found a hole beyond isize means no more extents.
+ * Note that the premise is that filesystems don't
+ * punch holes beyond isize and keep size unchanged.
+ */
+ flags |= FIEMAP_EXTENT_LAST;
+ }
- /* if we have holes up to/past EOF then we're done */
- if (start_blk > last_blk || past_eof || ret)
- goto out;
- } else {
- if (start_blk > last_blk && !whole_file) {
- ret = fiemap_fill_next_extent(fieinfo, logical,
- phys, size, flags);
- goto out;
- }
+ if (size) {
+ if (f2fs_encrypted_inode(inode))
+ flags |= FIEMAP_EXTENT_DATA_ENCRYPTED;
- /*
- * if size != 0 then we know we already have an extent
- * to add, so add it.
- */
- if (size) {
- ret = fiemap_fill_next_extent(fieinfo, logical,
- phys, size, flags);
- if (ret)
- goto out;
- }
+ ret = fiemap_fill_next_extent(fieinfo, logical,
+ phys, size, flags);
+ }
- logical = blk_to_logical(inode, start_blk);
- phys = blk_to_logical(inode, map_bh.b_blocknr);
- size = map_bh.b_size;
- flags = 0;
- if (buffer_unwritten(&map_bh))
- flags = FIEMAP_EXTENT_UNWRITTEN;
+ if (start_blk > last_blk || ret)
+ goto out;
- start_blk += logical_to_blk(inode, size);
+ logical = blk_to_logical(inode, start_blk);
+ phys = blk_to_logical(inode, map_bh.b_blocknr);
+ size = map_bh.b_size;
+ flags = 0;
+ if (buffer_unwritten(&map_bh))
+ flags = FIEMAP_EXTENT_UNWRITTEN;
- /*
- * If we are past the EOF, then we need to make sure as
- * soon as we find a hole that the last extent we found
- * is marked with FIEMAP_EXTENT_LAST
- */
- if (!past_eof && logical + size >= isize)
- past_eof = true;
- }
+ start_blk += logical_to_blk(inode, size);
+
+prep_next:
cond_resched();
if (fatal_signal_pending(current))
ret = -EINTR;
@@ -903,7 +926,8 @@ static int f2fs_mpage_readpages(struct address_space *mapping,
map.m_lblk = block_in_file;
map.m_len = last_block - block_in_file;
- if (f2fs_map_blocks(inode, &map, 0, false))
+ if (f2fs_map_blocks(inode, &map, 0,
+ F2FS_GET_BLOCK_READ))
goto set_error_page;
}
got_it:
@@ -936,21 +960,14 @@ submit_and_realloc:
if (f2fs_encrypted_inode(inode) &&
S_ISREG(inode->i_mode)) {
- struct page *cpage;
ctx = f2fs_get_crypto_ctx(inode);
if (IS_ERR(ctx))
goto set_error_page;
/* wait the page to be moved by cleaning */
- cpage = find_lock_page(
- META_MAPPING(F2FS_I_SB(inode)),
- block_nr);
- if (cpage) {
- f2fs_wait_on_page_writeback(cpage,
- DATA);
- f2fs_put_page(cpage, 1);
- }
+ f2fs_wait_on_encrypted_page_writeback(
+ F2FS_I_SB(inode), block_nr);
}
bio = bio_alloc(GFP_KERNEL,
@@ -1012,6 +1029,9 @@ static int f2fs_read_data_pages(struct file *file,
struct list_head *pages, unsigned nr_pages)
{
struct inode *inode = file->f_mapping->host;
+ struct page *page = list_entry(pages->prev, struct page, lru);
+
+ trace_f2fs_readpages(inode, page, nr_pages);
/* If the file has inline data, skip readpages */
if (f2fs_has_inline_data(inode))
@@ -1041,6 +1061,11 @@ int do_write_data_page(struct f2fs_io_info *fio)
}
if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
+
+ /* wait for GCed encrypted page writeback */
+ f2fs_wait_on_encrypted_page_writeback(F2FS_I_SB(inode),
+ fio->blk_addr);
+
fio->encrypted_page = f2fs_encrypt(inode, fio->page);
if (IS_ERR(fio->encrypted_page)) {
err = PTR_ERR(fio->encrypted_page);
@@ -1056,6 +1081,7 @@ int do_write_data_page(struct f2fs_io_info *fio)
*/
if (unlikely(fio->blk_addr != NEW_ADDR &&
!is_cold_data(page) &&
+ !IS_ATOMIC_WRITTEN_PAGE(page) &&
need_inplace_update(inode))) {
rewrite_data_page(fio);
set_inode_flag(F2FS_I(inode), FI_UPDATE_WRITE);
@@ -1152,10 +1178,11 @@ out:
if (err)
ClearPageUptodate(page);
unlock_page(page);
- if (need_balance_fs)
- f2fs_balance_fs(sbi);
- if (wbc->for_reclaim)
+ f2fs_balance_fs(sbi, need_balance_fs);
+ if (wbc->for_reclaim || unlikely(f2fs_cp_error(sbi))) {
f2fs_submit_merged_bio(sbi, DATA, WRITE);
+ remove_dirty_inode(inode);
+ }
return 0;
redirty_out:
@@ -1327,6 +1354,10 @@ static int f2fs_write_data_pages(struct address_space *mapping,
available_free_memory(sbi, DIRTY_DENTS))
goto skip_write;
+ /* skip writing during file defragment */
+ if (is_inode_flag_set(F2FS_I(inode), FI_DO_DEFRAG))
+ goto skip_write;
+
/* during POR, we don't need to trigger writepage at all. */
if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
goto skip_write;
@@ -1342,7 +1373,7 @@ static int f2fs_write_data_pages(struct address_space *mapping,
if (locked)
mutex_unlock(&sbi->writepages);
- remove_dirty_dir_inode(inode);
+ remove_dirty_inode(inode);
wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff);
return ret;
@@ -1355,11 +1386,83 @@ skip_write:
static void f2fs_write_failed(struct address_space *mapping, loff_t to)
{
struct inode *inode = mapping->host;
+ loff_t i_size = i_size_read(inode);
+
+ if (to > i_size) {
+ truncate_pagecache(inode, i_size);
+ truncate_blocks(inode, i_size, true);
+ }
+}
+
+static int prepare_write_begin(struct f2fs_sb_info *sbi,
+ struct page *page, loff_t pos, unsigned len,
+ block_t *blk_addr, bool *node_changed)
+{
+ struct inode *inode = page->mapping->host;
+ pgoff_t index = page->index;
+ struct dnode_of_data dn;
+ struct page *ipage;
+ bool locked = false;
+ struct extent_info ei;
+ int err = 0;
- if (to > inode->i_size) {
- truncate_pagecache(inode, inode->i_size);
- truncate_blocks(inode, inode->i_size, true);
+ if (f2fs_has_inline_data(inode) ||
+ (pos & PAGE_CACHE_MASK) >= i_size_read(inode)) {
+ f2fs_lock_op(sbi);
+ locked = true;
}
+restart:
+ /* check inline_data */
+ ipage = get_node_page(sbi, inode->i_ino);
+ if (IS_ERR(ipage)) {
+ err = PTR_ERR(ipage);
+ goto unlock_out;
+ }
+
+ set_new_dnode(&dn, inode, ipage, ipage, 0);
+
+ if (f2fs_has_inline_data(inode)) {
+ if (pos + len <= MAX_INLINE_DATA) {
+ read_inline_data(page, ipage);
+ set_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
+ sync_inode_page(&dn);
+ } else {
+ err = f2fs_convert_inline_page(&dn, page);
+ if (err)
+ goto out;
+ if (dn.data_blkaddr == NULL_ADDR)
+ err = f2fs_get_block(&dn, index);
+ }
+ } else if (locked) {
+ err = f2fs_get_block(&dn, index);
+ } else {
+ if (f2fs_lookup_extent_cache(inode, index, &ei)) {
+ dn.data_blkaddr = ei.blk + index - ei.fofs;
+ } else {
+ bool restart = false;
+
+ /* hole case */
+ err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
+ if (err || (!err && dn.data_blkaddr == NULL_ADDR))
+ restart = true;
+ if (restart) {
+ f2fs_put_dnode(&dn);
+ f2fs_lock_op(sbi);
+ locked = true;
+ goto restart;
+ }
+ }
+ }
+
+ /* convert_inline_page can make node_changed */
+ *blk_addr = dn.data_blkaddr;
+ *node_changed = dn.node_changed;
+out:
+ f2fs_put_dnode(&dn);
+unlock_out:
+ if (locked)
+ f2fs_unlock_op(sbi);
+ return err;
}
static int f2fs_write_begin(struct file *file, struct address_space *mapping,
@@ -1369,15 +1472,13 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
struct inode *inode = mapping->host;
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct page *page = NULL;
- struct page *ipage;
pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT;
- struct dnode_of_data dn;
+ bool need_balance = false;
+ block_t blkaddr = NULL_ADDR;
int err = 0;
trace_f2fs_write_begin(inode, pos, len, flags);
- f2fs_balance_fs(sbi);
-
/*
* We should check this at this moment to avoid deadlock on inode page
* and #0 page. The locking rule for inline_data conversion should be:
@@ -1397,38 +1498,28 @@ repeat:
*pagep = page;
- f2fs_lock_op(sbi);
-
- /* check inline_data */
- ipage = get_node_page(sbi, inode->i_ino);
- if (IS_ERR(ipage)) {
- err = PTR_ERR(ipage);
- goto unlock_fail;
- }
-
- set_new_dnode(&dn, inode, ipage, ipage, 0);
+ err = prepare_write_begin(sbi, page, pos, len,
+ &blkaddr, &need_balance);
+ if (err)
+ goto fail;
- if (f2fs_has_inline_data(inode)) {
- if (pos + len <= MAX_INLINE_DATA) {
- read_inline_data(page, ipage);
- set_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
- sync_inode_page(&dn);
- goto put_next;
+ if (need_balance && has_not_enough_free_secs(sbi, 0)) {
+ unlock_page(page);
+ f2fs_balance_fs(sbi, true);
+ lock_page(page);
+ if (page->mapping != mapping) {
+ /* The page got truncated from under us */
+ f2fs_put_page(page, 1);
+ goto repeat;
}
- err = f2fs_convert_inline_page(&dn, page);
- if (err)
- goto put_fail;
}
- err = f2fs_get_block(&dn, index);
- if (err)
- goto put_fail;
-put_next:
- f2fs_put_dnode(&dn);
- f2fs_unlock_op(sbi);
-
f2fs_wait_on_page_writeback(page, DATA);
+ /* wait for GCed encrypted page writeback */
+ if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
+ f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr);
+
if (len == PAGE_CACHE_SIZE)
goto out_update;
if (PageUptodate(page))
@@ -1443,14 +1534,14 @@ put_next:
goto out_update;
}
- if (dn.data_blkaddr == NEW_ADDR) {
+ if (blkaddr == NEW_ADDR) {
zero_user_segment(page, 0, PAGE_CACHE_SIZE);
} else {
struct f2fs_io_info fio = {
.sbi = sbi,
.type = DATA,
.rw = READ_SYNC,
- .blk_addr = dn.data_blkaddr,
+ .blk_addr = blkaddr,
.page = page,
.encrypted_page = NULL,
};
@@ -1481,10 +1572,6 @@ out_clear:
clear_cold_data(page);
return 0;
-put_fail:
- f2fs_put_dnode(&dn);
-unlock_fail:
- f2fs_unlock_op(sbi);
fail:
f2fs_put_page(page, 1);
f2fs_write_failed(mapping, pos + len);
@@ -1509,6 +1596,7 @@ static int f2fs_write_end(struct file *file,
}
f2fs_put_page(page, 1);
+ f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
return copied;
}
@@ -1536,11 +1624,9 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
int err;
/* we don't need to use inline_data strictly */
- if (f2fs_has_inline_data(inode)) {
- err = f2fs_convert_inline_inode(inode);
- if (err)
- return err;
- }
+ err = f2fs_convert_inline_inode(inode);
+ if (err)
+ return err;
if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
return 0;
@@ -1551,10 +1637,14 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
trace_f2fs_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
- if (iov_iter_rw(iter) == WRITE)
- __allocate_data_blocks(inode, offset, count);
+ if (iov_iter_rw(iter) == WRITE) {
+ err = __allocate_data_blocks(inode, offset, count);
+ if (err)
+ goto out;
+ }
err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block_dio);
+out:
if (err < 0 && iov_iter_rw(iter) == WRITE)
f2fs_write_failed(mapping, offset + count);
@@ -1636,12 +1726,13 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block)
{
struct inode *inode = mapping->host;
- /* we don't need to use inline_data strictly */
- if (f2fs_has_inline_data(inode)) {
- int err = f2fs_convert_inline_inode(inode);
- if (err)
- return err;
- }
+ if (f2fs_has_inline_data(inode))
+ return 0;
+
+ /* make sure allocating whole blocks */
+ if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
+ filemap_write_and_wait(mapping);
+
return generic_block_bmap(mapping, block, get_data_block_bmap);
}
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index d013d8479753..4fb6ef88a34f 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -33,17 +33,20 @@ static void update_general_status(struct f2fs_sb_info *sbi)
int i;
/* validation check of the segment numbers */
- si->hit_largest = atomic_read(&sbi->read_hit_largest);
- si->hit_cached = atomic_read(&sbi->read_hit_cached);
- si->hit_rbtree = atomic_read(&sbi->read_hit_rbtree);
+ si->hit_largest = atomic64_read(&sbi->read_hit_largest);
+ si->hit_cached = atomic64_read(&sbi->read_hit_cached);
+ si->hit_rbtree = atomic64_read(&sbi->read_hit_rbtree);
si->hit_total = si->hit_largest + si->hit_cached + si->hit_rbtree;
- si->total_ext = atomic_read(&sbi->total_hit_ext);
- si->ext_tree = sbi->total_ext_tree;
+ si->total_ext = atomic64_read(&sbi->total_hit_ext);
+ si->ext_tree = atomic_read(&sbi->total_ext_tree);
+ si->zombie_tree = atomic_read(&sbi->total_zombie_tree);
si->ext_node = atomic_read(&sbi->total_ext_node);
si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES);
si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS);
- si->ndirty_dirs = sbi->n_dirty_dirs;
si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META);
+ si->ndirty_data = get_pages(sbi, F2FS_DIRTY_DATA);
+ si->ndirty_dirs = sbi->ndirty_inode[DIR_INODE];
+ si->ndirty_files = sbi->ndirty_inode[FILE_INODE];
si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES);
si->wb_pages = get_pages(sbi, F2FS_WRITEBACK);
si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg;
@@ -105,7 +108,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
bimodal = 0;
total_vblocks = 0;
- blks_per_sec = sbi->segs_per_sec * (1 << sbi->log_blocks_per_seg);
+ blks_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg;
hblks_per_sec = blks_per_sec / 2;
for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) {
vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec);
@@ -118,7 +121,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
}
}
dist = div_u64(MAIN_SECS(sbi) * hblks_per_sec * hblks_per_sec, 100);
- si->bimodal = div_u64(bimodal, dist);
+ si->bimodal = div64_u64(bimodal, dist);
if (si->dirty_count)
si->avg_vblocks = div_u64(total_vblocks, ndirty);
else
@@ -189,18 +192,18 @@ get_cache:
si->cache_mem += NM_I(sbi)->dirty_nat_cnt *
sizeof(struct nat_entry_set);
si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages);
- si->cache_mem += sbi->n_dirty_dirs * sizeof(struct inode_entry);
for (i = 0; i <= UPDATE_INO; i++)
si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry);
- si->cache_mem += sbi->total_ext_tree * sizeof(struct extent_tree);
+ si->cache_mem += atomic_read(&sbi->total_ext_tree) *
+ sizeof(struct extent_tree);
si->cache_mem += atomic_read(&sbi->total_ext_node) *
sizeof(struct extent_node);
si->page_mem = 0;
npages = NODE_MAPPING(sbi)->nrpages;
- si->page_mem += npages << PAGE_CACHE_SHIFT;
+ si->page_mem += (unsigned long long)npages << PAGE_CACHE_SHIFT;
npages = META_MAPPING(sbi)->nrpages;
- si->page_mem += npages << PAGE_CACHE_SHIFT;
+ si->page_mem += (unsigned long long)npages << PAGE_CACHE_SHIFT;
}
static int stat_show(struct seq_file *s, void *v)
@@ -211,12 +214,10 @@ static int stat_show(struct seq_file *s, void *v)
mutex_lock(&f2fs_stat_mutex);
list_for_each_entry(si, &f2fs_stat_list, stat_list) {
- char devname[BDEVNAME_SIZE];
-
update_general_status(si->sbi);
- seq_printf(s, "\n=====[ partition info(%s). #%d ]=====\n",
- bdevname(si->sbi->sb->s_bdev, devname), i++);
+ seq_printf(s, "\n=====[ partition info(%pg). #%d ]=====\n",
+ si->sbi->sb->s_bdev, i++);
seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ",
si->sit_area_segs, si->nat_area_segs);
seq_printf(s, "[SSA: %d] [MAIN: %d",
@@ -269,7 +270,8 @@ static int stat_show(struct seq_file *s, void *v)
si->dirty_count);
seq_printf(s, " - Prefree: %d\n - Free: %d (%d)\n\n",
si->prefree_count, si->free_segs, si->free_secs);
- seq_printf(s, "CP calls: %d\n", si->cp_count);
+ seq_printf(s, "CP calls: %d (BG: %d)\n",
+ si->cp_count, si->bg_cp_count);
seq_printf(s, "GC calls: %d (BG: %d)\n",
si->call_count, si->bg_gc);
seq_printf(s, " - data segments : %d (%d)\n",
@@ -283,15 +285,15 @@ static int stat_show(struct seq_file *s, void *v)
seq_printf(s, " - node blocks : %d (%d)\n", si->node_blks,
si->bg_node_blks);
seq_puts(s, "\nExtent Cache:\n");
- seq_printf(s, " - Hit Count: L1-1:%d L1-2:%d L2:%d\n",
+ seq_printf(s, " - Hit Count: L1-1:%llu L1-2:%llu L2:%llu\n",
si->hit_largest, si->hit_cached,
si->hit_rbtree);
- seq_printf(s, " - Hit Ratio: %d%% (%d / %d)\n",
+ seq_printf(s, " - Hit Ratio: %llu%% (%llu / %llu)\n",
!si->total_ext ? 0 :
- (si->hit_total * 100) / si->total_ext,
+ div64_u64(si->hit_total * 100, si->total_ext),
si->hit_total, si->total_ext);
- seq_printf(s, " - Inner Struct Count: tree: %d, node: %d\n",
- si->ext_tree, si->ext_node);
+ seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n",
+ si->ext_tree, si->zombie_tree, si->ext_node);
seq_puts(s, "\nBalancing F2FS Async:\n");
seq_printf(s, " - inmem: %4d, wb: %4d\n",
si->inmem_pages, si->wb_pages);
@@ -299,6 +301,8 @@ static int stat_show(struct seq_file *s, void *v)
si->ndirty_node, si->node_pages);
seq_printf(s, " - dents: %4d in dirs:%4d\n",
si->ndirty_dent, si->ndirty_dirs);
+ seq_printf(s, " - datas: %4d in files:%4d\n",
+ si->ndirty_data, si->ndirty_files);
seq_printf(s, " - meta: %4d in %4d\n",
si->ndirty_meta, si->meta_pages);
seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n",
@@ -333,13 +337,13 @@ static int stat_show(struct seq_file *s, void *v)
/* memory footprint */
update_mem_info(si->sbi);
- seq_printf(s, "\nMemory: %u KB\n",
+ seq_printf(s, "\nMemory: %llu KB\n",
(si->base_mem + si->cache_mem + si->page_mem) >> 10);
- seq_printf(s, " - static: %u KB\n",
+ seq_printf(s, " - static: %llu KB\n",
si->base_mem >> 10);
- seq_printf(s, " - cached: %u KB\n",
+ seq_printf(s, " - cached: %llu KB\n",
si->cache_mem >> 10);
- seq_printf(s, " - paged : %u KB\n",
+ seq_printf(s, " - paged : %llu KB\n",
si->page_mem >> 10);
}
mutex_unlock(&f2fs_stat_mutex);
@@ -378,10 +382,10 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
si->sbi = sbi;
sbi->stat_info = si;
- atomic_set(&sbi->total_hit_ext, 0);
- atomic_set(&sbi->read_hit_rbtree, 0);
- atomic_set(&sbi->read_hit_largest, 0);
- atomic_set(&sbi->read_hit_cached, 0);
+ atomic64_set(&sbi->total_hit_ext, 0);
+ atomic64_set(&sbi->read_hit_rbtree, 0);
+ atomic64_set(&sbi->read_hit_largest, 0);
+ atomic64_set(&sbi->read_hit_cached, 0);
atomic_set(&sbi->inline_xattr, 0);
atomic_set(&sbi->inline_inode, 0);
@@ -406,20 +410,23 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi)
kfree(si);
}
-void __init f2fs_create_root_stats(void)
+int __init f2fs_create_root_stats(void)
{
struct dentry *file;
f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL);
if (!f2fs_debugfs_root)
- return;
+ return -ENOMEM;
file = debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root,
NULL, &stat_fops);
if (!file) {
debugfs_remove(f2fs_debugfs_root);
f2fs_debugfs_root = NULL;
+ return -ENOMEM;
}
+
+ return 0;
}
void f2fs_destroy_root_stats(void)
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 8f15fc134040..faa7495e2d7e 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -172,8 +172,6 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
namehash = f2fs_dentry_hash(&name);
- f2fs_bug_on(F2FS_I_SB(dir), level > MAX_DIR_HASH_DEPTH);
-
nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level);
nblock = bucket_blocks(level);
@@ -238,6 +236,14 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
goto out;
max_depth = F2FS_I(dir)->i_current_depth;
+ if (unlikely(max_depth > MAX_DIR_HASH_DEPTH)) {
+ f2fs_msg(F2FS_I_SB(dir)->sb, KERN_WARNING,
+ "Corrupted max_depth of %lu: %u",
+ dir->i_ino, max_depth);
+ max_depth = MAX_DIR_HASH_DEPTH;
+ F2FS_I(dir)->i_current_depth = max_depth;
+ mark_inode_dirty(dir);
+ }
for (level = 0; level < max_depth; level++) {
de = find_in_level(dir, level, &fname, res_page);
@@ -258,7 +264,7 @@ struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p)
if (f2fs_has_inline_dentry(dir))
return f2fs_parent_inline_dir(dir, p);
- page = get_lock_data_page(dir, 0);
+ page = get_lock_data_page(dir, 0, false);
if (IS_ERR(page))
return NULL;
@@ -444,7 +450,7 @@ error:
/* once the failed inode becomes a bad inode, i_mode is S_IFREG */
truncate_inode_pages(&inode->i_data, 0);
truncate_blocks(inode, 0, false);
- remove_dirty_dir_inode(inode);
+ remove_dirty_inode(inode);
remove_inode_page(inode);
return ERR_PTR(err);
}
@@ -630,6 +636,7 @@ fail:
f2fs_put_page(dentry_page, 1);
out:
f2fs_fname_free_filename(&fname);
+ f2fs_update_time(F2FS_I_SB(dir), REQ_TIME);
return err;
}
@@ -651,6 +658,7 @@ int f2fs_do_tmpfile(struct inode *inode, struct inode *dir)
clear_inode_flag(F2FS_I(inode), FI_NEW_INODE);
fail:
up_write(&F2FS_I(inode)->i_sem);
+ f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
return err;
}
@@ -695,6 +703,8 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len));
int i;
+ f2fs_update_time(F2FS_I_SB(dir), REQ_TIME);
+
if (f2fs_has_inline_dentry(dir))
return f2fs_delete_inline_entry(dentry, page, dir, inode);
@@ -740,7 +750,7 @@ bool f2fs_empty_dir(struct inode *dir)
return f2fs_empty_inline_dir(dir);
for (bidx = 0; bidx < nblock; bidx++) {
- dentry_page = get_lock_data_page(dir, bidx);
+ dentry_page = get_lock_data_page(dir, bidx, false);
if (IS_ERR(dentry_page)) {
if (PTR_ERR(dentry_page) == -ENOENT)
continue;
@@ -787,7 +797,6 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
else
d_type = DT_UNKNOWN;
- /* encrypted case */
de_name.name = d->filename[bit_pos];
de_name.len = le16_to_cpu(de->name_len);
@@ -795,12 +804,20 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
int save_len = fstr->len;
int ret;
+ de_name.name = kmalloc(de_name.len, GFP_NOFS);
+ if (!de_name.name)
+ return false;
+
+ memcpy(de_name.name, d->filename[bit_pos], de_name.len);
+
ret = f2fs_fname_disk_to_usr(d->inode, &de->hash_code,
&de_name, fstr);
- de_name = *fstr;
- fstr->len = save_len;
+ kfree(de_name.name);
if (ret < 0)
return true;
+
+ de_name = *fstr;
+ fstr->len = save_len;
}
if (!dir_emit(ctx, de_name.name, de_name.len,
@@ -847,26 +864,28 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES));
for (; n < npages; n++) {
- dentry_page = get_lock_data_page(inode, n);
- if (IS_ERR(dentry_page))
- continue;
+ dentry_page = get_lock_data_page(inode, n, false);
+ if (IS_ERR(dentry_page)) {
+ err = PTR_ERR(dentry_page);
+ if (err == -ENOENT)
+ continue;
+ else
+ goto out;
+ }
dentry_blk = kmap(dentry_page);
make_dentry_ptr(inode, &d, (void *)dentry_blk, 1);
- if (f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr))
- goto stop;
+ if (f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr)) {
+ kunmap(dentry_page);
+ f2fs_put_page(dentry_page, 1);
+ break;
+ }
ctx->pos = (n + 1) * NR_DENTRY_IN_BLOCK;
kunmap(dentry_page);
f2fs_put_page(dentry_page, 1);
- dentry_page = NULL;
- }
-stop:
- if (dentry_page && !IS_ERR(dentry_page)) {
- kunmap(dentry_page);
- f2fs_put_page(dentry_page, 1);
}
out:
f2fs_fname_crypto_free_buffer(&fstr);
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 997ac86f2a1d..ccd5c636d3fe 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -36,7 +36,7 @@ static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi,
rb_link_node(&en->rb_node, parent, p);
rb_insert_color(&en->rb_node, &et->root);
- et->count++;
+ atomic_inc(&et->node_cnt);
atomic_inc(&sbi->total_ext_node);
return en;
}
@@ -45,7 +45,7 @@ static void __detach_extent_node(struct f2fs_sb_info *sbi,
struct extent_tree *et, struct extent_node *en)
{
rb_erase(&en->rb_node, &et->root);
- et->count--;
+ atomic_dec(&et->node_cnt);
atomic_dec(&sbi->total_ext_node);
if (et->cached_en == en)
@@ -68,11 +68,13 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode)
et->root = RB_ROOT;
et->cached_en = NULL;
rwlock_init(&et->lock);
- atomic_set(&et->refcount, 0);
- et->count = 0;
- sbi->total_ext_tree++;
+ INIT_LIST_HEAD(&et->list);
+ atomic_set(&et->node_cnt, 0);
+ atomic_inc(&sbi->total_ext_tree);
+ } else {
+ atomic_dec(&sbi->total_zombie_tree);
+ list_del_init(&et->list);
}
- atomic_inc(&et->refcount);
up_write(&sbi->extent_tree_lock);
/* never died until evict_inode */
@@ -131,7 +133,7 @@ static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi,
{
struct rb_node *node, *next;
struct extent_node *en;
- unsigned int count = et->count;
+ unsigned int count = atomic_read(&et->node_cnt);
node = rb_first(&et->root);
while (node) {
@@ -152,45 +154,45 @@ static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi,
node = next;
}
- return count - et->count;
+ return count - atomic_read(&et->node_cnt);
}
-static void __drop_largest_extent(struct inode *inode, pgoff_t fofs)
+static void __drop_largest_extent(struct inode *inode,
+ pgoff_t fofs, unsigned int len)
{
struct extent_info *largest = &F2FS_I(inode)->extent_tree->largest;
- if (largest->fofs <= fofs && largest->fofs + largest->len > fofs)
+ if (fofs < largest->fofs + largest->len && fofs + len > largest->fofs)
largest->len = 0;
}
-void f2fs_drop_largest_extent(struct inode *inode, pgoff_t fofs)
-{
- if (!f2fs_may_extent_tree(inode))
- return;
-
- __drop_largest_extent(inode, fofs);
-}
-
-void f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext)
+/* return true, if inode page is changed */
+bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct extent_tree *et;
struct extent_node *en;
struct extent_info ei;
- if (!f2fs_may_extent_tree(inode))
- return;
+ if (!f2fs_may_extent_tree(inode)) {
+ /* drop largest extent */
+ if (i_ext && i_ext->len) {
+ i_ext->len = 0;
+ return true;
+ }
+ return false;
+ }
et = __grab_extent_tree(inode);
- if (!i_ext || le32_to_cpu(i_ext->len) < F2FS_MIN_EXTENT_LEN)
- return;
+ if (!i_ext || !i_ext->len)
+ return false;
set_extent_info(&ei, le32_to_cpu(i_ext->fofs),
le32_to_cpu(i_ext->blk), le32_to_cpu(i_ext->len));
write_lock(&et->lock);
- if (et->count)
+ if (atomic_read(&et->node_cnt))
goto out;
en = __init_extent_tree(sbi, et, &ei);
@@ -201,6 +203,7 @@ void f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext)
}
out:
write_unlock(&et->lock);
+ return false;
}
static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
@@ -350,8 +353,7 @@ static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi,
}
if (en) {
- if (en->ei.len > et->largest.len)
- et->largest = en->ei;
+ __try_update_largest_extent(et, en);
et->cached_en = en;
}
return en;
@@ -388,18 +390,17 @@ do_insert:
if (!en)
return NULL;
- if (en->ei.len > et->largest.len)
- et->largest = en->ei;
+ __try_update_largest_extent(et, en);
et->cached_en = en;
return en;
}
-unsigned int f2fs_update_extent_tree_range(struct inode *inode,
+static unsigned int f2fs_update_extent_tree_range(struct inode *inode,
pgoff_t fofs, block_t blkaddr, unsigned int len)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct extent_tree *et = F2FS_I(inode)->extent_tree;
- struct extent_node *en = NULL, *en1 = NULL, *en2 = NULL, *en3 = NULL;
+ struct extent_node *en = NULL, *en1 = NULL;
struct extent_node *prev_en = NULL, *next_en = NULL;
struct extent_info ei, dei, prev;
struct rb_node **insert_p = NULL, *insert_parent = NULL;
@@ -409,6 +410,8 @@ unsigned int f2fs_update_extent_tree_range(struct inode *inode,
if (!et)
return false;
+ trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, len);
+
write_lock(&et->lock);
if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) {
@@ -419,148 +422,99 @@ unsigned int f2fs_update_extent_tree_range(struct inode *inode,
prev = et->largest;
dei.len = 0;
- /* we do not guarantee that the largest extent is cached all the time */
- __drop_largest_extent(inode, fofs);
+ /*
+ * drop largest extent before lookup, in case it's already
+ * been shrunk from extent tree
+ */
+ __drop_largest_extent(inode, fofs, len);
/* 1. lookup first extent node in range [fofs, fofs + len - 1] */
en = __lookup_extent_tree_ret(et, fofs, &prev_en, &next_en,
&insert_p, &insert_parent);
- if (!en) {
- if (next_en) {
- en = next_en;
- f2fs_bug_on(sbi, en->ei.fofs <= pos);
- pos = en->ei.fofs;
- } else {
- /*
- * skip searching in the tree since there is no
- * larger extent node in the cache.
- */
- goto update_extent;
- }
- }
+ if (!en)
+ en = next_en;
/* 2. invlidate all extent nodes in range [fofs, fofs + len - 1] */
- while (en) {
- struct rb_node *node;
+ while (en && en->ei.fofs < end) {
+ unsigned int org_end;
+ int parts = 0; /* # of parts current extent split into */
- if (pos >= end)
- break;
+ next_en = en1 = NULL;
dei = en->ei;
- en1 = en2 = NULL;
+ org_end = dei.fofs + dei.len;
+ f2fs_bug_on(sbi, pos >= org_end);
- node = rb_next(&en->rb_node);
+ if (pos > dei.fofs && pos - dei.fofs >= F2FS_MIN_EXTENT_LEN) {
+ en->ei.len = pos - en->ei.fofs;
+ prev_en = en;
+ parts = 1;
+ }
- /*
- * 2.1 there are four cases when we invalidate blkaddr in extent
- * node, |V: valid address, X: will be invalidated|
- */
- /* case#1, invalidate right part of extent node |VVVVVXXXXX| */
- if (pos > dei.fofs && end >= dei.fofs + dei.len) {
- en->ei.len = pos - dei.fofs;
-
- if (en->ei.len < F2FS_MIN_EXTENT_LEN) {
- __detach_extent_node(sbi, et, en);
- insert_p = NULL;
- insert_parent = NULL;
- goto update;
+ if (end < org_end && org_end - end >= F2FS_MIN_EXTENT_LEN) {
+ if (parts) {
+ set_extent_info(&ei, end,
+ end - dei.fofs + dei.blk,
+ org_end - end);
+ en1 = __insert_extent_tree(sbi, et, &ei,
+ NULL, NULL);
+ next_en = en1;
+ } else {
+ en->ei.fofs = end;
+ en->ei.blk += end - dei.fofs;
+ en->ei.len -= end - dei.fofs;
+ next_en = en;
}
-
- if (__is_extent_same(&dei, &et->largest))
- et->largest = en->ei;
- goto next;
+ parts++;
}
- /* case#2, invalidate left part of extent node |XXXXXVVVVV| */
- if (pos <= dei.fofs && end < dei.fofs + dei.len) {
- en->ei.fofs = end;
- en->ei.blk += end - dei.fofs;
- en->ei.len -= end - dei.fofs;
-
- if (en->ei.len < F2FS_MIN_EXTENT_LEN) {
- __detach_extent_node(sbi, et, en);
- insert_p = NULL;
- insert_parent = NULL;
- goto update;
- }
+ if (!next_en) {
+ struct rb_node *node = rb_next(&en->rb_node);
- if (__is_extent_same(&dei, &et->largest))
- et->largest = en->ei;
- goto next;
+ next_en = node ?
+ rb_entry(node, struct extent_node, rb_node)
+ : NULL;
}
- __detach_extent_node(sbi, et, en);
+ if (parts)
+ __try_update_largest_extent(et, en);
+ else
+ __detach_extent_node(sbi, et, en);
/*
- * if we remove node in rb-tree, our parent node pointer may
- * point the wrong place, discard them.
+ * if original extent is split into zero or two parts, extent
+ * tree has been altered by deletion or insertion, therefore
+ * invalidate pointers regard to tree.
*/
- insert_p = NULL;
- insert_parent = NULL;
-
- /* case#3, invalidate entire extent node |XXXXXXXXXX| */
- if (pos <= dei.fofs && end >= dei.fofs + dei.len) {
- if (__is_extent_same(&dei, &et->largest))
- et->largest.len = 0;
- goto update;
+ if (parts != 1) {
+ insert_p = NULL;
+ insert_parent = NULL;
}
- /*
- * case#4, invalidate data in the middle of extent node
- * |VVVXXXXVVV|
- */
- if (dei.len > F2FS_MIN_EXTENT_LEN) {
- unsigned int endofs;
-
- /* insert left part of split extent into cache */
- if (pos - dei.fofs >= F2FS_MIN_EXTENT_LEN) {
- set_extent_info(&ei, dei.fofs, dei.blk,
- pos - dei.fofs);
- en1 = __insert_extent_tree(sbi, et, &ei,
- NULL, NULL);
- }
-
- /* insert right part of split extent into cache */
- endofs = dei.fofs + dei.len;
- if (endofs - end >= F2FS_MIN_EXTENT_LEN) {
- set_extent_info(&ei, end,
- end - dei.fofs + dei.blk,
- endofs - end);
- en2 = __insert_extent_tree(sbi, et, &ei,
- NULL, NULL);
- }
- }
-update:
- /* 2.2 update in global extent list */
+ /* update in global extent list */
spin_lock(&sbi->extent_lock);
- if (en && !list_empty(&en->list))
+ if (!parts && !list_empty(&en->list))
list_del(&en->list);
if (en1)
list_add_tail(&en1->list, &sbi->extent_list);
- if (en2)
- list_add_tail(&en2->list, &sbi->extent_list);
spin_unlock(&sbi->extent_lock);
- /* 2.3 release extent node */
- if (en)
+ /* release extent node */
+ if (!parts)
kmem_cache_free(extent_node_slab, en);
-next:
- en = node ? rb_entry(node, struct extent_node, rb_node) : NULL;
- next_en = en;
- if (en)
- pos = en->ei.fofs;
+
+ en = next_en;
}
-update_extent:
/* 3. update extent in extent cache */
if (blkaddr) {
struct extent_node *den = NULL;
set_extent_info(&ei, fofs, blkaddr, len);
- en3 = __try_merge_extent_node(sbi, et, &ei, &den,
+ en1 = __try_merge_extent_node(sbi, et, &ei, &den,
prev_en, next_en);
- if (!en3)
- en3 = __insert_extent_tree(sbi, et, &ei,
+ if (!en1)
+ en1 = __insert_extent_tree(sbi, et, &ei,
insert_p, insert_parent);
/* give up extent_cache, if split and small updates happen */
@@ -572,11 +526,11 @@ update_extent:
}
spin_lock(&sbi->extent_lock);
- if (en3) {
- if (list_empty(&en3->list))
- list_add_tail(&en3->list, &sbi->extent_list);
+ if (en1) {
+ if (list_empty(&en1->list))
+ list_add_tail(&en1->list, &sbi->extent_list);
else
- list_move_tail(&en3->list, &sbi->extent_list);
+ list_move_tail(&en1->list, &sbi->extent_list);
}
if (den && !list_empty(&den->list))
list_del(&den->list);
@@ -597,45 +551,44 @@ update_extent:
unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
{
struct extent_tree *treevec[EXT_TREE_VEC_SIZE];
+ struct extent_tree *et, *next;
struct extent_node *en, *tmp;
unsigned long ino = F2FS_ROOT_INO(sbi);
- struct radix_tree_root *root = &sbi->extent_tree_root;
unsigned int found;
unsigned int node_cnt = 0, tree_cnt = 0;
int remained;
+ bool do_free = false;
if (!test_opt(sbi, EXTENT_CACHE))
return 0;
+ if (!atomic_read(&sbi->total_zombie_tree))
+ goto free_node;
+
if (!down_write_trylock(&sbi->extent_tree_lock))
goto out;
/* 1. remove unreferenced extent tree */
- while ((found = radix_tree_gang_lookup(root,
- (void **)treevec, ino, EXT_TREE_VEC_SIZE))) {
- unsigned i;
-
- ino = treevec[found - 1]->ino + 1;
- for (i = 0; i < found; i++) {
- struct extent_tree *et = treevec[i];
-
- if (!atomic_read(&et->refcount)) {
- write_lock(&et->lock);
- node_cnt += __free_extent_tree(sbi, et, true);
- write_unlock(&et->lock);
+ list_for_each_entry_safe(et, next, &sbi->zombie_list, list) {
+ if (atomic_read(&et->node_cnt)) {
+ write_lock(&et->lock);
+ node_cnt += __free_extent_tree(sbi, et, true);
+ write_unlock(&et->lock);
+ }
- radix_tree_delete(root, et->ino);
- kmem_cache_free(extent_tree_slab, et);
- sbi->total_ext_tree--;
- tree_cnt++;
+ list_del_init(&et->list);
+ radix_tree_delete(&sbi->extent_tree_root, et->ino);
+ kmem_cache_free(extent_tree_slab, et);
+ atomic_dec(&sbi->total_ext_tree);
+ atomic_dec(&sbi->total_zombie_tree);
+ tree_cnt++;
- if (node_cnt + tree_cnt >= nr_shrink)
- goto unlock_out;
- }
- }
+ if (node_cnt + tree_cnt >= nr_shrink)
+ goto unlock_out;
}
up_write(&sbi->extent_tree_lock);
+free_node:
/* 2. remove LRU extent entries */
if (!down_write_trylock(&sbi->extent_tree_lock))
goto out;
@@ -647,10 +600,19 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
if (!remained--)
break;
list_del_init(&en->list);
+ do_free = true;
}
spin_unlock(&sbi->extent_lock);
- while ((found = radix_tree_gang_lookup(root,
+ if (do_free == false)
+ goto unlock_out;
+
+ /*
+ * reset ino for searching victims from beginning of global extent tree.
+ */
+ ino = F2FS_ROOT_INO(sbi);
+
+ while ((found = radix_tree_gang_lookup(&sbi->extent_tree_root,
(void **)treevec, ino, EXT_TREE_VEC_SIZE))) {
unsigned i;
@@ -658,12 +620,16 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
for (i = 0; i < found; i++) {
struct extent_tree *et = treevec[i];
- write_lock(&et->lock);
- node_cnt += __free_extent_tree(sbi, et, false);
- write_unlock(&et->lock);
+ if (!atomic_read(&et->node_cnt))
+ continue;
+
+ if (write_trylock(&et->lock)) {
+ node_cnt += __free_extent_tree(sbi, et, false);
+ write_unlock(&et->lock);
+ }
if (node_cnt + tree_cnt >= nr_shrink)
- break;
+ goto unlock_out;
}
}
unlock_out:
@@ -680,7 +646,7 @@ unsigned int f2fs_destroy_extent_node(struct inode *inode)
struct extent_tree *et = F2FS_I(inode)->extent_tree;
unsigned int node_cnt = 0;
- if (!et)
+ if (!et || !atomic_read(&et->node_cnt))
return 0;
write_lock(&et->lock);
@@ -699,8 +665,12 @@ void f2fs_destroy_extent_tree(struct inode *inode)
if (!et)
return;
- if (inode->i_nlink && !is_bad_inode(inode) && et->count) {
- atomic_dec(&et->refcount);
+ if (inode->i_nlink && !is_bad_inode(inode) &&
+ atomic_read(&et->node_cnt)) {
+ down_write(&sbi->extent_tree_lock);
+ list_add_tail(&et->list, &sbi->zombie_list);
+ atomic_inc(&sbi->total_zombie_tree);
+ up_write(&sbi->extent_tree_lock);
return;
}
@@ -709,11 +679,10 @@ void f2fs_destroy_extent_tree(struct inode *inode)
/* delete extent tree entry in radix tree */
down_write(&sbi->extent_tree_lock);
- atomic_dec(&et->refcount);
- f2fs_bug_on(sbi, atomic_read(&et->refcount) || et->count);
+ f2fs_bug_on(sbi, atomic_read(&et->node_cnt));
radix_tree_delete(&sbi->extent_tree_root, inode->i_ino);
kmem_cache_free(extent_tree_slab, et);
- sbi->total_ext_tree--;
+ atomic_dec(&sbi->total_ext_tree);
up_write(&sbi->extent_tree_lock);
F2FS_I(inode)->extent_tree = NULL;
@@ -765,7 +734,9 @@ void init_extent_cache_info(struct f2fs_sb_info *sbi)
init_rwsem(&sbi->extent_tree_lock);
INIT_LIST_HEAD(&sbi->extent_list);
spin_lock_init(&sbi->extent_lock);
- sbi->total_ext_tree = 0;
+ atomic_set(&sbi->total_ext_tree, 0);
+ INIT_LIST_HEAD(&sbi->zombie_list);
+ atomic_set(&sbi->total_zombie_tree, 0);
atomic_set(&sbi->total_ext_node, 0);
}
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index f1a90ffd7cad..ff79054c6cf6 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -19,7 +19,9 @@
#include <linux/magic.h>
#include <linux/kobject.h>
#include <linux/sched.h>
+#include <linux/vmalloc.h>
#include <linux/bio.h>
+#include <linux/blkdev.h>
#ifdef CONFIG_F2FS_CHECK_FS
#define f2fs_bug_on(sbi, condition) BUG_ON(condition)
@@ -52,6 +54,8 @@
#define F2FS_MOUNT_NOBARRIER 0x00000800
#define F2FS_MOUNT_FASTBOOT 0x00001000
#define F2FS_MOUNT_EXTENT_CACHE 0x00002000
+#define F2FS_MOUNT_FORCE_FG_GC 0x00004000
+#define F2FS_MOUNT_DATA_FLUSH 0x00008000
#define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option)
#define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option)
@@ -122,6 +126,8 @@ enum {
(SM_I(sbi)->trim_sections * (sbi)->segs_per_sec)
#define BATCHED_TRIM_BLOCKS(sbi) \
(BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg)
+#define DEF_CP_INTERVAL 60 /* 60 secs */
+#define DEF_IDLE_INTERVAL 120 /* 2 mins */
struct cp_control {
int reason;
@@ -155,13 +161,7 @@ struct ino_entry {
nid_t ino; /* inode number */
};
-/*
- * for the list of directory inodes or gc inodes.
- * NOTE: there are two slab users for this structure, if we add/modify/delete
- * fields in structure for one of slab users, it may affect fields or size of
- * other one, in this condition, it's better to split both of slab and related
- * data structure.
- */
+/* for the list of inodes to be GCed */
struct inode_entry {
struct list_head list; /* list head */
struct inode *inode; /* vfs inode pointer */
@@ -230,6 +230,8 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
#define F2FS_IOC_RELEASE_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 4)
#define F2FS_IOC_ABORT_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 5)
#define F2FS_IOC_GARBAGE_COLLECT _IO(F2FS_IOCTL_MAGIC, 6)
+#define F2FS_IOC_WRITE_CHECKPOINT _IO(F2FS_IOCTL_MAGIC, 7)
+#define F2FS_IOC_DEFRAGMENT _IO(F2FS_IOCTL_MAGIC, 8)
#define F2FS_IOC_SET_ENCRYPTION_POLICY \
_IOR('f', 19, struct f2fs_encryption_policy)
@@ -246,15 +248,22 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
#define F2FS_GOING_DOWN_FULLSYNC 0x0 /* going down with full sync */
#define F2FS_GOING_DOWN_METASYNC 0x1 /* going down with metadata */
#define F2FS_GOING_DOWN_NOSYNC 0x2 /* going down */
+#define F2FS_GOING_DOWN_METAFLUSH 0x3 /* going down with meta flush */
#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
/*
* ioctl commands in 32 bit emulation
*/
-#define F2FS_IOC32_GETFLAGS FS_IOC32_GETFLAGS
-#define F2FS_IOC32_SETFLAGS FS_IOC32_SETFLAGS
+#define F2FS_IOC32_GETFLAGS FS_IOC32_GETFLAGS
+#define F2FS_IOC32_SETFLAGS FS_IOC32_SETFLAGS
+#define F2FS_IOC32_GETVERSION FS_IOC32_GETVERSION
#endif
+struct f2fs_defragment {
+ u64 start;
+ u64 len;
+};
+
/*
* For INODE and NODE manager
*/
@@ -352,9 +361,9 @@ struct extent_tree {
struct rb_root root; /* root of extent info rb-tree */
struct extent_node *cached_en; /* recently accessed extent node */
struct extent_info largest; /* largested extent info */
+ struct list_head list; /* to be used by sbi->zombie_list */
rwlock_t lock; /* protect extent info rb-tree */
- atomic_t refcount; /* reference count of rb-tree */
- unsigned int count; /* # of extent node in rb-tree*/
+ atomic_t node_cnt; /* # of extent node in rb-tree*/
};
/*
@@ -429,8 +438,8 @@ struct f2fs_inode_info {
unsigned int clevel; /* maximum level of given file name */
nid_t i_xattr_nid; /* node id that contains xattrs */
unsigned long long xattr_ver; /* cp version of xattr modification */
- struct inode_entry *dirty_dir; /* the pointer of dirty dir */
+ struct list_head dirty_list; /* linked in global dirty list */
struct list_head inmem_pages; /* inmemory pages managed by f2fs */
struct mutex inmem_lock; /* lock for inmemory pages */
@@ -492,12 +501,20 @@ static inline bool __is_front_mergeable(struct extent_info *cur,
return __is_extent_mergeable(cur, front);
}
+static inline void __try_update_largest_extent(struct extent_tree *et,
+ struct extent_node *en)
+{
+ if (en->ei.len > et->largest.len)
+ et->largest = en->ei;
+}
+
struct f2fs_nm_info {
block_t nat_blkaddr; /* base disk address of NAT */
nid_t max_nid; /* maximum possible node ids */
nid_t available_nids; /* maximum available node ids */
nid_t next_scan_nid; /* the next nid to be scanned */
unsigned int ram_thresh; /* control the memory footprint */
+ unsigned int ra_nid_pages; /* # of nid pages to be readaheaded */
/* NAT cache management */
struct radix_tree_root nat_root;/* root of the nat entry cache */
@@ -531,6 +548,7 @@ struct dnode_of_data {
nid_t nid; /* node id of the direct node block */
unsigned int ofs_in_node; /* data offset in the node page */
bool inode_page_locked; /* inode page is locked or not */
+ bool node_changed; /* is node block changed */
block_t data_blkaddr; /* block address of the node block */
};
@@ -634,6 +652,7 @@ struct f2fs_sm_info {
enum count_type {
F2FS_WRITEBACK,
F2FS_DIRTY_DENTS,
+ F2FS_DIRTY_DATA,
F2FS_DIRTY_NODES,
F2FS_DIRTY_META,
F2FS_INMEM_PAGES,
@@ -682,6 +701,12 @@ struct f2fs_bio_info {
struct rw_semaphore io_rwsem; /* blocking op for bio */
};
+enum inode_type {
+ DIR_INODE, /* for dirty dir inode */
+ FILE_INODE, /* for dirty regular/symlink inode */
+ NR_INODE_TYPE,
+};
+
/* for inner inode cache management */
struct inode_management {
struct radix_tree_root ino_root; /* ino entry array */
@@ -698,11 +723,17 @@ enum {
SBI_POR_DOING, /* recovery is doing or not */
};
+enum {
+ CP_TIME,
+ REQ_TIME,
+ MAX_TIME,
+};
+
struct f2fs_sb_info {
struct super_block *sb; /* pointer to VFS super block */
struct proc_dir_entry *s_proc; /* proc entry */
- struct buffer_head *raw_super_buf; /* buffer head of raw sb */
struct f2fs_super_block *raw_super; /* raw super block pointer */
+ int valid_super_block; /* valid super block no */
int s_flag; /* flags for sbi */
/* for node-related operations */
@@ -724,22 +755,26 @@ struct f2fs_sb_info {
struct rw_semaphore node_write; /* locking node writes */
struct mutex writepages; /* mutex for writepages() */
wait_queue_head_t cp_wait;
+ unsigned long last_time[MAX_TIME]; /* to store time in jiffies */
+ long interval_time[MAX_TIME]; /* to store thresholds */
struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */
/* for orphan inode, use 0'th array */
unsigned int max_orphans; /* max orphan inodes */
- /* for directory inode management */
- struct list_head dir_inode_list; /* dir inode list */
- spinlock_t dir_inode_lock; /* for dir inode list lock */
+ /* for inode management */
+ struct list_head inode_list[NR_INODE_TYPE]; /* dirty inode list */
+ spinlock_t inode_lock[NR_INODE_TYPE]; /* for dirty inode list lock */
/* for extent tree cache */
struct radix_tree_root extent_tree_root;/* cache extent cache entries */
struct rw_semaphore extent_tree_lock; /* locking extent radix tree */
struct list_head extent_list; /* lru list for shrinker */
spinlock_t extent_lock; /* locking extent lru list */
- int total_ext_tree; /* extent tree count */
+ atomic_t total_ext_tree; /* extent tree count */
+ struct list_head zombie_list; /* extent zombie tree list */
+ atomic_t total_zombie_tree; /* extent zombie tree count */
atomic_t total_ext_node; /* extent info count */
/* basic filesystem units */
@@ -757,6 +792,7 @@ struct f2fs_sb_info {
unsigned int total_node_count; /* total node block count */
unsigned int total_valid_node_count; /* valid node block count */
unsigned int total_valid_inode_count; /* valid inode count */
+ loff_t max_file_blocks; /* max block index of file */
int active_logs; /* # of active logs */
int dir_level; /* directory level */
@@ -787,15 +823,15 @@ struct f2fs_sb_info {
unsigned int segment_count[2]; /* # of allocated segments */
unsigned int block_count[2]; /* # of allocated blocks */
atomic_t inplace_count; /* # of inplace update */
- atomic_t total_hit_ext; /* # of lookup extent cache */
- atomic_t read_hit_rbtree; /* # of hit rbtree extent node */
- atomic_t read_hit_largest; /* # of hit largest extent node */
- atomic_t read_hit_cached; /* # of hit cached extent node */
+ atomic64_t total_hit_ext; /* # of lookup extent cache */
+ atomic64_t read_hit_rbtree; /* # of hit rbtree extent node */
+ atomic64_t read_hit_largest; /* # of hit largest extent node */
+ atomic64_t read_hit_cached; /* # of hit cached extent node */
atomic_t inline_xattr; /* # of inline_xattr inodes */
atomic_t inline_inode; /* # of inline_data inodes */
atomic_t inline_dir; /* # of inline_dentry inodes */
int bg_gc; /* background gc calls */
- unsigned int n_dirty_dirs; /* # of dir inodes */
+ unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */
#endif
unsigned int last_victim[2]; /* last victim segment # */
spinlock_t stat_lock; /* lock for stat operations */
@@ -810,6 +846,31 @@ struct f2fs_sb_info {
unsigned int shrinker_run_no;
};
+static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type)
+{
+ sbi->last_time[type] = jiffies;
+}
+
+static inline bool f2fs_time_over(struct f2fs_sb_info *sbi, int type)
+{
+ struct timespec ts = {sbi->interval_time[type], 0};
+ unsigned long interval = timespec_to_jiffies(&ts);
+
+ return time_after(jiffies, sbi->last_time[type] + interval);
+}
+
+static inline bool is_idle(struct f2fs_sb_info *sbi)
+{
+ struct block_device *bdev = sbi->sb->s_bdev;
+ struct request_queue *q = bdev_get_queue(bdev);
+ struct request_list *rl = &q->root_rl;
+
+ if (rl->count[BLK_RW_SYNC] || rl->count[BLK_RW_ASYNC])
+ return 0;
+
+ return f2fs_time_over(sbi, REQ_TIME);
+}
+
/*
* Inline functions
*/
@@ -1045,8 +1106,8 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
static inline void inode_inc_dirty_pages(struct inode *inode)
{
atomic_inc(&F2FS_I(inode)->dirty_pages);
- if (S_ISDIR(inode->i_mode))
- inc_page_count(F2FS_I_SB(inode), F2FS_DIRTY_DENTS);
+ inc_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ?
+ F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA);
}
static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type)
@@ -1061,9 +1122,8 @@ static inline void inode_dec_dirty_pages(struct inode *inode)
return;
atomic_dec(&F2FS_I(inode)->dirty_pages);
-
- if (S_ISDIR(inode->i_mode))
- dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_DENTS);
+ dec_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ?
+ F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA);
}
static inline int get_pages(struct f2fs_sb_info *sbi, int count_type)
@@ -1078,8 +1138,7 @@ static inline int get_dirty_pages(struct inode *inode)
static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type)
{
- unsigned int pages_per_sec = sbi->segs_per_sec *
- (1 << sbi->log_blocks_per_seg);
+ unsigned int pages_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg;
return ((get_pages(sbi, block_type) + pages_per_sec - 1)
>> sbi->log_blocks_per_seg) / sbi->segs_per_sec;
}
@@ -1220,6 +1279,24 @@ static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi)
return sbi->total_valid_inode_count;
}
+static inline struct page *f2fs_grab_cache_page(struct address_space *mapping,
+ pgoff_t index, bool for_write)
+{
+ if (!for_write)
+ return grab_cache_page(mapping, index);
+ return grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
+}
+
+static inline void f2fs_copy_page(struct page *src, struct page *dst)
+{
+ char *src_kaddr = kmap(src);
+ char *dst_kaddr = kmap(dst);
+
+ memcpy(dst_kaddr, src_kaddr, PAGE_SIZE);
+ kunmap(dst);
+ kunmap(src);
+}
+
static inline void f2fs_put_page(struct page *page, int unlock)
{
if (!page)
@@ -1384,6 +1461,8 @@ enum {
FI_DROP_CACHE, /* drop dirty page cache */
FI_DATA_EXIST, /* indicate data exists */
FI_INLINE_DOTS, /* indicate inline dot dentries */
+ FI_DO_DEFRAG, /* indicate defragment is running */
+ FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */
};
static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag)
@@ -1570,13 +1649,31 @@ static inline bool is_dot_dotdot(const struct qstr *str)
static inline bool f2fs_may_extent_tree(struct inode *inode)
{
- mode_t mode = inode->i_mode;
-
if (!test_opt(F2FS_I_SB(inode), EXTENT_CACHE) ||
is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT))
return false;
- return S_ISREG(mode);
+ return S_ISREG(inode->i_mode);
+}
+
+static inline void *f2fs_kvmalloc(size_t size, gfp_t flags)
+{
+ void *ret;
+
+ ret = kmalloc(size, flags | __GFP_NOWARN);
+ if (!ret)
+ ret = __vmalloc(size, flags, PAGE_KERNEL);
+ return ret;
+}
+
+static inline void *f2fs_kvzalloc(size_t size, gfp_t flags)
+{
+ void *ret;
+
+ ret = kzalloc(size, flags | __GFP_NOWARN);
+ if (!ret)
+ ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL);
+ return ret;
}
#define get_inode_mode(i) \
@@ -1609,8 +1706,8 @@ long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long);
void f2fs_set_inode_flags(struct inode *);
struct inode *f2fs_iget(struct super_block *, unsigned long);
int try_to_free_nats(struct f2fs_sb_info *, int);
-void update_inode(struct inode *, struct page *);
-void update_inode_page(struct inode *);
+int update_inode(struct inode *, struct page *);
+int update_inode_page(struct inode *);
int f2fs_write_inode(struct inode *, struct writeback_control *);
void f2fs_evict_inode(struct inode *);
void handle_failed_inode(struct inode *);
@@ -1715,12 +1812,13 @@ void destroy_node_manager_caches(void);
*/
void register_inmem_page(struct inode *, struct page *);
int commit_inmem_pages(struct inode *, bool);
-void f2fs_balance_fs(struct f2fs_sb_info *);
+void f2fs_balance_fs(struct f2fs_sb_info *, bool);
void f2fs_balance_fs_bg(struct f2fs_sb_info *);
int f2fs_issue_flush(struct f2fs_sb_info *);
int create_flush_cmd_control(struct f2fs_sb_info *);
void destroy_flush_cmd_control(struct f2fs_sb_info *);
void invalidate_blocks(struct f2fs_sb_info *, block_t);
+bool is_checkpointed_data(struct f2fs_sb_info *, block_t);
void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t);
void clear_prefree_segments(struct f2fs_sb_info *, struct cp_control *);
void release_discard_addrs(struct f2fs_sb_info *);
@@ -1739,6 +1837,7 @@ void f2fs_replace_block(struct f2fs_sb_info *, struct dnode_of_data *,
void allocate_data_block(struct f2fs_sb_info *, struct page *,
block_t, block_t *, struct f2fs_summary *, int);
void f2fs_wait_on_page_writeback(struct page *, enum page_type);
+void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *, block_t);
void write_data_summaries(struct f2fs_sb_info *, block_t);
void write_node_summaries(struct f2fs_sb_info *, block_t);
int lookup_journal_in_cursum(struct f2fs_summary_block *,
@@ -1754,13 +1853,14 @@ void destroy_segment_manager_caches(void);
*/
struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t);
struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t);
+struct page *get_tmp_page(struct f2fs_sb_info *, pgoff_t);
bool is_valid_blkaddr(struct f2fs_sb_info *, block_t, int);
-int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int);
+int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int, bool);
void ra_meta_pages_cond(struct f2fs_sb_info *, pgoff_t);
long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long);
-void add_dirty_inode(struct f2fs_sb_info *, nid_t, int type);
-void remove_dirty_inode(struct f2fs_sb_info *, nid_t, int type);
-void release_dirty_inode(struct f2fs_sb_info *);
+void add_ino_entry(struct f2fs_sb_info *, nid_t, int type);
+void remove_ino_entry(struct f2fs_sb_info *, nid_t, int type);
+void release_ino_entry(struct f2fs_sb_info *);
bool exist_written_data(struct f2fs_sb_info *, nid_t, int);
int acquire_orphan_inode(struct f2fs_sb_info *);
void release_orphan_inode(struct f2fs_sb_info *);
@@ -1770,9 +1870,9 @@ int recover_orphan_inodes(struct f2fs_sb_info *);
int get_valid_checkpoint(struct f2fs_sb_info *);
void update_dirty_page(struct inode *, struct page *);
void add_dirty_dir_inode(struct inode *);
-void remove_dirty_dir_inode(struct inode *);
-void sync_dirty_dir_inodes(struct f2fs_sb_info *);
-void write_checkpoint(struct f2fs_sb_info *, struct cp_control *);
+void remove_dirty_inode(struct inode *);
+int sync_dirty_inodes(struct f2fs_sb_info *, enum inode_type);
+int write_checkpoint(struct f2fs_sb_info *, struct cp_control *);
void init_ino_entry_info(struct f2fs_sb_info *);
int __init create_checkpoint_caches(void);
void destroy_checkpoint_caches(void);
@@ -1787,11 +1887,12 @@ void set_data_blkaddr(struct dnode_of_data *);
int reserve_new_block(struct dnode_of_data *);
int f2fs_get_block(struct dnode_of_data *, pgoff_t);
int f2fs_reserve_block(struct dnode_of_data *, pgoff_t);
-struct page *get_read_data_page(struct inode *, pgoff_t, int);
+struct page *get_read_data_page(struct inode *, pgoff_t, int, bool);
struct page *find_data_page(struct inode *, pgoff_t);
-struct page *get_lock_data_page(struct inode *, pgoff_t);
+struct page *get_lock_data_page(struct inode *, pgoff_t, bool);
struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool);
int do_write_data_page(struct f2fs_io_info *);
+int f2fs_map_blocks(struct inode *, struct f2fs_map_blocks *, int, int);
int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64);
void f2fs_invalidate_page(struct page *, unsigned int, unsigned int);
int f2fs_release_page(struct page *, gfp_t);
@@ -1802,7 +1903,7 @@ int f2fs_release_page(struct page *, gfp_t);
int start_gc_thread(struct f2fs_sb_info *);
void stop_gc_thread(struct f2fs_sb_info *);
block_t start_bidx_of_node(unsigned int, struct f2fs_inode_info *);
-int f2fs_gc(struct f2fs_sb_info *);
+int f2fs_gc(struct f2fs_sb_info *, bool);
void build_gc_manager(struct f2fs_sb_info *);
/*
@@ -1820,9 +1921,11 @@ struct f2fs_stat_info {
struct f2fs_sb_info *sbi;
int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs;
int main_area_segs, main_area_sections, main_area_zones;
- int hit_largest, hit_cached, hit_rbtree, hit_total, total_ext;
- int ext_tree, ext_node;
- int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta;
+ unsigned long long hit_largest, hit_cached, hit_rbtree;
+ unsigned long long hit_total, total_ext;
+ int ext_tree, zombie_tree, ext_node;
+ int ndirty_node, ndirty_meta;
+ int ndirty_dent, ndirty_dirs, ndirty_data, ndirty_files;
int nats, dirty_nats, sits, dirty_sits, fnids;
int total_count, utilization;
int bg_gc, inmem_pages, wb_pages;
@@ -1832,7 +1935,7 @@ struct f2fs_stat_info {
int util_free, util_valid, util_invalid;
int rsvd_segs, overp_segs;
int dirty_count, node_pages, meta_pages;
- int prefree_count, call_count, cp_count;
+ int prefree_count, call_count, cp_count, bg_cp_count;
int tot_segs, node_segs, data_segs, free_segs, free_secs;
int bg_node_segs, bg_data_segs;
int tot_blks, data_blks, node_blks;
@@ -1844,7 +1947,7 @@ struct f2fs_stat_info {
unsigned int segment_count[2];
unsigned int block_count[2];
unsigned int inplace_count;
- unsigned base_mem, cache_mem, page_mem;
+ unsigned long long base_mem, cache_mem, page_mem;
};
static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
@@ -1853,14 +1956,15 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
}
#define stat_inc_cp_count(si) ((si)->cp_count++)
+#define stat_inc_bg_cp_count(si) ((si)->bg_cp_count++)
#define stat_inc_call_count(si) ((si)->call_count++)
#define stat_inc_bggc_count(sbi) ((sbi)->bg_gc++)
-#define stat_inc_dirty_dir(sbi) ((sbi)->n_dirty_dirs++)
-#define stat_dec_dirty_dir(sbi) ((sbi)->n_dirty_dirs--)
-#define stat_inc_total_hit(sbi) (atomic_inc(&(sbi)->total_hit_ext))
-#define stat_inc_rbtree_node_hit(sbi) (atomic_inc(&(sbi)->read_hit_rbtree))
-#define stat_inc_largest_node_hit(sbi) (atomic_inc(&(sbi)->read_hit_largest))
-#define stat_inc_cached_node_hit(sbi) (atomic_inc(&(sbi)->read_hit_cached))
+#define stat_inc_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]++)
+#define stat_dec_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]--)
+#define stat_inc_total_hit(sbi) (atomic64_inc(&(sbi)->total_hit_ext))
+#define stat_inc_rbtree_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_rbtree))
+#define stat_inc_largest_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_largest))
+#define stat_inc_cached_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_cached))
#define stat_inc_inline_xattr(inode) \
do { \
if (f2fs_has_inline_xattr(inode)) \
@@ -1931,14 +2035,15 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
int f2fs_build_stats(struct f2fs_sb_info *);
void f2fs_destroy_stats(struct f2fs_sb_info *);
-void __init f2fs_create_root_stats(void);
+int __init f2fs_create_root_stats(void);
void f2fs_destroy_root_stats(void);
#else
#define stat_inc_cp_count(si)
+#define stat_inc_bg_cp_count(si)
#define stat_inc_call_count(si)
#define stat_inc_bggc_count(si)
-#define stat_inc_dirty_dir(sbi)
-#define stat_dec_dirty_dir(sbi)
+#define stat_inc_dirty_inode(sbi, type)
+#define stat_dec_dirty_inode(sbi, type)
#define stat_inc_total_hit(sb)
#define stat_inc_rbtree_node_hit(sb)
#define stat_inc_largest_node_hit(sbi)
@@ -1959,7 +2064,7 @@ void f2fs_destroy_root_stats(void);
static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; }
static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { }
-static inline void __init f2fs_create_root_stats(void) { }
+static inline int __init f2fs_create_root_stats(void) { return 0; }
static inline void f2fs_destroy_root_stats(void) { }
#endif
@@ -1998,6 +2103,8 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *, struct page *,
bool f2fs_empty_inline_dir(struct inode *);
int f2fs_read_inline_dir(struct file *, struct dir_context *,
struct f2fs_str *);
+int f2fs_inline_data_fiemap(struct inode *,
+ struct fiemap_extent_info *, __u64, __u64);
/*
* shrinker.c
@@ -2011,8 +2118,7 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *);
* extent_cache.c
*/
unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *, int);
-void f2fs_drop_largest_extent(struct inode *, pgoff_t);
-void f2fs_init_extent_tree(struct inode *, struct f2fs_extent *);
+bool f2fs_init_extent_tree(struct inode *, struct f2fs_extent *);
unsigned int f2fs_destroy_extent_node(struct inode *);
void f2fs_destroy_extent_tree(struct inode *);
bool f2fs_lookup_extent_cache(struct inode *, pgoff_t, struct extent_info *);
@@ -2063,7 +2169,7 @@ static inline int f2fs_sb_has_crypto(struct super_block *sb)
static inline bool f2fs_may_encrypt(struct inode *inode)
{
#ifdef CONFIG_F2FS_FS_ENCRYPTION
- mode_t mode = inode->i_mode;
+ umode_t mode = inode->i_mode;
return (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode));
#else
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 8120f8685141..18ddb1e5182a 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -40,8 +40,6 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
struct dnode_of_data dn;
int err;
- f2fs_balance_fs(sbi);
-
sb_start_pagefault(inode->i_sb);
f2fs_bug_on(sbi, f2fs_has_inline_data(inode));
@@ -57,6 +55,8 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
f2fs_put_dnode(&dn);
f2fs_unlock_op(sbi);
+ f2fs_balance_fs(sbi, dn.node_changed);
+
file_update_time(vma->vm_file);
lock_page(page);
if (unlikely(page->mapping != inode->i_mapping ||
@@ -74,7 +74,8 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
goto mapped;
/* page is wholly or partially inside EOF */
- if (((page->index + 1) << PAGE_CACHE_SHIFT) > i_size_read(inode)) {
+ if (((loff_t)(page->index + 1) << PAGE_CACHE_SHIFT) >
+ i_size_read(inode)) {
unsigned offset;
offset = i_size_read(inode) & ~PAGE_CACHE_MASK;
zero_user_segment(page, offset, PAGE_CACHE_SIZE);
@@ -86,10 +87,16 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
mapped:
/* fill the page */
f2fs_wait_on_page_writeback(page, DATA);
+
+ /* wait for GCed encrypted page writeback */
+ if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
+ f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr);
+
/* if gced page is attached, don't write to cold segment */
clear_cold_data(page);
out:
sb_end_pagefault(inode->i_sb);
+ f2fs_update_time(sbi, REQ_TIME);
return block_page_mkwrite_return(err);
}
@@ -195,7 +202,7 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
trace_f2fs_sync_file_enter(inode);
/* if fdatasync is triggered, let's do in-place-update */
- if (get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks)
+ if (datasync || get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks)
set_inode_flag(fi, FI_NEED_IPU);
ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
clear_inode_flag(fi, FI_NEED_IPU);
@@ -227,9 +234,6 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
goto out;
}
go_write:
- /* guarantee free sections for fsync */
- f2fs_balance_fs(sbi);
-
/*
* Both of fdatasync() and fsync() are able to be recovered from
* sudden-power-off.
@@ -255,8 +259,10 @@ sync_nodes:
sync_node_pages(sbi, ino, &wbc);
/* if cp_error was enabled, we should avoid infinite loop */
- if (unlikely(f2fs_cp_error(sbi)))
+ if (unlikely(f2fs_cp_error(sbi))) {
+ ret = -EIO;
goto out;
+ }
if (need_inode_block_update(sbi, ino)) {
mark_inode_dirty_sync(inode);
@@ -269,12 +275,13 @@ sync_nodes:
goto out;
/* once recovery info is written, don't need to tack this */
- remove_dirty_inode(sbi, ino, APPEND_INO);
+ remove_ino_entry(sbi, ino, APPEND_INO);
clear_inode_flag(fi, FI_APPEND_WRITE);
flush_out:
- remove_dirty_inode(sbi, ino, UPDATE_INO);
+ remove_ino_entry(sbi, ino, UPDATE_INO);
clear_inode_flag(fi, FI_UPDATE_WRITE);
ret = f2fs_issue_flush(sbi);
+ f2fs_update_time(sbi, REQ_TIME);
out:
trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
f2fs_trace_ios(NULL, 1);
@@ -343,7 +350,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
dirty = __get_first_dirty_index(inode->i_mapping, pgofs, whence);
- for (; data_ofs < isize; data_ofs = pgofs << PAGE_CACHE_SHIFT) {
+ for (; data_ofs < isize; data_ofs = (loff_t)pgofs << PAGE_CACHE_SHIFT) {
set_new_dnode(&dn, inode, NULL, NULL, 0);
err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE_RA);
if (err && err != -ENOENT) {
@@ -412,19 +419,18 @@ static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence)
static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
{
struct inode *inode = file_inode(file);
+ int err;
if (f2fs_encrypted_inode(inode)) {
- int err = f2fs_get_encryption_info(inode);
+ err = f2fs_get_encryption_info(inode);
if (err)
return 0;
}
/* we don't need to use inline_data strictly */
- if (f2fs_has_inline_data(inode)) {
- int err = f2fs_convert_inline_inode(inode);
- if (err)
- return err;
- }
+ err = f2fs_convert_inline_inode(inode);
+ if (err)
+ return err;
file_accessed(file);
vma->vm_ops = &f2fs_file_vm_ops;
@@ -477,11 +483,11 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
F2FS_I(dn->inode)) + ofs;
f2fs_update_extent_cache_range(dn, fofs, 0, len);
dec_valid_block_count(sbi, dn->inode, nr_free);
- set_page_dirty(dn->node_page);
sync_inode_page(dn);
}
dn->ofs_in_node = ofs;
+ f2fs_update_time(sbi, REQ_TIME);
trace_f2fs_truncate_data_blocks_range(dn->inode, dn->nid,
dn->ofs_in_node, nr_free);
return nr_free;
@@ -504,14 +510,14 @@ static int truncate_partial_data_page(struct inode *inode, u64 from,
return 0;
if (cache_only) {
- page = grab_cache_page(mapping, index);
+ page = f2fs_grab_cache_page(mapping, index, false);
if (page && PageUptodate(page))
goto truncate_out;
f2fs_put_page(page, 1);
return 0;
}
- page = get_lock_data_page(inode, index);
+ page = get_lock_data_page(inode, index, true);
if (IS_ERR(page))
return 0;
truncate_out:
@@ -598,7 +604,7 @@ int f2fs_truncate(struct inode *inode, bool lock)
trace_f2fs_truncate(inode);
/* we should check inline_data size */
- if (f2fs_has_inline_data(inode) && !f2fs_may_inline_data(inode)) {
+ if (!f2fs_may_inline_data(inode)) {
err = f2fs_convert_inline_inode(inode);
if (err)
return err;
@@ -673,13 +679,21 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
err = f2fs_truncate(inode, true);
if (err)
return err;
- f2fs_balance_fs(F2FS_I_SB(inode));
+ f2fs_balance_fs(F2FS_I_SB(inode), true);
} else {
/*
* do not trim all blocks after i_size if target size is
* larger than i_size.
*/
truncate_setsize(inode, attr->ia_size);
+
+ /* should convert inline inode here */
+ if (!f2fs_may_inline_data(inode)) {
+ err = f2fs_convert_inline_inode(inode);
+ if (err)
+ return err;
+ }
+ inode->i_mtime = inode->i_ctime = CURRENT_TIME;
}
}
@@ -720,7 +734,7 @@ static int fill_zero(struct inode *inode, pgoff_t index,
if (!len)
return 0;
- f2fs_balance_fs(sbi);
+ f2fs_balance_fs(sbi, true);
f2fs_lock_op(sbi);
page = get_new_data_page(inode, NULL, index, false);
@@ -738,23 +752,31 @@ static int fill_zero(struct inode *inode, pgoff_t index,
int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end)
{
- pgoff_t index;
int err;
- for (index = pg_start; index < pg_end; index++) {
+ while (pg_start < pg_end) {
struct dnode_of_data dn;
+ pgoff_t end_offset, count;
set_new_dnode(&dn, inode, NULL, NULL, 0);
- err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
+ err = get_dnode_of_data(&dn, pg_start, LOOKUP_NODE);
if (err) {
- if (err == -ENOENT)
+ if (err == -ENOENT) {
+ pg_start++;
continue;
+ }
return err;
}
- if (dn.data_blkaddr != NULL_ADDR)
- truncate_data_blocks_range(&dn, 1);
+ end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
+ count = min(end_offset - dn.ofs_in_node, pg_end - pg_start);
+
+ f2fs_bug_on(F2FS_I_SB(inode), count == 0 || count > end_offset);
+
+ truncate_data_blocks_range(&dn, count);
f2fs_put_dnode(&dn);
+
+ pg_start += count;
}
return 0;
}
@@ -763,16 +785,11 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
{
pgoff_t pg_start, pg_end;
loff_t off_start, off_end;
- int ret = 0;
-
- if (!S_ISREG(inode->i_mode))
- return -EOPNOTSUPP;
+ int ret;
- if (f2fs_has_inline_data(inode)) {
- ret = f2fs_convert_inline_inode(inode);
- if (ret)
- return ret;
- }
+ ret = f2fs_convert_inline_inode(inode);
+ if (ret)
+ return ret;
pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT;
pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT;
@@ -803,10 +820,10 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
loff_t blk_start, blk_end;
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- f2fs_balance_fs(sbi);
+ f2fs_balance_fs(sbi, true);
- blk_start = pg_start << PAGE_CACHE_SHIFT;
- blk_end = pg_end << PAGE_CACHE_SHIFT;
+ blk_start = (loff_t)pg_start << PAGE_CACHE_SHIFT;
+ blk_end = (loff_t)pg_end << PAGE_CACHE_SHIFT;
truncate_inode_pages_range(mapping, blk_start,
blk_end - 1);
@@ -819,86 +836,100 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
return ret;
}
-static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end)
+static int __exchange_data_block(struct inode *inode, pgoff_t src,
+ pgoff_t dst, bool full)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct dnode_of_data dn;
- pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE;
- int ret = 0;
-
- for (; end < nrpages; start++, end++) {
- block_t new_addr, old_addr;
-
- f2fs_lock_op(sbi);
+ block_t new_addr;
+ bool do_replace = false;
+ int ret;
- set_new_dnode(&dn, inode, NULL, NULL, 0);
- ret = get_dnode_of_data(&dn, end, LOOKUP_NODE_RA);
- if (ret && ret != -ENOENT) {
- goto out;
- } else if (ret == -ENOENT) {
- new_addr = NULL_ADDR;
- } else {
- new_addr = dn.data_blkaddr;
- truncate_data_blocks_range(&dn, 1);
- f2fs_put_dnode(&dn);
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ ret = get_dnode_of_data(&dn, src, LOOKUP_NODE_RA);
+ if (ret && ret != -ENOENT) {
+ return ret;
+ } else if (ret == -ENOENT) {
+ new_addr = NULL_ADDR;
+ } else {
+ new_addr = dn.data_blkaddr;
+ if (!is_checkpointed_data(sbi, new_addr)) {
+ dn.data_blkaddr = NULL_ADDR;
+ /* do not invalidate this block address */
+ set_data_blkaddr(&dn);
+ f2fs_update_extent_cache(&dn);
+ do_replace = true;
}
+ f2fs_put_dnode(&dn);
+ }
- if (new_addr == NULL_ADDR) {
- set_new_dnode(&dn, inode, NULL, NULL, 0);
- ret = get_dnode_of_data(&dn, start, LOOKUP_NODE_RA);
- if (ret && ret != -ENOENT) {
- goto out;
- } else if (ret == -ENOENT) {
- f2fs_unlock_op(sbi);
- continue;
- }
+ if (new_addr == NULL_ADDR)
+ return full ? truncate_hole(inode, dst, dst + 1) : 0;
- if (dn.data_blkaddr == NULL_ADDR) {
- f2fs_put_dnode(&dn);
- f2fs_unlock_op(sbi);
- continue;
- } else {
- truncate_data_blocks_range(&dn, 1);
- }
+ if (do_replace) {
+ struct page *ipage = get_node_page(sbi, inode->i_ino);
+ struct node_info ni;
- f2fs_put_dnode(&dn);
- } else {
- struct page *ipage;
+ if (IS_ERR(ipage)) {
+ ret = PTR_ERR(ipage);
+ goto err_out;
+ }
- ipage = get_node_page(sbi, inode->i_ino);
- if (IS_ERR(ipage)) {
- ret = PTR_ERR(ipage);
- goto out;
- }
+ set_new_dnode(&dn, inode, ipage, NULL, 0);
+ ret = f2fs_reserve_block(&dn, dst);
+ if (ret)
+ goto err_out;
- set_new_dnode(&dn, inode, ipage, NULL, 0);
- ret = f2fs_reserve_block(&dn, start);
- if (ret)
- goto out;
+ truncate_data_blocks_range(&dn, 1);
- old_addr = dn.data_blkaddr;
- if (old_addr != NEW_ADDR && new_addr == NEW_ADDR) {
- dn.data_blkaddr = NULL_ADDR;
- f2fs_update_extent_cache(&dn);
- invalidate_blocks(sbi, old_addr);
+ get_node_info(sbi, dn.nid, &ni);
+ f2fs_replace_block(sbi, &dn, dn.data_blkaddr, new_addr,
+ ni.version, true);
+ f2fs_put_dnode(&dn);
+ } else {
+ struct page *psrc, *pdst;
+
+ psrc = get_lock_data_page(inode, src, true);
+ if (IS_ERR(psrc))
+ return PTR_ERR(psrc);
+ pdst = get_new_data_page(inode, NULL, dst, false);
+ if (IS_ERR(pdst)) {
+ f2fs_put_page(psrc, 1);
+ return PTR_ERR(pdst);
+ }
+ f2fs_copy_page(psrc, pdst);
+ set_page_dirty(pdst);
+ f2fs_put_page(pdst, 1);
+ f2fs_put_page(psrc, 1);
- dn.data_blkaddr = new_addr;
- set_data_blkaddr(&dn);
- } else if (new_addr != NEW_ADDR) {
- struct node_info ni;
+ return truncate_hole(inode, src, src + 1);
+ }
+ return 0;
- get_node_info(sbi, dn.nid, &ni);
- f2fs_replace_block(sbi, &dn, old_addr, new_addr,
- ni.version, true);
- }
+err_out:
+ if (!get_dnode_of_data(&dn, src, LOOKUP_NODE)) {
+ dn.data_blkaddr = new_addr;
+ set_data_blkaddr(&dn);
+ f2fs_update_extent_cache(&dn);
+ f2fs_put_dnode(&dn);
+ }
+ return ret;
+}
- f2fs_put_dnode(&dn);
- }
+static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE;
+ int ret = 0;
+
+ for (; end < nrpages; start++, end++) {
+ f2fs_balance_fs(sbi, true);
+ f2fs_lock_op(sbi);
+ ret = __exchange_data_block(inode, end, start, true);
f2fs_unlock_op(sbi);
+ if (ret)
+ break;
}
- return 0;
-out:
- f2fs_unlock_op(sbi);
return ret;
}
@@ -908,9 +939,6 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
loff_t new_size;
int ret;
- if (!S_ISREG(inode->i_mode))
- return -EINVAL;
-
if (offset + len >= i_size_read(inode))
return -EINVAL;
@@ -918,13 +946,9 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
if (offset & (F2FS_BLKSIZE - 1) || len & (F2FS_BLKSIZE - 1))
return -EINVAL;
- f2fs_balance_fs(F2FS_I_SB(inode));
-
- if (f2fs_has_inline_data(inode)) {
- ret = f2fs_convert_inline_inode(inode);
- if (ret)
- return ret;
- }
+ ret = f2fs_convert_inline_inode(inode);
+ if (ret)
+ return ret;
pg_start = offset >> PAGE_CACHE_SHIFT;
pg_end = (offset + len) >> PAGE_CACHE_SHIFT;
@@ -940,7 +964,12 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
if (ret)
return ret;
+ /* write out all moved pages, if possible */
+ filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX);
+ truncate_pagecache(inode, offset);
+
new_size = i_size_read(inode) - len;
+ truncate_pagecache(inode, new_size);
ret = truncate_blocks(inode, new_size, true);
if (!ret)
@@ -959,20 +988,13 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
loff_t off_start, off_end;
int ret = 0;
- if (!S_ISREG(inode->i_mode))
- return -EINVAL;
-
ret = inode_newsize_ok(inode, (len + offset));
if (ret)
return ret;
- f2fs_balance_fs(sbi);
-
- if (f2fs_has_inline_data(inode)) {
- ret = f2fs_convert_inline_inode(inode);
- if (ret)
- return ret;
- }
+ ret = f2fs_convert_inline_inode(inode);
+ if (ret)
+ return ret;
ret = filemap_write_and_wait_range(mapping, offset, offset + len - 1);
if (ret)
@@ -1003,7 +1025,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
return ret;
new_size = max_t(loff_t, new_size,
- pg_start << PAGE_CACHE_SHIFT);
+ (loff_t)pg_start << PAGE_CACHE_SHIFT);
}
for (index = pg_start; index < pg_end; index++) {
@@ -1039,7 +1061,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
f2fs_unlock_op(sbi);
new_size = max_t(loff_t, new_size,
- (index + 1) << PAGE_CACHE_SHIFT);
+ (loff_t)(index + 1) << PAGE_CACHE_SHIFT);
}
if (off_end) {
@@ -1066,10 +1088,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
pgoff_t pg_start, pg_end, delta, nrpages, idx;
loff_t new_size;
- int ret;
-
- if (!S_ISREG(inode->i_mode))
- return -EINVAL;
+ int ret = 0;
new_size = i_size_read(inode) + len;
if (new_size > inode->i_sb->s_maxbytes)
@@ -1082,13 +1101,11 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
if (offset & (F2FS_BLKSIZE - 1) || len & (F2FS_BLKSIZE - 1))
return -EINVAL;
- f2fs_balance_fs(sbi);
+ ret = f2fs_convert_inline_inode(inode);
+ if (ret)
+ return ret;
- if (f2fs_has_inline_data(inode)) {
- ret = f2fs_convert_inline_inode(inode);
- if (ret)
- return ret;
- }
+ f2fs_balance_fs(sbi, true);
ret = truncate_blocks(inode, i_size_read(inode), true);
if (ret)
@@ -1107,57 +1124,19 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE;
for (idx = nrpages - 1; idx >= pg_start && idx != -1; idx--) {
- struct dnode_of_data dn;
- struct page *ipage;
- block_t new_addr, old_addr;
-
f2fs_lock_op(sbi);
-
- set_new_dnode(&dn, inode, NULL, NULL, 0);
- ret = get_dnode_of_data(&dn, idx, LOOKUP_NODE_RA);
- if (ret && ret != -ENOENT) {
- goto out;
- } else if (ret == -ENOENT) {
- goto next;
- } else if (dn.data_blkaddr == NULL_ADDR) {
- f2fs_put_dnode(&dn);
- goto next;
- } else {
- new_addr = dn.data_blkaddr;
- truncate_data_blocks_range(&dn, 1);
- f2fs_put_dnode(&dn);
- }
-
- ipage = get_node_page(sbi, inode->i_ino);
- if (IS_ERR(ipage)) {
- ret = PTR_ERR(ipage);
- goto out;
- }
-
- set_new_dnode(&dn, inode, ipage, NULL, 0);
- ret = f2fs_reserve_block(&dn, idx + delta);
- if (ret)
- goto out;
-
- old_addr = dn.data_blkaddr;
- f2fs_bug_on(sbi, old_addr != NEW_ADDR);
-
- if (new_addr != NEW_ADDR) {
- struct node_info ni;
-
- get_node_info(sbi, dn.nid, &ni);
- f2fs_replace_block(sbi, &dn, old_addr, new_addr,
- ni.version, true);
- }
- f2fs_put_dnode(&dn);
-next:
+ ret = __exchange_data_block(inode, idx, idx + delta, false);
f2fs_unlock_op(sbi);
+ if (ret)
+ break;
}
- i_size_write(inode, new_size);
- return 0;
-out:
- f2fs_unlock_op(sbi);
+ /* write out all moved pages, if possible */
+ filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX);
+ truncate_pagecache(inode, offset);
+
+ if (!ret)
+ i_size_write(inode, new_size);
return ret;
}
@@ -1170,17 +1149,15 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
loff_t off_start, off_end;
int ret = 0;
- f2fs_balance_fs(sbi);
-
ret = inode_newsize_ok(inode, (len + offset));
if (ret)
return ret;
- if (f2fs_has_inline_data(inode)) {
- ret = f2fs_convert_inline_inode(inode);
- if (ret)
- return ret;
- }
+ ret = f2fs_convert_inline_inode(inode);
+ if (ret)
+ return ret;
+
+ f2fs_balance_fs(sbi, true);
pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT;
pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT;
@@ -1204,9 +1181,10 @@ noalloc:
if (pg_start == pg_end)
new_size = offset + len;
else if (index == pg_start && off_start)
- new_size = (index + 1) << PAGE_CACHE_SHIFT;
+ new_size = (loff_t)(index + 1) << PAGE_CACHE_SHIFT;
else if (index == pg_end)
- new_size = (index << PAGE_CACHE_SHIFT) + off_end;
+ new_size = ((loff_t)index << PAGE_CACHE_SHIFT) +
+ off_end;
else
new_size += PAGE_CACHE_SIZE;
}
@@ -1228,6 +1206,10 @@ static long f2fs_fallocate(struct file *file, int mode,
struct inode *inode = file_inode(file);
long ret = 0;
+ /* f2fs only support ->fallocate for regular file */
+ if (!S_ISREG(inode->i_mode))
+ return -EINVAL;
+
if (f2fs_encrypted_inode(inode) &&
(mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE)))
return -EOPNOTSUPP;
@@ -1257,6 +1239,7 @@ static long f2fs_fallocate(struct file *file, int mode,
if (!ret) {
inode->i_mtime = inode->i_ctime = CURRENT_TIME;
mark_inode_dirty(inode);
+ f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
}
out:
@@ -1364,8 +1347,6 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
if (!inode_owner_or_capable(inode))
return -EACCES;
- f2fs_balance_fs(F2FS_I_SB(inode));
-
if (f2fs_is_atomic_file(inode))
return 0;
@@ -1374,6 +1355,8 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
return ret;
set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
+ f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
+
return 0;
}
@@ -1395,8 +1378,10 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp)
if (f2fs_is_atomic_file(inode)) {
clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
ret = commit_inmem_pages(inode, false);
- if (ret)
+ if (ret) {
+ set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
goto err_out;
+ }
}
ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0);
@@ -1421,6 +1406,7 @@ static int f2fs_ioc_start_volatile_write(struct file *filp)
return ret;
set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
+ f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
return 0;
}
@@ -1437,8 +1423,7 @@ static int f2fs_ioc_release_volatile_write(struct file *filp)
if (!f2fs_is_first_block_written(inode))
return truncate_partial_data_page(inode, 0, true);
- punch_hole(inode, 0, F2FS_BLKSIZE);
- return 0;
+ return punch_hole(inode, 0, F2FS_BLKSIZE);
}
static int f2fs_ioc_abort_volatile_write(struct file *filp)
@@ -1453,17 +1438,17 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp)
if (ret)
return ret;
- f2fs_balance_fs(F2FS_I_SB(inode));
-
if (f2fs_is_atomic_file(inode)) {
clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
commit_inmem_pages(inode, true);
}
-
- if (f2fs_is_volatile_file(inode))
+ if (f2fs_is_volatile_file(inode)) {
clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
+ ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0);
+ }
mnt_drop_write_file(filp);
+ f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
return ret;
}
@@ -1496,9 +1481,14 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
case F2FS_GOING_DOWN_NOSYNC:
f2fs_stop_checkpoint(sbi);
break;
+ case F2FS_GOING_DOWN_METAFLUSH:
+ sync_meta_pages(sbi, META, LONG_MAX);
+ f2fs_stop_checkpoint(sbi);
+ break;
default:
return -EINVAL;
}
+ f2fs_update_time(sbi, REQ_TIME);
return 0;
}
@@ -1529,6 +1519,7 @@ static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg)
if (copy_to_user((struct fstrim_range __user *)arg, &range,
sizeof(range)))
return -EFAULT;
+ f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
return 0;
}
@@ -1552,6 +1543,7 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg)
sizeof(policy)))
return -EFAULT;
+ f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
return f2fs_process_policy(&policy, inode);
#else
return -EOPNOTSUPP;
@@ -1598,13 +1590,13 @@ static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg)
generate_random_uuid(sbi->raw_super->encrypt_pw_salt);
err = f2fs_commit_super(sbi, false);
-
- mnt_drop_write_file(filp);
if (err) {
/* undo new data */
memset(sbi->raw_super->encrypt_pw_salt, 0, 16);
+ mnt_drop_write_file(filp);
return err;
}
+ mnt_drop_write_file(filp);
got_it:
if (copy_to_user((__u8 __user *)arg, sbi->raw_super->encrypt_pw_salt,
16))
@@ -1616,29 +1608,228 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg)
{
struct inode *inode = file_inode(filp);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- __u32 i, count;
+ __u32 sync;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (get_user(count, (__u32 __user *)arg))
+ if (get_user(sync, (__u32 __user *)arg))
return -EFAULT;
- if (!count || count > F2FS_BATCH_GC_MAX_NUM)
- return -EINVAL;
+ if (f2fs_readonly(sbi->sb))
+ return -EROFS;
- for (i = 0; i < count; i++) {
+ if (!sync) {
if (!mutex_trylock(&sbi->gc_mutex))
- break;
+ return -EBUSY;
+ } else {
+ mutex_lock(&sbi->gc_mutex);
+ }
+
+ return f2fs_gc(sbi, sync);
+}
+
+static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (f2fs_readonly(sbi->sb))
+ return -EROFS;
+
+ return f2fs_sync_fs(sbi->sb, 1);
+}
+
+static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
+ struct file *filp,
+ struct f2fs_defragment *range)
+{
+ struct inode *inode = file_inode(filp);
+ struct f2fs_map_blocks map;
+ struct extent_info ei;
+ pgoff_t pg_start, pg_end;
+ unsigned int blk_per_seg = sbi->blocks_per_seg;
+ unsigned int total = 0, sec_num;
+ unsigned int pages_per_sec = sbi->segs_per_sec * blk_per_seg;
+ block_t blk_end = 0;
+ bool fragmented = false;
+ int err;
+
+ /* if in-place-update policy is enabled, don't waste time here */
+ if (need_inplace_update(inode))
+ return -EINVAL;
+
+ pg_start = range->start >> PAGE_CACHE_SHIFT;
+ pg_end = (range->start + range->len) >> PAGE_CACHE_SHIFT;
+
+ f2fs_balance_fs(sbi, true);
+
+ mutex_lock(&inode->i_mutex);
+
+ /* writeback all dirty pages in the range */
+ err = filemap_write_and_wait_range(inode->i_mapping, range->start,
+ range->start + range->len - 1);
+ if (err)
+ goto out;
+
+ /*
+ * lookup mapping info in extent cache, skip defragmenting if physical
+ * block addresses are continuous.
+ */
+ if (f2fs_lookup_extent_cache(inode, pg_start, &ei)) {
+ if (ei.fofs + ei.len >= pg_end)
+ goto out;
+ }
+
+ map.m_lblk = pg_start;
+
+ /*
+ * lookup mapping info in dnode page cache, skip defragmenting if all
+ * physical block addresses are continuous even if there are hole(s)
+ * in logical blocks.
+ */
+ while (map.m_lblk < pg_end) {
+ map.m_len = pg_end - map.m_lblk;
+ err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ);
+ if (err)
+ goto out;
+
+ if (!(map.m_flags & F2FS_MAP_FLAGS)) {
+ map.m_lblk++;
+ continue;
+ }
- if (f2fs_gc(sbi))
+ if (blk_end && blk_end != map.m_pblk) {
+ fragmented = true;
break;
+ }
+ blk_end = map.m_pblk + map.m_len;
+
+ map.m_lblk += map.m_len;
}
- if (put_user(i, (__u32 __user *)arg))
- return -EFAULT;
+ if (!fragmented)
+ goto out;
- return 0;
+ map.m_lblk = pg_start;
+ map.m_len = pg_end - pg_start;
+
+ sec_num = (map.m_len + pages_per_sec - 1) / pages_per_sec;
+
+ /*
+ * make sure there are enough free section for LFS allocation, this can
+ * avoid defragment running in SSR mode when free section are allocated
+ * intensively
+ */
+ if (has_not_enough_free_secs(sbi, sec_num)) {
+ err = -EAGAIN;
+ goto out;
+ }
+
+ while (map.m_lblk < pg_end) {
+ pgoff_t idx;
+ int cnt = 0;
+
+do_map:
+ map.m_len = pg_end - map.m_lblk;
+ err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ);
+ if (err)
+ goto clear_out;
+
+ if (!(map.m_flags & F2FS_MAP_FLAGS)) {
+ map.m_lblk++;
+ continue;
+ }
+
+ set_inode_flag(F2FS_I(inode), FI_DO_DEFRAG);
+
+ idx = map.m_lblk;
+ while (idx < map.m_lblk + map.m_len && cnt < blk_per_seg) {
+ struct page *page;
+
+ page = get_lock_data_page(inode, idx, true);
+ if (IS_ERR(page)) {
+ err = PTR_ERR(page);
+ goto clear_out;
+ }
+
+ set_page_dirty(page);
+ f2fs_put_page(page, 1);
+
+ idx++;
+ cnt++;
+ total++;
+ }
+
+ map.m_lblk = idx;
+
+ if (idx < pg_end && cnt < blk_per_seg)
+ goto do_map;
+
+ clear_inode_flag(F2FS_I(inode), FI_DO_DEFRAG);
+
+ err = filemap_fdatawrite(inode->i_mapping);
+ if (err)
+ goto out;
+ }
+clear_out:
+ clear_inode_flag(F2FS_I(inode), FI_DO_DEFRAG);
+out:
+ mutex_unlock(&inode->i_mutex);
+ if (!err)
+ range->len = (u64)total << PAGE_CACHE_SHIFT;
+ return err;
+}
+
+static int f2fs_ioc_defragment(struct file *filp, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct f2fs_defragment range;
+ int err;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!S_ISREG(inode->i_mode))
+ return -EINVAL;
+
+ err = mnt_want_write_file(filp);
+ if (err)
+ return err;
+
+ if (f2fs_readonly(sbi->sb)) {
+ err = -EROFS;
+ goto out;
+ }
+
+ if (copy_from_user(&range, (struct f2fs_defragment __user *)arg,
+ sizeof(range))) {
+ err = -EFAULT;
+ goto out;
+ }
+
+ /* verify alignment of offset & size */
+ if (range.start & (F2FS_BLKSIZE - 1) ||
+ range.len & (F2FS_BLKSIZE - 1)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = f2fs_defragment_range(sbi, filp, &range);
+ f2fs_update_time(sbi, REQ_TIME);
+ if (err < 0)
+ goto out;
+
+ if (copy_to_user((struct f2fs_defragment __user *)arg, &range,
+ sizeof(range)))
+ err = -EFAULT;
+out:
+ mnt_drop_write_file(filp);
+ return err;
}
long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
@@ -1672,6 +1863,10 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return f2fs_ioc_get_encryption_pwsalt(filp, arg);
case F2FS_IOC_GARBAGE_COLLECT:
return f2fs_ioc_gc(filp, arg);
+ case F2FS_IOC_WRITE_CHECKPOINT:
+ return f2fs_ioc_write_checkpoint(filp, arg);
+ case F2FS_IOC_DEFRAGMENT:
+ return f2fs_ioc_defragment(filp, arg);
default:
return -ENOTTY;
}
@@ -1699,6 +1894,22 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case F2FS_IOC32_SETFLAGS:
cmd = F2FS_IOC_SETFLAGS;
break;
+ case F2FS_IOC32_GETVERSION:
+ cmd = F2FS_IOC_GETVERSION;
+ break;
+ case F2FS_IOC_START_ATOMIC_WRITE:
+ case F2FS_IOC_COMMIT_ATOMIC_WRITE:
+ case F2FS_IOC_START_VOLATILE_WRITE:
+ case F2FS_IOC_RELEASE_VOLATILE_WRITE:
+ case F2FS_IOC_ABORT_VOLATILE_WRITE:
+ case F2FS_IOC_SHUTDOWN:
+ case F2FS_IOC_SET_ENCRYPTION_POLICY:
+ case F2FS_IOC_GET_ENCRYPTION_PWSALT:
+ case F2FS_IOC_GET_ENCRYPTION_POLICY:
+ case F2FS_IOC_GARBAGE_COLLECT:
+ case F2FS_IOC_WRITE_CHECKPOINT:
+ case F2FS_IOC_DEFRAGMENT:
+ break;
default:
return -ENOIOCTLCMD;
}
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 782b8e72c094..f610c2a9bdde 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -16,7 +16,6 @@
#include <linux/kthread.h>
#include <linux/delay.h>
#include <linux/freezer.h>
-#include <linux/blkdev.h>
#include "f2fs.h"
#include "node.h"
@@ -78,9 +77,12 @@ static int gc_thread_func(void *data)
stat_inc_bggc_count(sbi);
/* if return value is not zero, no victim was selected */
- if (f2fs_gc(sbi))
+ if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC)))
wait_ms = gc_th->no_gc_sleep_time;
+ trace_f2fs_background_gc(sbi->sb, wait_ms,
+ prefree_segments(sbi), free_segments(sbi));
+
/* balancing f2fs's metadata periodically */
f2fs_balance_fs_bg(sbi);
@@ -170,9 +172,9 @@ static unsigned int get_max_cost(struct f2fs_sb_info *sbi,
{
/* SSR allocates in a segment unit */
if (p->alloc_mode == SSR)
- return 1 << sbi->log_blocks_per_seg;
+ return sbi->blocks_per_seg;
if (p->gc_mode == GC_GREEDY)
- return (1 << sbi->log_blocks_per_seg) * p->ofs_unit;
+ return sbi->blocks_per_seg * p->ofs_unit;
else if (p->gc_mode == GC_CB)
return UINT_MAX;
else /* No other gc_mode */
@@ -257,6 +259,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
struct victim_sel_policy p;
unsigned int secno, max_cost;
+ unsigned int last_segment = MAIN_SEGS(sbi);
int nsearched = 0;
mutex_lock(&dirty_i->seglist_lock);
@@ -267,6 +270,9 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
p.min_segno = NULL_SEGNO;
p.min_cost = max_cost = get_max_cost(sbi, &p);
+ if (p.max_search == 0)
+ goto out;
+
if (p.alloc_mode == LFS && gc_type == FG_GC) {
p.min_segno = check_bg_victims(sbi);
if (p.min_segno != NULL_SEGNO)
@@ -277,9 +283,10 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
unsigned long cost;
unsigned int segno;
- segno = find_next_bit(p.dirty_segmap, MAIN_SEGS(sbi), p.offset);
- if (segno >= MAIN_SEGS(sbi)) {
+ segno = find_next_bit(p.dirty_segmap, last_segment, p.offset);
+ if (segno >= last_segment) {
if (sbi->last_victim[p.gc_mode]) {
+ last_segment = sbi->last_victim[p.gc_mode];
sbi->last_victim[p.gc_mode] = 0;
p.offset = 0;
continue;
@@ -327,6 +334,7 @@ got_it:
sbi->cur_victim_sec,
prefree_segments(sbi), free_segments(sbi));
}
+out:
mutex_unlock(&dirty_i->seglist_lock);
return (p.min_segno == NULL_SEGNO) ? 0 : 1;
@@ -541,7 +549,7 @@ static void move_encrypted_block(struct inode *inode, block_t bidx)
int err;
/* do not read out */
- page = grab_cache_page(inode->i_mapping, bidx);
+ page = f2fs_grab_cache_page(inode->i_mapping, bidx, false);
if (!page)
return;
@@ -550,8 +558,16 @@ static void move_encrypted_block(struct inode *inode, block_t bidx)
if (err)
goto out;
- if (unlikely(dn.data_blkaddr == NULL_ADDR))
+ if (unlikely(dn.data_blkaddr == NULL_ADDR)) {
+ ClearPageUptodate(page);
goto put_out;
+ }
+
+ /*
+ * don't cache encrypted data into meta inode until previous dirty
+ * data were writebacked to avoid racing between GC and flush.
+ */
+ f2fs_wait_on_page_writeback(page, DATA);
get_node_info(fio.sbi, dn.nid, &ni);
set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version);
@@ -580,7 +596,7 @@ static void move_encrypted_block(struct inode *inode, block_t bidx)
goto put_page_out;
set_page_dirty(fio.encrypted_page);
- f2fs_wait_on_page_writeback(fio.encrypted_page, META);
+ f2fs_wait_on_page_writeback(fio.encrypted_page, DATA);
if (clear_page_dirty_for_io(fio.encrypted_page))
dec_page_count(fio.sbi, F2FS_DIRTY_META);
@@ -611,7 +627,7 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type)
{
struct page *page;
- page = get_lock_data_page(inode, bidx);
+ page = get_lock_data_page(inode, bidx, true);
if (IS_ERR(page))
return;
@@ -705,7 +721,7 @@ next_step:
start_bidx = start_bidx_of_node(nofs, F2FS_I(inode));
data_page = get_read_data_page(inode,
- start_bidx + ofs_in_node, READA);
+ start_bidx + ofs_in_node, READA, true);
if (IS_ERR(data_page)) {
iput(inode);
continue;
@@ -797,13 +813,12 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno,
return nfree;
}
-int f2fs_gc(struct f2fs_sb_info *sbi)
+int f2fs_gc(struct f2fs_sb_info *sbi, bool sync)
{
- unsigned int segno = NULL_SEGNO;
- unsigned int i;
- int gc_type = BG_GC;
- int nfree = 0;
- int ret = -1;
+ unsigned int segno, i;
+ int gc_type = sync ? FG_GC : BG_GC;
+ int sec_freed = 0;
+ int ret = -EINVAL;
struct cp_control cpc;
struct gc_inode_list gc_list = {
.ilist = LIST_HEAD_INIT(gc_list.ilist),
@@ -812,12 +827,16 @@ int f2fs_gc(struct f2fs_sb_info *sbi)
cpc.reason = __get_cp_reason(sbi);
gc_more:
+ segno = NULL_SEGNO;
+
if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE)))
goto stop;
- if (unlikely(f2fs_cp_error(sbi)))
+ if (unlikely(f2fs_cp_error(sbi))) {
+ ret = -EIO;
goto stop;
+ }
- if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) {
+ if (gc_type == BG_GC && has_not_enough_free_secs(sbi, sec_freed)) {
gc_type = FG_GC;
if (__get_victim(sbi, &segno, gc_type) || prefree_segments(sbi))
write_checkpoint(sbi, &cpc);
@@ -830,23 +849,38 @@ gc_more:
/* readahead multi ssa blocks those have contiguous address */
if (sbi->segs_per_sec > 1)
ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), sbi->segs_per_sec,
- META_SSA);
+ META_SSA, true);
- for (i = 0; i < sbi->segs_per_sec; i++)
- nfree += do_garbage_collect(sbi, segno + i, &gc_list, gc_type);
+ for (i = 0; i < sbi->segs_per_sec; i++) {
+ /*
+ * for FG_GC case, halt gcing left segments once failed one
+ * of segments in selected section to avoid long latency.
+ */
+ if (!do_garbage_collect(sbi, segno + i, &gc_list, gc_type) &&
+ gc_type == FG_GC)
+ break;
+ }
+
+ if (i == sbi->segs_per_sec && gc_type == FG_GC)
+ sec_freed++;
if (gc_type == FG_GC)
sbi->cur_victim_sec = NULL_SEGNO;
- if (has_not_enough_free_secs(sbi, nfree))
- goto gc_more;
+ if (!sync) {
+ if (has_not_enough_free_secs(sbi, sec_freed))
+ goto gc_more;
- if (gc_type == FG_GC)
- write_checkpoint(sbi, &cpc);
+ if (gc_type == FG_GC)
+ write_checkpoint(sbi, &cpc);
+ }
stop:
mutex_unlock(&sbi->gc_mutex);
put_gc_inode(&gc_list);
+
+ if (sync)
+ ret = sec_freed ? 0 : -EAGAIN;
return ret;
}
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index c5a055b3376e..a993967dcdb9 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -19,12 +19,6 @@
#define LIMIT_INVALID_BLOCK 40 /* percentage over total user space */
#define LIMIT_FREE_BLOCK 40 /* percentage over invalid + free space */
-/*
- * with this macro, we can control the max time we do garbage collection,
- * when user triggers batch mode gc by ioctl.
- */
-#define F2FS_BATCH_GC_MAX_NUM 16
-
/* Search max. number of dirty segments to select a victim segment */
#define DEF_MAX_VICTIM_SEARCH 4096 /* covers 8GB */
@@ -106,11 +100,3 @@ static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi)
return true;
return false;
}
-
-static inline int is_idle(struct f2fs_sb_info *sbi)
-{
- struct block_device *bdev = sbi->sb->s_bdev;
- struct request_queue *q = bdev_get_queue(bdev);
- struct request_list *rl = &q->root_rl;
- return !(rl->count[BLK_RW_SYNC]) && !(rl->count[BLK_RW_ASYNC]);
-}
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 3d143be42895..c3f0b7d4cfca 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -12,12 +12,10 @@
#include <linux/f2fs_fs.h>
#include "f2fs.h"
+#include "node.h"
bool f2fs_may_inline_data(struct inode *inode)
{
- if (!test_opt(F2FS_I_SB(inode), INLINE_DATA))
- return false;
-
if (f2fs_is_atomic_file(inode))
return false;
@@ -176,6 +174,9 @@ int f2fs_convert_inline_inode(struct inode *inode)
struct page *ipage, *page;
int err = 0;
+ if (!f2fs_has_inline_data(inode))
+ return 0;
+
page = grab_cache_page(inode->i_mapping, 0);
if (!page)
return -ENOMEM;
@@ -198,6 +199,9 @@ out:
f2fs_unlock_op(sbi);
f2fs_put_page(page, 1);
+
+ f2fs_balance_fs(sbi, dn.node_changed);
+
return err;
}
@@ -274,12 +278,14 @@ process_inline:
if (f2fs_has_inline_data(inode)) {
ipage = get_node_page(sbi, inode->i_ino);
f2fs_bug_on(sbi, IS_ERR(ipage));
- truncate_inline_inode(ipage, 0);
+ if (!truncate_inline_inode(ipage, 0))
+ return false;
f2fs_clear_inline_inode(inode);
update_inode(inode, ipage);
f2fs_put_page(ipage, 1);
} else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) {
- truncate_blocks(inode, 0, false);
+ if (truncate_blocks(inode, 0, false))
+ return false;
goto process_inline;
}
return false;
@@ -568,3 +574,38 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx,
f2fs_put_page(ipage, 1);
return 0;
}
+
+int f2fs_inline_data_fiemap(struct inode *inode,
+ struct fiemap_extent_info *fieinfo, __u64 start, __u64 len)
+{
+ __u64 byteaddr, ilen;
+ __u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED |
+ FIEMAP_EXTENT_LAST;
+ struct node_info ni;
+ struct page *ipage;
+ int err = 0;
+
+ ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino);
+ if (IS_ERR(ipage))
+ return PTR_ERR(ipage);
+
+ if (!f2fs_has_inline_data(inode)) {
+ err = -EAGAIN;
+ goto out;
+ }
+
+ ilen = min_t(size_t, MAX_INLINE_DATA, i_size_read(inode));
+ if (start >= ilen)
+ goto out;
+ if (start + len < ilen)
+ ilen = start + len;
+ ilen -= start;
+
+ get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni);
+ byteaddr = (__u64)ni.blk_addr << inode->i_sb->s_blocksize_bits;
+ byteaddr += (char *)inline_data_addr(ipage) - (char *)F2FS_INODE(ipage);
+ err = fiemap_fill_next_extent(fieinfo, start, byteaddr, ilen, flags);
+out:
+ f2fs_put_page(ipage, 1);
+ return err;
+}
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 35aae65b3e5d..2adeff26be11 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -138,7 +138,8 @@ static int do_read_inode(struct inode *inode)
fi->i_pino = le32_to_cpu(ri->i_pino);
fi->i_dir_level = ri->i_dir_level;
- f2fs_init_extent_tree(inode, &ri->i_ext);
+ if (f2fs_init_extent_tree(inode, &ri->i_ext))
+ set_page_dirty(node_page);
get_inline_info(fi, ri);
@@ -202,6 +203,7 @@ make_now:
inode->i_op = &f2fs_encrypted_symlink_inode_operations;
else
inode->i_op = &f2fs_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &f2fs_dblock_aops;
} else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
@@ -221,7 +223,7 @@ bad_inode:
return ERR_PTR(ret);
}
-void update_inode(struct inode *inode, struct page *node_page)
+int update_inode(struct inode *inode, struct page *node_page)
{
struct f2fs_inode *ri;
@@ -259,15 +261,16 @@ void update_inode(struct inode *inode, struct page *node_page)
__set_inode_rdev(inode, ri);
set_cold_node(inode, node_page);
- set_page_dirty(node_page);
-
clear_inode_flag(F2FS_I(inode), FI_DIRTY_INODE);
+
+ return set_page_dirty(node_page);
}
-void update_inode_page(struct inode *inode)
+int update_inode_page(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct page *node_page;
+ int ret = 0;
retry:
node_page = get_node_page(sbi, inode->i_ino);
if (IS_ERR(node_page)) {
@@ -278,10 +281,11 @@ retry:
} else if (err != -ENOENT) {
f2fs_stop_checkpoint(sbi);
}
- return;
+ return 0;
}
- update_inode(inode, node_page);
+ ret = update_inode(inode, node_page);
f2fs_put_page(node_page, 1);
+ return ret;
}
int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
@@ -296,16 +300,11 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
return 0;
/*
- * We need to lock here to prevent from producing dirty node pages
+ * We need to balance fs here to prevent from producing dirty node pages
* during the urgent cleaning time when runing out of free sections.
*/
- f2fs_lock_op(sbi);
- update_inode_page(inode);
- f2fs_unlock_op(sbi);
-
- if (wbc)
- f2fs_balance_fs(sbi);
-
+ if (update_inode_page(inode))
+ f2fs_balance_fs(sbi, true);
return 0;
}
@@ -331,7 +330,7 @@ void f2fs_evict_inode(struct inode *inode)
goto out_clear;
f2fs_bug_on(sbi, get_dirty_pages(inode));
- remove_dirty_dir_inode(inode);
+ remove_dirty_inode(inode);
f2fs_destroy_extent_tree(inode);
@@ -361,9 +360,9 @@ no_delete:
if (xnid)
invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid);
if (is_inode_flag_set(fi, FI_APPEND_WRITE))
- add_dirty_inode(sbi, inode->i_ino, APPEND_INO);
+ add_ino_entry(sbi, inode->i_ino, APPEND_INO);
if (is_inode_flag_set(fi, FI_UPDATE_WRITE))
- add_dirty_inode(sbi, inode->i_ino, UPDATE_INO);
+ add_ino_entry(sbi, inode->i_ino, UPDATE_INO);
if (is_inode_flag_set(fi, FI_FREE_NID)) {
if (err && err != -ENOENT)
alloc_nid_done(sbi, inode->i_ino);
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index a680bf38e4f0..6f944e5eb76e 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -60,7 +60,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode))
f2fs_set_encrypted_inode(inode);
- if (f2fs_may_inline_data(inode))
+ if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode))
set_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
if (f2fs_may_inline_dentry(inode))
set_inode_flag(F2FS_I(inode), FI_INLINE_DENTRY);
@@ -128,8 +128,6 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
nid_t ino = 0;
int err;
- f2fs_balance_fs(sbi);
-
inode = f2fs_new_inode(dir, mode);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -142,6 +140,8 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
inode->i_mapping->a_ops = &f2fs_dblock_aops;
ino = inode->i_ino;
+ f2fs_balance_fs(sbi, true);
+
f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
if (err)
@@ -172,7 +172,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
!f2fs_is_child_context_consistent_with_parent(dir, inode))
return -EPERM;
- f2fs_balance_fs(sbi);
+ f2fs_balance_fs(sbi, true);
inode->i_ctime = CURRENT_TIME;
ihold(inode);
@@ -214,6 +214,15 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino)
struct page *page;
int err = 0;
+ if (f2fs_readonly(sbi->sb)) {
+ f2fs_msg(sbi->sb, KERN_INFO,
+ "skip recovering inline_dots inode (ino:%lu, pino:%u) "
+ "in readonly mountpoint", dir->i_ino, pino);
+ return 0;
+ }
+
+ f2fs_balance_fs(sbi, true);
+
f2fs_lock_op(sbi);
de = f2fs_find_entry(dir, &dot, &page);
@@ -288,12 +297,13 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
int err = -ENOENT;
trace_f2fs_unlink_enter(dir, dentry);
- f2fs_balance_fs(sbi);
de = f2fs_find_entry(dir, &dentry->d_name, &page);
if (!de)
goto fail;
+ f2fs_balance_fs(sbi, true);
+
f2fs_lock_op(sbi);
err = acquire_orphan_inode(sbi);
if (err) {
@@ -315,12 +325,15 @@ fail:
return err;
}
-static const char *f2fs_follow_link(struct dentry *dentry, void **cookie)
+static const char *f2fs_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- const char *link = page_follow_link_light(dentry, cookie);
+ const char *link = page_get_link(dentry, inode, done);
if (!IS_ERR(link) && !*link) {
/* this is broken symlink case */
- page_put_link(NULL, *cookie);
+ do_delayed_call(done);
+ clear_delayed_call(done);
link = ERR_PTR(-ENOENT);
}
return link;
@@ -341,8 +354,6 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
if (len > dir->i_sb->s_blocksize)
return -ENAMETOOLONG;
- f2fs_balance_fs(sbi);
-
inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -351,8 +362,11 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
inode->i_op = &f2fs_encrypted_symlink_inode_operations;
else
inode->i_op = &f2fs_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &f2fs_dblock_aops;
+ f2fs_balance_fs(sbi, true);
+
f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
if (err)
@@ -410,11 +424,14 @@ err_out:
* If the symlink path is stored into inline_data, there is no
* performance regression.
*/
- if (!err)
+ if (!err) {
filemap_write_and_wait_range(inode->i_mapping, 0, p_len - 1);
- if (IS_DIRSYNC(dir))
- f2fs_sync_fs(sbi->sb, 1);
+ if (IS_DIRSYNC(dir))
+ f2fs_sync_fs(sbi->sb, 1);
+ } else {
+ f2fs_unlink(dir, dentry);
+ }
kfree(sd);
f2fs_fname_crypto_free_buffer(&disk_link);
@@ -430,8 +447,6 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
struct inode *inode;
int err;
- f2fs_balance_fs(sbi);
-
inode = f2fs_new_inode(dir, S_IFDIR | mode);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -441,6 +456,8 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
inode->i_mapping->a_ops = &f2fs_dblock_aops;
mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO);
+ f2fs_balance_fs(sbi, true);
+
set_inode_flag(F2FS_I(inode), FI_INC_LINK);
f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
@@ -478,11 +495,6 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
struct inode *inode;
int err = 0;
- if (!new_valid_dev(rdev))
- return -EINVAL;
-
- f2fs_balance_fs(sbi);
-
inode = f2fs_new_inode(dir, mode);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -490,6 +502,8 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
init_special_inode(inode, inode->i_mode, rdev);
inode->i_op = &f2fs_special_inode_operations;
+ f2fs_balance_fs(sbi, true);
+
f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
if (err)
@@ -516,9 +530,6 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry,
struct inode *inode;
int err;
- if (!whiteout)
- f2fs_balance_fs(sbi);
-
inode = f2fs_new_inode(dir, mode);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -532,6 +543,8 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry,
inode->i_mapping->a_ops = &f2fs_dblock_aops;
}
+ f2fs_balance_fs(sbi, true);
+
f2fs_lock_op(sbi);
err = acquire_orphan_inode(sbi);
if (err)
@@ -604,8 +617,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
goto out;
}
- f2fs_balance_fs(sbi);
-
old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page);
if (!old_entry)
goto out;
@@ -635,6 +646,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (!new_entry)
goto out_whiteout;
+ f2fs_balance_fs(sbi, true);
+
f2fs_lock_op(sbi);
err = acquire_orphan_inode(sbi);
@@ -666,6 +679,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
update_inode_page(old_inode);
update_inode_page(new_inode);
} else {
+ f2fs_balance_fs(sbi, true);
+
f2fs_lock_op(sbi);
err = f2fs_add_link(new_dentry, old_inode);
@@ -763,8 +778,6 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
new_inode)))
return -EPERM;
- f2fs_balance_fs(sbi);
-
old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page);
if (!old_entry)
goto out;
@@ -807,6 +820,8 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
goto out_new_dir;
}
+ f2fs_balance_fs(sbi, true);
+
f2fs_lock_op(sbi);
err = update_dent_inode(old_inode, new_inode, &new_dentry->d_name);
@@ -923,18 +938,22 @@ static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry,
}
#ifdef CONFIG_F2FS_FS_ENCRYPTION
-static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cookie)
+static const char *f2fs_encrypted_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
struct page *cpage = NULL;
char *caddr, *paddr = NULL;
- struct f2fs_str cstr;
+ struct f2fs_str cstr = FSTR_INIT(NULL, 0);
struct f2fs_str pstr = FSTR_INIT(NULL, 0);
- struct inode *inode = d_inode(dentry);
struct f2fs_encrypted_symlink_data *sd;
loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1);
u32 max_size = inode->i_sb->s_blocksize;
int res;
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
res = f2fs_get_encryption_info(inode);
if (res)
return ERR_PTR(res);
@@ -942,16 +961,27 @@ static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cook
cpage = read_mapping_page(inode->i_mapping, 0, NULL);
if (IS_ERR(cpage))
return ERR_CAST(cpage);
- caddr = kmap(cpage);
+ caddr = page_address(cpage);
caddr[size] = 0;
/* Symlink is encrypted */
sd = (struct f2fs_encrypted_symlink_data *)caddr;
- cstr.name = sd->encrypted_path;
cstr.len = le16_to_cpu(sd->len);
/* this is broken symlink case */
- if (cstr.name[0] == 0 && cstr.len == 0) {
+ if (unlikely(cstr.len == 0)) {
+ res = -ENOENT;
+ goto errout;
+ }
+ cstr.name = kmalloc(cstr.len, GFP_NOFS);
+ if (!cstr.name) {
+ res = -ENOMEM;
+ goto errout;
+ }
+ memcpy(cstr.name, sd->encrypted_path, cstr.len);
+
+ /* this is broken symlink case */
+ if (unlikely(cstr.name[0] == 0)) {
res = -ENOENT;
goto errout;
}
@@ -970,31 +1000,34 @@ static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cook
if (res < 0)
goto errout;
+ kfree(cstr.name);
+
paddr = pstr.name;
/* Null-terminate the name */
paddr[res] = '\0';
- kunmap(cpage);
page_cache_release(cpage);
- return *cookie = paddr;
+ set_delayed_call(done, kfree_link, paddr);
+ return paddr;
errout:
+ kfree(cstr.name);
f2fs_fname_crypto_free_buffer(&pstr);
- kunmap(cpage);
page_cache_release(cpage);
return ERR_PTR(res);
}
const struct inode_operations f2fs_encrypted_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = f2fs_encrypted_follow_link,
- .put_link = kfree_put_link,
+ .get_link = f2fs_encrypted_get_link,
.getattr = f2fs_getattr,
.setattr = f2fs_setattr,
+#ifdef CONFIG_F2FS_FS_XATTR
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.listxattr = f2fs_listxattr,
.removexattr = generic_removexattr,
+#endif
};
#endif
@@ -1023,8 +1056,7 @@ const struct inode_operations f2fs_dir_inode_operations = {
const struct inode_operations f2fs_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = f2fs_follow_link,
- .put_link = page_put_link,
+ .get_link = f2fs_get_link,
.getattr = f2fs_getattr,
.setattr = f2fs_setattr,
#ifdef CONFIG_F2FS_FS_XATTR
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 27d1a74dd6f3..342597a5897f 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -65,13 +65,14 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
sizeof(struct ino_entry)) >> PAGE_CACHE_SHIFT;
res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
} else if (type == EXTENT_CACHE) {
- mem_size = (sbi->total_ext_tree * sizeof(struct extent_tree) +
+ mem_size = (atomic_read(&sbi->total_ext_tree) *
+ sizeof(struct extent_tree) +
atomic_read(&sbi->total_ext_node) *
sizeof(struct extent_node)) >> PAGE_CACHE_SHIFT;
res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
} else {
- if (sbi->sb->s_bdi->wb.dirty_exceeded)
- return false;
+ if (!sbi->sb->s_bdi->wb.dirty_exceeded)
+ return true;
}
return res;
}
@@ -261,13 +262,11 @@ static void cache_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid,
{
struct nat_entry *e;
- down_write(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, nid);
if (!e) {
e = grab_nat_entry(nm_i, nid);
node_info_from_raw_nat(&e->ni, ne);
}
- up_write(&nm_i->nat_tree_lock);
}
static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
@@ -379,6 +378,8 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
memset(&ne, 0, sizeof(struct f2fs_nat_entry));
+ down_write(&nm_i->nat_tree_lock);
+
/* Check current segment summary */
mutex_lock(&curseg->curseg_mutex);
i = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 0);
@@ -399,6 +400,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
cache:
/* cache nat entry */
cache_nat_entry(NM_I(sbi), nid, &ne);
+ up_write(&nm_i->nat_tree_lock);
}
/*
@@ -676,7 +678,8 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
ret = truncate_dnode(&rdn);
if (ret < 0)
goto out_err;
- set_nid(page, i, 0, false);
+ if (set_nid(page, i, 0, false))
+ dn->node_changed = true;
}
} else {
child_nofs = nofs + ofs * (NIDS_PER_BLOCK + 1) + 1;
@@ -689,7 +692,8 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
rdn.nid = child_nid;
ret = truncate_nodes(&rdn, child_nofs, 0, depth - 1);
if (ret == (NIDS_PER_BLOCK + 1)) {
- set_nid(page, i, 0, false);
+ if (set_nid(page, i, 0, false))
+ dn->node_changed = true;
child_nofs += ret;
} else if (ret < 0 && ret != -ENOENT) {
goto out_err;
@@ -750,7 +754,8 @@ static int truncate_partial_nodes(struct dnode_of_data *dn,
err = truncate_dnode(dn);
if (err < 0)
goto fail;
- set_nid(pages[idx], i, 0, false);
+ if (set_nid(pages[idx], i, 0, false))
+ dn->node_changed = true;
}
if (offset[idx + 1] == 0) {
@@ -975,7 +980,8 @@ struct page *new_node_page(struct dnode_of_data *dn,
fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true);
set_cold_node(dn->inode, page);
SetPageUptodate(page);
- set_page_dirty(page);
+ if (set_page_dirty(page))
+ dn->node_changed = true;
if (f2fs_has_xattr_block(ofs))
F2FS_I(dn->inode)->i_xattr_nid = dn->nid;
@@ -1035,6 +1041,10 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
struct page *apage;
int err;
+ if (!nid)
+ return;
+ f2fs_bug_on(sbi, check_nid_range(sbi, nid));
+
apage = find_get_page(NODE_MAPPING(sbi), nid);
if (apage && PageUptodate(apage)) {
f2fs_put_page(apage, 0);
@@ -1050,51 +1060,38 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
f2fs_put_page(apage, err ? 1 : 0);
}
-struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid)
+/*
+ * readahead MAX_RA_NODE number of node pages.
+ */
+void ra_node_pages(struct page *parent, int start)
{
- struct page *page;
- int err;
-repeat:
- page = grab_cache_page(NODE_MAPPING(sbi), nid);
- if (!page)
- return ERR_PTR(-ENOMEM);
+ struct f2fs_sb_info *sbi = F2FS_P_SB(parent);
+ struct blk_plug plug;
+ int i, end;
+ nid_t nid;
- err = read_node_page(page, READ_SYNC);
- if (err < 0) {
- f2fs_put_page(page, 1);
- return ERR_PTR(err);
- } else if (err != LOCKED_PAGE) {
- lock_page(page);
- }
+ blk_start_plug(&plug);
- if (unlikely(!PageUptodate(page) || nid != nid_of_node(page))) {
- ClearPageUptodate(page);
- f2fs_put_page(page, 1);
- return ERR_PTR(-EIO);
- }
- if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
- f2fs_put_page(page, 1);
- goto repeat;
+ /* Then, try readahead for siblings of the desired node */
+ end = start + MAX_RA_NODE;
+ end = min(end, NIDS_PER_BLOCK);
+ for (i = start; i < end; i++) {
+ nid = get_nid(parent, i, false);
+ ra_node_page(sbi, nid);
}
- return page;
+
+ blk_finish_plug(&plug);
}
-/*
- * Return a locked page for the desired node page.
- * And, readahead MAX_RA_NODE number of node pages.
- */
-struct page *get_node_page_ra(struct page *parent, int start)
+struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid,
+ struct page *parent, int start)
{
- struct f2fs_sb_info *sbi = F2FS_P_SB(parent);
- struct blk_plug plug;
struct page *page;
- int err, i, end;
- nid_t nid;
+ int err;
- /* First, try getting the desired direct node. */
- nid = get_nid(parent, start, false);
if (!nid)
return ERR_PTR(-ENOENT);
+ f2fs_bug_on(sbi, check_nid_range(sbi, nid));
repeat:
page = grab_cache_page(NODE_MAPPING(sbi), nid);
if (!page)
@@ -1108,46 +1105,53 @@ repeat:
goto page_hit;
}
- blk_start_plug(&plug);
-
- /* Then, try readahead for siblings of the desired node */
- end = start + MAX_RA_NODE;
- end = min(end, NIDS_PER_BLOCK);
- for (i = start + 1; i < end; i++) {
- nid = get_nid(parent, i, false);
- if (!nid)
- continue;
- ra_node_page(sbi, nid);
- }
-
- blk_finish_plug(&plug);
+ if (parent)
+ ra_node_pages(parent, start + 1);
lock_page(page);
+
+ if (unlikely(!PageUptodate(page))) {
+ f2fs_put_page(page, 1);
+ return ERR_PTR(-EIO);
+ }
if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
f2fs_put_page(page, 1);
goto repeat;
}
page_hit:
- if (unlikely(!PageUptodate(page))) {
- f2fs_put_page(page, 1);
- return ERR_PTR(-EIO);
- }
+ f2fs_bug_on(sbi, nid != nid_of_node(page));
return page;
}
+struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid)
+{
+ return __get_node_page(sbi, nid, NULL, 0);
+}
+
+struct page *get_node_page_ra(struct page *parent, int start)
+{
+ struct f2fs_sb_info *sbi = F2FS_P_SB(parent);
+ nid_t nid = get_nid(parent, start, false);
+
+ return __get_node_page(sbi, nid, parent, start);
+}
+
void sync_inode_page(struct dnode_of_data *dn)
{
+ int ret = 0;
+
if (IS_INODE(dn->node_page) || dn->inode_page == dn->node_page) {
- update_inode(dn->inode, dn->node_page);
+ ret = update_inode(dn->inode, dn->node_page);
} else if (dn->inode_page) {
if (!dn->inode_page_locked)
lock_page(dn->inode_page);
- update_inode(dn->inode, dn->inode_page);
+ ret = update_inode(dn->inode, dn->inode_page);
if (!dn->inode_page_locked)
unlock_page(dn->inode_page);
} else {
- update_inode_page(dn->inode);
+ ret = update_inode_page(dn->inode);
}
+ dn->node_changed = ret ? true: false;
}
int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino,
@@ -1175,6 +1179,11 @@ next_step:
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
+ if (unlikely(f2fs_cp_error(sbi))) {
+ pagevec_release(&pvec);
+ return -EIO;
+ }
+
/*
* flushing sequence with step:
* 0. indirect nodes
@@ -1323,23 +1332,24 @@ static int f2fs_write_node_page(struct page *page,
nid = nid_of_node(page);
f2fs_bug_on(sbi, page->index != nid);
+ if (wbc->for_reclaim) {
+ if (!down_read_trylock(&sbi->node_write))
+ goto redirty_out;
+ } else {
+ down_read(&sbi->node_write);
+ }
+
get_node_info(sbi, nid, &ni);
/* This page is already truncated */
if (unlikely(ni.blk_addr == NULL_ADDR)) {
ClearPageUptodate(page);
dec_page_count(sbi, F2FS_DIRTY_NODES);
+ up_read(&sbi->node_write);
unlock_page(page);
return 0;
}
- if (wbc->for_reclaim) {
- if (!down_read_trylock(&sbi->node_write))
- goto redirty_out;
- } else {
- down_read(&sbi->node_write);
- }
-
set_page_writeback(page);
fio.blk_addr = ni.blk_addr;
write_node_page(nid, &fio);
@@ -1348,7 +1358,7 @@ static int f2fs_write_node_page(struct page *page,
up_read(&sbi->node_write);
unlock_page(page);
- if (wbc->for_reclaim)
+ if (wbc->for_reclaim || unlikely(f2fs_cp_error(sbi)))
f2fs_submit_merged_bio(sbi, NODE, WRITE);
return 0;
@@ -1439,13 +1449,10 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
if (build) {
/* do not add allocated nids */
- down_read(&nm_i->nat_tree_lock);
ne = __lookup_nat_cache(nm_i, nid);
- if (ne &&
- (!get_nat_flag(ne, IS_CHECKPOINTED) ||
+ if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) ||
nat_get_blkaddr(ne) != NULL_ADDR))
allocated = true;
- up_read(&nm_i->nat_tree_lock);
if (allocated)
return 0;
}
@@ -1528,7 +1535,10 @@ static void build_free_nids(struct f2fs_sb_info *sbi)
return;
/* readahead nat pages to be scanned */
- ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT);
+ ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES,
+ META_NAT, true);
+
+ down_read(&nm_i->nat_tree_lock);
while (1) {
struct page *page = get_current_nat_page(sbi, nid);
@@ -1558,6 +1568,10 @@ static void build_free_nids(struct f2fs_sb_info *sbi)
remove_free_nid(nm_i, nid);
}
mutex_unlock(&curseg->curseg_mutex);
+ up_read(&nm_i->nat_tree_lock);
+
+ ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid),
+ nm_i->ra_nid_pages, META_NAT, false);
}
/*
@@ -1577,8 +1591,6 @@ retry:
/* We should not use stale free nids created by build_free_nids */
if (nm_i->fcnt && !on_build_free_nids(nm_i)) {
- struct node_info ni;
-
f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list));
list_for_each_entry(i, &nm_i->free_nid_list, list)
if (i->state == NID_NEW)
@@ -1589,13 +1601,6 @@ retry:
i->state = NID_ALLOC;
nm_i->fcnt--;
spin_unlock(&nm_i->free_nid_list_lock);
-
- /* check nid is allocated already */
- get_node_info(sbi, *nid, &ni);
- if (ni.blk_addr != NULL_ADDR) {
- alloc_nid_done(sbi, *nid);
- goto retry;
- }
return true;
}
spin_unlock(&nm_i->free_nid_list_lock);
@@ -1803,10 +1808,10 @@ int restore_node_summary(struct f2fs_sb_info *sbi,
nrpages = min(last_offset - i, bio_blocks);
/* readahead node pages */
- ra_meta_pages(sbi, addr, nrpages, META_POR);
+ ra_meta_pages(sbi, addr, nrpages, META_POR, true);
for (idx = addr; idx < addr + nrpages; idx++) {
- struct page *page = get_meta_page(sbi, idx);
+ struct page *page = get_tmp_page(sbi, idx);
rn = F2FS_NODE(page);
sum_entry->nid = rn->footer.nid;
@@ -1837,14 +1842,12 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
raw_ne = nat_in_journal(sum, i);
- down_write(&nm_i->nat_tree_lock);
ne = __lookup_nat_cache(nm_i, nid);
if (!ne) {
ne = grab_nat_entry(nm_i, nid);
node_info_from_raw_nat(&ne->ni, &raw_ne);
}
__set_nat_cache_dirty(nm_i, ne);
- up_write(&nm_i->nat_tree_lock);
}
update_nats_in_cursum(sum, -i);
mutex_unlock(&curseg->curseg_mutex);
@@ -1878,7 +1881,6 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
struct f2fs_nat_block *nat_blk;
struct nat_entry *ne, *cur;
struct page *page = NULL;
- struct f2fs_nm_info *nm_i = NM_I(sbi);
/*
* there are two steps to flush nat entries:
@@ -1915,12 +1917,8 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
raw_ne = &nat_blk->entries[nid - start_nid];
}
raw_nat_from_node_info(raw_ne, &ne->ni);
-
- down_write(&NM_I(sbi)->nat_tree_lock);
nat_reset_flag(ne);
__clear_nat_cache_dirty(NM_I(sbi), ne);
- up_write(&NM_I(sbi)->nat_tree_lock);
-
if (nat_get_blkaddr(ne) == NULL_ADDR)
add_free_nid(sbi, nid, false);
}
@@ -1932,9 +1930,7 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
f2fs_bug_on(sbi, set->entry_cnt);
- down_write(&nm_i->nat_tree_lock);
radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
- up_write(&nm_i->nat_tree_lock);
kmem_cache_free(nat_entry_set_slab, set);
}
@@ -1954,6 +1950,9 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
if (!nm_i->dirty_nat_cnt)
return;
+
+ down_write(&nm_i->nat_tree_lock);
+
/*
* if there are no enough space in journal to store dirty nat
* entries, remove all entries from journal and merge them
@@ -1962,7 +1961,6 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt, NAT_JOURNAL))
remove_nats_in_journal(sbi);
- down_write(&nm_i->nat_tree_lock);
while ((found = __gang_lookup_nat_set(nm_i,
set_idx, SETVEC_SIZE, setvec))) {
unsigned idx;
@@ -1971,12 +1969,13 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
__adjust_nat_entry_set(setvec[idx], &sets,
MAX_NAT_JENTRIES(sum));
}
- up_write(&nm_i->nat_tree_lock);
/* flush dirty nats in nat entry set */
list_for_each_entry_safe(set, tmp, &sets, set_list)
__flush_nat_entry_set(sbi, set);
+ up_write(&nm_i->nat_tree_lock);
+
f2fs_bug_on(sbi, nm_i->dirty_nat_cnt);
}
@@ -2000,6 +1999,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
nm_i->fcnt = 0;
nm_i->nat_cnt = 0;
nm_i->ram_thresh = DEF_RAM_THRESHOLD;
+ nm_i->ra_nid_pages = DEF_RA_NID_PAGES;
INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC);
INIT_LIST_HEAD(&nm_i->free_nid_list);
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 7427e956ad81..d4d1f636fe1c 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -14,9 +14,11 @@
/* node block offset on the NAT area dedicated to the given start node id */
#define NAT_BLOCK_OFFSET(start_nid) (start_nid / NAT_ENTRY_PER_BLOCK)
-/* # of pages to perform readahead before building free nids */
+/* # of pages to perform synchronous readahead before building free nids */
#define FREE_NID_PAGES 4
+#define DEF_RA_NID_PAGES 4 /* # of nid pages to be readaheaded */
+
/* maximum readahead size for node during getting data blocks */
#define MAX_RA_NODE 128
@@ -181,7 +183,7 @@ static inline pgoff_t current_nat_addr(struct f2fs_sb_info *sbi, nid_t start)
block_addr = (pgoff_t)(nm_i->nat_blkaddr +
(seg_off << sbi->log_blocks_per_seg << 1) +
- (block_off & ((1 << sbi->log_blocks_per_seg) - 1)));
+ (block_off & (sbi->blocks_per_seg - 1)));
if (f2fs_test_bit(block_off, nm_i->nat_bitmap))
block_addr += sbi->blocks_per_seg;
@@ -315,7 +317,7 @@ static inline bool IS_DNODE(struct page *node_page)
return true;
}
-static inline void set_nid(struct page *p, int off, nid_t nid, bool i)
+static inline int set_nid(struct page *p, int off, nid_t nid, bool i)
{
struct f2fs_node *rn = F2FS_NODE(p);
@@ -325,7 +327,7 @@ static inline void set_nid(struct page *p, int off, nid_t nid, bool i)
rn->i.i_nid[off - NODE_DIR1_BLOCK] = cpu_to_le32(nid);
else
rn->in.nid[off] = cpu_to_le32(nid);
- set_page_dirty(p);
+ return set_page_dirty(p);
}
static inline nid_t get_nid(struct page *p, int off, bool i)
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index faec2ca004b9..589b20b8677b 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -168,6 +168,32 @@ static void recover_inode(struct inode *inode, struct page *page)
ino_of_node(page), name);
}
+static bool is_same_inode(struct inode *inode, struct page *ipage)
+{
+ struct f2fs_inode *ri = F2FS_INODE(ipage);
+ struct timespec disk;
+
+ if (!IS_INODE(ipage))
+ return true;
+
+ disk.tv_sec = le64_to_cpu(ri->i_ctime);
+ disk.tv_nsec = le32_to_cpu(ri->i_ctime_nsec);
+ if (timespec_compare(&inode->i_ctime, &disk) > 0)
+ return false;
+
+ disk.tv_sec = le64_to_cpu(ri->i_atime);
+ disk.tv_nsec = le32_to_cpu(ri->i_atime_nsec);
+ if (timespec_compare(&inode->i_atime, &disk) > 0)
+ return false;
+
+ disk.tv_sec = le64_to_cpu(ri->i_mtime);
+ disk.tv_nsec = le32_to_cpu(ri->i_mtime_nsec);
+ if (timespec_compare(&inode->i_mtime, &disk) > 0)
+ return false;
+
+ return true;
+}
+
static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
{
unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi));
@@ -180,7 +206,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
- ra_meta_pages(sbi, blkaddr, 1, META_POR);
+ ra_meta_pages(sbi, blkaddr, 1, META_POR, true);
while (1) {
struct fsync_inode_entry *entry;
@@ -188,7 +214,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
if (!is_valid_blkaddr(sbi, blkaddr, META_POR))
return 0;
- page = get_meta_page(sbi, blkaddr);
+ page = get_tmp_page(sbi, blkaddr);
if (cp_ver != cpver_of_node(page))
break;
@@ -197,7 +223,10 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
goto next;
entry = get_fsync_inode(head, ino_of_node(page));
- if (!entry) {
+ if (entry) {
+ if (!is_same_inode(entry->inode, page))
+ goto next;
+ } else {
if (IS_INODE(page) && is_dent_dnode(page)) {
err = recover_inode_page(sbi, page);
if (err)
@@ -383,15 +412,11 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
start = start_bidx_of_node(ofs_of_node(page), fi);
end = start + ADDRS_PER_PAGE(page, fi);
- f2fs_lock_op(sbi);
-
set_new_dnode(&dn, inode, NULL, NULL, 0);
err = get_dnode_of_data(&dn, start, ALLOC_NODE);
- if (err) {
- f2fs_unlock_op(sbi);
+ if (err)
goto out;
- }
f2fs_wait_on_page_writeback(dn.node_page, NODE);
@@ -456,7 +481,6 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
set_page_dirty(dn.node_page);
err:
f2fs_put_dnode(&dn);
- f2fs_unlock_op(sbi);
out:
f2fs_msg(sbi->sb, KERN_NOTICE,
"recover_data: ino = %lx, recovered = %d blocks, err = %d",
@@ -464,8 +488,7 @@ out:
return err;
}
-static int recover_data(struct f2fs_sb_info *sbi,
- struct list_head *head, int type)
+static int recover_data(struct f2fs_sb_info *sbi, struct list_head *head)
{
unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi));
struct curseg_info *curseg;
@@ -474,7 +497,7 @@ static int recover_data(struct f2fs_sb_info *sbi,
block_t blkaddr;
/* get node pages in the current segment */
- curseg = CURSEG_I(sbi, type);
+ curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
while (1) {
@@ -485,7 +508,7 @@ static int recover_data(struct f2fs_sb_info *sbi,
ra_meta_pages_cond(sbi, blkaddr);
- page = get_meta_page(sbi, blkaddr);
+ page = get_tmp_page(sbi, blkaddr);
if (cp_ver != cpver_of_node(page)) {
f2fs_put_page(page, 1);
@@ -561,7 +584,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
need_writecp = true;
/* step #2: recover data */
- err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE);
+ err = recover_data(sbi, &inode_list);
if (!err)
f2fs_bug_on(sbi, !list_empty(&inode_list));
out:
@@ -570,7 +593,7 @@ out:
/* truncate meta pages to be used by the recovery */
truncate_inode_pages_range(META_MAPPING(sbi),
- MAIN_BLKADDR(sbi) << PAGE_CACHE_SHIFT, -1);
+ (loff_t)MAIN_BLKADDR(sbi) << PAGE_CACHE_SHIFT, -1);
if (err) {
truncate_inode_pages_final(NODE_MAPPING(sbi));
@@ -600,7 +623,7 @@ out:
.reason = CP_RECOVERY,
};
mutex_unlock(&sbi->cp_mutex);
- write_checkpoint(sbi, &cpc);
+ err = write_checkpoint(sbi, &cpc);
} else {
mutex_unlock(&sbi->cp_mutex);
}
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 78e6d0696847..5904a411c86f 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -14,8 +14,8 @@
#include <linux/blkdev.h>
#include <linux/prefetch.h>
#include <linux/kthread.h>
-#include <linux/vmalloc.h>
#include <linux/swap.h>
+#include <linux/timer.h>
#include "f2fs.h"
#include "segment.h"
@@ -29,6 +29,21 @@ static struct kmem_cache *discard_entry_slab;
static struct kmem_cache *sit_entry_set_slab;
static struct kmem_cache *inmem_entry_slab;
+static unsigned long __reverse_ulong(unsigned char *str)
+{
+ unsigned long tmp = 0;
+ int shift = 24, idx = 0;
+
+#if BITS_PER_LONG == 64
+ shift = 56;
+#endif
+ while (shift >= 0) {
+ tmp |= (unsigned long)str[idx++] << shift;
+ shift -= BITS_PER_BYTE;
+ }
+ return tmp;
+}
+
/*
* __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since
* MSB and LSB are reversed in a byte by f2fs_set_bit.
@@ -38,27 +53,31 @@ static inline unsigned long __reverse_ffs(unsigned long word)
int num = 0;
#if BITS_PER_LONG == 64
- if ((word & 0xffffffff) == 0) {
+ if ((word & 0xffffffff00000000UL) == 0)
num += 32;
+ else
word >>= 32;
- }
#endif
- if ((word & 0xffff) == 0) {
+ if ((word & 0xffff0000) == 0)
num += 16;
+ else
word >>= 16;
- }
- if ((word & 0xff) == 0) {
+
+ if ((word & 0xff00) == 0)
num += 8;
+ else
word >>= 8;
- }
+
if ((word & 0xf0) == 0)
num += 4;
else
word >>= 4;
+
if ((word & 0xc) == 0)
num += 2;
else
word >>= 2;
+
if ((word & 0x2) == 0)
num += 1;
return num;
@@ -67,130 +86,83 @@ static inline unsigned long __reverse_ffs(unsigned long word)
/*
* __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c because
* f2fs_set_bit makes MSB and LSB reversed in a byte.
+ * @size must be integral times of unsigned long.
* Example:
- * LSB <--> MSB
- * f2fs_set_bit(0, bitmap) => 0000 0001
- * f2fs_set_bit(7, bitmap) => 1000 0000
+ * MSB <--> LSB
+ * f2fs_set_bit(0, bitmap) => 1000 0000
+ * f2fs_set_bit(7, bitmap) => 0000 0001
*/
static unsigned long __find_rev_next_bit(const unsigned long *addr,
unsigned long size, unsigned long offset)
{
- while (!f2fs_test_bit(offset, (unsigned char *)addr))
- offset++;
-
- if (offset > size)
- offset = size;
-
- return offset;
-#if 0
const unsigned long *p = addr + BIT_WORD(offset);
- unsigned long result = offset & ~(BITS_PER_LONG - 1);
+ unsigned long result = size;
unsigned long tmp;
- unsigned long mask, submask;
- unsigned long quot, rest;
if (offset >= size)
return size;
- size -= result;
+ size -= (offset & ~(BITS_PER_LONG - 1));
offset %= BITS_PER_LONG;
- if (!offset)
- goto aligned;
-
- tmp = *(p++);
- quot = (offset >> 3) << 3;
- rest = offset & 0x7;
- mask = ~0UL << quot;
- submask = (unsigned char)(0xff << rest) >> rest;
- submask <<= quot;
- mask &= submask;
- tmp &= mask;
- if (size < BITS_PER_LONG)
- goto found_first;
- if (tmp)
- goto found_middle;
-
- size -= BITS_PER_LONG;
- result += BITS_PER_LONG;
-aligned:
- while (size & ~(BITS_PER_LONG-1)) {
- tmp = *(p++);
+
+ while (1) {
+ if (*p == 0)
+ goto pass;
+
+ tmp = __reverse_ulong((unsigned char *)p);
+
+ tmp &= ~0UL >> offset;
+ if (size < BITS_PER_LONG)
+ tmp &= (~0UL << (BITS_PER_LONG - size));
if (tmp)
- goto found_middle;
- result += BITS_PER_LONG;
+ goto found;
+pass:
+ if (size <= BITS_PER_LONG)
+ break;
size -= BITS_PER_LONG;
+ offset = 0;
+ p++;
}
- if (!size)
- return result;
- tmp = *p;
-found_first:
- tmp &= (~0UL >> (BITS_PER_LONG - size));
- if (tmp == 0UL) /* Are any bits set? */
- return result + size; /* Nope. */
-found_middle:
- return result + __reverse_ffs(tmp);
-#endif
+ return result;
+found:
+ return result - size + __reverse_ffs(tmp);
}
static unsigned long __find_rev_next_zero_bit(const unsigned long *addr,
unsigned long size, unsigned long offset)
{
- while (f2fs_test_bit(offset, (unsigned char *)addr))
- offset++;
-
- if (offset > size)
- offset = size;
-
- return offset;
-#if 0
const unsigned long *p = addr + BIT_WORD(offset);
- unsigned long result = offset & ~(BITS_PER_LONG - 1);
+ unsigned long result = size;
unsigned long tmp;
- unsigned long mask, submask;
- unsigned long quot, rest;
if (offset >= size)
return size;
- size -= result;
+ size -= (offset & ~(BITS_PER_LONG - 1));
offset %= BITS_PER_LONG;
- if (!offset)
- goto aligned;
-
- tmp = *(p++);
- quot = (offset >> 3) << 3;
- rest = offset & 0x7;
- mask = ~(~0UL << quot);
- submask = (unsigned char)~((unsigned char)(0xff << rest) >> rest);
- submask <<= quot;
- mask += submask;
- tmp |= mask;
- if (size < BITS_PER_LONG)
- goto found_first;
- if (~tmp)
- goto found_middle;
-
- size -= BITS_PER_LONG;
- result += BITS_PER_LONG;
-aligned:
- while (size & ~(BITS_PER_LONG - 1)) {
- tmp = *(p++);
- if (~tmp)
- goto found_middle;
- result += BITS_PER_LONG;
+
+ while (1) {
+ if (*p == ~0UL)
+ goto pass;
+
+ tmp = __reverse_ulong((unsigned char *)p);
+
+ if (offset)
+ tmp |= ~0UL << (BITS_PER_LONG - offset);
+ if (size < BITS_PER_LONG)
+ tmp |= ~0UL >> size;
+ if (tmp != ~0UL)
+ goto found;
+pass:
+ if (size <= BITS_PER_LONG)
+ break;
size -= BITS_PER_LONG;
+ offset = 0;
+ p++;
}
- if (!size)
- return result;
- tmp = *p;
-
-found_first:
- tmp |= ~0UL << size;
- if (tmp == ~0UL) /* Are any bits zero? */
- return result + size; /* Nope. */
-found_middle:
- return result + __reverse_ffz(tmp);
-#endif
+ return result;
+found:
+ return result - size + __reverse_ffz(tmp);
}
void register_inmem_page(struct inode *inode, struct page *page)
@@ -241,7 +213,7 @@ int commit_inmem_pages(struct inode *inode, bool abort)
* inode becomes free by iget_locked in f2fs_iget.
*/
if (!abort) {
- f2fs_balance_fs(sbi);
+ f2fs_balance_fs(sbi, true);
f2fs_lock_op(sbi);
}
@@ -257,13 +229,15 @@ int commit_inmem_pages(struct inode *inode, bool abort)
trace_f2fs_commit_inmem_page(cur->page, INMEM);
fio.page = cur->page;
err = do_write_data_page(&fio);
- submit_bio = true;
if (err) {
unlock_page(cur->page);
break;
}
+ clear_cold_data(cur->page);
+ submit_bio = true;
}
} else {
+ ClearPageUptodate(cur->page);
trace_f2fs_commit_inmem_page(cur->page, INMEM_DROP);
}
set_page_private(cur->page, 0);
@@ -288,15 +262,17 @@ int commit_inmem_pages(struct inode *inode, bool abort)
* This function balances dirty node and dentry pages.
* In addition, it controls garbage collection.
*/
-void f2fs_balance_fs(struct f2fs_sb_info *sbi)
+void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
{
+ if (!need)
+ return;
/*
* We should do GC or end up with checkpoint, if there are so many dirty
* dir/node pages without enough free segments.
*/
if (has_not_enough_free_secs(sbi, 0)) {
mutex_lock(&sbi->gc_mutex);
- f2fs_gc(sbi);
+ f2fs_gc(sbi, false);
}
}
@@ -316,8 +292,13 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
/* checkpoint is the only way to shrink partial cached entries */
if (!available_free_memory(sbi, NAT_ENTRIES) ||
excess_prefree_segs(sbi) ||
- !available_free_memory(sbi, INO_ENTRIES))
+ !available_free_memory(sbi, INO_ENTRIES) ||
+ (is_idle(sbi) && f2fs_time_over(sbi, CP_TIME))) {
+ if (test_opt(sbi, DATA_FLUSH))
+ sync_dirty_inodes(sbi, FILE_INODE);
f2fs_sync_fs(sbi->sb, true);
+ stat_inc_bg_cp_count(sbi->stat_info);
+ }
}
static int issue_flush_thread(void *data)
@@ -767,6 +748,30 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
mutex_unlock(&sit_i->sentry_lock);
}
+bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr)
+{
+ struct sit_info *sit_i = SIT_I(sbi);
+ unsigned int segno, offset;
+ struct seg_entry *se;
+ bool is_cp = false;
+
+ if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR)
+ return true;
+
+ mutex_lock(&sit_i->sentry_lock);
+
+ segno = GET_SEGNO(sbi, blkaddr);
+ se = get_seg_entry(sbi, segno);
+ offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr);
+
+ if (f2fs_test_bit(offset, se->ckpt_valid_map))
+ is_cp = true;
+
+ mutex_unlock(&sit_i->sentry_lock);
+
+ return is_cp;
+}
+
/*
* This function should be resided under the curseg_mutex lock
*/
@@ -1116,6 +1121,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
__u64 end = start + F2FS_BYTES_TO_BLK(range->len) - 1;
unsigned int start_segno, end_segno;
struct cp_control cpc;
+ int err = 0;
if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize)
return -EINVAL;
@@ -1146,12 +1152,12 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
sbi->segs_per_sec) - 1, end_segno);
mutex_lock(&sbi->gc_mutex);
- write_checkpoint(sbi, &cpc);
+ err = write_checkpoint(sbi, &cpc);
mutex_unlock(&sbi->gc_mutex);
}
out:
range->len = F2FS_BLK_TO_BYTES(cpc.trimmed);
- return 0;
+ return err;
}
static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type)
@@ -1292,6 +1298,9 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page)
.encrypted_page = NULL,
};
+ if (unlikely(page->index >= MAIN_BLKADDR(sbi)))
+ fio.rw &= ~REQ_META;
+
set_page_writeback(page);
f2fs_submit_page_mbio(&fio);
}
@@ -1369,7 +1378,14 @@ static void __f2fs_replace_block(struct f2fs_sb_info *sbi,
curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr);
__add_sum_entry(sbi, type, sum);
- refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
+ if (!recover_curseg)
+ update_sit_entry(sbi, new_blkaddr, 1);
+ if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO)
+ update_sit_entry(sbi, old_blkaddr, -1);
+
+ locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
+ locate_dirty_segment(sbi, GET_SEGNO(sbi, new_blkaddr));
+
locate_dirty_segment(sbi, old_cursegno);
if (recover_curseg) {
@@ -1449,6 +1465,23 @@ void f2fs_wait_on_page_writeback(struct page *page,
}
}
+void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *sbi,
+ block_t blkaddr)
+{
+ struct page *cpage;
+
+ if (blkaddr == NEW_ADDR)
+ return;
+
+ f2fs_bug_on(sbi, blkaddr == NULL_ADDR);
+
+ cpage = find_lock_page(META_MAPPING(sbi), blkaddr);
+ if (cpage) {
+ f2fs_wait_on_page_writeback(cpage, DATA);
+ f2fs_put_page(cpage, 1);
+ }
+}
+
static int read_compacted_summaries(struct f2fs_sb_info *sbi)
{
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
@@ -1586,7 +1619,7 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi)
if (npages >= 2)
ra_meta_pages(sbi, start_sum_block(sbi), npages,
- META_CP);
+ META_CP, true);
/* restore for compacted data summary */
if (read_compacted_summaries(sbi))
@@ -1596,7 +1629,7 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi)
if (__exist_node_summaries(sbi))
ra_meta_pages(sbi, sum_blk_addr(sbi, NR_CURSEG_TYPE, type),
- NR_CURSEG_TYPE - type, META_CP);
+ NR_CURSEG_TYPE - type, META_CP, true);
for (; type <= CURSEG_COLD_NODE; type++) {
err = read_normal_summaries(sbi, type);
@@ -1704,13 +1737,13 @@ int lookup_journal_in_cursum(struct f2fs_summary_block *sum, int type,
if (le32_to_cpu(nid_in_journal(sum, i)) == val)
return i;
}
- if (alloc && nats_in_cursum(sum) < NAT_JOURNAL_ENTRIES)
+ if (alloc && __has_cursum_space(sum, 1, NAT_JOURNAL))
return update_nats_in_cursum(sum, 1);
} else if (type == SIT_JOURNAL) {
for (i = 0; i < sits_in_cursum(sum); i++)
if (le32_to_cpu(segno_in_journal(sum, i)) == val)
return i;
- if (alloc && sits_in_cursum(sum) < SIT_JOURNAL_ENTRIES)
+ if (alloc && __has_cursum_space(sum, 1, SIT_JOURNAL))
return update_sits_in_cursum(sum, 1);
}
return -1;
@@ -1955,12 +1988,13 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
SM_I(sbi)->sit_info = sit_i;
- sit_i->sentries = vzalloc(MAIN_SEGS(sbi) * sizeof(struct seg_entry));
+ sit_i->sentries = f2fs_kvzalloc(MAIN_SEGS(sbi) *
+ sizeof(struct seg_entry), GFP_KERNEL);
if (!sit_i->sentries)
return -ENOMEM;
bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
- sit_i->dirty_sentries_bitmap = kzalloc(bitmap_size, GFP_KERNEL);
+ sit_i->dirty_sentries_bitmap = f2fs_kvzalloc(bitmap_size, GFP_KERNEL);
if (!sit_i->dirty_sentries_bitmap)
return -ENOMEM;
@@ -1982,8 +2016,8 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
return -ENOMEM;
if (sbi->segs_per_sec > 1) {
- sit_i->sec_entries = vzalloc(MAIN_SECS(sbi) *
- sizeof(struct sec_entry));
+ sit_i->sec_entries = f2fs_kvzalloc(MAIN_SECS(sbi) *
+ sizeof(struct sec_entry), GFP_KERNEL);
if (!sit_i->sec_entries)
return -ENOMEM;
}
@@ -2028,12 +2062,12 @@ static int build_free_segmap(struct f2fs_sb_info *sbi)
SM_I(sbi)->free_info = free_i;
bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
- free_i->free_segmap = kmalloc(bitmap_size, GFP_KERNEL);
+ free_i->free_segmap = f2fs_kvmalloc(bitmap_size, GFP_KERNEL);
if (!free_i->free_segmap)
return -ENOMEM;
sec_bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi));
- free_i->free_secmap = kmalloc(sec_bitmap_size, GFP_KERNEL);
+ free_i->free_secmap = f2fs_kvmalloc(sec_bitmap_size, GFP_KERNEL);
if (!free_i->free_secmap)
return -ENOMEM;
@@ -2082,7 +2116,7 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
int nrpages = MAX_BIO_BLOCKS(sbi);
do {
- readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT);
+ readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT, true);
start = start_blk * sit_i->sents_per_block;
end = (start_blk + readed) * sit_i->sents_per_block;
@@ -2174,7 +2208,7 @@ static int init_victim_secmap(struct f2fs_sb_info *sbi)
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi));
- dirty_i->victim_secmap = kzalloc(bitmap_size, GFP_KERNEL);
+ dirty_i->victim_secmap = f2fs_kvzalloc(bitmap_size, GFP_KERNEL);
if (!dirty_i->victim_secmap)
return -ENOMEM;
return 0;
@@ -2196,7 +2230,7 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi)
bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
for (i = 0; i < NR_DIRTY_TYPE; i++) {
- dirty_i->dirty_segmap[i] = kzalloc(bitmap_size, GFP_KERNEL);
+ dirty_i->dirty_segmap[i] = f2fs_kvzalloc(bitmap_size, GFP_KERNEL);
if (!dirty_i->dirty_segmap[i])
return -ENOMEM;
}
@@ -2301,7 +2335,7 @@ static void discard_dirty_segmap(struct f2fs_sb_info *sbi,
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
mutex_lock(&dirty_i->seglist_lock);
- kfree(dirty_i->dirty_segmap[dirty_type]);
+ kvfree(dirty_i->dirty_segmap[dirty_type]);
dirty_i->nr_dirty[dirty_type] = 0;
mutex_unlock(&dirty_i->seglist_lock);
}
@@ -2309,7 +2343,7 @@ static void discard_dirty_segmap(struct f2fs_sb_info *sbi,
static void destroy_victim_secmap(struct f2fs_sb_info *sbi)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
- kfree(dirty_i->victim_secmap);
+ kvfree(dirty_i->victim_secmap);
}
static void destroy_dirty_segmap(struct f2fs_sb_info *sbi)
@@ -2348,8 +2382,8 @@ static void destroy_free_segmap(struct f2fs_sb_info *sbi)
if (!free_i)
return;
SM_I(sbi)->free_info = NULL;
- kfree(free_i->free_segmap);
- kfree(free_i->free_secmap);
+ kvfree(free_i->free_segmap);
+ kvfree(free_i->free_secmap);
kfree(free_i);
}
@@ -2370,9 +2404,9 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi)
}
kfree(sit_i->tmp_map);
- vfree(sit_i->sentries);
- vfree(sit_i->sec_entries);
- kfree(sit_i->dirty_sentries_bitmap);
+ kvfree(sit_i->sentries);
+ kvfree(sit_i->sec_entries);
+ kvfree(sit_i->dirty_sentries_bitmap);
SM_I(sbi)->sit_info = NULL;
kfree(sit_i->sit_bitmap);
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index b6e4ed15c698..ee44d346ea44 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -137,10 +137,12 @@ enum {
/*
* BG_GC means the background cleaning job.
* FG_GC means the on-demand cleaning job.
+ * FORCE_FG_GC means on-demand cleaning job in background.
*/
enum {
BG_GC = 0,
- FG_GC
+ FG_GC,
+ FORCE_FG_GC,
};
/* for a function parameter to select a victim segment */
diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c
index da0d8e0b55a5..93606f281bf9 100644
--- a/fs/f2fs/shrinker.c
+++ b/fs/f2fs/shrinker.c
@@ -32,7 +32,8 @@ static unsigned long __count_free_nids(struct f2fs_sb_info *sbi)
static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi)
{
- return sbi->total_ext_tree + atomic_read(&sbi->total_ext_node);
+ return atomic_read(&sbi->total_zombie_tree) +
+ atomic_read(&sbi->total_ext_node);
}
unsigned long f2fs_shrink_count(struct shrinker *shrink,
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index f79478115d37..6134832baaaf 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -67,6 +67,7 @@ enum {
Opt_extent_cache,
Opt_noextent_cache,
Opt_noinline_data,
+ Opt_data_flush,
Opt_err,
};
@@ -91,6 +92,7 @@ static match_table_t f2fs_tokens = {
{Opt_extent_cache, "extent_cache"},
{Opt_noextent_cache, "noextent_cache"},
{Opt_noinline_data, "noinline_data"},
+ {Opt_data_flush, "data_flush"},
{Opt_err, NULL},
};
@@ -213,8 +215,11 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks);
F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh);
+F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]);
#define ATTR_LIST(name) (&f2fs_attr_##name.attr)
static struct attribute *f2fs_attrs[] = {
@@ -231,6 +236,9 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(max_victim_search),
ATTR_LIST(dir_level),
ATTR_LIST(ram_thresh),
+ ATTR_LIST(ra_nid_pages),
+ ATTR_LIST(cp_interval),
+ ATTR_LIST(idle_interval),
NULL,
};
@@ -292,11 +300,16 @@ static int parse_options(struct super_block *sb, char *options)
if (!name)
return -ENOMEM;
- if (strlen(name) == 2 && !strncmp(name, "on", 2))
+ if (strlen(name) == 2 && !strncmp(name, "on", 2)) {
set_opt(sbi, BG_GC);
- else if (strlen(name) == 3 && !strncmp(name, "off", 3))
+ clear_opt(sbi, FORCE_FG_GC);
+ } else if (strlen(name) == 3 && !strncmp(name, "off", 3)) {
clear_opt(sbi, BG_GC);
- else {
+ clear_opt(sbi, FORCE_FG_GC);
+ } else if (strlen(name) == 4 && !strncmp(name, "sync", 4)) {
+ set_opt(sbi, BG_GC);
+ set_opt(sbi, FORCE_FG_GC);
+ } else {
kfree(name);
return -EINVAL;
}
@@ -397,6 +410,9 @@ static int parse_options(struct super_block *sb, char *options)
case Opt_noinline_data:
clear_opt(sbi, INLINE_DATA);
break;
+ case Opt_data_flush:
+ set_opt(sbi, DATA_FLUSH);
+ break;
default:
f2fs_msg(sb, KERN_ERR,
"Unrecognized mount option \"%s\" or missing value",
@@ -423,6 +439,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
fi->i_current_depth = 1;
fi->i_advise = 0;
init_rwsem(&fi->i_sem);
+ INIT_LIST_HEAD(&fi->dirty_list);
INIT_LIST_HEAD(&fi->inmem_pages);
mutex_init(&fi->inmem_lock);
@@ -539,7 +556,7 @@ static void f2fs_put_super(struct super_block *sb)
* normally superblock is clean, so we need to release this.
* In addition, EIO will skip do checkpoint, we need this as well.
*/
- release_dirty_inode(sbi);
+ release_ino_entry(sbi);
release_discard_addrs(sbi);
f2fs_leave_shrinker(sbi);
@@ -557,13 +574,14 @@ static void f2fs_put_super(struct super_block *sb)
wait_for_completion(&sbi->s_kobj_unregister);
sb->s_fs_info = NULL;
- brelse(sbi->raw_super_buf);
+ kfree(sbi->raw_super);
kfree(sbi);
}
int f2fs_sync_fs(struct super_block *sb, int sync)
{
struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ int err = 0;
trace_f2fs_sync_fs(sb, sync);
@@ -573,14 +591,12 @@ int f2fs_sync_fs(struct super_block *sb, int sync)
cpc.reason = __get_cp_reason(sbi);
mutex_lock(&sbi->gc_mutex);
- write_checkpoint(sbi, &cpc);
+ err = write_checkpoint(sbi, &cpc);
mutex_unlock(&sbi->gc_mutex);
- } else {
- f2fs_balance_fs(sbi);
}
f2fs_trace_ios(NULL, 1);
- return 0;
+ return err;
}
static int f2fs_freeze(struct super_block *sb)
@@ -631,10 +647,14 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
{
struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb);
- if (!f2fs_readonly(sbi->sb) && test_opt(sbi, BG_GC))
- seq_printf(seq, ",background_gc=%s", "on");
- else
+ if (!f2fs_readonly(sbi->sb) && test_opt(sbi, BG_GC)) {
+ if (test_opt(sbi, FORCE_FG_GC))
+ seq_printf(seq, ",background_gc=%s", "sync");
+ else
+ seq_printf(seq, ",background_gc=%s", "on");
+ } else {
seq_printf(seq, ",background_gc=%s", "off");
+ }
if (test_opt(sbi, DISABLE_ROLL_FORWARD))
seq_puts(seq, ",disable_roll_forward");
if (test_opt(sbi, DISCARD))
@@ -673,6 +693,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
seq_puts(seq, ",extent_cache");
else
seq_puts(seq, ",noextent_cache");
+ if (test_opt(sbi, DATA_FLUSH))
+ seq_puts(seq, ",data_flush");
seq_printf(seq, ",active_logs=%u", sbi->active_logs);
return 0;
@@ -742,6 +764,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
int err, active_logs;
bool need_restart_gc = false;
bool need_stop_gc = false;
+ bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE);
sync_filesystem(sb);
@@ -767,6 +790,14 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
if (f2fs_readonly(sb) && (*flags & MS_RDONLY))
goto skip;
+ /* disallow enable/disable extent_cache dynamically */
+ if (no_extent_cache == !!test_opt(sbi, EXTENT_CACHE)) {
+ err = -EINVAL;
+ f2fs_msg(sbi->sb, KERN_WARNING,
+ "switch extent_cache option is not allowed");
+ goto restore_opts;
+ }
+
/*
* We stop the GC thread if FS is mounted as RO
* or if background_gc = off is passed in mount
@@ -876,7 +907,7 @@ static const struct export_operations f2fs_export_ops = {
.get_parent = f2fs_get_parent,
};
-static loff_t max_file_size(unsigned bits)
+static loff_t max_file_blocks(void)
{
loff_t result = (DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS);
loff_t leaf_count = ADDRS_PER_BLOCK;
@@ -892,10 +923,82 @@ static loff_t max_file_size(unsigned bits)
leaf_count *= NIDS_PER_BLOCK;
result += leaf_count;
- result <<= bits;
return result;
}
+static inline bool sanity_check_area_boundary(struct super_block *sb,
+ struct f2fs_super_block *raw_super)
+{
+ u32 segment0_blkaddr = le32_to_cpu(raw_super->segment0_blkaddr);
+ u32 cp_blkaddr = le32_to_cpu(raw_super->cp_blkaddr);
+ u32 sit_blkaddr = le32_to_cpu(raw_super->sit_blkaddr);
+ u32 nat_blkaddr = le32_to_cpu(raw_super->nat_blkaddr);
+ u32 ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr);
+ u32 main_blkaddr = le32_to_cpu(raw_super->main_blkaddr);
+ u32 segment_count_ckpt = le32_to_cpu(raw_super->segment_count_ckpt);
+ u32 segment_count_sit = le32_to_cpu(raw_super->segment_count_sit);
+ u32 segment_count_nat = le32_to_cpu(raw_super->segment_count_nat);
+ u32 segment_count_ssa = le32_to_cpu(raw_super->segment_count_ssa);
+ u32 segment_count_main = le32_to_cpu(raw_super->segment_count_main);
+ u32 segment_count = le32_to_cpu(raw_super->segment_count);
+ u32 log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg);
+
+ if (segment0_blkaddr != cp_blkaddr) {
+ f2fs_msg(sb, KERN_INFO,
+ "Mismatch start address, segment0(%u) cp_blkaddr(%u)",
+ segment0_blkaddr, cp_blkaddr);
+ return true;
+ }
+
+ if (cp_blkaddr + (segment_count_ckpt << log_blocks_per_seg) !=
+ sit_blkaddr) {
+ f2fs_msg(sb, KERN_INFO,
+ "Wrong CP boundary, start(%u) end(%u) blocks(%u)",
+ cp_blkaddr, sit_blkaddr,
+ segment_count_ckpt << log_blocks_per_seg);
+ return true;
+ }
+
+ if (sit_blkaddr + (segment_count_sit << log_blocks_per_seg) !=
+ nat_blkaddr) {
+ f2fs_msg(sb, KERN_INFO,
+ "Wrong SIT boundary, start(%u) end(%u) blocks(%u)",
+ sit_blkaddr, nat_blkaddr,
+ segment_count_sit << log_blocks_per_seg);
+ return true;
+ }
+
+ if (nat_blkaddr + (segment_count_nat << log_blocks_per_seg) !=
+ ssa_blkaddr) {
+ f2fs_msg(sb, KERN_INFO,
+ "Wrong NAT boundary, start(%u) end(%u) blocks(%u)",
+ nat_blkaddr, ssa_blkaddr,
+ segment_count_nat << log_blocks_per_seg);
+ return true;
+ }
+
+ if (ssa_blkaddr + (segment_count_ssa << log_blocks_per_seg) !=
+ main_blkaddr) {
+ f2fs_msg(sb, KERN_INFO,
+ "Wrong SSA boundary, start(%u) end(%u) blocks(%u)",
+ ssa_blkaddr, main_blkaddr,
+ segment_count_ssa << log_blocks_per_seg);
+ return true;
+ }
+
+ if (main_blkaddr + (segment_count_main << log_blocks_per_seg) !=
+ segment0_blkaddr + (segment_count << log_blocks_per_seg)) {
+ f2fs_msg(sb, KERN_INFO,
+ "Wrong MAIN_AREA boundary, start(%u) end(%u) blocks(%u)",
+ main_blkaddr,
+ segment0_blkaddr + (segment_count << log_blocks_per_seg),
+ segment_count_main << log_blocks_per_seg);
+ return true;
+ }
+
+ return false;
+}
+
static int sanity_check_raw_super(struct super_block *sb,
struct f2fs_super_block *raw_super)
{
@@ -925,6 +1028,14 @@ static int sanity_check_raw_super(struct super_block *sb,
return 1;
}
+ /* check log blocks per segment */
+ if (le32_to_cpu(raw_super->log_blocks_per_seg) != 9) {
+ f2fs_msg(sb, KERN_INFO,
+ "Invalid log blocks per segment (%u)\n",
+ le32_to_cpu(raw_super->log_blocks_per_seg));
+ return 1;
+ }
+
/* Currently, support 512/1024/2048/4096 bytes sector size */
if (le32_to_cpu(raw_super->log_sectorsize) >
F2FS_MAX_LOG_SECTOR_SIZE ||
@@ -943,6 +1054,23 @@ static int sanity_check_raw_super(struct super_block *sb,
le32_to_cpu(raw_super->log_sectorsize));
return 1;
}
+
+ /* check reserved ino info */
+ if (le32_to_cpu(raw_super->node_ino) != 1 ||
+ le32_to_cpu(raw_super->meta_ino) != 2 ||
+ le32_to_cpu(raw_super->root_ino) != 3) {
+ f2fs_msg(sb, KERN_INFO,
+ "Invalid Fs Meta Ino: node(%u) meta(%u) root(%u)",
+ le32_to_cpu(raw_super->node_ino),
+ le32_to_cpu(raw_super->meta_ino),
+ le32_to_cpu(raw_super->root_ino));
+ return 1;
+ }
+
+ /* check CP/SIT/NAT/SSA/MAIN_AREA area boundary */
+ if (sanity_check_area_boundary(sb, raw_super))
+ return 1;
+
return 0;
}
@@ -996,6 +1124,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
atomic_set(&sbi->nr_pages[i], 0);
sbi->dir_level = DEF_DIR_LEVEL;
+ sbi->interval_time[CP_TIME] = DEF_CP_INTERVAL;
+ sbi->interval_time[REQ_TIME] = DEF_IDLE_INTERVAL;
clear_sbi_flag(sbi, SBI_NEED_FSCK);
INIT_LIST_HEAD(&sbi->s_list);
@@ -1009,111 +1139,114 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
*/
static int read_raw_super_block(struct super_block *sb,
struct f2fs_super_block **raw_super,
- struct buffer_head **raw_super_buf,
- int *recovery)
+ int *valid_super_block, int *recovery)
{
int block = 0;
- struct buffer_head *buffer;
- struct f2fs_super_block *super;
+ struct buffer_head *bh;
+ struct f2fs_super_block *super, *buf;
int err = 0;
+ super = kzalloc(sizeof(struct f2fs_super_block), GFP_KERNEL);
+ if (!super)
+ return -ENOMEM;
retry:
- buffer = sb_bread(sb, block);
- if (!buffer) {
+ bh = sb_bread(sb, block);
+ if (!bh) {
*recovery = 1;
f2fs_msg(sb, KERN_ERR, "Unable to read %dth superblock",
block + 1);
- if (block == 0) {
- block++;
- goto retry;
- } else {
- err = -EIO;
- goto out;
- }
+ err = -EIO;
+ goto next;
}
- super = (struct f2fs_super_block *)
- ((char *)(buffer)->b_data + F2FS_SUPER_OFFSET);
+ buf = (struct f2fs_super_block *)(bh->b_data + F2FS_SUPER_OFFSET);
/* sanity checking of raw super */
- if (sanity_check_raw_super(sb, super)) {
- brelse(buffer);
+ if (sanity_check_raw_super(sb, buf)) {
+ brelse(bh);
*recovery = 1;
f2fs_msg(sb, KERN_ERR,
"Can't find valid F2FS filesystem in %dth superblock",
block + 1);
- if (block == 0) {
- block++;
- goto retry;
- } else {
- err = -EINVAL;
- goto out;
- }
+ err = -EINVAL;
+ goto next;
}
if (!*raw_super) {
- *raw_super_buf = buffer;
+ memcpy(super, buf, sizeof(*super));
+ *valid_super_block = block;
*raw_super = super;
- } else {
- /* already have a valid superblock */
- brelse(buffer);
}
+ brelse(bh);
+next:
/* check the validity of the second superblock */
if (block == 0) {
block++;
goto retry;
}
-out:
/* No valid superblock */
- if (!*raw_super)
+ if (!*raw_super) {
+ kfree(super);
return err;
+ }
return 0;
}
+static int __f2fs_commit_super(struct f2fs_sb_info *sbi, int block)
+{
+ struct f2fs_super_block *super = F2FS_RAW_SUPER(sbi);
+ struct buffer_head *bh;
+ int err;
+
+ bh = sb_getblk(sbi->sb, block);
+ if (!bh)
+ return -EIO;
+
+ lock_buffer(bh);
+ memcpy(bh->b_data + F2FS_SUPER_OFFSET, super, sizeof(*super));
+ set_buffer_uptodate(bh);
+ set_buffer_dirty(bh);
+ unlock_buffer(bh);
+
+ /* it's rare case, we can do fua all the time */
+ err = __sync_dirty_buffer(bh, WRITE_FLUSH_FUA);
+ brelse(bh);
+
+ return err;
+}
+
int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover)
{
- struct buffer_head *sbh = sbi->raw_super_buf;
- sector_t block = sbh->b_blocknr;
int err;
/* write back-up superblock first */
- sbh->b_blocknr = block ? 0 : 1;
- mark_buffer_dirty(sbh);
- err = sync_dirty_buffer(sbh);
-
- sbh->b_blocknr = block;
+ err = __f2fs_commit_super(sbi, sbi->valid_super_block ? 0 : 1);
/* if we are in recovery path, skip writing valid superblock */
if (recover || err)
- goto out;
+ return err;
/* write current valid superblock */
- mark_buffer_dirty(sbh);
- err = sync_dirty_buffer(sbh);
-out:
- clear_buffer_write_io_error(sbh);
- set_buffer_uptodate(sbh);
- return err;
+ return __f2fs_commit_super(sbi, sbi->valid_super_block);
}
static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
{
struct f2fs_sb_info *sbi;
struct f2fs_super_block *raw_super;
- struct buffer_head *raw_super_buf;
struct inode *root;
long err;
bool retry = true, need_fsck = false;
char *options = NULL;
- int recovery, i;
+ int recovery, i, valid_super_block;
try_onemore:
err = -EINVAL;
raw_super = NULL;
- raw_super_buf = NULL;
+ valid_super_block = -1;
recovery = 0;
/* allocate memory for f2fs-specific super block info */
@@ -1127,7 +1260,8 @@ try_onemore:
goto free_sbi;
}
- err = read_raw_super_block(sb, &raw_super, &raw_super_buf, &recovery);
+ err = read_raw_super_block(sb, &raw_super, &valid_super_block,
+ &recovery);
if (err)
goto free_sbi;
@@ -1144,7 +1278,9 @@ try_onemore:
if (err)
goto free_options;
- sb->s_maxbytes = max_file_size(le32_to_cpu(raw_super->log_blocksize));
+ sbi->max_file_blocks = max_file_blocks();
+ sb->s_maxbytes = sbi->max_file_blocks <<
+ le32_to_cpu(raw_super->log_blocksize);
sb->s_max_links = F2FS_LINK_MAX;
get_random_bytes(&sbi->s_next_generation, sizeof(u32));
@@ -1160,7 +1296,7 @@ try_onemore:
/* init f2fs-specific super block info */
sbi->sb = sb;
sbi->raw_super = raw_super;
- sbi->raw_super_buf = raw_super_buf;
+ sbi->valid_super_block = valid_super_block;
mutex_init(&sbi->gc_mutex);
mutex_init(&sbi->writepages);
mutex_init(&sbi->cp_mutex);
@@ -1213,8 +1349,10 @@ try_onemore:
le64_to_cpu(sbi->ckpt->valid_block_count);
sbi->last_valid_block_count = sbi->total_valid_block_count;
sbi->alloc_valid_block_count = 0;
- INIT_LIST_HEAD(&sbi->dir_inode_list);
- spin_lock_init(&sbi->dir_inode_lock);
+ for (i = 0; i < NR_INODE_TYPE; i++) {
+ INIT_LIST_HEAD(&sbi->inode_list[i]);
+ spin_lock_init(&sbi->inode_lock[i]);
+ }
init_extent_cache_info(sbi);
@@ -1332,10 +1470,14 @@ try_onemore:
f2fs_commit_super(sbi, true);
}
+ f2fs_update_time(sbi, CP_TIME);
+ f2fs_update_time(sbi, REQ_TIME);
return 0;
free_kobj:
kobject_del(&sbi->s_kobj);
+ kobject_put(&sbi->s_kobj);
+ wait_for_completion(&sbi->s_kobj_unregister);
free_proc:
if (sbi->s_proc) {
remove_proc_entry("segment_info", sbi->s_proc);
@@ -1362,7 +1504,7 @@ free_meta_inode:
free_options:
kfree(options);
free_sb_buf:
- brelse(raw_super_buf);
+ kfree(raw_super);
free_sbi:
kfree(sbi);
@@ -1399,8 +1541,9 @@ MODULE_ALIAS_FS("f2fs");
static int __init init_inodecache(void)
{
- f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache",
- sizeof(struct f2fs_inode_info));
+ f2fs_inode_cachep = kmem_cache_create("f2fs_inode_cache",
+ sizeof(struct f2fs_inode_info), 0,
+ SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, NULL);
if (!f2fs_inode_cachep)
return -ENOMEM;
return 0;
@@ -1453,10 +1596,14 @@ static int __init init_f2fs_fs(void)
err = register_filesystem(&f2fs_fs_type);
if (err)
goto free_shrinker;
- f2fs_create_root_stats();
+ err = f2fs_create_root_stats();
+ if (err)
+ goto free_filesystem;
f2fs_proc_root = proc_mkdir("fs/f2fs", NULL);
return 0;
+free_filesystem:
+ unregister_filesystem(&f2fs_fs_type);
free_shrinker:
unregister_shrinker(&f2fs_shrinker_info);
free_crypto:
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 4de2286c0e4d..10f1e784fa23 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -25,49 +25,37 @@
#include "f2fs.h"
#include "xattr.h"
-static size_t f2fs_xattr_generic_list(struct dentry *dentry, char *list,
- size_t list_size, const char *name, size_t len, int type)
+static int f2fs_xattr_generic_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name, void *buffer,
+ size_t size)
{
struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
- int total_len, prefix_len = 0;
- const char *prefix = NULL;
- switch (type) {
+ switch (handler->flags) {
case F2FS_XATTR_INDEX_USER:
if (!test_opt(sbi, XATTR_USER))
return -EOPNOTSUPP;
- prefix = XATTR_USER_PREFIX;
- prefix_len = XATTR_USER_PREFIX_LEN;
break;
case F2FS_XATTR_INDEX_TRUSTED:
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- prefix = XATTR_TRUSTED_PREFIX;
- prefix_len = XATTR_TRUSTED_PREFIX_LEN;
break;
case F2FS_XATTR_INDEX_SECURITY:
- prefix = XATTR_SECURITY_PREFIX;
- prefix_len = XATTR_SECURITY_PREFIX_LEN;
break;
default:
return -EINVAL;
}
-
- total_len = prefix_len + len + 1;
- if (list && total_len <= list_size) {
- memcpy(list, prefix, prefix_len);
- memcpy(list + prefix_len, name, len);
- list[prefix_len + len] = '\0';
- }
- return total_len;
+ return f2fs_getxattr(d_inode(dentry), handler->flags, name,
+ buffer, size, NULL);
}
-static int f2fs_xattr_generic_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int f2fs_xattr_generic_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name, const void *value,
+ size_t size, int flags)
{
struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
- switch (type) {
+ switch (handler->flags) {
case F2FS_XATTR_INDEX_USER:
if (!test_opt(sbi, XATTR_USER))
return -EOPNOTSUPP;
@@ -81,72 +69,39 @@ static int f2fs_xattr_generic_get(struct dentry *dentry, const char *name,
default:
return -EINVAL;
}
- if (strcmp(name, "") == 0)
- return -EINVAL;
- return f2fs_getxattr(d_inode(dentry), type, name, buffer, size, NULL);
+ return f2fs_setxattr(d_inode(dentry), handler->flags, name,
+ value, size, NULL, flags);
}
-static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+static bool f2fs_xattr_user_list(struct dentry *dentry)
{
struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
- switch (type) {
- case F2FS_XATTR_INDEX_USER:
- if (!test_opt(sbi, XATTR_USER))
- return -EOPNOTSUPP;
- break;
- case F2FS_XATTR_INDEX_TRUSTED:
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
- break;
- case F2FS_XATTR_INDEX_SECURITY:
- break;
- default:
- return -EINVAL;
- }
- if (strcmp(name, "") == 0)
- return -EINVAL;
-
- return f2fs_setxattr(d_inode(dentry), type, name,
- value, size, NULL, flags);
+ return test_opt(sbi, XATTR_USER);
}
-static size_t f2fs_xattr_advise_list(struct dentry *dentry, char *list,
- size_t list_size, const char *name, size_t len, int type)
+static bool f2fs_xattr_trusted_list(struct dentry *dentry)
{
- const char *xname = F2FS_SYSTEM_ADVISE_PREFIX;
- size_t size;
-
- if (type != F2FS_XATTR_INDEX_ADVISE)
- return 0;
-
- size = strlen(xname) + 1;
- if (list && size <= list_size)
- memcpy(list, xname, size);
- return size;
+ return capable(CAP_SYS_ADMIN);
}
-static int f2fs_xattr_advise_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int f2fs_xattr_advise_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name, void *buffer,
+ size_t size)
{
struct inode *inode = d_inode(dentry);
- if (strcmp(name, "") != 0)
- return -EINVAL;
-
if (buffer)
*((char *)buffer) = F2FS_I(inode)->i_advise;
return sizeof(char);
}
-static int f2fs_xattr_advise_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+static int f2fs_xattr_advise_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name, const void *value,
+ size_t size, int flags)
{
struct inode *inode = d_inode(dentry);
- if (strcmp(name, "") != 0)
- return -EINVAL;
if (!inode_owner_or_capable(inode))
return -EPERM;
if (value == NULL)
@@ -185,7 +140,7 @@ int f2fs_init_security(struct inode *inode, struct inode *dir,
const struct xattr_handler f2fs_xattr_user_handler = {
.prefix = XATTR_USER_PREFIX,
.flags = F2FS_XATTR_INDEX_USER,
- .list = f2fs_xattr_generic_list,
+ .list = f2fs_xattr_user_list,
.get = f2fs_xattr_generic_get,
.set = f2fs_xattr_generic_set,
};
@@ -193,15 +148,14 @@ const struct xattr_handler f2fs_xattr_user_handler = {
const struct xattr_handler f2fs_xattr_trusted_handler = {
.prefix = XATTR_TRUSTED_PREFIX,
.flags = F2FS_XATTR_INDEX_TRUSTED,
- .list = f2fs_xattr_generic_list,
+ .list = f2fs_xattr_trusted_list,
.get = f2fs_xattr_generic_get,
.set = f2fs_xattr_generic_set,
};
const struct xattr_handler f2fs_xattr_advise_handler = {
- .prefix = F2FS_SYSTEM_ADVISE_PREFIX,
+ .name = F2FS_SYSTEM_ADVISE_NAME,
.flags = F2FS_XATTR_INDEX_ADVISE,
- .list = f2fs_xattr_advise_list,
.get = f2fs_xattr_advise_get,
.set = f2fs_xattr_advise_set,
};
@@ -209,7 +163,6 @@ const struct xattr_handler f2fs_xattr_advise_handler = {
const struct xattr_handler f2fs_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
.flags = F2FS_XATTR_INDEX_SECURITY,
- .list = f2fs_xattr_generic_list,
.get = f2fs_xattr_generic_get,
.set = f2fs_xattr_generic_set,
};
@@ -457,20 +410,27 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
list_for_each_xattr(entry, base_addr) {
const struct xattr_handler *handler =
f2fs_xattr_handler(entry->e_name_index);
+ const char *prefix;
+ size_t prefix_len;
size_t size;
- if (!handler)
+ if (!handler || (handler->list && !handler->list(dentry)))
continue;
- size = handler->list(dentry, buffer, rest, entry->e_name,
- entry->e_name_len, handler->flags);
- if (buffer && size > rest) {
- error = -ERANGE;
- goto cleanup;
+ prefix = handler->prefix ?: handler->name;
+ prefix_len = strlen(prefix);
+ size = prefix_len + entry->e_name_len + 1;
+ if (buffer) {
+ if (size > rest) {
+ error = -ERANGE;
+ goto cleanup;
+ }
+ memcpy(buffer, prefix, prefix_len);
+ buffer += prefix_len;
+ memcpy(buffer, entry->e_name, entry->e_name_len);
+ buffer += entry->e_name_len;
+ *buffer++ = 0;
}
-
- if (buffer)
- buffer += size;
rest -= size;
}
error = buffer_size - rest;
@@ -611,7 +571,7 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name,
if (ipage)
return __f2fs_setxattr(inode, index, name, value,
size, ipage, flags);
- f2fs_balance_fs(sbi);
+ f2fs_balance_fs(sbi, true);
f2fs_lock_op(sbi);
/* protect xattr_ver */
@@ -620,5 +580,6 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name,
up_write(&F2FS_I(inode)->i_sem);
f2fs_unlock_op(sbi);
+ f2fs_update_time(sbi, REQ_TIME);
return err;
}
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
index 71a7100d5492..79dccc8252dd 100644
--- a/fs/f2fs/xattr.h
+++ b/fs/f2fs/xattr.h
@@ -27,7 +27,7 @@
#define F2FS_XATTR_REFCOUNT_MAX 1024
/* Name indexes */
-#define F2FS_SYSTEM_ADVISE_PREFIX "system.advise"
+#define F2FS_SYSTEM_ADVISE_NAME "system.advise"
#define F2FS_XATTR_INDEX_USER 1
#define F2FS_XATTR_INDEX_POSIX_ACL_ACCESS 2
#define F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT 3
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 4afc4d9d2e41..8b2127ffb226 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -610,9 +610,9 @@ parse_record:
int status = fat_parse_long(inode, &cpos, &bh, &de,
&unicode, &nr_slots);
if (status < 0) {
- ctx->pos = cpos;
+ bh = NULL;
ret = status;
- goto out;
+ goto end_of_dir;
} else if (status == PARSE_INVALID)
goto record_end;
else if (status == PARSE_NOT_LONGNAME)
@@ -654,8 +654,9 @@ parse_record:
fill_len = short_len;
start_filldir:
- if (!fake_offset)
- ctx->pos = cpos - (nr_slots + 1) * sizeof(struct msdos_dir_entry);
+ ctx->pos = cpos - (nr_slots + 1) * sizeof(struct msdos_dir_entry);
+ if (fake_offset && ctx->pos < 2)
+ ctx->pos = 2;
if (!memcmp(de->name, MSDOS_DOT, MSDOS_NAME)) {
if (!dir_emit_dot(file, ctx))
@@ -681,14 +682,19 @@ record_end:
fake_offset = 0;
ctx->pos = cpos;
goto get_new;
+
end_of_dir:
- ctx->pos = cpos;
+ if (fake_offset && cpos < 2)
+ ctx->pos = 2;
+ else
+ ctx->pos = cpos;
fill_failed:
brelse(bh);
if (unicode)
__putname(unicode);
out:
mutex_unlock(&sbi->s_lock);
+
return ret;
}
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 509411dd3698..6aece96df19f 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -677,7 +677,7 @@ static int __init fat_init_inodecache(void)
fat_inode_cachep = kmem_cache_create("fat_inode_cache",
sizeof(struct msdos_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (fat_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index ee85cd4e136a..350a2c8cfd28 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -51,7 +51,8 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
if (arg & O_NDELAY)
arg |= O_NONBLOCK;
- if (arg & O_DIRECT) {
+ /* Pipe packetized mode is controlled by O_DIRECT flag */
+ if (!S_ISFIFO(filp->f_inode->i_mode) && (arg & O_DIRECT)) {
if (!filp->f_mapping || !filp->f_mapping->a_ops ||
!filp->f_mapping->a_ops->direct_IO)
return -EINVAL;
diff --git a/fs/file.c b/fs/file.c
index 6c672ad329e9..1fbc5c0555a9 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -25,9 +25,9 @@
int sysctl_nr_open __read_mostly = 1024*1024;
int sysctl_nr_open_min = BITS_PER_LONG;
-/* our max() is unusable in constant expressions ;-/ */
-#define __const_max(x, y) ((x) < (y) ? (x) : (y))
-int sysctl_nr_open_max = __const_max(INT_MAX, ~(size_t)0/sizeof(void *)) &
+/* our min() is unusable in constant expressions ;-/ */
+#define __const_min(x, y) ((x) < (y) ? (x) : (y))
+int sysctl_nr_open_max = __const_min(INT_MAX, ~(size_t)0/sizeof(void *)) &
-BITS_PER_LONG;
static void *alloc_fdmem(size_t size)
@@ -37,11 +37,12 @@ static void *alloc_fdmem(size_t size)
* vmalloc() if the allocation size will be considered "large" by the VM.
*/
if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
- void *data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN|__GFP_NORETRY);
+ void *data = kmalloc(size, GFP_KERNEL_ACCOUNT |
+ __GFP_NOWARN | __GFP_NORETRY);
if (data != NULL)
return data;
}
- return vmalloc(size);
+ return __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM, PAGE_KERNEL);
}
static void __free_fdtable(struct fdtable *fdt)
@@ -56,9 +57,35 @@ static void free_fdtable_rcu(struct rcu_head *rcu)
__free_fdtable(container_of(rcu, struct fdtable, rcu));
}
+#define BITBIT_NR(nr) BITS_TO_LONGS(BITS_TO_LONGS(nr))
+#define BITBIT_SIZE(nr) (BITBIT_NR(nr) * sizeof(long))
+
+/*
+ * Copy 'count' fd bits from the old table to the new table and clear the extra
+ * space if any. This does not copy the file pointers. Called with the files
+ * spinlock held for write.
+ */
+static void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt,
+ unsigned int count)
+{
+ unsigned int cpy, set;
+
+ cpy = count / BITS_PER_BYTE;
+ set = (nfdt->max_fds - count) / BITS_PER_BYTE;
+ memcpy(nfdt->open_fds, ofdt->open_fds, cpy);
+ memset((char *)nfdt->open_fds + cpy, 0, set);
+ memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy);
+ memset((char *)nfdt->close_on_exec + cpy, 0, set);
+
+ cpy = BITBIT_SIZE(count);
+ set = BITBIT_SIZE(nfdt->max_fds) - cpy;
+ memcpy(nfdt->full_fds_bits, ofdt->full_fds_bits, cpy);
+ memset((char *)nfdt->full_fds_bits + cpy, 0, set);
+}
+
/*
- * Expand the fdset in the files_struct. Called with the files spinlock
- * held for write.
+ * Copy all file descriptors from the old table to the new, expanded table and
+ * clear the extra space. Called with the files spinlock held for write.
*/
static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
{
@@ -69,14 +96,9 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
cpy = ofdt->max_fds * sizeof(struct file *);
set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *);
memcpy(nfdt->fd, ofdt->fd, cpy);
- memset((char *)(nfdt->fd) + cpy, 0, set);
+ memset((char *)nfdt->fd + cpy, 0, set);
- cpy = ofdt->max_fds / BITS_PER_BYTE;
- set = (nfdt->max_fds - ofdt->max_fds) / BITS_PER_BYTE;
- memcpy(nfdt->open_fds, ofdt->open_fds, cpy);
- memset((char *)(nfdt->open_fds) + cpy, 0, set);
- memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy);
- memset((char *)(nfdt->close_on_exec) + cpy, 0, set);
+ copy_fd_bitmaps(nfdt, ofdt, ofdt->max_fds);
}
static struct fdtable * alloc_fdtable(unsigned int nr)
@@ -105,7 +127,7 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
if (unlikely(nr > sysctl_nr_open))
nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1;
- fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL);
+ fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT);
if (!fdt)
goto out;
fdt->max_fds = nr;
@@ -115,12 +137,14 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
fdt->fd = data;
data = alloc_fdmem(max_t(size_t,
- 2 * nr / BITS_PER_BYTE, L1_CACHE_BYTES));
+ 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES));
if (!data)
goto out_arr;
fdt->open_fds = data;
data += nr / BITS_PER_BYTE;
fdt->close_on_exec = data;
+ data += nr / BITS_PER_BYTE;
+ fdt->full_fds_bits = data;
return fdt;
@@ -226,17 +250,22 @@ static inline void __set_close_on_exec(int fd, struct fdtable *fdt)
static inline void __clear_close_on_exec(int fd, struct fdtable *fdt)
{
- __clear_bit(fd, fdt->close_on_exec);
+ if (test_bit(fd, fdt->close_on_exec))
+ __clear_bit(fd, fdt->close_on_exec);
}
-static inline void __set_open_fd(int fd, struct fdtable *fdt)
+static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt)
{
__set_bit(fd, fdt->open_fds);
+ fd /= BITS_PER_LONG;
+ if (!~fdt->open_fds[fd])
+ __set_bit(fd, fdt->full_fds_bits);
}
-static inline void __clear_open_fd(int fd, struct fdtable *fdt)
+static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt)
{
__clear_bit(fd, fdt->open_fds);
+ __clear_bit(fd / BITS_PER_LONG, fdt->full_fds_bits);
}
static int count_open_files(struct fdtable *fdt)
@@ -262,7 +291,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
{
struct files_struct *newf;
struct file **old_fds, **new_fds;
- int open_files, size, i;
+ int open_files, i;
struct fdtable *old_fdt, *new_fdt;
*errorp = -ENOMEM;
@@ -280,6 +309,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
new_fdt->max_fds = NR_OPEN_DEFAULT;
new_fdt->close_on_exec = newf->close_on_exec_init;
new_fdt->open_fds = newf->open_fds_init;
+ new_fdt->full_fds_bits = newf->full_fds_bits_init;
new_fdt->fd = &newf->fd_array[0];
spin_lock(&oldf->file_lock);
@@ -318,12 +348,11 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
open_files = count_open_files(old_fdt);
}
+ copy_fd_bitmaps(new_fdt, old_fdt, open_files);
+
old_fds = old_fdt->fd;
new_fds = new_fdt->fd;
- memcpy(new_fdt->open_fds, old_fdt->open_fds, open_files / 8);
- memcpy(new_fdt->close_on_exec, old_fdt->close_on_exec, open_files / 8);
-
for (i = open_files; i != 0; i--) {
struct file *f = *old_fds++;
if (f) {
@@ -341,19 +370,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
}
spin_unlock(&oldf->file_lock);
- /* compute the remainder to be cleared */
- size = (new_fdt->max_fds - open_files) * sizeof(struct file *);
-
- /* This is long word aligned thus could use a optimized version */
- memset(new_fds, 0, size);
-
- if (new_fdt->max_fds > open_files) {
- int left = (new_fdt->max_fds - open_files) / 8;
- int start = open_files / BITS_PER_LONG;
-
- memset(&new_fdt->open_fds[start], 0, left);
- memset(&new_fdt->close_on_exec[start], 0, left);
- }
+ /* clear the remainder */
+ memset(new_fds, 0, (new_fdt->max_fds - open_files) * sizeof(struct file *));
rcu_assign_pointer(newf->fdt, new_fdt);
@@ -454,10 +472,25 @@ struct files_struct init_files = {
.fd = &init_files.fd_array[0],
.close_on_exec = init_files.close_on_exec_init,
.open_fds = init_files.open_fds_init,
+ .full_fds_bits = init_files.full_fds_bits_init,
},
.file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock),
};
+static unsigned long find_next_fd(struct fdtable *fdt, unsigned long start)
+{
+ unsigned long maxfd = fdt->max_fds;
+ unsigned long maxbit = maxfd / BITS_PER_LONG;
+ unsigned long bitbit = start / BITS_PER_LONG;
+
+ bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG;
+ if (bitbit > maxfd)
+ return maxfd;
+ if (bitbit > start)
+ start = bitbit;
+ return find_next_zero_bit(fdt->open_fds, maxfd, start);
+}
+
/*
* allocate a file descriptor, mark it busy.
*/
@@ -476,7 +509,7 @@ repeat:
fd = files->next_fd;
if (fd < fdt->max_fds)
- fd = find_next_zero_bit(fdt->open_fds, fdt->max_fds, fd);
+ fd = find_next_fd(fdt, fd);
/*
* N.B. For clone tasks sharing a files structure, this test
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index ef73ed674a27..3e2ccade61ed 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -326,6 +326,7 @@ vxfs_iget(struct super_block *sbp, ino_t ino)
} else if (S_ISLNK(ip->i_mode)) {
if (!VXFS_ISIMMED(vip)) {
ip->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(ip);
ip->i_mapping->a_ops = &vxfs_aops;
} else {
ip->i_op = &simple_symlink_inode_operations;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 29e4599f6fc1..6915c950e6e8 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -677,9 +677,7 @@ void wbc_account_io(struct writeback_control *wbc, struct page *page,
if (!wbc->wb)
return;
- rcu_read_lock();
id = mem_cgroup_css_from_page(page)->id;
- rcu_read_unlock();
if (id == wbc->wb_id) {
wbc->wb_bytes += bytes;
@@ -779,8 +777,8 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
bool skip_if_busy)
{
struct bdi_writeback *last_wb = NULL;
- struct bdi_writeback *wb = list_entry_rcu(&bdi->wb_list,
- struct bdi_writeback, bdi_node);
+ struct bdi_writeback *wb = list_entry(&bdi->wb_list,
+ struct bdi_writeback, bdi_node);
might_sleep();
restart:
@@ -1981,9 +1979,9 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode)
* page->mapping->host, so the page-dirtying time is recorded in the internal
* blockdev inode.
*/
-#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
void __mark_inode_dirty(struct inode *inode, int flags)
{
+#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
struct super_block *sb = inode->i_sb;
int dirtytime;
@@ -2093,6 +2091,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
out_unlock_inode:
spin_unlock(&inode->i_lock);
+#undef I_DIRTY_INODE
}
EXPORT_SYMBOL(__mark_inode_dirty);
@@ -2149,7 +2148,12 @@ static void wait_sb_inodes(struct super_block *sb)
iput(old_inode);
old_inode = inode;
- filemap_fdatawait(mapping);
+ /*
+ * We keep the error status of individual mapping so that
+ * applications can catch the writeback error using fsync(2).
+ * See filemap_fdatawait_keep_errors() for details.
+ */
+ filemap_fdatawait_keep_errors(mapping);
cond_resched();
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index d403c69bee08..4304072161aa 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -111,7 +111,7 @@ struct fscache_cookie *__fscache_acquire_cookie(
/* radix tree insertion won't use the preallocation pool unless it's
* told it may not wait */
- INIT_RADIX_TREE(&cookie->stores, GFP_NOFS & ~__GFP_WAIT);
+ INIT_RADIX_TREE(&cookie->stores, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
switch (cookie->def->type) {
case FSCACHE_COOKIE_TYPE_INDEX:
diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c
index 6d941f56faf4..9b28649df3a1 100644
--- a/fs/fscache/netfs.c
+++ b/fs/fscache/netfs.c
@@ -22,6 +22,7 @@ static LIST_HEAD(fscache_netfs_list);
int __fscache_register_netfs(struct fscache_netfs *netfs)
{
struct fscache_netfs *ptr;
+ struct fscache_cookie *cookie;
int ret;
_enter("{%s}", netfs->name);
@@ -29,29 +30,25 @@ int __fscache_register_netfs(struct fscache_netfs *netfs)
INIT_LIST_HEAD(&netfs->link);
/* allocate a cookie for the primary index */
- netfs->primary_index =
- kmem_cache_zalloc(fscache_cookie_jar, GFP_KERNEL);
+ cookie = kmem_cache_zalloc(fscache_cookie_jar, GFP_KERNEL);
- if (!netfs->primary_index) {
+ if (!cookie) {
_leave(" = -ENOMEM");
return -ENOMEM;
}
/* initialise the primary index cookie */
- atomic_set(&netfs->primary_index->usage, 1);
- atomic_set(&netfs->primary_index->n_children, 0);
- atomic_set(&netfs->primary_index->n_active, 1);
+ atomic_set(&cookie->usage, 1);
+ atomic_set(&cookie->n_children, 0);
+ atomic_set(&cookie->n_active, 1);
- netfs->primary_index->def = &fscache_fsdef_netfs_def;
- netfs->primary_index->parent = &fscache_fsdef_index;
- netfs->primary_index->netfs_data = netfs;
- netfs->primary_index->flags = 1 << FSCACHE_COOKIE_ENABLED;
+ cookie->def = &fscache_fsdef_netfs_def;
+ cookie->parent = &fscache_fsdef_index;
+ cookie->netfs_data = netfs;
+ cookie->flags = 1 << FSCACHE_COOKIE_ENABLED;
- atomic_inc(&netfs->primary_index->parent->usage);
- atomic_inc(&netfs->primary_index->parent->n_children);
-
- spin_lock_init(&netfs->primary_index->lock);
- INIT_HLIST_HEAD(&netfs->primary_index->backing_objects);
+ spin_lock_init(&cookie->lock);
+ INIT_HLIST_HEAD(&cookie->backing_objects);
/* check the netfs type is not already present */
down_write(&fscache_addremove_sem);
@@ -62,6 +59,10 @@ int __fscache_register_netfs(struct fscache_netfs *netfs)
goto already_registered;
}
+ atomic_inc(&cookie->parent->usage);
+ atomic_inc(&cookie->parent->n_children);
+
+ netfs->primary_index = cookie;
list_add(&netfs->link, &fscache_netfs_list);
ret = 0;
@@ -70,11 +71,8 @@ int __fscache_register_netfs(struct fscache_netfs *netfs)
already_registered:
up_write(&fscache_addremove_sem);
- if (ret < 0) {
- netfs->primary_index->parent = NULL;
- __fscache_cookie_put(netfs->primary_index);
- netfs->primary_index = NULL;
- }
+ if (ret < 0)
+ kmem_cache_free(fscache_cookie_jar, cookie);
_leave(" = %d", ret);
return ret;
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index 51dde817e1f2..6b028b7c4250 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -316,7 +316,7 @@ static const struct seq_operations fscache_objlist_ops = {
static void fscache_objlist_config(struct fscache_objlist_data *data)
{
#ifdef CONFIG_KEYS
- struct user_key_payload *confkey;
+ const struct user_key_payload *confkey;
unsigned long config;
struct key *key;
const char *buf;
@@ -329,7 +329,7 @@ static void fscache_objlist_config(struct fscache_objlist_data *data)
config = 0;
rcu_read_lock();
- confkey = key->payload.data;
+ confkey = user_key_payload(key);
buf = confkey->data;
for (len = confkey->datalen - 1; len >= 0; len--) {
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 483bbc613bf0..6b35fc4860a0 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -58,7 +58,7 @@ bool release_page_wait_timeout(struct fscache_cookie *cookie, struct page *page)
/*
* decide whether a page can be released, possibly by cancelling a store to it
- * - we're allowed to sleep if __GFP_WAIT is flagged
+ * - we're allowed to sleep if __GFP_DIRECT_RECLAIM is flagged
*/
bool __fscache_maybe_release_page(struct fscache_cookie *cookie,
struct page *page,
@@ -122,7 +122,7 @@ page_busy:
* allocator as the work threads writing to the cache may all end up
* sleeping on memory allocation, so we may need to impose a timeout
* too. */
- if (!(gfp & __GFP_WAIT) || !(gfp & __GFP_FS)) {
+ if (!(gfp & __GFP_DIRECT_RECLAIM) || !(gfp & __GFP_FS)) {
fscache_stat(&fscache_n_store_vmscan_busy);
return false;
}
@@ -132,7 +132,7 @@ page_busy:
_debug("fscache writeout timeout page: %p{%lx}",
page, page->index);
- gfp &= ~__GFP_WAIT;
+ gfp &= ~__GFP_DIRECT_RECLAIM;
goto try_again;
}
EXPORT_SYMBOL(__fscache_maybe_release_page);
@@ -816,7 +816,7 @@ static void fscache_write_op(struct fscache_operation *_op)
goto superseded;
page = results[0];
_debug("gang %d [%lx]", n, page->index);
- if (page->index > op->store_limit) {
+ if (page->index >= op->store_limit) {
fscache_stat(&fscache_n_store_pages_over_limit);
goto superseded;
}
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 5e2e08712d3b..712601f299b8 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1365,15 +1365,19 @@ static int fuse_readdir(struct file *file, struct dir_context *ctx)
return err;
}
-static const char *fuse_follow_link(struct dentry *dentry, void **cookie)
+static const char *fuse_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- struct inode *inode = d_inode(dentry);
struct fuse_conn *fc = get_fuse_conn(inode);
FUSE_ARGS(args);
char *link;
ssize_t ret;
- link = (char *) __get_free_page(GFP_KERNEL);
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
+ link = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (!link)
return ERR_PTR(-ENOMEM);
@@ -1385,11 +1389,11 @@ static const char *fuse_follow_link(struct dentry *dentry, void **cookie)
args.out.args[0].value = link;
ret = fuse_simple_request(fc, &args);
if (ret < 0) {
- free_page((unsigned long) link);
+ kfree(link);
link = ERR_PTR(ret);
} else {
link[ret] = '\0';
- *cookie = link;
+ set_delayed_call(done, kfree_link, link);
}
fuse_invalidate_atime(inode);
return link;
@@ -1909,8 +1913,7 @@ static const struct inode_operations fuse_common_inode_operations = {
static const struct inode_operations fuse_symlink_inode_operations = {
.setattr = fuse_setattr,
- .follow_link = fuse_follow_link,
- .put_link = free_page_put_link,
+ .get_link = fuse_get_link,
.readlink = generic_readlink,
.getattr = fuse_getattr,
.setxattr = fuse_setxattr,
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 47f181191060..aa03aab6a24f 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -2189,7 +2189,7 @@ static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl)
int err;
if (fc->no_flock) {
- err = flock_lock_file_wait(file, fl);
+ err = locks_lock_file_wait(file, fl);
} else {
struct fuse_file *ff = file->private_data;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 2913db2a5b99..4d69d5c0bedc 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1255,8 +1255,8 @@ static int __init fuse_fs_init(void)
int err;
fuse_inode_cachep = kmem_cache_create("fuse_inode",
- sizeof(struct fuse_inode),
- 0, SLAB_HWCACHE_ALIGN,
+ sizeof(struct fuse_inode), 0,
+ SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT,
fuse_inode_init_once);
err = -ENOMEM;
if (!fuse_inode_cachep)
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 1be3b061c05c..791932617d1a 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -31,9 +31,9 @@ static const char *gfs2_acl_name(int type)
{
switch (type) {
case ACL_TYPE_ACCESS:
- return GFS2_POSIX_ACL_ACCESS;
+ return XATTR_POSIX_ACL_ACCESS;
case ACL_TYPE_DEFAULT:
- return GFS2_POSIX_ACL_DEFAULT;
+ return XATTR_POSIX_ACL_DEFAULT;
}
return NULL;
}
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
index 2d65ec4cd4be..3af4f407a483 100644
--- a/fs/gfs2/acl.h
+++ b/fs/gfs2/acl.h
@@ -12,8 +12,6 @@
#include "incore.h"
-#define GFS2_POSIX_ACL_ACCESS "posix_acl_access"
-#define GFS2_POSIX_ACL_DEFAULT "posix_acl_default"
#define GFS2_ACL_MAX_ENTRIES(sdp) ((300 << (sdp)->sd_sb.sb_bsize_shift) >> 12)
extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type);
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 1caee0534587..93f07465e5a6 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -914,7 +914,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
failed:
gfs2_trans_end(sdp);
gfs2_inplace_release(ip);
- if (ip->i_res->rs_qa_qd_num)
+ if (ip->i_qadata && ip->i_qadata->qa_qd_num)
gfs2_quota_unlock(ip);
if (inode == sdp->sd_rindex) {
gfs2_glock_dq(&m_ip->i_gh);
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 61296ecbd0e2..0860f0b5b3f1 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -787,8 +787,8 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
if (error)
goto out_rlist;
- if (gfs2_rs_active(ip->i_res)) /* needs to be done with the rgrp glock held */
- gfs2_rs_deltree(ip->i_res);
+ if (gfs2_rs_active(&ip->i_res)) /* needs to be done with the rgrp glock held */
+ gfs2_rs_deltree(&ip->i_res);
error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE +
RES_INDIRECT + RES_STATFS + RES_QUOTA,
@@ -1291,13 +1291,9 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize)
if (ret)
return ret;
- ret = get_write_access(inode);
- if (ret)
- return ret;
-
inode_dio_wait(inode);
- ret = gfs2_rs_alloc(ip);
+ ret = gfs2_rsqa_alloc(ip);
if (ret)
goto out;
@@ -1307,10 +1303,9 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize)
goto out;
}
- gfs2_rs_deltree(ip->i_res);
ret = do_shrink(inode, oldsize, newsize);
out:
- put_write_access(inode);
+ gfs2_rsqa_delete(ip, NULL);
return ret;
}
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 487527b42d94..6a92592304fb 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -82,6 +82,8 @@
#define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1)
#define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1))
+#define GFS2_HASH_INDEX_MASK 0xffffc000
+#define GFS2_USE_HASH_FLAG 0x2000
struct qstr gfs2_qdot __read_mostly;
struct qstr gfs2_qdotdot __read_mostly;
@@ -108,7 +110,7 @@ static int gfs2_dir_get_existing_buffer(struct gfs2_inode *ip, u64 block,
struct buffer_head *bh;
int error;
- error = gfs2_meta_read(ip->i_gl, block, DIO_WAIT, &bh);
+ error = gfs2_meta_read(ip->i_gl, block, DIO_WAIT, 0, &bh);
if (error)
return error;
if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_JD)) {
@@ -305,7 +307,7 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, __be64 *buf,
BUG_ON(extlen < 1);
bh = gfs2_meta_ra(ip->i_gl, dblock, extlen);
} else {
- error = gfs2_meta_read(ip->i_gl, dblock, DIO_WAIT, &bh);
+ error = gfs2_meta_read(ip->i_gl, dblock, DIO_WAIT, 0, &bh);
if (error)
goto fail;
}
@@ -388,8 +390,13 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip)
*/
void gfs2_dir_hash_inval(struct gfs2_inode *ip)
{
- __be64 *hc = ip->i_hash_cache;
+ __be64 *hc;
+
+ spin_lock(&ip->i_inode.i_lock);
+ hc = ip->i_hash_cache;
ip->i_hash_cache = NULL;
+ spin_unlock(&ip->i_inode.i_lock);
+
kvfree(hc);
}
@@ -438,6 +445,27 @@ static int gfs2_dirent_last(const struct gfs2_dirent *dent,
return 0;
}
+/* Look for the dirent that contains the offset specified in data. Once we
+ * find that dirent, there must be space available there for the new dirent */
+static int gfs2_dirent_find_offset(const struct gfs2_dirent *dent,
+ const struct qstr *name,
+ void *ptr)
+{
+ unsigned required = GFS2_DIRENT_SIZE(name->len);
+ unsigned actual = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len));
+ unsigned totlen = be16_to_cpu(dent->de_rec_len);
+
+ if (ptr < (void *)dent || ptr >= (void *)dent + totlen)
+ return 0;
+ if (gfs2_dirent_sentinel(dent))
+ actual = 0;
+ if (ptr < (void *)dent + actual)
+ return -1;
+ if ((void *)dent + totlen >= ptr + required)
+ return 1;
+ return -1;
+}
+
static int gfs2_dirent_find_space(const struct gfs2_dirent *dent,
const struct qstr *name,
void *opaque)
@@ -677,6 +705,27 @@ static void dirent_del(struct gfs2_inode *dip, struct buffer_head *bh,
prev->de_rec_len = cpu_to_be16(prev_rec_len);
}
+
+static struct gfs2_dirent *do_init_dirent(struct inode *inode,
+ struct gfs2_dirent *dent,
+ const struct qstr *name,
+ struct buffer_head *bh,
+ unsigned offset)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_dirent *ndent;
+ unsigned totlen;
+
+ totlen = be16_to_cpu(dent->de_rec_len);
+ BUG_ON(offset + name->len > totlen);
+ gfs2_trans_add_meta(ip->i_gl, bh);
+ ndent = (struct gfs2_dirent *)((char *)dent + offset);
+ dent->de_rec_len = cpu_to_be16(offset);
+ gfs2_qstr2dirent(name, totlen - offset, ndent);
+ return ndent;
+}
+
+
/*
* Takes a dent from which to grab space as an argument. Returns the
* newly created dent.
@@ -686,31 +735,25 @@ static struct gfs2_dirent *gfs2_init_dirent(struct inode *inode,
const struct qstr *name,
struct buffer_head *bh)
{
- struct gfs2_inode *ip = GFS2_I(inode);
- struct gfs2_dirent *ndent;
- unsigned offset = 0, totlen;
+ unsigned offset = 0;
if (!gfs2_dirent_sentinel(dent))
offset = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len));
- totlen = be16_to_cpu(dent->de_rec_len);
- BUG_ON(offset + name->len > totlen);
- gfs2_trans_add_meta(ip->i_gl, bh);
- ndent = (struct gfs2_dirent *)((char *)dent + offset);
- dent->de_rec_len = cpu_to_be16(offset);
- gfs2_qstr2dirent(name, totlen - offset, ndent);
- return ndent;
+ return do_init_dirent(inode, dent, name, bh, offset);
}
-static struct gfs2_dirent *gfs2_dirent_alloc(struct inode *inode,
- struct buffer_head *bh,
- const struct qstr *name)
+static struct gfs2_dirent *gfs2_dirent_split_alloc(struct inode *inode,
+ struct buffer_head *bh,
+ const struct qstr *name,
+ void *ptr)
{
struct gfs2_dirent *dent;
dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
- gfs2_dirent_find_space, name, NULL);
+ gfs2_dirent_find_offset, name, ptr);
if (!dent || IS_ERR(dent))
return dent;
- return gfs2_init_dirent(inode, dent, name, bh);
+ return do_init_dirent(inode, dent, name, bh,
+ (unsigned)(ptr - (void *)dent));
}
static int get_leaf(struct gfs2_inode *dip, u64 leaf_no,
@@ -718,7 +761,7 @@ static int get_leaf(struct gfs2_inode *dip, u64 leaf_no,
{
int error;
- error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_WAIT, bhp);
+ error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_WAIT, 0, bhp);
if (!error && gfs2_metatype_check(GFS2_SB(&dip->i_inode), *bhp, GFS2_METATYPE_LF)) {
/* pr_info("block num=%llu\n", leaf_no); */
error = -EIO;
@@ -1046,10 +1089,11 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
if (!gfs2_dirent_sentinel(dent) &&
be32_to_cpu(dent->de_hash) < divider) {
struct qstr str;
+ void *ptr = ((char *)dent - obh->b_data) + nbh->b_data;
str.name = (char*)(dent+1);
str.len = be16_to_cpu(dent->de_name_len);
str.hash = be32_to_cpu(dent->de_hash);
- new = gfs2_dirent_alloc(inode, nbh, &str);
+ new = gfs2_dirent_split_alloc(inode, nbh, &str, ptr);
if (IS_ERR(new)) {
error = PTR_ERR(new);
break;
@@ -1181,10 +1225,10 @@ static int compare_dents(const void *a, const void *b)
int ret = 0;
dent_a = *(const struct gfs2_dirent **)a;
- hash_a = be32_to_cpu(dent_a->de_hash);
+ hash_a = dent_a->de_cookie;
dent_b = *(const struct gfs2_dirent **)b;
- hash_b = be32_to_cpu(dent_b->de_hash);
+ hash_b = dent_b->de_cookie;
if (hash_a > hash_b)
ret = 1;
@@ -1222,19 +1266,20 @@ static int compare_dents(const void *a, const void *b)
*/
static int do_filldir_main(struct gfs2_inode *dip, struct dir_context *ctx,
- const struct gfs2_dirent **darr, u32 entries,
- int *copied)
+ struct gfs2_dirent **darr, u32 entries,
+ u32 sort_start, int *copied)
{
const struct gfs2_dirent *dent, *dent_next;
u64 off, off_next;
unsigned int x, y;
int run = 0;
- sort(darr, entries, sizeof(struct gfs2_dirent *), compare_dents, NULL);
+ if (sort_start < entries)
+ sort(&darr[sort_start], entries - sort_start,
+ sizeof(struct gfs2_dirent *), compare_dents, NULL);
dent_next = darr[0];
- off_next = be32_to_cpu(dent_next->de_hash);
- off_next = gfs2_disk_hash2offset(off_next);
+ off_next = dent_next->de_cookie;
for (x = 0, y = 1; x < entries; x++, y++) {
dent = dent_next;
@@ -1242,8 +1287,7 @@ static int do_filldir_main(struct gfs2_inode *dip, struct dir_context *ctx,
if (y < entries) {
dent_next = darr[y];
- off_next = be32_to_cpu(dent_next->de_hash);
- off_next = gfs2_disk_hash2offset(off_next);
+ off_next = dent_next->de_cookie;
if (off < ctx->pos)
continue;
@@ -1290,6 +1334,40 @@ static void *gfs2_alloc_sort_buffer(unsigned size)
return ptr;
}
+
+static int gfs2_set_cookies(struct gfs2_sbd *sdp, struct buffer_head *bh,
+ unsigned leaf_nr, struct gfs2_dirent **darr,
+ unsigned entries)
+{
+ int sort_id = -1;
+ int i;
+
+ for (i = 0; i < entries; i++) {
+ unsigned offset;
+
+ darr[i]->de_cookie = be32_to_cpu(darr[i]->de_hash);
+ darr[i]->de_cookie = gfs2_disk_hash2offset(darr[i]->de_cookie);
+
+ if (!sdp->sd_args.ar_loccookie)
+ continue;
+ offset = (char *)(darr[i]) -
+ (bh->b_data + gfs2_dirent_offset(bh->b_data));
+ offset /= GFS2_MIN_DIRENT_SIZE;
+ offset += leaf_nr * sdp->sd_max_dents_per_leaf;
+ if (offset >= GFS2_USE_HASH_FLAG ||
+ leaf_nr >= GFS2_USE_HASH_FLAG) {
+ darr[i]->de_cookie |= GFS2_USE_HASH_FLAG;
+ if (sort_id < 0)
+ sort_id = i;
+ continue;
+ }
+ darr[i]->de_cookie &= GFS2_HASH_INDEX_MASK;
+ darr[i]->de_cookie |= offset;
+ }
+ return sort_id;
+}
+
+
static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
int *copied, unsigned *depth,
u64 leaf_no)
@@ -1299,12 +1377,11 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
struct buffer_head *bh;
struct gfs2_leaf *lf;
unsigned entries = 0, entries2 = 0;
- unsigned leaves = 0;
- const struct gfs2_dirent **darr, *dent;
+ unsigned leaves = 0, leaf = 0, offset, sort_offset;
+ struct gfs2_dirent **darr, *dent;
struct dirent_gather g;
struct buffer_head **larr;
- int leaf = 0;
- int error, i;
+ int error, i, need_sort = 0, sort_id;
u64 lfn = leaf_no;
do {
@@ -1320,6 +1397,11 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
brelse(bh);
} while(lfn);
+ if (*depth < GFS2_DIR_MAX_DEPTH || !sdp->sd_args.ar_loccookie) {
+ need_sort = 1;
+ sort_offset = 0;
+ }
+
if (!entries)
return 0;
@@ -1333,8 +1415,8 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
larr = gfs2_alloc_sort_buffer((leaves + entries + 99) * sizeof(void *));
if (!larr)
goto out;
- darr = (const struct gfs2_dirent **)(larr + leaves);
- g.pdent = darr;
+ darr = (struct gfs2_dirent **)(larr + leaves);
+ g.pdent = (const struct gfs2_dirent **)darr;
g.offset = 0;
lfn = leaf_no;
@@ -1345,6 +1427,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
lf = (struct gfs2_leaf *)bh->b_data;
lfn = be64_to_cpu(lf->lf_next);
if (lf->lf_entries) {
+ offset = g.offset;
entries2 += be16_to_cpu(lf->lf_entries);
dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
gfs2_dirent_gather, NULL, &g);
@@ -1362,17 +1445,26 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
goto out_free;
}
error = 0;
+ sort_id = gfs2_set_cookies(sdp, bh, leaf, &darr[offset],
+ be16_to_cpu(lf->lf_entries));
+ if (!need_sort && sort_id >= 0) {
+ need_sort = 1;
+ sort_offset = offset + sort_id;
+ }
larr[leaf++] = bh;
} else {
+ larr[leaf++] = NULL;
brelse(bh);
}
} while(lfn);
BUG_ON(entries2 != entries);
- error = do_filldir_main(ip, ctx, darr, entries, copied);
+ error = do_filldir_main(ip, ctx, darr, entries, need_sort ?
+ sort_offset : entries, copied);
out_free:
for(i = 0; i < leaf; i++)
- brelse(larr[i]);
+ if (larr[i])
+ brelse(larr[i]);
kvfree(larr);
out:
return error;
@@ -1478,7 +1570,7 @@ int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
struct gfs2_inode *dip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
struct dirent_gather g;
- const struct gfs2_dirent **darr, *dent;
+ struct gfs2_dirent **darr, *dent;
struct buffer_head *dibh;
int copied = 0;
int error;
@@ -1502,7 +1594,7 @@ int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
/* 96 is max number of dirents which can be stuffed into an inode */
darr = kmalloc(96 * sizeof(struct gfs2_dirent *), GFP_NOFS);
if (darr) {
- g.pdent = darr;
+ g.pdent = (const struct gfs2_dirent **)darr;
g.offset = 0;
dent = gfs2_dirent_scan(inode, dibh->b_data, dibh->b_size,
gfs2_dirent_gather, NULL, &g);
@@ -1519,8 +1611,9 @@ int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
error = -EIO;
goto out;
}
+ gfs2_set_cookies(sdp, dibh, 0, darr, dip->i_entries);
error = do_filldir_main(dip, ctx, darr,
- dip->i_entries, &copied);
+ dip->i_entries, 0, &copied);
out:
kfree(darr);
}
@@ -1555,15 +1648,22 @@ struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name,
dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh);
if (dent) {
+ struct inode *inode;
+ u16 rahead;
+
if (IS_ERR(dent))
return ERR_CAST(dent);
dtype = be16_to_cpu(dent->de_type);
+ rahead = be16_to_cpu(dent->de_rahead);
addr = be64_to_cpu(dent->de_inum.no_addr);
formal_ino = be64_to_cpu(dent->de_inum.no_formal_ino);
brelse(bh);
if (fail_on_exist)
return ERR_PTR(-EEXIST);
- return gfs2_inode_lookup(dir->i_sb, dtype, addr, formal_ino, 0);
+ inode = gfs2_inode_lookup(dir->i_sb, dtype, addr, formal_ino, 0);
+ if (!IS_ERR(inode))
+ GFS2_I(inode)->i_rahead = rahead;
+ return inode;
}
return ERR_PTR(-ENOENT);
}
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index cf4ab89159f4..7412863cda1e 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -298,9 +298,9 @@ static int gfs2_set_flags(struct file *filp, u32 __user *ptr)
gfsflags &= ~GFS2_DIF_TOPDIR;
if (gfsflags & GFS2_DIF_INHERIT_JDATA)
gfsflags ^= (GFS2_DIF_JDATA | GFS2_DIF_INHERIT_JDATA);
- return do_gfs2_set_flags(filp, gfsflags, ~0);
+ return do_gfs2_set_flags(filp, gfsflags, ~GFS2_DIF_SYSTEM);
}
- return do_gfs2_set_flags(filp, gfsflags, ~GFS2_DIF_JDATA);
+ return do_gfs2_set_flags(filp, gfsflags, ~(GFS2_DIF_SYSTEM | GFS2_DIF_JDATA));
}
static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
@@ -336,8 +336,8 @@ static void gfs2_size_hint(struct file *filep, loff_t offset, size_t size)
size_t blks = (size + sdp->sd_sb.sb_bsize - 1) >> sdp->sd_sb.sb_bsize_shift;
int hint = min_t(size_t, INT_MAX, blks);
- if (hint > atomic_read(&ip->i_res->rs_sizehint))
- atomic_set(&ip->i_res->rs_sizehint, hint);
+ if (hint > atomic_read(&ip->i_res.rs_sizehint))
+ atomic_set(&ip->i_res.rs_sizehint, hint);
}
/**
@@ -397,14 +397,10 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
/* Update file times before taking page lock */
file_update_time(vma->vm_file);
- ret = get_write_access(inode);
+ ret = gfs2_rsqa_alloc(ip);
if (ret)
goto out;
- ret = gfs2_rs_alloc(ip);
- if (ret)
- goto out_write_access;
-
gfs2_size_hint(vma->vm_file, pos, PAGE_CACHE_SIZE);
gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
@@ -486,8 +482,6 @@ out_uninit:
set_page_dirty(page);
wait_for_stable_page(page);
}
-out_write_access:
- put_write_access(inode);
out:
sb_end_pagefault(inode->i_sb);
return block_page_mkwrite_return(ret);
@@ -623,7 +617,7 @@ static int gfs2_release(struct inode *inode, struct file *file)
if (!(file->f_mode & FMODE_WRITE))
return 0;
- gfs2_rs_delete(ip, &inode->i_writecount);
+ gfs2_rsqa_delete(ip, &inode->i_writecount);
return 0;
}
@@ -703,7 +697,7 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct gfs2_inode *ip = GFS2_I(file_inode(file));
int ret;
- ret = gfs2_rs_alloc(ip);
+ ret = gfs2_rsqa_alloc(ip);
if (ret)
return ret;
@@ -897,8 +891,8 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t
if (!(mode & FALLOC_FL_KEEP_SIZE) && (pos + count) > inode->i_size) {
i_size_write(inode, pos + count);
- /* Marks the inode as dirty */
file_update_time(file);
+ mark_inode_dirty(inode);
}
return generic_write_sync(file, pos, count);
@@ -938,13 +932,14 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t le
if (ret)
goto out_unlock;
- ret = gfs2_rs_alloc(ip);
+ ret = gfs2_rsqa_alloc(ip);
if (ret)
goto out_putw;
ret = __gfs2_fallocate(file, mode, offset, len);
if (ret)
- gfs2_rs_deltree(ip->i_res);
+ gfs2_rs_deltree(&ip->i_res);
+
out_putw:
put_write_access(inode);
out_unlock:
@@ -962,7 +957,7 @@ static ssize_t gfs2_file_splice_write(struct pipe_inode_info *pipe,
int error;
struct gfs2_inode *ip = GFS2_I(out->f_mapping->host);
- error = gfs2_rs_alloc(ip);
+ error = gfs2_rsqa_alloc(ip);
if (error)
return (ssize_t)error;
@@ -1000,7 +995,7 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
}
if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
if (fl->fl_type == F_UNLCK)
- posix_lock_file_wait(file, fl);
+ locks_lock_file_wait(file, fl);
return -EIO;
}
if (IS_GETLK(cmd))
@@ -1018,7 +1013,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
struct gfs2_inode *ip = GFS2_I(file_inode(file));
struct gfs2_glock *gl;
unsigned int state;
- int flags;
+ u16 flags;
int error = 0;
int sleeptime;
@@ -1031,8 +1026,11 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
if (gl) {
if (fl_gh->gh_state == state)
goto out;
- flock_lock_file_wait(file,
- &(struct file_lock){.fl_type = F_UNLCK});
+ locks_lock_file_wait(file,
+ &(struct file_lock) {
+ .fl_type = F_UNLCK,
+ .fl_flags = FL_FLOCK
+ });
gfs2_glock_dq(fl_gh);
gfs2_holder_reinit(state, flags, fl_gh);
} else {
@@ -1056,7 +1054,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
if (error == GLR_TRYFAILED)
error = -EAGAIN;
} else {
- error = flock_lock_file_wait(file, fl);
+ error = locks_lock_file_wait(file, fl);
gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);
}
@@ -1071,7 +1069,7 @@ static void do_unflock(struct file *file, struct file_lock *fl)
struct gfs2_holder *fl_gh = &fp->f_fl_gh;
mutex_lock(&fp->f_fl_mutex);
- flock_lock_file_wait(file, fl);
+ locks_lock_file_wait(file, fl);
if (fl_gh->gh_gl) {
gfs2_glock_dq(fl_gh);
gfs2_holder_uninit(fl_gh);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 9bd1244caf38..a4ff7b56f5cd 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -246,8 +246,8 @@ static inline void do_error(struct gfs2_glock *gl, const int ret)
*/
static int do_promote(struct gfs2_glock *gl)
-__releases(&gl->gl_spin)
-__acquires(&gl->gl_spin)
+__releases(&gl->gl_lockref.lock)
+__acquires(&gl->gl_lockref.lock)
{
const struct gfs2_glock_operations *glops = gl->gl_ops;
struct gfs2_holder *gh, *tmp;
@@ -260,10 +260,10 @@ restart:
if (may_grant(gl, gh)) {
if (gh->gh_list.prev == &gl->gl_holders &&
glops->go_lock) {
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
/* FIXME: eliminate this eventually */
ret = glops->go_lock(gh);
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
if (ret) {
if (ret == 1)
return 2;
@@ -361,7 +361,7 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret)
unsigned state = ret & LM_OUT_ST_MASK;
int rv;
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
trace_gfs2_glock_state_change(gl, state);
state_change(gl, state);
gh = find_first_waiter(gl);
@@ -405,7 +405,7 @@ retry:
pr_err("wanted %u got %u\n", gl->gl_target, state);
GLOCK_BUG_ON(gl, 1);
}
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
return;
}
@@ -414,9 +414,9 @@ retry:
gfs2_demote_wake(gl);
if (state != LM_ST_UNLOCKED) {
if (glops->go_xmote_bh) {
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
rv = glops->go_xmote_bh(gl, gh);
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
if (rv) {
do_error(gl, rv);
goto out;
@@ -429,7 +429,7 @@ retry:
out:
clear_bit(GLF_LOCK, &gl->gl_flags);
out_locked:
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
}
/**
@@ -441,12 +441,12 @@ out_locked:
*/
static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target)
-__releases(&gl->gl_spin)
-__acquires(&gl->gl_spin)
+__releases(&gl->gl_lockref.lock)
+__acquires(&gl->gl_lockref.lock)
{
const struct gfs2_glock_operations *glops = gl->gl_ops;
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
- unsigned int lck_flags = gh ? gh->gh_flags : 0;
+ unsigned int lck_flags = (unsigned int)(gh ? gh->gh_flags : 0);
int ret;
lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP |
@@ -464,7 +464,7 @@ __acquires(&gl->gl_spin)
(gl->gl_state == LM_ST_EXCLUSIVE) ||
(lck_flags & (LM_FLAG_TRY|LM_FLAG_TRY_1CB)))
clear_bit(GLF_BLOCKING, &gl->gl_flags);
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
if (glops->go_sync)
glops->go_sync(gl);
if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
@@ -485,7 +485,7 @@ __acquires(&gl->gl_spin)
gfs2_glock_put(gl);
}
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
}
/**
@@ -513,8 +513,8 @@ static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl)
*/
static void run_queue(struct gfs2_glock *gl, const int nonblock)
-__releases(&gl->gl_spin)
-__acquires(&gl->gl_spin)
+__releases(&gl->gl_lockref.lock)
+__acquires(&gl->gl_lockref.lock)
{
struct gfs2_holder *gh = NULL;
int ret;
@@ -596,7 +596,7 @@ static void glock_work_func(struct work_struct *work)
finish_xmote(gl, gl->gl_reply);
drop_ref = 1;
}
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
gl->gl_state != LM_ST_UNLOCKED &&
gl->gl_demote_state != LM_ST_EXCLUSIVE) {
@@ -612,7 +612,7 @@ static void glock_work_func(struct work_struct *work)
}
}
run_queue(gl, 0);
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
if (!delay)
gfs2_glock_put(gl);
else {
@@ -750,7 +750,7 @@ again:
*
*/
-void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
+void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, u16 flags,
struct gfs2_holder *gh)
{
INIT_LIST_HEAD(&gh->gh_list);
@@ -774,7 +774,7 @@ void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
*
*/
-void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder *gh)
+void gfs2_holder_reinit(unsigned int state, u16 flags, struct gfs2_holder *gh)
{
gh->gh_state = state;
gh->gh_flags = flags;
@@ -876,8 +876,8 @@ void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
*/
static inline void add_to_queue(struct gfs2_holder *gh)
-__releases(&gl->gl_spin)
-__acquires(&gl->gl_spin)
+__releases(&gl->gl_lockref.lock)
+__acquires(&gl->gl_lockref.lock)
{
struct gfs2_glock *gl = gh->gh_gl;
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
@@ -926,10 +926,10 @@ fail:
do_cancel:
gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
if (!(gh->gh_flags & LM_FLAG_PRIORITY)) {
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
if (sdp->sd_lockstruct.ls_ops->lm_cancel)
sdp->sd_lockstruct.ls_ops->lm_cancel(gl);
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
}
return;
@@ -967,7 +967,7 @@ int gfs2_glock_nq(struct gfs2_holder *gh)
if (test_bit(GLF_LRU, &gl->gl_flags))
gfs2_glock_remove_from_lru(gl);
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
add_to_queue(gh);
if (unlikely((LM_FLAG_NOEXP & gh->gh_flags) &&
test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))) {
@@ -977,7 +977,7 @@ int gfs2_glock_nq(struct gfs2_holder *gh)
gl->gl_lockref.count--;
}
run_queue(gl, 1);
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
if (!(gh->gh_flags & GL_ASYNC))
error = gfs2_glock_wait(gh);
@@ -1010,7 +1010,7 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
unsigned delay = 0;
int fast_path = 0;
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
if (gh->gh_flags & GL_NOCACHE)
handle_callback(gl, LM_ST_UNLOCKED, 0, false);
@@ -1018,9 +1018,9 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
if (find_first_holder(gl) == NULL) {
if (glops->go_unlock) {
GLOCK_BUG_ON(gl, test_and_set_bit(GLF_LOCK, &gl->gl_flags));
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
glops->go_unlock(gh);
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
clear_bit(GLF_LOCK, &gl->gl_flags);
}
if (list_empty(&gl->gl_holders) &&
@@ -1033,7 +1033,7 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
gfs2_glock_add_to_lru(gl);
trace_gfs2_glock_queue(gh, 0);
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
if (likely(fast_path))
return;
@@ -1080,7 +1080,7 @@ void gfs2_glock_dq_uninit(struct gfs2_holder *gh)
int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number,
const struct gfs2_glock_operations *glops,
- unsigned int state, int flags, struct gfs2_holder *gh)
+ unsigned int state, u16 flags, struct gfs2_holder *gh)
{
struct gfs2_glock *gl;
int error;
@@ -1217,9 +1217,9 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
delay = gl->gl_hold_time;
}
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
handle_callback(gl, state, delay, true);
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
gfs2_glock_put(gl);
}
@@ -1259,7 +1259,7 @@ static int gfs2_should_freeze(const struct gfs2_glock *gl)
* @gl: Pointer to the glock
* @ret: The return value from the dlm
*
- * The gl_reply field is under the gl_spin lock so that it is ok
+ * The gl_reply field is under the gl_lockref.lock lock so that it is ok
* to use a bitfield shared with other glock state fields.
*/
@@ -1267,20 +1267,20 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
{
struct lm_lockstruct *ls = &gl->gl_name.ln_sbd->sd_lockstruct;
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
gl->gl_reply = ret;
if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags))) {
if (gfs2_should_freeze(gl)) {
set_bit(GLF_FROZEN, &gl->gl_flags);
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
return;
}
}
gl->gl_lockref.count++;
set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
gfs2_glock_put(gl);
@@ -1326,14 +1326,14 @@ __acquires(&lru_lock)
while(!list_empty(list)) {
gl = list_entry(list->next, struct gfs2_glock, gl_lru);
list_del_init(&gl->gl_lru);
- if (!spin_trylock(&gl->gl_spin)) {
+ if (!spin_trylock(&gl->gl_lockref.lock)) {
add_back_to_lru:
list_add(&gl->gl_lru, &lru_list);
atomic_inc(&lru_count);
continue;
}
if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
goto add_back_to_lru;
}
clear_bit(GLF_LRU, &gl->gl_flags);
@@ -1343,7 +1343,7 @@ add_back_to_lru:
WARN_ON(!test_and_clear_bit(GLF_LOCK, &gl->gl_flags));
if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
gl->gl_lockref.count--;
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
cond_resched_lock(&lru_lock);
}
}
@@ -1417,14 +1417,14 @@ static struct shrinker glock_shrinker = {
static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp)
{
struct gfs2_glock *gl;
- struct rhash_head *pos, *next;
+ struct rhash_head *pos;
const struct bucket_table *tbl;
int i;
rcu_read_lock();
tbl = rht_dereference_rcu(gl_hash_table.tbl, &gl_hash_table);
for (i = 0; i < tbl->size; i++) {
- rht_for_each_entry_safe(gl, pos, next, tbl, i, gl_node) {
+ rht_for_each_entry_rcu(gl, pos, tbl, i, gl_node) {
if ((gl->gl_name.ln_sbd == sdp) &&
lockref_get_not_dead(&gl->gl_lockref))
examiner(gl);
@@ -1461,10 +1461,10 @@ static void clear_glock(struct gfs2_glock *gl)
{
gfs2_glock_remove_from_lru(gl);
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
if (gl->gl_state != LM_ST_UNLOCKED)
handle_callback(gl, LM_ST_UNLOCKED, 0, false);
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
gfs2_glock_put(gl);
}
@@ -1482,9 +1482,9 @@ void gfs2_glock_thaw(struct gfs2_sbd *sdp)
static void dump_glock(struct seq_file *seq, struct gfs2_glock *gl)
{
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
gfs2_dump_glock(seq, gl);
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
}
static void dump_glock_func(struct gfs2_glock *gl)
@@ -1506,7 +1506,9 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
flush_workqueue(glock_workqueue);
glock_hash_walk(clear_glock, sdp);
flush_workqueue(glock_workqueue);
- wait_event(sdp->sd_glock_wait, atomic_read(&sdp->sd_glock_disposal) == 0);
+ wait_event_timeout(sdp->sd_glock_wait,
+ atomic_read(&sdp->sd_glock_disposal) == 0,
+ HZ * 600);
glock_hash_walk(dump_glock_func, sdp);
}
@@ -1518,10 +1520,10 @@ void gfs2_glock_finish_truncate(struct gfs2_inode *ip)
ret = gfs2_truncatei_resume(ip);
gfs2_assert_withdraw(gl->gl_name.ln_sbd, ret == 0);
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
clear_bit(GLF_LOCK, &gl->gl_flags);
run_queue(gl, 1);
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
}
static const char *state2str(unsigned state)
@@ -1539,7 +1541,7 @@ static const char *state2str(unsigned state)
return "??";
}
-static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags)
+static const char *hflags2str(char *buf, u16 flags, unsigned long iflags)
{
char *p = buf;
if (flags & LM_FLAG_TRY)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 32572f71f027..46ab67fc16da 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -79,15 +79,15 @@ enum {
* requested had acquired and released the lock.
*/
-#define LM_FLAG_TRY 0x00000001
-#define LM_FLAG_TRY_1CB 0x00000002
-#define LM_FLAG_NOEXP 0x00000004
-#define LM_FLAG_ANY 0x00000008
-#define LM_FLAG_PRIORITY 0x00000010
-#define GL_ASYNC 0x00000040
-#define GL_EXACT 0x00000080
-#define GL_SKIP 0x00000100
-#define GL_NOCACHE 0x00000400
+#define LM_FLAG_TRY 0x0001
+#define LM_FLAG_TRY_1CB 0x0002
+#define LM_FLAG_NOEXP 0x0004
+#define LM_FLAG_ANY 0x0008
+#define LM_FLAG_PRIORITY 0x0010
+#define GL_ASYNC 0x0040
+#define GL_EXACT 0x0080
+#define GL_SKIP 0x0100
+#define GL_NOCACHE 0x0400
/*
* lm_async_cb return flags
@@ -141,7 +141,7 @@ static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *
struct pid *pid;
/* Look in glock's list of holders for one with current task as owner */
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
pid = task_pid(current);
list_for_each_entry(gh, &gl->gl_holders, gh_list) {
if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
@@ -151,7 +151,7 @@ static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *
}
gh = NULL;
out:
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
return gh;
}
@@ -183,8 +183,8 @@ extern int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
int create, struct gfs2_glock **glp);
extern void gfs2_glock_put(struct gfs2_glock *gl);
extern void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state,
- unsigned flags, struct gfs2_holder *gh);
-extern void gfs2_holder_reinit(unsigned int state, unsigned flags,
+ u16 flags, struct gfs2_holder *gh);
+extern void gfs2_holder_reinit(unsigned int state, u16 flags,
struct gfs2_holder *gh);
extern void gfs2_holder_uninit(struct gfs2_holder *gh);
extern int gfs2_glock_nq(struct gfs2_holder *gh);
@@ -195,7 +195,7 @@ extern void gfs2_glock_dq_wait(struct gfs2_holder *gh);
extern void gfs2_glock_dq_uninit(struct gfs2_holder *gh);
extern int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number,
const struct gfs2_glock_operations *glops,
- unsigned int state, int flags,
+ unsigned int state, u16 flags,
struct gfs2_holder *gh);
extern int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
extern void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
@@ -215,7 +215,7 @@ void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
*/
static inline int gfs2_glock_nq_init(struct gfs2_glock *gl,
- unsigned int state, int flags,
+ unsigned int state, u16 flags,
struct gfs2_holder *gh)
{
int error;
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 1f6c9c3fe5cb..437fd73e381e 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -13,6 +13,7 @@
#include <linux/gfs2_ondisk.h>
#include <linux/bio.h>
#include <linux/posix_acl.h>
+#include <linux/security.h>
#include "gfs2.h"
#include "incore.h"
@@ -146,11 +147,11 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
struct gfs2_rgrpd *rgd;
int error;
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
rgd = gl->gl_object;
if (rgd)
gfs2_rgrp_brelse(rgd);
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
return;
@@ -162,11 +163,11 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
mapping_set_error(mapping, error);
gfs2_ail_empty_gl(gl);
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
rgd = gl->gl_object;
if (rgd)
gfs2_free_clones(rgd);
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
}
/**
@@ -262,6 +263,7 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
if (ip) {
set_bit(GIF_INVALID, &ip->i_flags);
forget_all_cached_acls(&ip->i_inode);
+ security_inode_invalidate_secctx(&ip->i_inode);
gfs2_dir_hash_inval(ip);
}
}
@@ -542,7 +544,7 @@ static int freeze_go_demote_ok(const struct gfs2_glock *gl)
* iopen_go_callback - schedule the dcache entry for the inode to be deleted
* @gl: the glock
*
- * gl_spin lock is held while calling this
+ * gl_lockref.lock lock is held while calling this
*/
static void iopen_go_callback(struct gfs2_glock *gl, bool remote)
{
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 121ed08d9d9f..845fb09cc606 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -259,8 +259,8 @@ struct gfs2_holder {
struct gfs2_glock *gh_gl;
struct pid *gh_owner_pid;
- unsigned int gh_state;
- unsigned gh_flags;
+ u16 gh_flags;
+ u16 gh_state;
int gh_error;
unsigned long gh_iflags; /* HIF_... */
@@ -270,6 +270,13 @@ struct gfs2_holder {
/* Number of quota types we support */
#define GFS2_MAXQUOTAS 2
+struct gfs2_qadata { /* quota allocation data */
+ /* Quota stuff */
+ struct gfs2_quota_data *qa_qd[2 * GFS2_MAXQUOTAS];
+ struct gfs2_holder qa_qd_ghs[2 * GFS2_MAXQUOTAS];
+ unsigned int qa_qd_num;
+};
+
/* Resource group multi-block reservation, in order of appearance:
Step 1. Function prepares to write, allocates a mb, sets the size hint.
@@ -288,11 +295,6 @@ struct gfs2_blkreserv {
struct gfs2_rbm rs_rbm; /* Start of reservation */
u32 rs_free; /* how many blocks are still free */
u64 rs_inum; /* Inode number for reservation */
-
- /* ancillary quota stuff */
- struct gfs2_quota_data *rs_qa_qd[2 * GFS2_MAXQUOTAS];
- struct gfs2_holder rs_qa_qd_ghs[2 * GFS2_MAXQUOTAS];
- unsigned int rs_qa_qd_num;
};
/*
@@ -334,9 +336,8 @@ struct gfs2_glock {
struct lm_lockname gl_name;
struct lockref gl_lockref;
-#define gl_spin gl_lockref.lock
- /* State fields protected by gl_spin */
+ /* State fields protected by gl_lockref.lock */
unsigned int gl_state:2, /* Current state */
gl_target:2, /* Target state */
gl_demote_state:2, /* State requested by remote node */
@@ -392,7 +393,8 @@ struct gfs2_inode {
struct gfs2_glock *i_gl; /* Move into i_gh? */
struct gfs2_holder i_iopen_gh;
struct gfs2_holder i_gh; /* for prepare/commit_write only */
- struct gfs2_blkreserv *i_res; /* rgrp multi-block reservation */
+ struct gfs2_qadata *i_qadata; /* quota allocation data */
+ struct gfs2_blkreserv i_res; /* rgrp multi-block reservation */
struct gfs2_rgrpd *i_rgd;
u64 i_goal; /* goal block for allocations */
struct rw_semaphore i_rw_mutex;
@@ -403,6 +405,7 @@ struct gfs2_inode {
u32 i_diskflags;
u8 i_height;
u8 i_depth;
+ u16 i_rahead;
};
/*
@@ -559,6 +562,8 @@ struct gfs2_args {
unsigned int ar_errors:2; /* errors=withdraw | panic */
unsigned int ar_nobarrier:1; /* do not send barriers */
unsigned int ar_rgrplvb:1; /* use lvbs for rgrp info */
+ unsigned int ar_loccookie:1; /* use location based readdir
+ cookies */
int ar_commit; /* Commit interval */
int ar_statfs_quantum; /* The fast statfs interval */
int ar_quota_quantum; /* The quota interval */
@@ -686,6 +691,7 @@ struct gfs2_sbd {
u64 sd_heightsize[GFS2_MAX_META_HEIGHT + 1];
u32 sd_max_jheight; /* Max height of journaled file's meta tree */
u64 sd_jheightsize[GFS2_MAX_META_HEIGHT + 1];
+ u32 sd_max_dents_per_leaf; /* Max number of dirents in a leaf block */
struct gfs2_args sd_args; /* Mount arguments */
struct gfs2_tune sd_tune; /* Filesystem tuning structure */
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 063fdfcf8275..3e94400d587c 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -191,13 +191,13 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
fail_refresh:
ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
ip->i_iopen_gh.gh_gl->gl_object = NULL;
- gfs2_glock_dq_uninit(&ip->i_iopen_gh);
+ gfs2_glock_dq_wait(&ip->i_iopen_gh);
+ gfs2_holder_uninit(&ip->i_iopen_gh);
fail_iopen:
if (io_gl)
gfs2_glock_put(io_gl);
fail_put:
ip->i_gl->gl_object = NULL;
- gfs2_glock_put(ip->i_gl);
fail:
iget_failed(inode);
return ERR_PTR(error);
@@ -593,7 +593,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
struct gfs2_inode *dip = GFS2_I(dir), *ip;
struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
struct gfs2_glock *io_gl;
- int error, free_vfs_inode = 0;
+ int error, free_vfs_inode = 1;
u32 aflags = 0;
unsigned blocks = 1;
struct gfs2_diradd da = { .bh = NULL, .save_loc = 1, };
@@ -601,7 +601,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
if (!name->len || name->len > GFS2_FNAMESIZE)
return -ENAMETOOLONG;
- error = gfs2_rs_alloc(dip);
+ error = gfs2_rsqa_alloc(dip);
if (error)
return error;
@@ -650,10 +650,10 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
error = posix_acl_create(dir, &mode, &default_acl, &acl);
if (error)
- goto fail_free_vfs_inode;
+ goto fail_gunlock;
ip = GFS2_I(inode);
- error = gfs2_rs_alloc(ip);
+ error = gfs2_rsqa_alloc(ip);
if (error)
goto fail_free_acls;
@@ -685,6 +685,11 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
ip->i_entries = 2;
break;
}
+
+ /* Force SYSTEM flag on all files and subdirs of a SYSTEM directory */
+ if (dip->i_diskflags & GFS2_DIF_SYSTEM)
+ ip->i_diskflags |= GFS2_DIF_SYSTEM;
+
gfs2_set_inode_flags(inode);
if ((GFS2_I(d_inode(sdp->sd_root_dir)) == dip) ||
@@ -733,6 +738,9 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
gfs2_set_iop(inode);
insert_inode_hash(inode);
+ free_vfs_inode = 0; /* After this point, the inode is no longer
+ considered free. Any failures need to undo
+ the gfs2 structures. */
if (default_acl) {
error = gfs2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
posix_acl_release(default_acl);
@@ -766,24 +774,19 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
return error;
fail_gunlock3:
- gfs2_glock_dq_uninit(ghs + 1);
- if (ip->i_gl)
- gfs2_glock_put(ip->i_gl);
- goto fail_gunlock;
-
+ gfs2_glock_dq_uninit(&ip->i_iopen_gh);
+ gfs2_glock_put(io_gl);
fail_gunlock2:
gfs2_glock_dq_uninit(ghs + 1);
fail_free_inode:
if (ip->i_gl)
gfs2_glock_put(ip->i_gl);
- gfs2_rs_delete(ip, NULL);
+ gfs2_rsqa_delete(ip, NULL);
fail_free_acls:
if (default_acl)
posix_acl_release(default_acl);
if (acl)
posix_acl_release(acl);
-fail_free_vfs_inode:
- free_vfs_inode = 1;
fail_gunlock:
gfs2_dir_no_add(&da);
gfs2_glock_dq_uninit(ghs);
@@ -898,7 +901,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
if (S_ISDIR(inode->i_mode))
return -EPERM;
- error = gfs2_rs_alloc(dip);
+ error = gfs2_rsqa_alloc(dip);
if (error)
return error;
@@ -1371,7 +1374,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
if (error)
return error;
- error = gfs2_rs_alloc(ndip);
+ error = gfs2_rsqa_alloc(ndip);
if (error)
return error;
@@ -1712,24 +1715,30 @@ static int gfs2_rename2(struct inode *odir, struct dentry *odentry,
}
/**
- * gfs2_follow_link - Follow a symbolic link
+ * gfs2_get_link - Follow a symbolic link
* @dentry: The dentry of the link
- * @nd: Data that we pass to vfs_follow_link()
+ * @inode: The inode of the link
+ * @done: destructor for return value
*
* This can handle symlinks of any size.
*
* Returns: 0 on success or error code
*/
-static const char *gfs2_follow_link(struct dentry *dentry, void **cookie)
+static const char *gfs2_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- struct gfs2_inode *ip = GFS2_I(d_inode(dentry));
+ struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_holder i_gh;
struct buffer_head *dibh;
unsigned int size;
char *buf;
int error;
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
error = gfs2_glock_nq(&i_gh);
if (error) {
@@ -1759,7 +1768,7 @@ static const char *gfs2_follow_link(struct dentry *dentry, void **cookie)
out:
gfs2_glock_dq_uninit(&i_gh);
if (!IS_ERR(buf))
- *cookie = buf;
+ set_delayed_call(done, kfree_link, buf);
return buf;
}
@@ -1854,11 +1863,7 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
if (!(attr->ia_valid & ATTR_GID) || gid_eq(ogid, ngid))
ogid = ngid = NO_GID_QUOTA_CHANGE;
- error = get_write_access(inode);
- if (error)
- return error;
-
- error = gfs2_rs_alloc(ip);
+ error = gfs2_rsqa_alloc(ip);
if (error)
goto out;
@@ -1898,7 +1903,6 @@ out_end_trans:
out_gunlock_q:
gfs2_quota_unlock(ip);
out:
- put_write_access(inode);
return error;
}
@@ -1920,7 +1924,7 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
struct gfs2_holder i_gh;
int error;
- error = gfs2_rs_alloc(ip);
+ error = gfs2_rsqa_alloc(ip);
if (error)
return error;
@@ -2002,7 +2006,7 @@ static int gfs2_setxattr(struct dentry *dentry, const char *name,
gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
ret = gfs2_glock_nq(&gh);
if (ret == 0) {
- ret = gfs2_rs_alloc(ip);
+ ret = gfs2_rsqa_alloc(ip);
if (ret == 0)
ret = generic_setxattr(dentry, name, data, size, flags);
gfs2_glock_dq(&gh);
@@ -2043,7 +2047,7 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name)
gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
ret = gfs2_glock_nq(&gh);
if (ret == 0) {
- ret = gfs2_rs_alloc(ip);
+ ret = gfs2_rsqa_alloc(ip);
if (ret == 0)
ret = generic_removexattr(dentry, name);
gfs2_glock_dq(&gh);
@@ -2132,8 +2136,7 @@ const struct inode_operations gfs2_dir_iops = {
const struct inode_operations gfs2_symlink_iops = {
.readlink = generic_readlink,
- .follow_link = gfs2_follow_link,
- .put_link = kfree_put_link,
+ .get_link = gfs2_get_link,
.permission = gfs2_permission,
.setattr = gfs2_setattr,
.getattr = gfs2_getattr,
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 284c1542783e..8b907c5cc913 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -50,7 +50,7 @@ static inline void gfs2_update_stats(struct gfs2_lkstats *s, unsigned index,
s64 delta = sample - s->stats[index];
s->stats[index] += (delta >> 3);
index++;
- s->stats[index] += ((abs64(delta) - s->stats[index]) >> 2);
+ s->stats[index] += ((abs(delta) - s->stats[index]) >> 2);
}
/**
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 536e7a6252cd..0ff028c15199 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -716,6 +716,9 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl,
}
trace_gfs2_log_flush(sdp, 1);
+ if (type == SHUTDOWN_FLUSH)
+ clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
+
sdp->sd_log_flush_head = sdp->sd_log_head;
sdp->sd_log_flush_wrapped = 0;
tr = sdp->sd_log_tr;
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 241a399bf83d..f99f8e94de3f 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -41,7 +41,9 @@ static void gfs2_init_inode_once(void *foo)
inode_init_once(&ip->i_inode);
init_rwsem(&ip->i_rw_mutex);
INIT_LIST_HEAD(&ip->i_trunc_list);
- ip->i_res = NULL;
+ ip->i_qadata = NULL;
+ memset(&ip->i_res, 0, sizeof(ip->i_res));
+ RB_CLEAR_NODE(&ip->i_res.rs_node);
ip->i_hash_cache = NULL;
}
@@ -50,7 +52,7 @@ static void gfs2_init_glock_once(void *foo)
struct gfs2_glock *gl = foo;
INIT_HLIST_BL_NODE(&gl->gl_list);
- spin_lock_init(&gl->gl_spin);
+ spin_lock_init(&gl->gl_lockref.lock);
INIT_LIST_HEAD(&gl->gl_holders);
INIT_LIST_HEAD(&gl->gl_lru);
INIT_LIST_HEAD(&gl->gl_ail_list);
@@ -112,7 +114,8 @@ static int __init init_gfs2_fs(void)
gfs2_inode_cachep = kmem_cache_create("gfs2_inode",
sizeof(struct gfs2_inode),
0, SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD,
+ SLAB_MEM_SPREAD|
+ SLAB_ACCOUNT,
gfs2_init_inode_once);
if (!gfs2_inode_cachep)
goto fail;
@@ -135,10 +138,10 @@ static int __init init_gfs2_fs(void)
if (!gfs2_quotad_cachep)
goto fail;
- gfs2_rsrv_cachep = kmem_cache_create("gfs2_mblk",
- sizeof(struct gfs2_blkreserv),
+ gfs2_qadata_cachep = kmem_cache_create("gfs2_qadata",
+ sizeof(struct gfs2_qadata),
0, 0, NULL);
- if (!gfs2_rsrv_cachep)
+ if (!gfs2_qadata_cachep)
goto fail;
register_shrinker(&gfs2_qd_shrinker);
@@ -193,8 +196,8 @@ fail_lru:
unregister_shrinker(&gfs2_qd_shrinker);
gfs2_glock_exit();
- if (gfs2_rsrv_cachep)
- kmem_cache_destroy(gfs2_rsrv_cachep);
+ if (gfs2_qadata_cachep)
+ kmem_cache_destroy(gfs2_qadata_cachep);
if (gfs2_quotad_cachep)
kmem_cache_destroy(gfs2_quotad_cachep);
@@ -238,7 +241,7 @@ static void __exit exit_gfs2_fs(void)
rcu_barrier();
mempool_destroy(gfs2_page_pool);
- kmem_cache_destroy(gfs2_rsrv_cachep);
+ kmem_cache_destroy(gfs2_qadata_cachep);
kmem_cache_destroy(gfs2_quotad_cachep);
kmem_cache_destroy(gfs2_rgrpd_cachep);
kmem_cache_destroy(gfs2_bufdata_cachep);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 0e1d4be5865a..e137d96f1b17 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -187,6 +187,52 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
return bh;
}
+static void gfs2_meta_read_endio(struct bio *bio)
+{
+ struct bio_vec *bvec;
+ int i;
+
+ bio_for_each_segment_all(bvec, bio, i) {
+ struct page *page = bvec->bv_page;
+ struct buffer_head *bh = page_buffers(page);
+ unsigned int len = bvec->bv_len;
+
+ while (bh_offset(bh) < bvec->bv_offset)
+ bh = bh->b_this_page;
+ do {
+ struct buffer_head *next = bh->b_this_page;
+ len -= bh->b_size;
+ bh->b_end_io(bh, !bio->bi_error);
+ bh = next;
+ } while (bh && len);
+ }
+ bio_put(bio);
+}
+
+/*
+ * Submit several consecutive buffer head I/O requests as a single bio I/O
+ * request. (See submit_bh_wbc.)
+ */
+static void gfs2_submit_bhs(int rw, struct buffer_head *bhs[], int num)
+{
+ struct buffer_head *bh = bhs[0];
+ struct bio *bio;
+ int i;
+
+ if (!num)
+ return;
+
+ bio = bio_alloc(GFP_NOIO, num);
+ bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+ bio->bi_bdev = bh->b_bdev;
+ for (i = 0; i < num; i++) {
+ bh = bhs[i];
+ bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
+ }
+ bio->bi_end_io = gfs2_meta_read_endio;
+ submit_bio(rw, bio);
+}
+
/**
* gfs2_meta_read - Read a block from disk
* @gl: The glock covering the block
@@ -198,10 +244,11 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
*/
int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
- struct buffer_head **bhp)
+ int rahead, struct buffer_head **bhp)
{
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
- struct buffer_head *bh;
+ struct buffer_head *bh, *bhs[2];
+ int num = 0;
if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
*bhp = NULL;
@@ -213,14 +260,31 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
lock_buffer(bh);
if (buffer_uptodate(bh)) {
unlock_buffer(bh);
- return 0;
+ flags &= ~DIO_WAIT;
+ } else {
+ bh->b_end_io = end_buffer_read_sync;
+ get_bh(bh);
+ bhs[num++] = bh;
}
- bh->b_end_io = end_buffer_read_sync;
- get_bh(bh);
- submit_bh(READ_SYNC | REQ_META | REQ_PRIO, bh);
+
+ if (rahead) {
+ bh = gfs2_getbuf(gl, blkno + 1, CREATE);
+
+ lock_buffer(bh);
+ if (buffer_uptodate(bh)) {
+ unlock_buffer(bh);
+ brelse(bh);
+ } else {
+ bh->b_end_io = end_buffer_read_sync;
+ bhs[num++] = bh;
+ }
+ }
+
+ gfs2_submit_bhs(READ_SYNC | REQ_META | REQ_PRIO, bhs, num);
if (!(flags & DIO_WAIT))
return 0;
+ bh = *bhp;
wait_on_buffer(bh);
if (unlikely(!buffer_uptodate(bh))) {
struct gfs2_trans *tr = current->journal_info;
@@ -341,8 +405,12 @@ int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num,
struct buffer_head *bh;
int ret = 0;
u32 mtype = height ? GFS2_METATYPE_IN : GFS2_METATYPE_DI;
+ int rahead = 0;
+
+ if (num == ip->i_no_addr)
+ rahead = ip->i_rahead;
- ret = gfs2_meta_read(gl, num, DIO_WAIT, &bh);
+ ret = gfs2_meta_read(gl, num, DIO_WAIT, rahead, &bh);
if (ret == 0 && gfs2_metatype_check(sdp, bh, mtype)) {
brelse(bh);
ret = -EIO;
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index 8ca161567a93..c5086c8af5ed 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -53,7 +53,7 @@ static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping)
extern struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno);
extern int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
- struct buffer_head **bhp);
+ int rahead, struct buffer_head **bhp);
extern int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh);
extern struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno,
int create);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 02586e7eb964..dbed9e243ea2 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -352,6 +352,9 @@ static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent)
sdp->sd_jheightsize[x] = ~0;
gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT);
+ sdp->sd_max_dents_per_leaf = (sdp->sd_sb.sb_bsize -
+ sizeof(struct gfs2_leaf)) /
+ GFS2_MIN_DIRENT_SIZE;
return 0;
}
@@ -910,8 +913,7 @@ fail_qc_i:
fail_ut_i:
iput(sdp->sd_sc_inode);
fail:
- if (pn)
- iput(pn);
+ iput(pn);
return error;
}
@@ -1291,6 +1293,9 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
up_write(&s->s_umount);
blkdev_put(bdev, mode);
down_write(&s->s_umount);
+ } else {
+ /* s_mode must be set before deactivate_locked_super calls */
+ s->s_mode = mode;
}
memset(&args, 0, sizeof(args));
@@ -1312,10 +1317,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
if ((flags ^ s->s_flags) & MS_RDONLY)
goto error_super;
} else {
- char b[BDEVNAME_SIZE];
-
- s->s_mode = mode;
- strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
+ snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
sb_set_blocksize(s, block_size(bdev));
error = fill_super(s, &args, flags & MS_SILENT ? 1 : 0);
if (error)
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 3a31226531ea..be6d9c450b22 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -388,7 +388,7 @@ static int bh_get(struct gfs2_quota_data *qd)
error = gfs2_block_map(&ip->i_inode, block, &bh_map, 0);
if (error)
goto fail;
- error = gfs2_meta_read(ip->i_gl, bh_map.b_blocknr, DIO_WAIT, &bh);
+ error = gfs2_meta_read(ip->i_gl, bh_map.b_blocknr, DIO_WAIT, 0, &bh);
if (error)
goto fail;
error = -EIO;
@@ -527,37 +527,70 @@ static void qdsb_put(struct gfs2_quota_data *qd)
qd_put(qd);
}
+/**
+ * gfs2_qa_alloc - make sure we have a quota allocations data structure,
+ * if necessary
+ * @ip: the inode for this reservation
+ */
+int gfs2_qa_alloc(struct gfs2_inode *ip)
+{
+ int error = 0;
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+
+ if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
+ return 0;
+
+ down_write(&ip->i_rw_mutex);
+ if (ip->i_qadata == NULL) {
+ ip->i_qadata = kmem_cache_zalloc(gfs2_qadata_cachep, GFP_NOFS);
+ if (!ip->i_qadata)
+ error = -ENOMEM;
+ }
+ up_write(&ip->i_rw_mutex);
+ return error;
+}
+
+void gfs2_qa_delete(struct gfs2_inode *ip, atomic_t *wcount)
+{
+ down_write(&ip->i_rw_mutex);
+ if (ip->i_qadata && ((wcount == NULL) || (atomic_read(wcount) <= 1))) {
+ kmem_cache_free(gfs2_qadata_cachep, ip->i_qadata);
+ ip->i_qadata = NULL;
+ }
+ up_write(&ip->i_rw_mutex);
+}
+
int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_quota_data **qd;
int error;
- if (ip->i_res == NULL) {
- error = gfs2_rs_alloc(ip);
+ if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
+ return 0;
+
+ if (ip->i_qadata == NULL) {
+ error = gfs2_rsqa_alloc(ip);
if (error)
return error;
}
- qd = ip->i_res->rs_qa_qd;
+ qd = ip->i_qadata->qa_qd;
- if (gfs2_assert_warn(sdp, !ip->i_res->rs_qa_qd_num) ||
+ if (gfs2_assert_warn(sdp, !ip->i_qadata->qa_qd_num) ||
gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags)))
return -EIO;
- if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
- return 0;
-
error = qdsb_get(sdp, make_kqid_uid(ip->i_inode.i_uid), qd);
if (error)
goto out;
- ip->i_res->rs_qa_qd_num++;
+ ip->i_qadata->qa_qd_num++;
qd++;
error = qdsb_get(sdp, make_kqid_gid(ip->i_inode.i_gid), qd);
if (error)
goto out;
- ip->i_res->rs_qa_qd_num++;
+ ip->i_qadata->qa_qd_num++;
qd++;
if (!uid_eq(uid, NO_UID_QUOTA_CHANGE) &&
@@ -565,7 +598,7 @@ int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
error = qdsb_get(sdp, make_kqid_uid(uid), qd);
if (error)
goto out;
- ip->i_res->rs_qa_qd_num++;
+ ip->i_qadata->qa_qd_num++;
qd++;
}
@@ -574,7 +607,7 @@ int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
error = qdsb_get(sdp, make_kqid_gid(gid), qd);
if (error)
goto out;
- ip->i_res->rs_qa_qd_num++;
+ ip->i_qadata->qa_qd_num++;
qd++;
}
@@ -587,17 +620,17 @@ out:
void gfs2_quota_unhold(struct gfs2_inode *ip)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- unsigned int x;
+ u32 x;
- if (ip->i_res == NULL)
+ if (ip->i_qadata == NULL)
return;
gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags));
- for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) {
- qdsb_put(ip->i_res->rs_qa_qd[x]);
- ip->i_res->rs_qa_qd[x] = NULL;
+ for (x = 0; x < ip->i_qadata->qa_qd_num; x++) {
+ qdsb_put(ip->i_qadata->qa_qd[x]);
+ ip->i_qadata->qa_qd[x] = NULL;
}
- ip->i_res->rs_qa_qd_num = 0;
+ ip->i_qadata->qa_qd_num = 0;
}
static int sort_qd(const void *a, const void *b)
@@ -843,7 +876,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
unsigned int nalloc = 0, blocks;
int error;
- error = gfs2_rs_alloc(ip);
+ error = gfs2_rsqa_alloc(ip);
if (error)
return error;
@@ -1003,23 +1036,23 @@ int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_quota_data *qd;
- unsigned int x;
+ u32 x;
int error = 0;
- error = gfs2_quota_hold(ip, uid, gid);
- if (error)
- return error;
-
if (capable(CAP_SYS_RESOURCE) ||
sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
return 0;
- sort(ip->i_res->rs_qa_qd, ip->i_res->rs_qa_qd_num,
+ error = gfs2_quota_hold(ip, uid, gid);
+ if (error)
+ return error;
+
+ sort(ip->i_qadata->qa_qd, ip->i_qadata->qa_qd_num,
sizeof(struct gfs2_quota_data *), sort_qd, NULL);
- for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) {
- qd = ip->i_res->rs_qa_qd[x];
- error = do_glock(qd, NO_FORCE, &ip->i_res->rs_qa_qd_ghs[x]);
+ for (x = 0; x < ip->i_qadata->qa_qd_num; x++) {
+ qd = ip->i_qadata->qa_qd[x];
+ error = do_glock(qd, NO_FORCE, &ip->i_qadata->qa_qd_ghs[x]);
if (error)
break;
}
@@ -1028,7 +1061,7 @@ int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
set_bit(GIF_QD_LOCKED, &ip->i_flags);
else {
while (x--)
- gfs2_glock_dq_uninit(&ip->i_res->rs_qa_qd_ghs[x]);
+ gfs2_glock_dq_uninit(&ip->i_qadata->qa_qd_ghs[x]);
gfs2_quota_unhold(ip);
}
@@ -1076,20 +1109,20 @@ void gfs2_quota_unlock(struct gfs2_inode *ip)
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_quota_data *qda[4];
unsigned int count = 0;
- unsigned int x;
+ u32 x;
int found;
if (!test_and_clear_bit(GIF_QD_LOCKED, &ip->i_flags))
goto out;
- for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) {
+ for (x = 0; x < ip->i_qadata->qa_qd_num; x++) {
struct gfs2_quota_data *qd;
int sync;
- qd = ip->i_res->rs_qa_qd[x];
+ qd = ip->i_qadata->qa_qd[x];
sync = need_sync(qd);
- gfs2_glock_dq_uninit(&ip->i_res->rs_qa_qd_ghs[x]);
+ gfs2_glock_dq_uninit(&ip->i_qadata->qa_qd_ghs[x]);
if (!sync)
continue;
@@ -1158,7 +1191,7 @@ int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid,
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_quota_data *qd;
s64 value, warn, limit;
- unsigned int x;
+ u32 x;
int error = 0;
ap->allowed = UINT_MAX; /* Assume we are permitted a whole lot */
@@ -1168,8 +1201,8 @@ int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid,
if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
return 0;
- for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) {
- qd = ip->i_res->rs_qa_qd[x];
+ for (x = 0; x < ip->i_qadata->qa_qd_num; x++) {
+ qd = ip->i_qadata->qa_qd[x];
if (!(qid_eq(qd->qd_id, make_kqid_uid(uid)) ||
qid_eq(qd->qd_id, make_kqid_gid(gid))))
@@ -1216,15 +1249,17 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
kuid_t uid, kgid_t gid)
{
struct gfs2_quota_data *qd;
- unsigned int x;
+ u32 x;
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), change))
+ if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON ||
+ gfs2_assert_warn(sdp, change))
return;
if (ip->i_diskflags & GFS2_DIF_SYSTEM)
return;
- for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) {
- qd = ip->i_res->rs_qa_qd[x];
+ for (x = 0; x < ip->i_qadata->qa_qd_num; x++) {
+ qd = ip->i_qadata->qa_qd[x];
if (qid_eq(qd->qd_id, make_kqid_uid(uid)) ||
qid_eq(qd->qd_id, make_kqid_gid(gid))) {
@@ -1635,7 +1670,7 @@ static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid,
if (error)
return error;
- error = gfs2_rs_alloc(ip);
+ error = gfs2_rsqa_alloc(ip);
if (error)
goto out_put;
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index ad04b3acae2b..5e47c935a515 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -18,6 +18,8 @@ struct gfs2_sbd;
#define NO_UID_QUOTA_CHANGE INVALID_UID
#define NO_GID_QUOTA_CHANGE INVALID_GID
+extern int gfs2_qa_alloc(struct gfs2_inode *ip);
+extern void gfs2_qa_delete(struct gfs2_inode *ip, atomic_t *wcount);
extern int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid);
extern void gfs2_quota_unhold(struct gfs2_inode *ip);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 475985d14758..07c0265aa195 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -596,27 +596,13 @@ void gfs2_free_clones(struct gfs2_rgrpd *rgd)
}
/**
- * gfs2_rs_alloc - make sure we have a reservation assigned to the inode
+ * gfs2_rsqa_alloc - make sure we have a reservation assigned to the inode
+ * plus a quota allocations data structure, if necessary
* @ip: the inode for this reservation
*/
-int gfs2_rs_alloc(struct gfs2_inode *ip)
+int gfs2_rsqa_alloc(struct gfs2_inode *ip)
{
- int error = 0;
-
- down_write(&ip->i_rw_mutex);
- if (ip->i_res)
- goto out;
-
- ip->i_res = kmem_cache_zalloc(gfs2_rsrv_cachep, GFP_NOFS);
- if (!ip->i_res) {
- error = -ENOMEM;
- goto out;
- }
-
- RB_CLEAR_NODE(&ip->i_res->rs_node);
-out:
- up_write(&ip->i_rw_mutex);
- return error;
+ return gfs2_qa_alloc(ip);
}
static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs)
@@ -678,21 +664,20 @@ void gfs2_rs_deltree(struct gfs2_blkreserv *rs)
}
/**
- * gfs2_rs_delete - delete a multi-block reservation
+ * gfs2_rsqa_delete - delete a multi-block reservation and quota allocation
* @ip: The inode for this reservation
* @wcount: The inode's write count, or NULL
*
*/
-void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount)
+void gfs2_rsqa_delete(struct gfs2_inode *ip, atomic_t *wcount)
{
down_write(&ip->i_rw_mutex);
- if (ip->i_res && ((wcount == NULL) || (atomic_read(wcount) <= 1))) {
- gfs2_rs_deltree(ip->i_res);
- BUG_ON(ip->i_res->rs_free);
- kmem_cache_free(gfs2_rsrv_cachep, ip->i_res);
- ip->i_res = NULL;
+ if ((wcount == NULL) || (atomic_read(wcount) <= 1)) {
+ gfs2_rs_deltree(&ip->i_res);
+ BUG_ON(ip->i_res.rs_free);
}
up_write(&ip->i_rw_mutex);
+ gfs2_qa_delete(ip, wcount);
}
/**
@@ -729,9 +714,9 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
rb_erase(n, &sdp->sd_rindex_tree);
if (gl) {
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
gl->gl_object = NULL;
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
gfs2_glock_add_to_lru(gl);
gfs2_glock_put(gl);
}
@@ -933,8 +918,9 @@ static int read_rindex_entry(struct gfs2_inode *ip)
goto fail;
rgd->rd_gl->gl_object = rgd;
- rgd->rd_gl->gl_vm.start = rgd->rd_addr * bsize;
- rgd->rd_gl->gl_vm.end = rgd->rd_gl->gl_vm.start + (rgd->rd_length * bsize) - 1;
+ rgd->rd_gl->gl_vm.start = (rgd->rd_addr * bsize) & PAGE_CACHE_MASK;
+ rgd->rd_gl->gl_vm.end = PAGE_CACHE_ALIGN((rgd->rd_addr +
+ rgd->rd_length) * bsize) - 1;
rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr;
rgd->rd_flags &= ~(GFS2_RDF_UPTODATE | GFS2_RDF_PREFERRED);
if (rgd->rd_data > sdp->sd_max_rg_data)
@@ -1157,7 +1143,7 @@ static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
for (x = 0; x < length; x++) {
bi = rgd->rd_bits + x;
- error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, &bi->bi_bh);
+ error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, 0, &bi->bi_bh);
if (error)
goto fail;
}
@@ -1455,7 +1441,7 @@ static void rs_insert(struct gfs2_inode *ip)
{
struct rb_node **newn, *parent = NULL;
int rc;
- struct gfs2_blkreserv *rs = ip->i_res;
+ struct gfs2_blkreserv *rs = &ip->i_res;
struct gfs2_rgrpd *rgd = rs->rs_rbm.rgd;
u64 fsblock = gfs2_rbm_to_block(&rs->rs_rbm);
@@ -1502,7 +1488,7 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,
{
struct gfs2_rbm rbm = { .rgd = rgd, };
u64 goal;
- struct gfs2_blkreserv *rs = ip->i_res;
+ struct gfs2_blkreserv *rs = &ip->i_res;
u32 extlen;
u32 free_blocks = rgd->rd_free_clone - rgd->rd_reserved;
int ret;
@@ -1573,7 +1559,7 @@ static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block,
}
if (n) {
- while ((rs_cmp(block, length, rs) == 0) && (ip->i_res != rs)) {
+ while ((rs_cmp(block, length, rs) == 0) && (&ip->i_res != rs)) {
block = gfs2_rbm_to_block(&rs->rs_rbm) + rs->rs_free;
n = n->rb_right;
if (n == NULL)
@@ -1803,7 +1789,7 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
continue;
*last_unlinked = block;
- error = gfs2_glock_get(sdp, block, &gfs2_inode_glops, CREATE, &gl);
+ error = gfs2_glock_get(sdp, block, &gfs2_iopen_glops, CREATE, &gl);
if (error)
continue;
@@ -1983,7 +1969,7 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_rgrpd *begin = NULL;
- struct gfs2_blkreserv *rs = ip->i_res;
+ struct gfs2_blkreserv *rs = &ip->i_res;
int error = 0, rg_locked, flags = 0;
u64 last_unlinked = NO_BLOCK;
int loops = 0;
@@ -2112,7 +2098,7 @@ next_rgrp:
void gfs2_inplace_release(struct gfs2_inode *ip)
{
- struct gfs2_blkreserv *rs = ip->i_res;
+ struct gfs2_blkreserv *rs = &ip->i_res;
if (rs->rs_rgd_gh.gh_gl)
gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
@@ -2266,7 +2252,7 @@ static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
static void gfs2_adjust_reservation(struct gfs2_inode *ip,
const struct gfs2_rbm *rbm, unsigned len)
{
- struct gfs2_blkreserv *rs = ip->i_res;
+ struct gfs2_blkreserv *rs = &ip->i_res;
struct gfs2_rgrpd *rgd = rbm->rgd;
unsigned rlen;
u64 block;
@@ -2309,8 +2295,8 @@ static void gfs2_set_alloc_start(struct gfs2_rbm *rbm,
{
u64 goal;
- if (gfs2_rs_active(ip->i_res)) {
- *rbm = ip->i_res->rs_rbm;
+ if (gfs2_rs_active(&ip->i_res)) {
+ *rbm = ip->i_res.rs_rbm;
return;
}
@@ -2364,7 +2350,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
gfs2_alloc_extent(&rbm, dinode, nblocks);
block = gfs2_rbm_to_block(&rbm);
rbm.rgd->rd_last_alloc = block - rbm.rgd->rd_data0;
- if (gfs2_rs_active(ip->i_res))
+ if (gfs2_rs_active(&ip->i_res))
gfs2_adjust_reservation(ip, &rbm, *nblocks);
ndata = *nblocks;
if (dinode)
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index c0ab33fa3eed..66b51cf66dfa 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -49,9 +49,9 @@ extern void gfs2_inplace_release(struct gfs2_inode *ip);
extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n,
bool dinode, u64 *generation);
-extern int gfs2_rs_alloc(struct gfs2_inode *ip);
+extern int gfs2_rsqa_alloc(struct gfs2_inode *ip);
extern void gfs2_rs_deltree(struct gfs2_blkreserv *rs);
-extern void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount);
+extern void gfs2_rsqa_delete(struct gfs2_inode *ip, atomic_t *wcount);
extern void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta);
extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
@@ -78,7 +78,7 @@ extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
extern int gfs2_fitrim(struct file *filp, void __user *argp);
/* This is how to tell if a reservation is in the rgrp tree: */
-static inline bool gfs2_rs_active(struct gfs2_blkreserv *rs)
+static inline bool gfs2_rs_active(const struct gfs2_blkreserv *rs)
{
return rs && !RB_EMPTY_NODE(&rs->rs_node);
}
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 894fb01a91da..8f960a51a9a0 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -83,6 +83,8 @@ enum {
Opt_nobarrier,
Opt_rgrplvb,
Opt_norgrplvb,
+ Opt_loccookie,
+ Opt_noloccookie,
Opt_error,
};
@@ -122,6 +124,8 @@ static const match_table_t tokens = {
{Opt_nobarrier, "nobarrier"},
{Opt_rgrplvb, "rgrplvb"},
{Opt_norgrplvb, "norgrplvb"},
+ {Opt_loccookie, "loccookie"},
+ {Opt_noloccookie, "noloccookie"},
{Opt_error, NULL}
};
@@ -278,6 +282,12 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
case Opt_norgrplvb:
args->ar_rgrplvb = 0;
break;
+ case Opt_loccookie:
+ args->ar_loccookie = 1;
+ break;
+ case Opt_noloccookie:
+ args->ar_loccookie = 0;
+ break;
case Opt_error:
default:
pr_warn("invalid mount option: %s\n", o);
@@ -556,6 +566,7 @@ void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
gfs2_trans_add_meta(l_ip->i_gl, l_bh);
+ gfs2_trans_add_meta(m_ip->i_gl, m_bh);
spin_lock(&sdp->sd_statfs_spin);
m_sc->sc_total += l_sc->sc_total;
@@ -564,10 +575,8 @@ void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
memset(l_sc, 0, sizeof(struct gfs2_statfs_change));
memset(l_bh->b_data + sizeof(struct gfs2_dinode),
0, sizeof(struct gfs2_statfs_change));
- spin_unlock(&sdp->sd_statfs_spin);
-
- gfs2_trans_add_meta(m_ip->i_gl, m_bh);
gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode));
+ spin_unlock(&sdp->sd_statfs_spin);
}
int gfs2_statfs_sync(struct super_block *sb, int type)
@@ -842,10 +851,6 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
gfs2_quota_sync(sdp->sd_vfs, 0);
gfs2_statfs_sync(sdp->sd_vfs, 0);
- down_write(&sdp->sd_log_flush_lock);
- clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
- up_write(&sdp->sd_log_flush_lock);
-
gfs2_log_flush(sdp, NULL, SHUTDOWN_FLUSH);
wait_event(sdp->sd_reserving_log_wait, atomic_read(&sdp->sd_reserving_log) == 0);
gfs2_assert_warn(sdp, atomic_read(&sdp->sd_log_blks_free) == sdp->sd_jdesc->jd_blocks);
@@ -1419,6 +1424,8 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
seq_puts(s, ",demote_interface_used");
if (args->ar_rgrplvb)
seq_puts(s, ",rgrplvb");
+ if (args->ar_loccookie)
+ seq_puts(s, ",loccookie");
return 0;
}
@@ -1512,6 +1519,7 @@ static void gfs2_evict_inode(struct inode *inode)
struct gfs2_sbd *sdp = sb->s_fs_info;
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_holder gh;
+ struct address_space *metamapping;
int error;
if (test_bit(GIF_FREE_VFS_INODE, &ip->i_flags)) {
@@ -1526,7 +1534,8 @@ static void gfs2_evict_inode(struct inode *inode)
error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh);
if (unlikely(error)) {
ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
- gfs2_glock_dq_uninit(&ip->i_iopen_gh);
+ gfs2_glock_dq_wait(&ip->i_iopen_gh);
+ gfs2_holder_uninit(&ip->i_iopen_gh);
goto out;
}
@@ -1575,8 +1584,8 @@ static void gfs2_evict_inode(struct inode *inode)
out_truncate:
gfs2_log_flush(sdp, ip->i_gl, NORMAL_FLUSH);
+ metamapping = gfs2_glock2aspace(ip->i_gl);
if (test_bit(GLF_DIRTY, &ip->i_gl->gl_flags)) {
- struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl);
filemap_fdatawrite(metamapping);
filemap_fdatawait(metamapping);
}
@@ -1589,16 +1598,17 @@ out_truncate:
goto out_unlock;
/* Needs to be done before glock release & also in a transaction */
truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages(metamapping, 0);
gfs2_trans_end(sdp);
out_unlock:
/* Error path for case 1 */
- if (gfs2_rs_active(ip->i_res))
- gfs2_rs_deltree(ip->i_res);
+ if (gfs2_rs_active(&ip->i_res))
+ gfs2_rs_deltree(&ip->i_res);
if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) {
ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
- gfs2_glock_dq(&ip->i_iopen_gh);
+ gfs2_glock_dq_wait(&ip->i_iopen_gh);
}
gfs2_holder_uninit(&ip->i_iopen_gh);
gfs2_glock_dq_uninit(&gh);
@@ -1607,7 +1617,7 @@ out_unlock:
out:
/* Case 3 starts here */
truncate_inode_pages_final(&inode->i_data);
- gfs2_rs_delete(ip, NULL);
+ gfs2_rsqa_delete(ip, NULL);
gfs2_ordered_del_inode(ip);
clear_inode(inode);
gfs2_dir_hash_inval(ip);
@@ -1619,7 +1629,8 @@ out:
if (ip->i_iopen_gh.gh_gl) {
ip->i_iopen_gh.gh_gl->gl_object = NULL;
ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
- gfs2_glock_dq_uninit(&ip->i_iopen_gh);
+ gfs2_glock_dq_wait(&ip->i_iopen_gh);
+ gfs2_holder_uninit(&ip->i_iopen_gh);
}
}
@@ -1632,7 +1643,9 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb)
ip->i_flags = 0;
ip->i_gl = NULL;
ip->i_rgd = NULL;
- ip->i_res = NULL;
+ memset(&ip->i_res, 0, sizeof(ip->i_res));
+ RB_CLEAR_NODE(&ip->i_res.rs_node);
+ ip->i_rahead = 0;
}
return &ip->i_inode;
}
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index b95d0d625f32..0c1bde395062 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -176,6 +176,8 @@ void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh)
unlock_buffer(bh);
if (bh->b_private == NULL)
bd = gfs2_alloc_bufdata(gl, bh, &gfs2_databuf_lops);
+ else
+ bd = bh->b_private;
lock_buffer(bh);
gfs2_log_lock(sdp);
}
@@ -236,6 +238,8 @@ void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh)
lock_page(bh->b_page);
if (bh->b_private == NULL)
bd = gfs2_alloc_bufdata(gl, bh, &gfs2_buf_lops);
+ else
+ bd = bh->b_private;
unlock_page(bh->b_page);
lock_buffer(bh);
gfs2_log_lock(sdp);
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 86d2035ac669..cf645835710f 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -27,7 +27,7 @@ struct kmem_cache *gfs2_inode_cachep __read_mostly;
struct kmem_cache *gfs2_bufdata_cachep __read_mostly;
struct kmem_cache *gfs2_rgrpd_cachep __read_mostly;
struct kmem_cache *gfs2_quotad_cachep __read_mostly;
-struct kmem_cache *gfs2_rsrv_cachep __read_mostly;
+struct kmem_cache *gfs2_qadata_cachep __read_mostly;
mempool_t *gfs2_page_pool __read_mostly;
void gfs2_assert_i(struct gfs2_sbd *sdp)
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
index cbdcbdf39614..c81295f407f6 100644
--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h
@@ -149,7 +149,7 @@ extern struct kmem_cache *gfs2_inode_cachep;
extern struct kmem_cache *gfs2_bufdata_cachep;
extern struct kmem_cache *gfs2_rgrpd_cachep;
extern struct kmem_cache *gfs2_quotad_cachep;
-extern struct kmem_cache *gfs2_rsrv_cachep;
+extern struct kmem_cache *gfs2_qadata_cachep;
extern mempool_t *gfs2_page_pool;
static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt,
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 4c096fa9e2a1..e8dfb4740c04 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -119,7 +119,7 @@ static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data)
__be64 *eablk, *end;
int error;
- error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &bh);
+ error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, 0, &bh);
if (error)
return error;
@@ -143,7 +143,7 @@ static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data)
break;
bn = be64_to_cpu(*eablk);
- error = gfs2_meta_read(ip->i_gl, bn, DIO_WAIT, &eabh);
+ error = gfs2_meta_read(ip->i_gl, bn, DIO_WAIT, 0, &eabh);
if (error)
break;
error = ea_foreach_i(ip, eabh, ea_call, data);
@@ -477,7 +477,7 @@ static int gfs2_iter_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
return -ENOMEM;
for (x = 0; x < nptrs; x++) {
- error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0,
+ error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0, 0,
bh + x);
if (error) {
while (x--)
@@ -583,11 +583,13 @@ out:
*
* Returns: actual size of data on success, -errno on error
*/
-static int gfs2_xattr_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int gfs2_xattr_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
struct gfs2_inode *ip = GFS2_I(d_inode(dentry));
struct gfs2_ea_location el;
+ int type = handler->flags;
int error;
if (!ip->i_eattr)
@@ -977,7 +979,7 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) {
__be64 *end;
- error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT,
+ error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, 0,
&indbh);
if (error)
return error;
@@ -1227,61 +1229,12 @@ int __gfs2_xattr_set(struct inode *inode, const char *name,
return error;
}
-static int gfs2_xattr_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+static int gfs2_xattr_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
return __gfs2_xattr_set(d_inode(dentry), name, value,
- size, flags, type);
-}
-
-
-static int ea_acl_chmod_unstuffed(struct gfs2_inode *ip,
- struct gfs2_ea_header *ea, char *data)
-{
- struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- unsigned int amount = GFS2_EA_DATA_LEN(ea);
- unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize);
- int ret;
-
- ret = gfs2_trans_begin(sdp, nptrs + RES_DINODE, 0);
- if (ret)
- return ret;
-
- ret = gfs2_iter_unstuffed(ip, ea, data, NULL);
- gfs2_trans_end(sdp);
-
- return ret;
-}
-
-int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
-{
- struct inode *inode = &ip->i_inode;
- struct gfs2_sbd *sdp = GFS2_SB(inode);
- struct gfs2_ea_location el;
- int error;
-
- error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, GFS2_POSIX_ACL_ACCESS, &el);
- if (error)
- return error;
-
- if (GFS2_EA_IS_STUFFED(el.el_ea)) {
- error = gfs2_trans_begin(sdp, RES_DINODE + RES_EATTR, 0);
- if (error == 0) {
- gfs2_trans_add_meta(ip->i_gl, el.el_bh);
- memcpy(GFS2_EA2DATA(el.el_ea), data,
- GFS2_EA_DATA_LEN(el.el_ea));
- }
- } else {
- error = ea_acl_chmod_unstuffed(ip, el.el_ea, data);
- }
-
- brelse(el.el_bh);
- if (error)
- return error;
-
- error = gfs2_setattr_simple(inode, attr);
- gfs2_trans_end(sdp);
- return error;
+ size, flags, handler->flags);
}
static int ea_dealloc_indirect(struct gfs2_inode *ip)
@@ -1303,7 +1256,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
- error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &indbh);
+ error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, 0, &indbh);
if (error)
return error;
diff --git a/fs/gfs2/xattr.h b/fs/gfs2/xattr.h
index d392f8358f2f..2d887c88eb49 100644
--- a/fs/gfs2/xattr.h
+++ b/fs/gfs2/xattr.h
@@ -62,6 +62,5 @@ extern int gfs2_ea_dealloc(struct gfs2_inode *ip);
/* Exported to acl.c */
extern int gfs2_xattr_acl_get(struct gfs2_inode *ip, const char *name, char **data);
-extern int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data);
#endif /* __EATTR_DOT_H__ */
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index aa3f0d6d043c..a3ec3ae7d347 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -166,7 +166,7 @@ int hfs_mdb_get(struct super_block *sb)
pr_warn("continuing without an alternate MDB\n");
}
- HFS_SB(sb)->bitmap = (__be32 *)__get_free_pages(GFP_KERNEL, PAGE_SIZE < 8192 ? 1 : 0);
+ HFS_SB(sb)->bitmap = kmalloc(8192, GFP_KERNEL);
if (!HFS_SB(sb)->bitmap)
goto out;
@@ -360,7 +360,7 @@ void hfs_mdb_put(struct super_block *sb)
unload_nls(HFS_SB(sb)->nls_io);
unload_nls(HFS_SB(sb)->nls_disk);
- free_pages((unsigned long)HFS_SB(sb)->bitmap, PAGE_SIZE < 8192 ? 1 : 0);
+ kfree(HFS_SB(sb)->bitmap);
kfree(HFS_SB(sb));
sb->s_fs_info = NULL;
}
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 4574fdd3d421..1ca95c232bb5 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -483,8 +483,8 @@ static int __init init_hfs_fs(void)
int err;
hfs_inode_cachep = kmem_cache_create("hfs_inode_cache",
- sizeof(struct hfs_inode_info), 0, SLAB_HWCACHE_ALIGN,
- hfs_init_once);
+ sizeof(struct hfs_inode_info), 0,
+ SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, hfs_init_once);
if (!hfs_inode_cachep)
return -ENOMEM;
err = register_filesystem(&hfs_fs_type);
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 6dd107d7421e..19b33f8151f1 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -403,6 +403,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, umode_t mode)
} else if (S_ISLNK(inode->i_mode)) {
sbi->file_count++;
inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &hfsplus_aops;
hip->clump_blocks = 1;
} else
@@ -526,6 +527,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
inode->i_mapping->a_ops = &hfsplus_aops;
} else if (S_ISLNK(inode->i_mode)) {
inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &hfsplus_aops;
} else {
init_special_inode(inode, inode->i_mode,
diff --git a/fs/hfsplus/posix_acl.c b/fs/hfsplus/posix_acl.c
index df0c9af68d05..afb33eda6d7d 100644
--- a/fs/hfsplus/posix_acl.c
+++ b/fs/hfsplus/posix_acl.c
@@ -21,10 +21,10 @@ struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type)
switch (type) {
case ACL_TYPE_ACCESS:
- xattr_name = POSIX_ACL_XATTR_ACCESS;
+ xattr_name = XATTR_NAME_POSIX_ACL_ACCESS;
break;
case ACL_TYPE_DEFAULT:
- xattr_name = POSIX_ACL_XATTR_DEFAULT;
+ xattr_name = XATTR_NAME_POSIX_ACL_DEFAULT;
break;
default:
return ERR_PTR(-EINVAL);
@@ -66,7 +66,7 @@ int hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl,
switch (type) {
case ACL_TYPE_ACCESS:
- xattr_name = POSIX_ACL_XATTR_ACCESS;
+ xattr_name = XATTR_NAME_POSIX_ACL_ACCESS;
if (acl) {
err = posix_acl_equiv_mode(acl, &inode->i_mode);
if (err < 0)
@@ -76,7 +76,7 @@ int hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl,
break;
case ACL_TYPE_DEFAULT:
- xattr_name = POSIX_ACL_XATTR_DEFAULT;
+ xattr_name = XATTR_NAME_POSIX_ACL_DEFAULT;
if (!S_ISDIR(inode->i_mode))
return acl ? -EACCES : 0;
break;
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 7302d96ae8bf..5d54490a136d 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -663,7 +663,7 @@ static int __init init_hfsplus_fs(void)
int err;
hfsplus_inode_cachep = kmem_cache_create("hfsplus_icache",
- HFSPLUS_INODE_SIZE, 0, SLAB_HWCACHE_ALIGN,
+ HFSPLUS_INODE_SIZE, 0, SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT,
hfsplus_init_once);
if (!hfsplus_inode_cachep)
return -ENOMEM;
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index 416b1dbafe51..ab01530b4930 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -431,9 +431,6 @@ int hfsplus_setxattr(struct dentry *dentry, const char *name,
char *xattr_name;
int res;
- if (!strcmp(name, ""))
- return -EINVAL;
-
xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1,
GFP_KERNEL);
if (!xattr_name)
@@ -589,9 +586,6 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
int res;
char *xattr_name;
- if (!strcmp(name, ""))
- return -EINVAL;
-
xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1,
GFP_KERNEL);
if (!xattr_name)
@@ -849,12 +843,10 @@ end_removexattr:
return err;
}
-static int hfsplus_osx_getxattr(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int hfsplus_osx_getxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
- if (!strcmp(name, ""))
- return -EINVAL;
-
/*
* Don't allow retrieving properly prefixed attributes
* by prepending them with "osx."
@@ -871,12 +863,10 @@ static int hfsplus_osx_getxattr(struct dentry *dentry, const char *name,
return __hfsplus_getxattr(d_inode(dentry), name, buffer, size);
}
-static int hfsplus_osx_setxattr(struct dentry *dentry, const char *name,
- const void *buffer, size_t size, int flags, int type)
+static int hfsplus_osx_setxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *buffer, size_t size, int flags)
{
- if (!strcmp(name, ""))
- return -EINVAL;
-
/*
* Don't allow setting properly prefixed attributes
* by prepending them with "osx."
@@ -893,19 +883,8 @@ static int hfsplus_osx_setxattr(struct dentry *dentry, const char *name,
return __hfsplus_setxattr(d_inode(dentry), name, buffer, size, flags);
}
-static size_t hfsplus_osx_listxattr(struct dentry *dentry, char *list,
- size_t list_size, const char *name, size_t name_len, int type)
-{
- /*
- * This method is not used.
- * It is used hfsplus_listxattr() instead of generic_listxattr().
- */
- return -EOPNOTSUPP;
-}
-
const struct xattr_handler hfsplus_xattr_osx_handler = {
.prefix = XATTR_MAC_OSX_PREFIX,
- .list = hfsplus_osx_listxattr,
.get = hfsplus_osx_getxattr,
.set = hfsplus_osx_setxattr,
};
diff --git a/fs/hfsplus/xattr_security.c b/fs/hfsplus/xattr_security.c
index aacff00a9ff9..72a68a3a0c99 100644
--- a/fs/hfsplus/xattr_security.c
+++ b/fs/hfsplus/xattr_security.c
@@ -13,32 +13,24 @@
#include "xattr.h"
#include "acl.h"
-static int hfsplus_security_getxattr(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int hfsplus_security_getxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
return hfsplus_getxattr(dentry, name, buffer, size,
XATTR_SECURITY_PREFIX,
XATTR_SECURITY_PREFIX_LEN);
}
-static int hfsplus_security_setxattr(struct dentry *dentry, const char *name,
- const void *buffer, size_t size, int flags, int type)
+static int hfsplus_security_setxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *buffer, size_t size, int flags)
{
return hfsplus_setxattr(dentry, name, buffer, size, flags,
XATTR_SECURITY_PREFIX,
XATTR_SECURITY_PREFIX_LEN);
}
-static size_t hfsplus_security_listxattr(struct dentry *dentry, char *list,
- size_t list_size, const char *name, size_t name_len, int type)
-{
- /*
- * This method is not used.
- * It is used hfsplus_listxattr() instead of generic_listxattr().
- */
- return -EOPNOTSUPP;
-}
-
static int hfsplus_initxattrs(struct inode *inode,
const struct xattr *xattr_array,
void *fs_info)
@@ -92,7 +84,6 @@ int hfsplus_init_inode_security(struct inode *inode,
const struct xattr_handler hfsplus_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
- .list = hfsplus_security_listxattr,
.get = hfsplus_security_getxattr,
.set = hfsplus_security_setxattr,
};
diff --git a/fs/hfsplus/xattr_trusted.c b/fs/hfsplus/xattr_trusted.c
index bcf65089b7f7..95a7704c7abb 100644
--- a/fs/hfsplus/xattr_trusted.c
+++ b/fs/hfsplus/xattr_trusted.c
@@ -11,34 +11,25 @@
#include "hfsplus_fs.h"
#include "xattr.h"
-static int hfsplus_trusted_getxattr(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int hfsplus_trusted_getxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
return hfsplus_getxattr(dentry, name, buffer, size,
XATTR_TRUSTED_PREFIX,
XATTR_TRUSTED_PREFIX_LEN);
}
-static int hfsplus_trusted_setxattr(struct dentry *dentry, const char *name,
- const void *buffer, size_t size, int flags, int type)
+static int hfsplus_trusted_setxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *buffer, size_t size, int flags)
{
return hfsplus_setxattr(dentry, name, buffer, size, flags,
XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
}
-static size_t hfsplus_trusted_listxattr(struct dentry *dentry, char *list,
- size_t list_size, const char *name, size_t name_len, int type)
-{
- /*
- * This method is not used.
- * It is used hfsplus_listxattr() instead of generic_listxattr().
- */
- return -EOPNOTSUPP;
-}
-
const struct xattr_handler hfsplus_xattr_trusted_handler = {
.prefix = XATTR_TRUSTED_PREFIX,
- .list = hfsplus_trusted_listxattr,
.get = hfsplus_trusted_getxattr,
.set = hfsplus_trusted_setxattr,
};
diff --git a/fs/hfsplus/xattr_user.c b/fs/hfsplus/xattr_user.c
index 5aa0e6dc4a1e..6fc269baf959 100644
--- a/fs/hfsplus/xattr_user.c
+++ b/fs/hfsplus/xattr_user.c
@@ -11,34 +11,25 @@
#include "hfsplus_fs.h"
#include "xattr.h"
-static int hfsplus_user_getxattr(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int hfsplus_user_getxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
return hfsplus_getxattr(dentry, name, buffer, size,
XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
}
-static int hfsplus_user_setxattr(struct dentry *dentry, const char *name,
- const void *buffer, size_t size, int flags, int type)
+static int hfsplus_user_setxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *buffer, size_t size, int flags)
{
return hfsplus_setxattr(dentry, name, buffer, size, flags,
XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
}
-static size_t hfsplus_user_listxattr(struct dentry *dentry, char *list,
- size_t list_size, const char *name, size_t name_len, int type)
-{
- /*
- * This method is not used.
- * It is used hfsplus_listxattr() instead of generic_listxattr().
- */
- return -EOPNOTSUPP;
-}
-
const struct xattr_handler hfsplus_xattr_user_handler = {
.prefix = XATTR_USER_PREFIX,
- .list = hfsplus_user_listxattr,
.get = hfsplus_user_getxattr,
.set = hfsplus_user_setxattr,
};
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 2ac99db3750e..cfaa18c7a337 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -223,7 +223,7 @@ static struct inode *hostfs_alloc_inode(struct super_block *sb)
{
struct hostfs_inode_info *hi;
- hi = kmalloc(sizeof(*hi), GFP_KERNEL);
+ hi = kmalloc(sizeof(*hi), GFP_KERNEL_ACCOUNT);
if (hi == NULL)
return NULL;
hi->fd = -1;
@@ -730,15 +730,13 @@ static int hostfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
init_special_inode(inode, mode, dev);
err = do_mknod(name, mode, MAJOR(dev), MINOR(dev));
- if (!err)
+ if (err)
goto out_free;
err = read_name(inode, name);
__putname(name);
if (err)
goto out_put;
- if (err)
- goto out_put;
d_instantiate(dentry, inode);
return 0;
@@ -892,9 +890,14 @@ static const struct inode_operations hostfs_dir_iops = {
.setattr = hostfs_setattr,
};
-static const char *hostfs_follow_link(struct dentry *dentry, void **cookie)
+static const char *hostfs_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- char *link = __getname();
+ char *link;
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+ link = kmalloc(PATH_MAX, GFP_KERNEL);
if (link) {
char *path = dentry_name(dentry);
int err = -ENOMEM;
@@ -905,25 +908,20 @@ static const char *hostfs_follow_link(struct dentry *dentry, void **cookie)
__putname(path);
}
if (err < 0) {
- __putname(link);
+ kfree(link);
return ERR_PTR(err);
}
} else {
return ERR_PTR(-ENOMEM);
}
- return *cookie = link;
-}
-
-static void hostfs_put_link(struct inode *unused, void *cookie)
-{
- __putname(cookie);
+ set_delayed_call(done, kfree_link, link);
+ return link;
}
static const struct inode_operations hostfs_link_iops = {
.readlink = generic_readlink,
- .follow_link = hostfs_follow_link,
- .put_link = hostfs_put_link,
+ .get_link = hostfs_get_link,
};
static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 933c73780813..1f3c6d76200b 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -77,6 +77,7 @@ void hpfs_read_inode(struct inode *i)
kfree(ea);
i->i_mode = S_IFLNK | 0777;
i->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(i);
i->i_data.a_ops = &hpfs_symlink_aops;
set_nlink(i, 1);
i->i_size = ea_size;
diff --git a/fs/hpfs/map.c b/fs/hpfs/map.c
index a69bbc1e87f8..a136929189f0 100644
--- a/fs/hpfs/map.c
+++ b/fs/hpfs/map.c
@@ -133,7 +133,7 @@ __le32 *hpfs_load_bitmap_directory(struct super_block *s, secno bmp)
void hpfs_load_hotfix_map(struct super_block *s, struct hpfs_spare_block *spareblock)
{
struct quad_buffer_head qbh;
- u32 *directory;
+ __le32 *directory;
u32 n_hotfixes, n_used_hotfixes;
unsigned i;
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index 9e92c9c2d319..506765afa1a3 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -227,8 +227,6 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, de
int err;
if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err;
if (hpfs_sb(dir->i_sb)->sb_eas < 2) return -EPERM;
- if (!new_valid_dev(rdev))
- return -EINVAL;
hpfs_lock(dir->i_sb);
err = -ENOSPC;
fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh);
@@ -334,6 +332,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy
result->i_blocks = 1;
set_nlink(result, 1);
result->i_size = strlen(symlink);
+ inode_nohighmem(result);
result->i_op = &page_symlink_inode_operations;
result->i_data.a_ops = &hpfs_symlink_aops;
@@ -502,7 +501,7 @@ out:
static int hpfs_symlink_readpage(struct file *file, struct page *page)
{
- char *link = kmap(page);
+ char *link = page_address(page);
struct inode *i = page->mapping->host;
struct fnode *fnode;
struct buffer_head *bh;
@@ -518,14 +517,12 @@ static int hpfs_symlink_readpage(struct file *file, struct page *page)
goto fail;
hpfs_unlock(i->i_sb);
SetPageUptodate(page);
- kunmap(page);
unlock_page(page);
return 0;
fail:
hpfs_unlock(i->i_sb);
SetPageError(page);
- kunmap(page);
unlock_page(page);
return err;
}
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index a561591896bd..458cf463047b 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -261,7 +261,7 @@ static int init_inodecache(void)
hpfs_inode_cachep = kmem_cache_create("hpfs_inode_cache",
sizeof(struct hpfs_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (hpfs_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 316adb968b65..8bbf7f3e2a27 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -4,11 +4,11 @@
* Nadia Yvette Chambers, 2002
*
* Copyright (C) 2002 Linus Torvalds.
+ * License: GPL
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/module.h>
#include <linux/thread_info.h>
#include <asm/current.h>
#include <linux/sched.h> /* remove ASAP */
@@ -324,20 +324,62 @@ static void remove_huge_page(struct page *page)
delete_from_page_cache(page);
}
+static void
+hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
+{
+ struct vm_area_struct *vma;
+
+ /*
+ * end == 0 indicates that the entire range after
+ * start should be unmapped.
+ */
+ vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
+ unsigned long v_offset;
+ unsigned long v_end;
+
+ /*
+ * Can the expression below overflow on 32-bit arches?
+ * No, because the interval tree returns us only those vmas
+ * which overlap the truncated area starting at pgoff,
+ * and no vma on a 32-bit arch can span beyond the 4GB.
+ */
+ if (vma->vm_pgoff < start)
+ v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
+ else
+ v_offset = 0;
+
+ if (!end)
+ v_end = vma->vm_end;
+ else {
+ v_end = ((end - vma->vm_pgoff) << PAGE_SHIFT)
+ + vma->vm_start;
+ if (v_end > vma->vm_end)
+ v_end = vma->vm_end;
+ }
+
+ unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end,
+ NULL);
+ }
+}
/*
* remove_inode_hugepages handles two distinct cases: truncation and hole
* punch. There are subtle differences in operation for each case.
-
+ *
* truncation is indicated by end of range being LLONG_MAX
* In this case, we first scan the range and release found pages.
* After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
- * maps and global counts.
+ * maps and global counts. Page faults can not race with truncation
+ * in this routine. hugetlb_no_page() prevents page faults in the
+ * truncated range. It checks i_size before allocation, and again after
+ * with the page table lock for the page held. The same lock must be
+ * acquired to unmap a page.
* hole punch is indicated if end is not LLONG_MAX
* In the hole punch case we scan the range and release found pages.
* Only when releasing a page is the associated region/reserv map
* deleted. The region/reserv map for ranges without associated
- * pages are not modified.
+ * pages are not modified. Page faults can race with hole punch.
+ * This is indicated if we find a mapped page.
* Note: If the passed end of range value is beyond the end of file, but
* not LLONG_MAX this routine still performs a hole punch operation.
*/
@@ -361,77 +403,81 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
next = start;
while (next < end) {
/*
- * Make sure to never grab more pages that we
- * might possibly need.
+ * Don't grab more pages than the number left in the range.
*/
if (end - next < lookup_nr)
lookup_nr = end - next;
/*
- * This pagevec_lookup() may return pages past 'end',
- * so we must check for page->index > end.
+ * When no more pages are found, we are done.
*/
- if (!pagevec_lookup(&pvec, mapping, next, lookup_nr)) {
- if (next == start)
- break;
- next = start;
- continue;
- }
+ if (!pagevec_lookup(&pvec, mapping, next, lookup_nr))
+ break;
for (i = 0; i < pagevec_count(&pvec); ++i) {
struct page *page = pvec.pages[i];
+ bool rsv_on_error;
u32 hash;
+ /*
+ * The page (index) could be beyond end. This is
+ * only possible in the punch hole case as end is
+ * max page offset in the truncate case.
+ */
+ next = page->index;
+ if (next >= end)
+ break;
+
hash = hugetlb_fault_mutex_hash(h, current->mm,
&pseudo_vma,
mapping, next, 0);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
- lock_page(page);
- if (page->index >= end) {
- unlock_page(page);
- mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- next = end; /* we are done */
- break;
- }
-
/*
* If page is mapped, it was faulted in after being
- * unmapped. Do nothing in this race case. In the
- * normal case page is not mapped.
+ * unmapped in caller. Unmap (again) now after taking
+ * the fault mutex. The mutex will prevent faults
+ * until we finish removing the page.
+ *
+ * This race can only happen in the hole punch case.
+ * Getting here in a truncate operation is a bug.
*/
- if (!page_mapped(page)) {
- bool rsv_on_error = !PagePrivate(page);
- /*
- * We must free the huge page and remove
- * from page cache (remove_huge_page) BEFORE
- * removing the region/reserve map
- * (hugetlb_unreserve_pages). In rare out
- * of memory conditions, removal of the
- * region/reserve map could fail. Before
- * free'ing the page, note PagePrivate which
- * is used in case of error.
- */
- remove_huge_page(page);
- freed++;
- if (!truncate_op) {
- if (unlikely(hugetlb_unreserve_pages(
- inode, next,
- next + 1, 1)))
- hugetlb_fix_reserve_counts(
- inode, rsv_on_error);
- }
+ if (unlikely(page_mapped(page))) {
+ BUG_ON(truncate_op);
+
+ i_mmap_lock_write(mapping);
+ hugetlb_vmdelete_list(&mapping->i_mmap,
+ next * pages_per_huge_page(h),
+ (next + 1) * pages_per_huge_page(h));
+ i_mmap_unlock_write(mapping);
}
- if (page->index > next)
- next = page->index;
+ lock_page(page);
+ /*
+ * We must free the huge page and remove from page
+ * cache (remove_huge_page) BEFORE removing the
+ * region/reserve map (hugetlb_unreserve_pages). In
+ * rare out of memory conditions, removal of the
+ * region/reserve map could fail. Before free'ing
+ * the page, note PagePrivate which is used in case
+ * of error.
+ */
+ rsv_on_error = !PagePrivate(page);
+ remove_huge_page(page);
+ freed++;
+ if (!truncate_op) {
+ if (unlikely(hugetlb_unreserve_pages(inode,
+ next, next + 1, 1)))
+ hugetlb_fix_reserve_counts(inode,
+ rsv_on_error);
+ }
- ++next;
unlock_page(page);
-
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
}
+ ++next;
huge_pagevec_release(&pvec);
+ cond_resched();
}
if (truncate_op)
@@ -450,41 +496,6 @@ static void hugetlbfs_evict_inode(struct inode *inode)
clear_inode(inode);
}
-static inline void
-hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
-{
- struct vm_area_struct *vma;
-
- /*
- * end == 0 indicates that the entire range after
- * start should be unmapped.
- */
- vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
- unsigned long v_offset;
-
- /*
- * Can the expression below overflow on 32-bit arches?
- * No, because the interval tree returns us only those vmas
- * which overlap the truncated area starting at pgoff,
- * and no vma on a 32-bit arch can span beyond the 4GB.
- */
- if (vma->vm_pgoff < start)
- v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
- else
- v_offset = 0;
-
- if (end) {
- end = ((end - start) << PAGE_SHIFT) +
- vma->vm_start + v_offset;
- if (end > vma->vm_end)
- end = vma->vm_end;
- } else
- end = vma->vm_end;
-
- unmap_hugepage_range(vma, vma->vm_start + v_offset, end, NULL);
- }
-}
-
static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
{
pgoff_t pgoff;
@@ -647,9 +658,6 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
i_size_write(inode, offset + len);
inode->i_ctime = CURRENT_TIME;
- spin_lock(&inode->i_lock);
- inode->i_private = NULL;
- spin_unlock(&inode->i_lock);
out:
mutex_unlock(&inode->i_mutex);
return error;
@@ -709,7 +717,7 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb,
/*
* Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never
* be taken from reclaim -- unlike regular filesystems. This needs an
- * annotation because huge_pmd_share() does an allocation under
+ * annotation because huge_pmd_share() does an allocation under hugetlb's
* i_mmap_rwsem.
*/
static struct lock_class_key hugetlbfs_i_mmap_rwsem_key;
@@ -739,7 +747,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
/*
* The policy is initialized here even if we are creating a
* private inode because initialization simply creates an
- * an empty rb tree and calls spin_lock_init(), later when we
+ * an empty rb tree and calls rwlock_init(), later when we
* call mpol_free_shared_policy() it will just return because
* the rb tree will still be empty.
*/
@@ -761,6 +769,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
break;
case S_IFLNK:
inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
break;
}
lockdep_annotate_inode_mutex_key(inode);
@@ -1202,7 +1211,6 @@ static struct file_system_type hugetlbfs_fs_type = {
.mount = hugetlbfs_mount,
.kill_sb = kill_litter_super,
};
-MODULE_ALIAS_FS("hugetlbfs");
static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];
@@ -1322,7 +1330,7 @@ static int __init init_hugetlbfs_fs(void)
error = -ENOMEM;
hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",
sizeof(struct hugetlbfs_inode_info),
- 0, 0, init_once);
+ 0, SLAB_ACCOUNT, init_once);
if (hugetlbfs_inode_cachep == NULL)
goto out2;
@@ -1356,26 +1364,4 @@ static int __init init_hugetlbfs_fs(void)
out2:
return error;
}
-
-static void __exit exit_hugetlbfs_fs(void)
-{
- struct hstate *h;
- int i;
-
-
- /*
- * Make sure all delayed rcu free inodes are flushed before we
- * destroy cache.
- */
- rcu_barrier();
- kmem_cache_destroy(hugetlbfs_inode_cachep);
- i = 0;
- for_each_hstate(h)
- kern_unmount(hugetlbfs_vfsmount[i++]);
- unregister_filesystem(&hugetlbfs_fs_type);
-}
-
-module_init(init_hugetlbfs_fs)
-module_exit(exit_hugetlbfs_fs)
-
-MODULE_LICENSE("GPL");
+fs_initcall(init_hugetlbfs_fs)
diff --git a/fs/inode.c b/fs/inode.c
index 78a17b8859e1..e491e54d2430 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -225,7 +225,7 @@ void __destroy_inode(struct inode *inode)
inode_detach_wb(inode);
security_inode_free(inode);
fsnotify_inode_delete(inode);
- locks_free_lock_context(inode->i_flctx);
+ locks_free_lock_context(inode);
if (!inode->i_nlink) {
WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0);
atomic_long_dec(&inode->i_sb->s_remove_count);
@@ -1597,6 +1597,7 @@ static int update_time(struct inode *inode, struct timespec *time, int flags)
/**
* touch_atime - update the access time
* @path: the &struct path to update
+ * @inode: inode to update
*
* Update the accessed time on an inode and mark it for writeback.
* This function automatically handles read only file systems and media,
@@ -1882,7 +1883,7 @@ void __init inode_init(void)
sizeof(struct inode),
0,
(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
/* Hash may have been set up in inode_init_early */
@@ -2027,3 +2028,9 @@ void inode_set_flags(struct inode *inode, unsigned int flags,
new_flags) != old_flags));
}
EXPORT_SYMBOL(inode_set_flags);
+
+void inode_nohighmem(struct inode *inode)
+{
+ mapping_set_gfp_mask(inode->i_mapping, GFP_USER);
+}
+EXPORT_SYMBOL(inode_nohighmem);
diff --git a/fs/internal.h b/fs/internal.h
index 71859c4d0b41..b71deeecea17 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -55,7 +55,7 @@ extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
/*
* namespace.c
*/
-extern int copy_mount_options(const void __user *, unsigned long *);
+extern void *copy_mount_options(const void __user *);
extern char *copy_mount_string(const void __user *);
extern struct vfsmount *lookup_mnt(struct path *);
@@ -151,3 +151,10 @@ extern void mnt_pin_kill(struct mount *m);
* fs/nsfs.c
*/
extern struct dentry_operations ns_dentry_operations;
+
+/*
+ * fs/ioctl.c
+ */
+extern int do_vfs_ioctl(struct file *file, unsigned int fd, unsigned int cmd,
+ unsigned long arg);
+extern long vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 5d01d2638ca5..29466c380958 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -15,6 +15,7 @@
#include <linux/writeback.h>
#include <linux/buffer_head.h>
#include <linux/falloc.h>
+#include "internal.h"
#include <asm/ioctls.h>
@@ -32,8 +33,7 @@
*
* Returns 0 on success, -errno on error.
*/
-static long vfs_ioctl(struct file *filp, unsigned int cmd,
- unsigned long arg)
+long vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
int error = -ENOTTY;
@@ -215,6 +215,29 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
return error;
}
+static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd,
+ u64 off, u64 olen, u64 destoff)
+{
+ struct fd src_file = fdget(srcfd);
+ int ret;
+
+ if (!src_file.file)
+ return -EBADF;
+ ret = vfs_clone_file_range(src_file.file, off, dst_file, destoff, olen);
+ fdput(src_file);
+ return ret;
+}
+
+static long ioctl_file_clone_range(struct file *file, void __user *argp)
+{
+ struct file_clone_range args;
+
+ if (copy_from_user(&args, argp, sizeof(args)))
+ return -EFAULT;
+ return ioctl_file_clone(file, args.src_fd, args.src_offset,
+ args.src_length, args.dest_offset);
+}
+
#ifdef CONFIG_BLOCK
static inline sector_t logical_to_blk(struct inode *inode, loff_t offset)
@@ -545,6 +568,41 @@ static int ioctl_fsthaw(struct file *filp)
return thaw_super(sb);
}
+static long ioctl_file_dedupe_range(struct file *file, void __user *arg)
+{
+ struct file_dedupe_range __user *argp = arg;
+ struct file_dedupe_range *same = NULL;
+ int ret;
+ unsigned long size;
+ u16 count;
+
+ if (get_user(count, &argp->dest_count)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ size = offsetof(struct file_dedupe_range __user, info[count]);
+
+ same = memdup_user(argp, size);
+ if (IS_ERR(same)) {
+ ret = PTR_ERR(same);
+ same = NULL;
+ goto out;
+ }
+
+ ret = vfs_dedupe_file_range(file, same);
+ if (ret)
+ goto out;
+
+ ret = copy_to_user(argp, same, size);
+ if (ret)
+ ret = -EFAULT;
+
+out:
+ kfree(same);
+ return ret;
+}
+
/*
* When you add any new common ioctls to the switches above and below
* please update compat_sys_ioctl() too.
@@ -600,6 +658,15 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
case FIGETBSZ:
return put_user(inode->i_sb->s_blocksize, argp);
+ case FICLONE:
+ return ioctl_file_clone(filp, arg, 0, 0, 0);
+
+ case FICLONERANGE:
+ return ioctl_file_clone_range(filp, argp);
+
+ case FIDEDUPERANGE:
+ return ioctl_file_dedupe_range(filp, argp);
+
default:
if (S_ISREG(inode->i_mode))
error = file_ioctl(filp, cmd, arg);
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index d67a16f2a45d..bcd2d41b318a 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -94,7 +94,7 @@ static int __init init_inodecache(void)
isofs_inode_cachep = kmem_cache_create("isofs_inode_cache",
sizeof(struct iso_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (isofs_inode_cachep == NULL)
return -ENOMEM;
@@ -1417,6 +1417,7 @@ static int isofs_read_inode(struct inode *inode, int relocated)
inode->i_fop = &isofs_dir_operations;
} else if (S_ISLNK(inode->i_mode)) {
inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_data.a_ops = &isofs_symlink_aops;
} else
/* XXX - parse_rock_ridge_inode() had already set i_rdev. */
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index 735d7522a3a9..5384ceb35b1c 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -687,7 +687,7 @@ static int rock_ridge_symlink_readpage(struct file *file, struct page *page)
struct inode *inode = page->mapping->host;
struct iso_inode_info *ei = ISOFS_I(inode);
struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb);
- char *link = kmap(page);
+ char *link = page_address(page);
unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
struct buffer_head *bh;
char *rpnt = link;
@@ -774,7 +774,6 @@ repeat:
brelse(bh);
*rpnt = '\0';
SetPageUptodate(page);
- kunmap(page);
unlock_page(page);
return 0;
@@ -791,7 +790,6 @@ fail:
brelse(bh);
error:
SetPageError(page);
- kunmap(page);
unlock_page(page);
return -EIO;
}
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 8c44654ce274..684996c8a3a4 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -427,7 +427,6 @@ static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy)
struct journal_head *last_jh;
struct journal_head *next_jh = jh;
int ret;
- int freed = 0;
if (!jh)
return 0;
@@ -441,10 +440,9 @@ static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy)
else
ret = __jbd2_journal_remove_checkpoint(jh) + 1;
if (!ret)
- return freed;
+ return 0;
if (ret == 2)
return 1;
- freed = 1;
/*
* This function only frees up some memory
* if possible so we dont have an obligation
@@ -452,10 +450,10 @@ static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy)
* requested:
*/
if (need_resched())
- return freed;
+ return 0;
} while (jh != last_jh);
- return freed;
+ return 0;
}
/*
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 362e5f614450..36345fefa3ff 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -142,8 +142,7 @@ static int journal_submit_commit_record(journal_t *journal,
tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
- if (JBD2_HAS_COMPAT_FEATURE(journal,
- JBD2_FEATURE_COMPAT_CHECKSUM)) {
+ if (jbd2_has_feature_checksum(journal)) {
tmp->h_chksum_type = JBD2_CRC32_CHKSUM;
tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE;
tmp->h_chksum[0] = cpu_to_be32(crc32_sum);
@@ -157,8 +156,7 @@ static int journal_submit_commit_record(journal_t *journal,
bh->b_end_io = journal_end_buffer_io_sync;
if (journal->j_flags & JBD2_BARRIER &&
- !JBD2_HAS_INCOMPAT_FEATURE(journal,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
+ !jbd2_has_feature_async_commit(journal))
ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh);
else
ret = submit_bh(WRITE_SYNC, bh);
@@ -317,7 +315,7 @@ static void write_tag_block(journal_t *j, journal_block_tag_t *tag,
unsigned long long block)
{
tag->t_blocknr = cpu_to_be32(block & (u32)~0);
- if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_64BIT))
+ if (jbd2_has_feature_64bit(j))
tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
}
@@ -356,7 +354,7 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
bh->b_size);
kunmap_atomic(addr);
- if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V3))
+ if (jbd2_has_feature_csum3(j))
tag3->t_checksum = cpu_to_be32(csum32);
else
tag->t_checksum = cpu_to_be16(csum32);
@@ -730,8 +728,7 @@ start_journal_io:
/*
* Compute checksum.
*/
- if (JBD2_HAS_COMPAT_FEATURE(journal,
- JBD2_FEATURE_COMPAT_CHECKSUM)) {
+ if (jbd2_has_feature_checksum(journal)) {
crc32_sum =
jbd2_checksum_data(crc32_sum, bh);
}
@@ -797,8 +794,7 @@ start_journal_io:
blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL);
/* Done it all: now write the commit record asynchronously. */
- if (JBD2_HAS_INCOMPAT_FEATURE(journal,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+ if (jbd2_has_feature_async_commit(journal)) {
err = journal_submit_commit_record(journal, commit_transaction,
&cbh, crc32_sum);
if (err)
@@ -889,8 +885,7 @@ start_journal_io:
commit_transaction->t_state = T_COMMIT_JFLUSH;
write_unlock(&journal->j_state_lock);
- if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+ if (!jbd2_has_feature_async_commit(journal)) {
err = journal_submit_commit_record(journal, commit_transaction,
&cbh, crc32_sum);
if (err)
@@ -898,8 +893,7 @@ start_journal_io:
}
if (cbh)
err = journal_wait_on_commit_record(journal, cbh);
- if (JBD2_HAS_INCOMPAT_FEATURE(journal,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
+ if (jbd2_has_feature_async_commit(journal) &&
journal->j_flags & JBD2_BARRIER) {
blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL);
}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 8270fe9e3641..81e622681c82 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -124,7 +124,7 @@ EXPORT_SYMBOL(__jbd2_debug);
/* Checksumming functions */
static int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb)
{
- if (!jbd2_journal_has_csum_v2or3(j))
+ if (!jbd2_journal_has_csum_v2or3_feature(j))
return 1;
return sb->s_checksum_type == JBD2_CRC32C_CHKSUM;
@@ -1523,16 +1523,16 @@ static int journal_get_superblock(journal_t *journal)
goto out;
}
- if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2) &&
- JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
+ if (jbd2_has_feature_csum2(journal) &&
+ jbd2_has_feature_csum3(journal)) {
/* Can't have checksum v2 and v3 at the same time! */
printk(KERN_ERR "JBD2: Can't enable checksumming v2 and v3 "
"at the same time!\n");
goto out;
}
- if (jbd2_journal_has_csum_v2or3(journal) &&
- JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM)) {
+ if (jbd2_journal_has_csum_v2or3_feature(journal) &&
+ jbd2_has_feature_checksum(journal)) {
/* Can't have checksum v1 and v2 on at the same time! */
printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2/3 "
"at the same time!\n");
@@ -1545,7 +1545,7 @@ static int journal_get_superblock(journal_t *journal)
}
/* Load the checksum driver */
- if (jbd2_journal_has_csum_v2or3(journal)) {
+ if (jbd2_journal_has_csum_v2or3_feature(journal)) {
journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
if (IS_ERR(journal->j_chksum_driver)) {
printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n");
@@ -1558,6 +1558,7 @@ static int journal_get_superblock(journal_t *journal)
/* Check superblock checksum */
if (!jbd2_superblock_csum_verify(journal, sb)) {
printk(KERN_ERR "JBD2: journal checksum error\n");
+ err = -EFSBADCRC;
goto out;
}
@@ -1649,7 +1650,7 @@ int jbd2_journal_load(journal_t *journal)
printk(KERN_ERR "JBD2: journal transaction %u on %s "
"is corrupt.\n", journal->j_failed_commit,
journal->j_devname);
- return -EIO;
+ return -EFSCORRUPTED;
}
/* OK, we've finished with the dynamic journal bits:
@@ -2071,8 +2072,12 @@ static void __journal_abort_soft (journal_t *journal, int errno)
__jbd2_journal_abort_hard(journal);
- if (errno)
+ if (errno) {
jbd2_journal_update_sb_errno(journal);
+ write_lock(&journal->j_state_lock);
+ journal->j_flags |= JBD2_REC_ERR;
+ write_unlock(&journal->j_state_lock);
+ }
}
/**
@@ -2197,15 +2202,15 @@ size_t journal_tag_bytes(journal_t *journal)
{
size_t sz;
- if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V3))
+ if (jbd2_has_feature_csum3(journal))
return sizeof(journal_block_tag3_t);
sz = sizeof(journal_block_tag_t);
- if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ if (jbd2_has_feature_csum2(journal))
sz += sizeof(__u16);
- if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
+ if (jbd2_has_feature_64bit(journal))
return sz;
else
return sz - sizeof(__u32);
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index a9079d035ae5..7f277e49fe88 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -140,7 +140,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
if (offset >= journal->j_maxlen) {
printk(KERN_ERR "JBD2: corrupted journal superblock\n");
- return -EIO;
+ return -EFSCORRUPTED;
}
err = jbd2_journal_bmap(journal, offset, &blocknr);
@@ -342,7 +342,7 @@ static inline unsigned long long read_tag_block(journal_t *journal,
journal_block_tag_t *tag)
{
unsigned long long block = be32_to_cpu(tag->t_blocknr);
- if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
+ if (jbd2_has_feature_64bit(journal))
block |= (u64)be32_to_cpu(tag->t_blocknr_high) << 32;
return block;
}
@@ -411,7 +411,7 @@ static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag,
csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize);
- if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V3))
+ if (jbd2_has_feature_csum3(j))
return tag3->t_checksum == cpu_to_be32(csum32);
else
return tag->t_checksum == cpu_to_be16(csum32);
@@ -527,7 +527,7 @@ static int do_one_pass(journal_t *journal,
printk(KERN_ERR "JBD2: Invalid checksum "
"recovering block %lu in log\n",
next_log_block);
- err = -EIO;
+ err = -EFSBADCRC;
brelse(bh);
goto failed;
}
@@ -538,8 +538,7 @@ static int do_one_pass(journal_t *journal,
* just skip over the blocks it describes. */
if (pass != PASS_REPLAY) {
if (pass == PASS_SCAN &&
- JBD2_HAS_COMPAT_FEATURE(journal,
- JBD2_FEATURE_COMPAT_CHECKSUM) &&
+ jbd2_has_feature_checksum(journal) &&
!info->end_transaction) {
if (calc_chksums(journal, bh,
&next_log_block,
@@ -602,7 +601,7 @@ static int do_one_pass(journal_t *journal,
journal, tag, obh->b_data,
be32_to_cpu(tmp->h_sequence))) {
brelse(obh);
- success = -EIO;
+ success = -EFSBADCRC;
printk(KERN_ERR "JBD2: Invalid "
"checksum recovering "
"block %llu in log\n",
@@ -694,8 +693,7 @@ static int do_one_pass(journal_t *journal,
* much to do other than move on to the next sequence
* number. */
if (pass == PASS_SCAN &&
- JBD2_HAS_COMPAT_FEATURE(journal,
- JBD2_FEATURE_COMPAT_CHECKSUM)) {
+ jbd2_has_feature_checksum(journal)) {
int chksum_err, chksum_seen;
struct commit_header *cbh =
(struct commit_header *)bh->b_data;
@@ -735,8 +733,7 @@ static int do_one_pass(journal_t *journal,
if (chksum_err) {
info->end_transaction = next_commit_ID;
- if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)){
+ if (!jbd2_has_feature_async_commit(journal)) {
journal->j_failed_commit =
next_commit_ID;
brelse(bh);
@@ -750,8 +747,7 @@ static int do_one_pass(journal_t *journal,
bh->b_data)) {
info->end_transaction = next_commit_ID;
- if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+ if (!jbd2_has_feature_async_commit(journal)) {
journal->j_failed_commit =
next_commit_ID;
brelse(bh);
@@ -851,7 +847,7 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
rcount = be32_to_cpu(header->r_count);
if (!jbd2_revoke_block_csum_verify(journal, header))
- return -EINVAL;
+ return -EFSBADCRC;
if (jbd2_journal_has_csum_v2or3(journal))
csum_size = sizeof(struct jbd2_journal_revoke_tail);
@@ -859,7 +855,7 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
return -EINVAL;
max = rcount;
- if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
+ if (jbd2_has_feature_64bit(journal))
record_len = 8;
while (offset + record_len <= max) {
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 0abf2e7f725b..705ae577882b 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -589,7 +589,7 @@ static void write_one_revoke_record(journal_t *journal,
if (jbd2_journal_has_csum_v2or3(journal))
csum_size = sizeof(struct jbd2_journal_revoke_tail);
- if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
+ if (jbd2_has_feature_64bit(journal))
sz = 8;
else
sz = 4;
@@ -619,7 +619,7 @@ static void write_one_revoke_record(journal_t *journal,
*descriptorp = descriptor;
}
- if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
+ if (jbd2_has_feature_64bit(journal))
* ((__be64 *)(&descriptor->b_data[offset])) =
cpu_to_be64(record->blocknr);
else
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 6b8338ec2464..081dff087fc0 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -764,13 +764,11 @@ void jbd2_journal_unlock_updates (journal_t *journal)
static void warn_dirty_buffer(struct buffer_head *bh)
{
- char b[BDEVNAME_SIZE];
-
printk(KERN_WARNING
- "JBD2: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). "
+ "JBD2: Spotted dirty metadata buffer (dev = %pg, blocknr = %llu). "
"There's a risk of filesystem corruption in case of system "
"crash.\n",
- bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
+ bh->b_bdev, (unsigned long long)bh->b_blocknr);
}
/* Call t_frozen trigger and copy buffer data into jh->b_frozen_data. */
@@ -1009,7 +1007,8 @@ out:
}
/* Fast check whether buffer is already attached to the required transaction */
-static bool jbd2_write_access_granted(handle_t *handle, struct buffer_head *bh)
+static bool jbd2_write_access_granted(handle_t *handle, struct buffer_head *bh,
+ bool undo)
{
struct journal_head *jh;
bool ret = false;
@@ -1036,6 +1035,9 @@ static bool jbd2_write_access_granted(handle_t *handle, struct buffer_head *bh)
jh = READ_ONCE(bh->b_private);
if (!jh)
goto out;
+ /* For undo access buffer must have data copied */
+ if (undo && !jh->b_committed_data)
+ goto out;
if (jh->b_transaction != handle->h_transaction &&
jh->b_next_transaction != handle->h_transaction)
goto out;
@@ -1073,7 +1075,7 @@ int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh)
struct journal_head *jh;
int rc;
- if (jbd2_write_access_granted(handle, bh))
+ if (jbd2_write_access_granted(handle, bh, false))
return 0;
jh = jbd2_journal_add_journal_head(bh);
@@ -1210,7 +1212,7 @@ int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
char *committed_data = NULL;
JBUFFER_TRACE(jh, "entry");
- if (jbd2_write_access_granted(handle, bh))
+ if (jbd2_write_access_granted(handle, bh, true))
return 0;
jh = jbd2_journal_add_journal_head(bh);
@@ -1937,8 +1939,8 @@ out:
* @journal: journal for operation
* @page: to try and free
* @gfp_mask: we use the mask to detect how hard should we try to release
- * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
- * release the buffers.
+ * buffers. If __GFP_DIRECT_RECLAIM and __GFP_FS is set, we wait for commit
+ * code to release the buffers.
*
*
* For all the buffers on this page,
@@ -2152,6 +2154,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,
if (!buffer_dirty(bh)) {
/* bdflush has written it. We can drop it now */
+ __jbd2_journal_remove_checkpoint(jh);
goto zap_buffer;
}
@@ -2181,6 +2184,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,
/* The orphan record's transaction has
* committed. We can cleanse this buffer */
clear_buffer_jbddirty(bh);
+ __jbd2_journal_remove_checkpoint(jh);
goto zap_buffer;
}
}
diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index bb9cebc9ca8a..e5c1783ab64a 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -80,7 +80,6 @@ static int jffs2_garbage_collect_thread(void *_c)
siginitset(&hupmask, sigmask(SIGHUP));
allow_signal(SIGKILL);
allow_signal(SIGSTOP);
- allow_signal(SIGCONT);
allow_signal(SIGHUP);
c->gc_task = current;
@@ -121,20 +120,18 @@ static int jffs2_garbage_collect_thread(void *_c)
/* Put_super will send a SIGKILL and then wait on the sem.
*/
while (signal_pending(current) || freezing(current)) {
- siginfo_t info;
unsigned long signr;
if (try_to_freeze())
goto again;
- signr = dequeue_signal_lock(current, &current->blocked, &info);
+ signr = kernel_dequeue_signal(NULL);
switch(signr) {
case SIGSTOP:
jffs2_dbg(1, "%s(): SIGSTOP received\n",
__func__);
- set_current_state(TASK_STOPPED);
- schedule();
+ kernel_signal_stop();
break;
case SIGKILL:
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 81180022923f..d211b8e18566 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -621,9 +621,6 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, umode_t mode
uint32_t alloclen;
int ret;
- if (!new_valid_dev(rdev))
- return -EINVAL;
-
ri = jffs2_alloc_raw_inode();
if (!ri)
return -ENOMEM;
diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c
index b8fd651307a4..ce1189793288 100644
--- a/fs/jffs2/malloc.c
+++ b/fs/jffs2/malloc.c
@@ -97,25 +97,16 @@ int __init jffs2_create_slab_caches(void)
void jffs2_destroy_slab_caches(void)
{
- if(full_dnode_slab)
- kmem_cache_destroy(full_dnode_slab);
- if(raw_dirent_slab)
- kmem_cache_destroy(raw_dirent_slab);
- if(raw_inode_slab)
- kmem_cache_destroy(raw_inode_slab);
- if(tmp_dnode_info_slab)
- kmem_cache_destroy(tmp_dnode_info_slab);
- if(raw_node_ref_slab)
- kmem_cache_destroy(raw_node_ref_slab);
- if(node_frag_slab)
- kmem_cache_destroy(node_frag_slab);
- if(inode_cache_slab)
- kmem_cache_destroy(inode_cache_slab);
+ kmem_cache_destroy(full_dnode_slab);
+ kmem_cache_destroy(raw_dirent_slab);
+ kmem_cache_destroy(raw_inode_slab);
+ kmem_cache_destroy(tmp_dnode_info_slab);
+ kmem_cache_destroy(raw_node_ref_slab);
+ kmem_cache_destroy(node_frag_slab);
+ kmem_cache_destroy(inode_cache_slab);
#ifdef CONFIG_JFFS2_FS_XATTR
- if (xattr_datum_cache)
- kmem_cache_destroy(xattr_datum_cache);
- if (xattr_ref_cache)
- kmem_cache_destroy(xattr_ref_cache);
+ kmem_cache_destroy(xattr_datum_cache);
+ kmem_cache_destroy(xattr_ref_cache);
#endif
}
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index 28e0aab42bc3..bfebbf13698c 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -660,8 +660,12 @@ static inline int read_direntry(struct jffs2_sb_info *c, struct jffs2_raw_node_r
err = jffs2_flash_read(c, (ref_offset(ref)) + read,
rd->nsize - already, &read, &fd->name[already]);
- if (unlikely(read != rd->nsize - already) && likely(!err))
+ if (unlikely(read != rd->nsize - already) && likely(!err)) {
+ jffs2_free_full_dirent(fd);
+ JFFS2_ERROR("short read: wanted %d bytes, got %zd\n",
+ rd->nsize - already, read);
return -EIO;
+ }
if (unlikely(err)) {
JFFS2_ERROR("read remainder of name: error %d\n", err);
diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c
index d4b43fb7adb1..7a28facd7175 100644
--- a/fs/jffs2/security.c
+++ b/fs/jffs2/security.c
@@ -48,42 +48,24 @@ int jffs2_init_security(struct inode *inode, struct inode *dir,
}
/* ---- XATTR Handler for "security.*" ----------------- */
-static int jffs2_security_getxattr(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int jffs2_security_getxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
- if (!strcmp(name, ""))
- return -EINVAL;
-
return do_jffs2_getxattr(d_inode(dentry), JFFS2_XPREFIX_SECURITY,
name, buffer, size);
}
-static int jffs2_security_setxattr(struct dentry *dentry, const char *name,
- const void *buffer, size_t size, int flags, int type)
+static int jffs2_security_setxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *buffer, size_t size, int flags)
{
- if (!strcmp(name, ""))
- return -EINVAL;
-
return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_SECURITY,
name, buffer, size, flags);
}
-static size_t jffs2_security_listxattr(struct dentry *dentry, char *list,
- size_t list_size, const char *name, size_t name_len, int type)
-{
- size_t retlen = XATTR_SECURITY_PREFIX_LEN + name_len + 1;
-
- if (list && retlen <= list_size) {
- strcpy(list, XATTR_SECURITY_PREFIX);
- strcpy(list + XATTR_SECURITY_PREFIX_LEN, name);
- }
-
- return retlen;
-}
-
const struct xattr_handler jffs2_security_xattr_handler = {
.prefix = XATTR_SECURITY_PREFIX,
- .list = jffs2_security_listxattr,
.set = jffs2_security_setxattr,
.get = jffs2_security_getxattr
};
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index d86c5e3176a1..bb080c272149 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -387,7 +387,7 @@ static int __init init_jffs2_fs(void)
jffs2_inode_cachep = kmem_cache_create("jffs2_i",
sizeof(struct jffs2_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
jffs2_i_init_once);
if (!jffs2_inode_cachep) {
pr_err("error: Failed to initialise inode cache\n");
diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c
index 8ce2f240125b..2cabd649d4fb 100644
--- a/fs/jffs2/symlink.c
+++ b/fs/jffs2/symlink.c
@@ -14,7 +14,7 @@
const struct inode_operations jffs2_symlink_inode_operations =
{
.readlink = generic_readlink,
- .follow_link = simple_follow_link,
+ .get_link = simple_get_link,
.setattr = jffs2_setattr,
.setxattr = jffs2_setxattr,
.getxattr = jffs2_getxattr,
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 09ed55190ee2..5a3da3f52908 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -1153,7 +1153,7 @@ static struct jffs2_sb_info *work_to_sb(struct work_struct *work)
{
struct delayed_work *dwork;
- dwork = container_of(work, struct delayed_work, work);
+ dwork = to_delayed_work(work);
return container_of(dwork, struct jffs2_sb_info, wbuf_dwork);
}
@@ -1264,7 +1264,7 @@ int jffs2_dataflash_setup(struct jffs2_sb_info *c) {
if ((c->flash_size % c->sector_size) != 0) {
c->flash_size = (c->flash_size / c->sector_size) * c->sector_size;
pr_warn("flash size adjusted to %dKiB\n", c->flash_size);
- };
+ }
c->wbuf_ofs = 0xFFFFFFFF;
c->wbuf = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
@@ -1274,7 +1274,6 @@ int jffs2_dataflash_setup(struct jffs2_sb_info *c) {
#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY
c->wbuf_verify = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
if (!c->wbuf_verify) {
- kfree(c->oobbuf);
kfree(c->wbuf);
return -ENOMEM;
}
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index f092fee5be50..da3e18503c65 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -967,7 +967,8 @@ ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
struct jffs2_xattr_ref *ref, **pref;
struct jffs2_xattr_datum *xd;
const struct xattr_handler *xhandle;
- ssize_t len, rc;
+ const char *prefix;
+ ssize_t prefix_len, len, rc;
int retry = 0;
rc = check_xattr_ref_inode(c, ic);
@@ -998,17 +999,23 @@ ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
}
}
xhandle = xprefix_to_handler(xd->xprefix);
- if (!xhandle)
+ if (!xhandle || (xhandle->list && !xhandle->list(dentry)))
continue;
+ prefix = xhandle->prefix ?: xhandle->name;
+ prefix_len = strlen(prefix);
+ rc = prefix_len + xd->name_len + 1;
+
if (buffer) {
- rc = xhandle->list(dentry, buffer+len, size-len,
- xd->xname, xd->name_len, xd->flags);
- } else {
- rc = xhandle->list(dentry, NULL, 0, xd->xname,
- xd->name_len, xd->flags);
+ if (rc > size - len) {
+ rc = -ERANGE;
+ goto out;
+ }
+ memcpy(buffer, prefix, prefix_len);
+ buffer += prefix_len;
+ memcpy(buffer, xd->xname, xd->name_len);
+ buffer += xd->name_len;
+ *buffer++ = 0;
}
- if (rc < 0)
- goto out;
len += rc;
}
rc = len;
diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c
index ceaf9c693225..b2555ef07a12 100644
--- a/fs/jffs2/xattr_trusted.c
+++ b/fs/jffs2/xattr_trusted.c
@@ -16,35 +16,25 @@
#include <linux/mtd/mtd.h>
#include "nodelist.h"
-static int jffs2_trusted_getxattr(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int jffs2_trusted_getxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
- if (!strcmp(name, ""))
- return -EINVAL;
return do_jffs2_getxattr(d_inode(dentry), JFFS2_XPREFIX_TRUSTED,
name, buffer, size);
}
-static int jffs2_trusted_setxattr(struct dentry *dentry, const char *name,
- const void *buffer, size_t size, int flags, int type)
+static int jffs2_trusted_setxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *buffer, size_t size, int flags)
{
- if (!strcmp(name, ""))
- return -EINVAL;
return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_TRUSTED,
name, buffer, size, flags);
}
-static size_t jffs2_trusted_listxattr(struct dentry *dentry, char *list,
- size_t list_size, const char *name, size_t name_len, int type)
+static bool jffs2_trusted_listxattr(struct dentry *dentry)
{
- size_t retlen = XATTR_TRUSTED_PREFIX_LEN + name_len + 1;
-
- if (list && retlen<=list_size) {
- strcpy(list, XATTR_TRUSTED_PREFIX);
- strcpy(list + XATTR_TRUSTED_PREFIX_LEN, name);
- }
-
- return retlen;
+ return capable(CAP_SYS_ADMIN);
}
const struct xattr_handler jffs2_trusted_xattr_handler = {
diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c
index a71391eba514..539bd630b5e4 100644
--- a/fs/jffs2/xattr_user.c
+++ b/fs/jffs2/xattr_user.c
@@ -16,40 +16,24 @@
#include <linux/mtd/mtd.h>
#include "nodelist.h"
-static int jffs2_user_getxattr(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int jffs2_user_getxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
- if (!strcmp(name, ""))
- return -EINVAL;
return do_jffs2_getxattr(d_inode(dentry), JFFS2_XPREFIX_USER,
name, buffer, size);
}
-static int jffs2_user_setxattr(struct dentry *dentry, const char *name,
- const void *buffer, size_t size, int flags, int type)
+static int jffs2_user_setxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *buffer, size_t size, int flags)
{
- if (!strcmp(name, ""))
- return -EINVAL;
return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_USER,
name, buffer, size, flags);
}
-static size_t jffs2_user_listxattr(struct dentry *dentry, char *list,
- size_t list_size, const char *name, size_t name_len, int type)
-{
- size_t retlen = XATTR_USER_PREFIX_LEN + name_len + 1;
-
- if (list && retlen <= list_size) {
- strcpy(list, XATTR_USER_PREFIX);
- strcpy(list + XATTR_USER_PREFIX_LEN, name);
- }
-
- return retlen;
-}
-
const struct xattr_handler jffs2_user_xattr_handler = {
.prefix = XATTR_USER_PREFIX,
- .list = jffs2_user_listxattr,
.set = jffs2_user_setxattr,
.get = jffs2_user_getxattr
};
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index 0c8ca830b113..49456853e9de 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -40,10 +40,10 @@ struct posix_acl *jfs_get_acl(struct inode *inode, int type)
switch(type) {
case ACL_TYPE_ACCESS:
- ea_name = POSIX_ACL_XATTR_ACCESS;
+ ea_name = XATTR_NAME_POSIX_ACL_ACCESS;
break;
case ACL_TYPE_DEFAULT:
- ea_name = POSIX_ACL_XATTR_DEFAULT;
+ ea_name = XATTR_NAME_POSIX_ACL_DEFAULT;
break;
default:
return ERR_PTR(-EINVAL);
@@ -82,7 +82,7 @@ static int __jfs_set_acl(tid_t tid, struct inode *inode, int type,
switch (type) {
case ACL_TYPE_ACCESS:
- ea_name = POSIX_ACL_XATTR_ACCESS;
+ ea_name = XATTR_NAME_POSIX_ACL_ACCESS;
if (acl) {
rc = posix_acl_equiv_mode(acl, &inode->i_mode);
if (rc < 0)
@@ -94,7 +94,7 @@ static int __jfs_set_acl(tid_t tid, struct inode *inode, int type,
}
break;
case ACL_TYPE_DEFAULT:
- ea_name = POSIX_ACL_XATTR_DEFAULT;
+ ea_name = XATTR_NAME_POSIX_ACL_DEFAULT;
break;
default:
return -EINVAL;
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 41aa3ca6a6a4..9d9bae63ae2a 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -60,6 +60,7 @@ struct inode *jfs_iget(struct super_block *sb, unsigned long ino)
} else if (S_ISLNK(inode->i_mode)) {
if (inode->i_size >= IDATASIZE) {
inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &jfs_aops;
} else {
inode->i_op = &jfs_fast_symlink_inode_operations;
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index a69bdf2a1085..a270cb7ff4e0 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1835,17 +1835,16 @@ static int lbmLogInit(struct jfs_log * log)
for (i = 0; i < LOGPAGES;) {
char *buffer;
uint offset;
- struct page *page;
+ struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO);
- buffer = (char *) get_zeroed_page(GFP_KERNEL);
- if (buffer == NULL)
+ if (!page)
goto error;
- page = virt_to_page(buffer);
+ buffer = page_address(page);
for (offset = 0; offset < PAGE_SIZE; offset += LOGPSIZE) {
lbuf = kmalloc(sizeof(struct lbuf), GFP_KERNEL);
if (lbuf == NULL) {
if (offset == 0)
- free_page((unsigned long) buffer);
+ __free_page(page);
goto error;
}
if (offset) /* we already have one reference */
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 35976bdccafc..701f89370de7 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -983,6 +983,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
jfs_info("jfs_symlink: allocate extent ip:0x%p", ip);
ip->i_op = &jfs_symlink_inode_operations;
+ inode_nohighmem(ip);
ip->i_mapping->a_ops = &jfs_aops;
/*
@@ -1372,9 +1373,6 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
tid_t tid;
struct tblock *tblk;
- if (!new_valid_dev(rdev))
- return -EINVAL;
-
jfs_info("jfs_mknod: %pd", dentry);
rc = dquot_initialize(dir);
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 4cd9798f4948..900925b5eb8c 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -496,9 +496,6 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
jfs_info("In jfs_read_super: s_flags=0x%lx", sb->s_flags);
- if (!new_valid_dev(sb->s_bdev->bd_dev))
- return -EOVERFLOW;
-
sbi = kzalloc(sizeof(struct jfs_sb_info), GFP_KERNEL);
if (!sbi)
return -ENOMEM;
@@ -901,7 +898,7 @@ static int __init init_jfs_fs(void)
jfs_inode_cachep =
kmem_cache_create("jfs_ip", sizeof(struct jfs_inode_info), 0,
- SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
+ SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT,
init_once);
if (jfs_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/jfs/symlink.c b/fs/jfs/symlink.c
index 5929e2363cb8..f8db4fde0b0b 100644
--- a/fs/jfs/symlink.c
+++ b/fs/jfs/symlink.c
@@ -23,7 +23,7 @@
const struct inode_operations jfs_fast_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = simple_follow_link,
+ .get_link = simple_get_link,
.setattr = jfs_setattr,
.setxattr = jfs_setxattr,
.getxattr = jfs_getxattr,
@@ -33,8 +33,7 @@ const struct inode_operations jfs_fast_symlink_inode_operations = {
const struct inode_operations jfs_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.setattr = jfs_setattr,
.setxattr = jfs_setxattr,
.getxattr = jfs_getxattr,
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 91e004518237..821973853340 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -541,14 +541,7 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
if (!kn)
goto err_out1;
- /*
- * If the ino of the sysfs entry created for a kmem cache gets
- * allocated from an ida layer, which is accounted to the memcg that
- * owns the cache, the memcg will get pinned forever. So do not account
- * ino ida allocations.
- */
- ret = ida_simple_get(&root->ino_ida, 1, 0,
- GFP_KERNEL | __GFP_NOACCOUNT);
+ ret = ida_simple_get(&root->ino_ida, 1, 0, GFP_KERNEL);
if (ret < 0)
goto err_out2;
kn->ino = ret;
@@ -694,6 +687,29 @@ static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent,
return NULL;
}
+static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent,
+ const unsigned char *path,
+ const void *ns)
+{
+ static char path_buf[PATH_MAX]; /* protected by kernfs_mutex */
+ size_t len = strlcpy(path_buf, path, PATH_MAX);
+ char *p = path_buf;
+ char *name;
+
+ lockdep_assert_held(&kernfs_mutex);
+
+ if (len >= PATH_MAX)
+ return NULL;
+
+ while ((name = strsep(&p, "/")) && parent) {
+ if (*name == '\0')
+ continue;
+ parent = kernfs_find_ns(parent, name, ns);
+ }
+
+ return parent;
+}
+
/**
* kernfs_find_and_get_ns - find and get kernfs_node with the given name
* @parent: kernfs_node to search under
@@ -719,6 +735,29 @@ struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent,
EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns);
/**
+ * kernfs_walk_and_get_ns - find and get kernfs_node with the given path
+ * @parent: kernfs_node to search under
+ * @path: path to look for
+ * @ns: the namespace tag to use
+ *
+ * Look for kernfs_node with path @path under @parent and get a reference
+ * if found. This function may sleep and returns pointer to the found
+ * kernfs_node on success, %NULL on failure.
+ */
+struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent,
+ const char *path, const void *ns)
+{
+ struct kernfs_node *kn;
+
+ mutex_lock(&kernfs_mutex);
+ kn = kernfs_walk_ns(parent, path, ns);
+ kernfs_get(kn);
+ mutex_unlock(&kernfs_mutex);
+
+ return kn;
+}
+
+/**
* kernfs_create_root - create a new kernfs hierarchy
* @scops: optional syscall operations for the hierarchy
* @flags: KERNFS_ROOT_* flags
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 756dd56aaf60..16405ae88d2d 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -205,7 +205,7 @@ int kernfs_iop_removexattr(struct dentry *dentry, const char *name)
if (!attrs)
return -ENOMEM;
- return simple_xattr_remove(&attrs->xattrs, name);
+ return simple_xattr_set(&attrs->xattrs, name, NULL, 0, XATTR_REPLACE);
}
ssize_t kernfs_iop_getxattr(struct dentry *dentry, const char *name, void *buf,
@@ -230,7 +230,7 @@ ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size)
if (!attrs)
return -ENOMEM;
- return simple_xattr_list(&attrs->xattrs, buf, size);
+ return simple_xattr_list(d_inode(dentry), &attrs->xattrs, buf, size);
}
static inline void set_default_inode_attr(struct inode *inode, umode_t mode)
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
index db272528ab5b..117b8b3416f9 100644
--- a/fs/kernfs/symlink.c
+++ b/fs/kernfs/symlink.c
@@ -112,18 +112,25 @@ static int kernfs_getlink(struct dentry *dentry, char *path)
return error;
}
-static const char *kernfs_iop_follow_link(struct dentry *dentry, void **cookie)
+static const char *kernfs_iop_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- int error = -ENOMEM;
- unsigned long page = get_zeroed_page(GFP_KERNEL);
- if (!page)
+ char *body;
+ int error;
+
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+ body = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!body)
return ERR_PTR(-ENOMEM);
- error = kernfs_getlink(dentry, (char *)page);
+ error = kernfs_getlink(dentry, body);
if (unlikely(error < 0)) {
- free_page((unsigned long)page);
+ kfree(body);
return ERR_PTR(error);
}
- return *cookie = (char *)page;
+ set_delayed_call(done, kfree_link, body);
+ return body;
}
const struct inode_operations kernfs_symlink_iops = {
@@ -132,8 +139,7 @@ const struct inode_operations kernfs_symlink_iops = {
.getxattr = kernfs_iop_getxattr,
.listxattr = kernfs_iop_listxattr,
.readlink = generic_readlink,
- .follow_link = kernfs_iop_follow_link,
- .put_link = free_page_put_link,
+ .get_link = kernfs_iop_get_link,
.setattr = kernfs_iop_setattr,
.getattr = kernfs_iop_getattr,
.permission = kernfs_iop_permission,
diff --git a/fs/libfs.c b/fs/libfs.c
index c7cbfb092e94..01491299f348 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1019,17 +1019,12 @@ int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync)
}
EXPORT_SYMBOL(noop_fsync);
-void kfree_put_link(struct inode *unused, void *cookie)
+/* Because kfree isn't assignment-compatible with void(void*) ;-/ */
+void kfree_link(void *p)
{
- kfree(cookie);
+ kfree(p);
}
-EXPORT_SYMBOL(kfree_put_link);
-
-void free_page_put_link(struct inode *unused, void *cookie)
-{
- free_page((unsigned long) cookie);
-}
-EXPORT_SYMBOL(free_page_put_link);
+EXPORT_SYMBOL(kfree_link);
/*
* nop .set_page_dirty method so that people can use .page_mkwrite on
@@ -1092,14 +1087,15 @@ simple_nosetlease(struct file *filp, long arg, struct file_lock **flp,
}
EXPORT_SYMBOL(simple_nosetlease);
-const char *simple_follow_link(struct dentry *dentry, void **cookie)
+const char *simple_get_link(struct dentry *dentry, struct inode *inode,
+ struct delayed_call *done)
{
- return d_inode(dentry)->i_link;
+ return inode->i_link;
}
-EXPORT_SYMBOL(simple_follow_link);
+EXPORT_SYMBOL(simple_get_link);
const struct inode_operations simple_symlink_inode_operations = {
- .follow_link = simple_follow_link,
+ .get_link = simple_get_link,
.readlink = generic_readlink
};
EXPORT_SYMBOL(simple_symlink_inode_operations);
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index acd394716349..112952037933 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -474,18 +474,7 @@ static void nlmclnt_locks_init_private(struct file_lock *fl, struct nlm_host *ho
static int do_vfs_lock(struct file_lock *fl)
{
- int res = 0;
- switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) {
- case FL_POSIX:
- res = posix_lock_file_wait(fl->fl_file, fl);
- break;
- case FL_FLOCK:
- res = flock_lock_file_wait(fl->fl_file, fl);
- break;
- default:
- BUG();
- }
- return res;
+ return locks_lock_file_wait(fl->fl_file, fl);
}
/*
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 969d589c848d..d716c9993a26 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -116,7 +116,7 @@ static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni,
atomic_inc(&nsm->sm_count);
else {
host = NULL;
- nsm = nsm_get_handle(ni->sap, ni->salen,
+ nsm = nsm_get_handle(ni->net, ni->sap, ni->salen,
ni->hostname, ni->hostname_len);
if (unlikely(nsm == NULL)) {
dprintk("lockd: %s failed; no nsm handle\n",
@@ -161,6 +161,7 @@ static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni,
host->h_nsmhandle = nsm;
host->h_addrbuf = nsm->sm_addrbuf;
host->net = ni->net;
+ strlcpy(host->nodename, utsname()->nodename, sizeof(host->nodename));
out:
return host;
@@ -534,17 +535,18 @@ static struct nlm_host *next_host_state(struct hlist_head *cache,
/**
* nlm_host_rebooted - Release all resources held by rebooted host
+ * @net: network namespace
* @info: pointer to decoded results of NLM_SM_NOTIFY call
*
* We were notified that the specified host has rebooted. Release
* all resources held by that peer.
*/
-void nlm_host_rebooted(const struct nlm_reboot *info)
+void nlm_host_rebooted(const struct net *net, const struct nlm_reboot *info)
{
struct nsm_handle *nsm;
struct nlm_host *host;
- nsm = nsm_reboot_lookup(info);
+ nsm = nsm_reboot_lookup(net, info);
if (unlikely(nsm == NULL))
return;
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 47a32b6d9b90..19166d4a8d31 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -42,7 +42,7 @@ struct nsm_args {
u32 proc;
char *mon_name;
- char *nodename;
+ const char *nodename;
};
struct nsm_res {
@@ -51,7 +51,6 @@ struct nsm_res {
};
static const struct rpc_program nsm_program;
-static LIST_HEAD(nsm_handles);
static DEFINE_SPINLOCK(nsm_lock);
/*
@@ -87,69 +86,18 @@ static struct rpc_clnt *nsm_create(struct net *net, const char *nodename)
return rpc_create(&args);
}
-static struct rpc_clnt *nsm_client_set(struct lockd_net *ln,
- struct rpc_clnt *clnt)
-{
- spin_lock(&ln->nsm_clnt_lock);
- if (ln->nsm_users == 0) {
- if (clnt == NULL)
- goto out;
- ln->nsm_clnt = clnt;
- }
- clnt = ln->nsm_clnt;
- ln->nsm_users++;
-out:
- spin_unlock(&ln->nsm_clnt_lock);
- return clnt;
-}
-
-static struct rpc_clnt *nsm_client_get(struct net *net, const char *nodename)
-{
- struct rpc_clnt *clnt, *new;
- struct lockd_net *ln = net_generic(net, lockd_net_id);
-
- clnt = nsm_client_set(ln, NULL);
- if (clnt != NULL)
- goto out;
-
- clnt = new = nsm_create(net, nodename);
- if (IS_ERR(clnt))
- goto out;
-
- clnt = nsm_client_set(ln, new);
- if (clnt != new)
- rpc_shutdown_client(new);
-out:
- return clnt;
-}
-
-static void nsm_client_put(struct net *net)
-{
- struct lockd_net *ln = net_generic(net, lockd_net_id);
- struct rpc_clnt *clnt = NULL;
-
- spin_lock(&ln->nsm_clnt_lock);
- ln->nsm_users--;
- if (ln->nsm_users == 0) {
- clnt = ln->nsm_clnt;
- ln->nsm_clnt = NULL;
- }
- spin_unlock(&ln->nsm_clnt_lock);
- if (clnt != NULL)
- rpc_shutdown_client(clnt);
-}
-
static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res,
- struct rpc_clnt *clnt)
+ const struct nlm_host *host)
{
int status;
+ struct rpc_clnt *clnt;
struct nsm_args args = {
.priv = &nsm->sm_priv,
.prog = NLM_PROGRAM,
.vers = 3,
.proc = NLMPROC_NSM_NOTIFY,
.mon_name = nsm->sm_mon_name,
- .nodename = clnt->cl_nodename,
+ .nodename = host->nodename,
};
struct rpc_message msg = {
.rpc_argp = &args,
@@ -158,6 +106,13 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res,
memset(res, 0, sizeof(*res));
+ clnt = nsm_create(host->net, host->nodename);
+ if (IS_ERR(clnt)) {
+ dprintk("lockd: failed to create NSM upcall transport, "
+ "status=%ld, net=%p\n", PTR_ERR(clnt), host->net);
+ return PTR_ERR(clnt);
+ }
+
msg.rpc_proc = &clnt->cl_procinfo[proc];
status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFTCONN);
if (status == -ECONNREFUSED) {
@@ -171,6 +126,8 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res,
status);
else
status = 0;
+
+ rpc_shutdown_client(clnt);
return status;
}
@@ -190,32 +147,19 @@ int nsm_monitor(const struct nlm_host *host)
struct nsm_handle *nsm = host->h_nsmhandle;
struct nsm_res res;
int status;
- struct rpc_clnt *clnt;
- const char *nodename = NULL;
dprintk("lockd: nsm_monitor(%s)\n", nsm->sm_name);
if (nsm->sm_monitored)
return 0;
- if (host->h_rpcclnt)
- nodename = host->h_rpcclnt->cl_nodename;
-
/*
* Choose whether to record the caller_name or IP address of
* this peer in the local rpc.statd's database.
*/
nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf;
- clnt = nsm_client_get(host->net, nodename);
- if (IS_ERR(clnt)) {
- status = PTR_ERR(clnt);
- dprintk("lockd: failed to create NSM upcall transport, "
- "status=%d, net=%p\n", status, host->net);
- return status;
- }
-
- status = nsm_mon_unmon(nsm, NSMPROC_MON, &res, clnt);
+ status = nsm_mon_unmon(nsm, NSMPROC_MON, &res, host);
if (unlikely(res.status != 0))
status = -EIO;
if (unlikely(status < 0)) {
@@ -247,11 +191,9 @@ void nsm_unmonitor(const struct nlm_host *host)
if (atomic_read(&nsm->sm_count) == 1
&& nsm->sm_monitored && !nsm->sm_sticky) {
- struct lockd_net *ln = net_generic(host->net, lockd_net_id);
-
dprintk("lockd: nsm_unmonitor(%s)\n", nsm->sm_name);
- status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res, ln->nsm_clnt);
+ status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res, host);
if (res.status != 0)
status = -EIO;
if (status < 0)
@@ -259,38 +201,38 @@ void nsm_unmonitor(const struct nlm_host *host)
nsm->sm_name);
else
nsm->sm_monitored = 0;
-
- nsm_client_put(host->net);
}
}
-static struct nsm_handle *nsm_lookup_hostname(const char *hostname,
- const size_t len)
+static struct nsm_handle *nsm_lookup_hostname(const struct list_head *nsm_handles,
+ const char *hostname, const size_t len)
{
struct nsm_handle *nsm;
- list_for_each_entry(nsm, &nsm_handles, sm_link)
+ list_for_each_entry(nsm, nsm_handles, sm_link)
if (strlen(nsm->sm_name) == len &&
memcmp(nsm->sm_name, hostname, len) == 0)
return nsm;
return NULL;
}
-static struct nsm_handle *nsm_lookup_addr(const struct sockaddr *sap)
+static struct nsm_handle *nsm_lookup_addr(const struct list_head *nsm_handles,
+ const struct sockaddr *sap)
{
struct nsm_handle *nsm;
- list_for_each_entry(nsm, &nsm_handles, sm_link)
+ list_for_each_entry(nsm, nsm_handles, sm_link)
if (rpc_cmp_addr(nsm_addr(nsm), sap))
return nsm;
return NULL;
}
-static struct nsm_handle *nsm_lookup_priv(const struct nsm_private *priv)
+static struct nsm_handle *nsm_lookup_priv(const struct list_head *nsm_handles,
+ const struct nsm_private *priv)
{
struct nsm_handle *nsm;
- list_for_each_entry(nsm, &nsm_handles, sm_link)
+ list_for_each_entry(nsm, nsm_handles, sm_link)
if (memcmp(nsm->sm_priv.data, priv->data,
sizeof(priv->data)) == 0)
return nsm;
@@ -353,6 +295,7 @@ static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap,
/**
* nsm_get_handle - Find or create a cached nsm_handle
+ * @net: network namespace
* @sap: pointer to socket address of handle to find
* @salen: length of socket address
* @hostname: pointer to C string containing hostname to find
@@ -365,11 +308,13 @@ static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap,
* @hostname cannot be found in the handle cache. Returns NULL if
* an error occurs.
*/
-struct nsm_handle *nsm_get_handle(const struct sockaddr *sap,
+struct nsm_handle *nsm_get_handle(const struct net *net,
+ const struct sockaddr *sap,
const size_t salen, const char *hostname,
const size_t hostname_len)
{
struct nsm_handle *cached, *new = NULL;
+ struct lockd_net *ln = net_generic(net, lockd_net_id);
if (hostname && memchr(hostname, '/', hostname_len) != NULL) {
if (printk_ratelimit()) {
@@ -384,9 +329,10 @@ retry:
spin_lock(&nsm_lock);
if (nsm_use_hostnames && hostname != NULL)
- cached = nsm_lookup_hostname(hostname, hostname_len);
+ cached = nsm_lookup_hostname(&ln->nsm_handles,
+ hostname, hostname_len);
else
- cached = nsm_lookup_addr(sap);
+ cached = nsm_lookup_addr(&ln->nsm_handles, sap);
if (cached != NULL) {
atomic_inc(&cached->sm_count);
@@ -400,7 +346,7 @@ retry:
}
if (new != NULL) {
- list_add(&new->sm_link, &nsm_handles);
+ list_add(&new->sm_link, &ln->nsm_handles);
spin_unlock(&nsm_lock);
dprintk("lockd: created nsm_handle for %s (%s)\n",
new->sm_name, new->sm_addrbuf);
@@ -417,19 +363,22 @@ retry:
/**
* nsm_reboot_lookup - match NLMPROC_SM_NOTIFY arguments to an nsm_handle
+ * @net: network namespace
* @info: pointer to NLMPROC_SM_NOTIFY arguments
*
* Returns a matching nsm_handle if found in the nsm cache. The returned
* nsm_handle's reference count is bumped. Otherwise returns NULL if some
* error occurred.
*/
-struct nsm_handle *nsm_reboot_lookup(const struct nlm_reboot *info)
+struct nsm_handle *nsm_reboot_lookup(const struct net *net,
+ const struct nlm_reboot *info)
{
struct nsm_handle *cached;
+ struct lockd_net *ln = net_generic(net, lockd_net_id);
spin_lock(&nsm_lock);
- cached = nsm_lookup_priv(&info->priv);
+ cached = nsm_lookup_priv(&ln->nsm_handles, &info->priv);
if (unlikely(cached == NULL)) {
spin_unlock(&nsm_lock);
dprintk("lockd: never saw rebooted peer '%.*s' before\n",
diff --git a/fs/lockd/netns.h b/fs/lockd/netns.h
index 097bfa3adb1c..5426189406c1 100644
--- a/fs/lockd/netns.h
+++ b/fs/lockd/netns.h
@@ -12,9 +12,7 @@ struct lockd_net {
struct delayed_work grace_period_end;
struct lock_manager lockd_manager;
- spinlock_t nsm_clnt_lock;
- unsigned int nsm_users;
- struct rpc_clnt *nsm_clnt;
+ struct list_head nsm_handles;
};
extern int lockd_net_id;
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index d678bcc3cbcb..154a107cd376 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -25,13 +25,17 @@
#include <linux/mutex.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
+#include <linux/inetdevice.h>
#include <linux/sunrpc/types.h>
#include <linux/sunrpc/stats.h>
#include <linux/sunrpc/clnt.h>
#include <linux/sunrpc/svc.h>
#include <linux/sunrpc/svcsock.h>
+#include <linux/sunrpc/svc_xprt.h>
#include <net/ip.h>
+#include <net/addrconf.h>
+#include <net/ipv6.h>
#include <linux/lockd/lockd.h>
#include <linux/nfs.h>
@@ -44,7 +48,7 @@
static struct svc_program nlmsvc_program;
-struct nlmsvc_binding * nlmsvc_ops;
+const struct nlmsvc_binding *nlmsvc_ops;
EXPORT_SYMBOL_GPL(nlmsvc_ops);
static DEFINE_MUTEX(nlmsvc_mutex);
@@ -90,8 +94,7 @@ static unsigned long get_lockd_grace_period(void)
static void grace_ender(struct work_struct *grace)
{
- struct delayed_work *dwork = container_of(grace, struct delayed_work,
- work);
+ struct delayed_work *dwork = to_delayed_work(grace);
struct lockd_net *ln = container_of(dwork, struct lockd_net,
grace_period_end);
@@ -279,6 +282,68 @@ static void lockd_down_net(struct svc_serv *serv, struct net *net)
}
}
+static int lockd_inetaddr_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
+ struct sockaddr_in sin;
+
+ if (event != NETDEV_DOWN)
+ goto out;
+
+ if (nlmsvc_rqst) {
+ dprintk("lockd_inetaddr_event: removed %pI4\n",
+ &ifa->ifa_local);
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = ifa->ifa_local;
+ svc_age_temp_xprts_now(nlmsvc_rqst->rq_server,
+ (struct sockaddr *)&sin);
+ }
+
+out:
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block lockd_inetaddr_notifier = {
+ .notifier_call = lockd_inetaddr_event,
+};
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int lockd_inet6addr_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr;
+ struct sockaddr_in6 sin6;
+
+ if (event != NETDEV_DOWN)
+ goto out;
+
+ if (nlmsvc_rqst) {
+ dprintk("lockd_inet6addr_event: removed %pI6\n", &ifa->addr);
+ sin6.sin6_family = AF_INET6;
+ sin6.sin6_addr = ifa->addr;
+ svc_age_temp_xprts_now(nlmsvc_rqst->rq_server,
+ (struct sockaddr *)&sin6);
+ }
+
+out:
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block lockd_inet6addr_notifier = {
+ .notifier_call = lockd_inet6addr_event,
+};
+#endif
+
+static void lockd_svc_exit_thread(void)
+{
+ unregister_inetaddr_notifier(&lockd_inetaddr_notifier);
+#if IS_ENABLED(CONFIG_IPV6)
+ unregister_inet6addr_notifier(&lockd_inet6addr_notifier);
+#endif
+ svc_exit_thread(nlmsvc_rqst);
+}
+
static int lockd_start_svc(struct svc_serv *serv)
{
int error;
@@ -315,7 +380,7 @@ static int lockd_start_svc(struct svc_serv *serv)
return 0;
out_task:
- svc_exit_thread(nlmsvc_rqst);
+ lockd_svc_exit_thread();
nlmsvc_task = NULL;
out_rqst:
nlmsvc_rqst = NULL;
@@ -360,6 +425,10 @@ static struct svc_serv *lockd_create_svc(void)
printk(KERN_WARNING "lockd_up: create service failed\n");
return ERR_PTR(-ENOMEM);
}
+ register_inetaddr_notifier(&lockd_inetaddr_notifier);
+#if IS_ENABLED(CONFIG_IPV6)
+ register_inet6addr_notifier(&lockd_inet6addr_notifier);
+#endif
dprintk("lockd_up: service created\n");
return serv;
}
@@ -428,7 +497,7 @@ lockd_down(struct net *net)
}
kthread_stop(nlmsvc_task);
dprintk("lockd_down: service stopped\n");
- svc_exit_thread(nlmsvc_rqst);
+ lockd_svc_exit_thread();
dprintk("lockd_down: service destroyed\n");
nlmsvc_task = NULL;
nlmsvc_rqst = NULL;
@@ -592,7 +661,7 @@ static int lockd_init_net(struct net *net)
INIT_DELAYED_WORK(&ln->grace_period_end, grace_ender);
INIT_LIST_HEAD(&ln->lockd_manager.list);
ln->lockd_manager.block_opens = false;
- spin_lock_init(&ln->nsm_clnt_lock);
+ INIT_LIST_HEAD(&ln->nsm_handles);
return 0;
}
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index b147d1ae71fd..09c576f26c7b 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -421,7 +421,7 @@ nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
return rpc_system_err;
}
- nlm_host_rebooted(argp);
+ nlm_host_rebooted(SVC_NET(rqstp), argp);
return rpc_success;
}
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 21171f0c6477..fb26b9f522e7 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -464,7 +464,7 @@ nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
return rpc_system_err;
}
- nlm_host_rebooted(argp);
+ nlm_host_rebooted(SVC_NET(rqstp), argp);
return rpc_success;
}
diff --git a/fs/locks.c b/fs/locks.c
index 2a54c800a223..af1ed74a657f 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -119,7 +119,6 @@
#include <linux/fdtable.h>
#include <linux/fs.h>
#include <linux/init.h>
-#include <linux/module.h>
#include <linux/security.h>
#include <linux/slab.h>
#include <linux/syscalls.h>
@@ -205,37 +204,69 @@ static struct kmem_cache *filelock_cache __read_mostly;
static struct file_lock_context *
locks_get_lock_context(struct inode *inode, int type)
{
- struct file_lock_context *new;
+ struct file_lock_context *ctx;
- if (likely(inode->i_flctx) || type == F_UNLCK)
+ /* paired with cmpxchg() below */
+ ctx = smp_load_acquire(&inode->i_flctx);
+ if (likely(ctx) || type == F_UNLCK)
goto out;
- new = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
- if (!new)
+ ctx = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
+ if (!ctx)
goto out;
- spin_lock_init(&new->flc_lock);
- INIT_LIST_HEAD(&new->flc_flock);
- INIT_LIST_HEAD(&new->flc_posix);
- INIT_LIST_HEAD(&new->flc_lease);
+ spin_lock_init(&ctx->flc_lock);
+ INIT_LIST_HEAD(&ctx->flc_flock);
+ INIT_LIST_HEAD(&ctx->flc_posix);
+ INIT_LIST_HEAD(&ctx->flc_lease);
/*
* Assign the pointer if it's not already assigned. If it is, then
* free the context we just allocated.
*/
- if (cmpxchg(&inode->i_flctx, NULL, new))
- kmem_cache_free(flctx_cache, new);
+ if (cmpxchg(&inode->i_flctx, NULL, ctx)) {
+ kmem_cache_free(flctx_cache, ctx);
+ ctx = smp_load_acquire(&inode->i_flctx);
+ }
out:
- return inode->i_flctx;
+ trace_locks_get_lock_context(inode, type, ctx);
+ return ctx;
+}
+
+static void
+locks_dump_ctx_list(struct list_head *list, char *list_type)
+{
+ struct file_lock *fl;
+
+ list_for_each_entry(fl, list, fl_list) {
+ pr_warn("%s: fl_owner=%p fl_flags=0x%x fl_type=0x%x fl_pid=%u\n", list_type, fl->fl_owner, fl->fl_flags, fl->fl_type, fl->fl_pid);
+ }
+}
+
+static void
+locks_check_ctx_lists(struct inode *inode)
+{
+ struct file_lock_context *ctx = inode->i_flctx;
+
+ if (unlikely(!list_empty(&ctx->flc_flock) ||
+ !list_empty(&ctx->flc_posix) ||
+ !list_empty(&ctx->flc_lease))) {
+ pr_warn("Leaked locks on dev=0x%x:0x%x ino=0x%lx:\n",
+ MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev),
+ inode->i_ino);
+ locks_dump_ctx_list(&ctx->flc_flock, "FLOCK");
+ locks_dump_ctx_list(&ctx->flc_posix, "POSIX");
+ locks_dump_ctx_list(&ctx->flc_lease, "LEASE");
+ }
}
void
-locks_free_lock_context(struct file_lock_context *ctx)
+locks_free_lock_context(struct inode *inode)
{
- if (ctx) {
- WARN_ON_ONCE(!list_empty(&ctx->flc_flock));
- WARN_ON_ONCE(!list_empty(&ctx->flc_posix));
- WARN_ON_ONCE(!list_empty(&ctx->flc_lease));
+ struct file_lock_context *ctx = inode->i_flctx;
+
+ if (unlikely(ctx)) {
+ locks_check_ctx_lists(inode);
kmem_cache_free(flctx_cache, ctx);
}
}
@@ -762,7 +793,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
struct file_lock_context *ctx;
struct inode *inode = file_inode(filp);
- ctx = inode->i_flctx;
+ ctx = smp_load_acquire(&inode->i_flctx);
if (!ctx || list_empty_careful(&ctx->flc_posix)) {
fl->fl_type = F_UNLCK;
return;
@@ -930,7 +961,8 @@ out:
return error;
}
-static int __posix_lock_file(struct inode *inode, struct file_lock *request, struct file_lock *conflock)
+static int posix_lock_inode(struct inode *inode, struct file_lock *request,
+ struct file_lock *conflock)
{
struct file_lock *fl, *tmp;
struct file_lock *new_fl = NULL;
@@ -1138,6 +1170,8 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
if (new_fl2)
locks_free_lock(new_fl2);
locks_dispose_list(&dispose);
+ trace_posix_lock_inode(inode, request, error);
+
return error;
}
@@ -1158,7 +1192,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
int posix_lock_file(struct file *filp, struct file_lock *fl,
struct file_lock *conflock)
{
- return __posix_lock_file(file_inode(filp), fl, conflock);
+ return posix_lock_inode(file_inode(filp), fl, conflock);
}
EXPORT_SYMBOL(posix_lock_file);
@@ -1167,15 +1201,14 @@ EXPORT_SYMBOL(posix_lock_file);
* @inode: inode of file to which lock request should be applied
* @fl: The lock to be applied
*
- * Variant of posix_lock_file_wait that does not take a filp, and so can be
- * used after the filp has already been torn down.
+ * Apply a POSIX style lock request to an inode.
*/
-int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl)
+static int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl)
{
int error;
might_sleep ();
for (;;) {
- error = __posix_lock_file(inode, fl, NULL);
+ error = posix_lock_inode(inode, fl, NULL);
if (error != FILE_LOCK_DEFERRED)
break;
error = wait_event_interruptible(fl->fl_wait, !fl->fl_next);
@@ -1187,8 +1220,8 @@ int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl)
}
return error;
}
-EXPORT_SYMBOL(posix_lock_inode_wait);
+#ifdef CONFIG_MANDATORY_FILE_LOCKING
/**
* locks_mandatory_locked - Check for an active lock
* @file: the file to check
@@ -1203,7 +1236,7 @@ int locks_mandatory_locked(struct file *file)
struct file_lock_context *ctx;
struct file_lock *fl;
- ctx = inode->i_flctx;
+ ctx = smp_load_acquire(&inode->i_flctx);
if (!ctx || list_empty_careful(&ctx->flc_posix))
return 0;
@@ -1225,20 +1258,16 @@ int locks_mandatory_locked(struct file *file)
/**
* locks_mandatory_area - Check for a conflicting lock
- * @read_write: %FLOCK_VERIFY_WRITE for exclusive access, %FLOCK_VERIFY_READ
- * for shared
- * @inode: the file to check
+ * @inode: the file to check
* @filp: how the file was opened (if it was)
- * @offset: start of area to check
- * @count: length of area to check
+ * @start: first byte in the file to check
+ * @end: lastbyte in the file to check
+ * @type: %F_WRLCK for a write lock, else %F_RDLCK
*
* Searches the inode's list of locks to find any POSIX locks which conflict.
- * This function is called from rw_verify_area() and
- * locks_verify_truncate().
*/
-int locks_mandatory_area(int read_write, struct inode *inode,
- struct file *filp, loff_t offset,
- size_t count)
+int locks_mandatory_area(struct inode *inode, struct file *filp, loff_t start,
+ loff_t end, unsigned char type)
{
struct file_lock fl;
int error;
@@ -1250,15 +1279,15 @@ int locks_mandatory_area(int read_write, struct inode *inode,
fl.fl_flags = FL_POSIX | FL_ACCESS;
if (filp && !(filp->f_flags & O_NONBLOCK))
sleep = true;
- fl.fl_type = (read_write == FLOCK_VERIFY_WRITE) ? F_WRLCK : F_RDLCK;
- fl.fl_start = offset;
- fl.fl_end = offset + count - 1;
+ fl.fl_type = type;
+ fl.fl_start = start;
+ fl.fl_end = end;
for (;;) {
if (filp) {
fl.fl_owner = filp;
fl.fl_flags &= ~FL_SLEEP;
- error = __posix_lock_file(inode, &fl, NULL);
+ error = posix_lock_inode(inode, &fl, NULL);
if (!error)
break;
}
@@ -1266,7 +1295,7 @@ int locks_mandatory_area(int read_write, struct inode *inode,
if (sleep)
fl.fl_flags |= FL_SLEEP;
fl.fl_owner = current->files;
- error = __posix_lock_file(inode, &fl, NULL);
+ error = posix_lock_inode(inode, &fl, NULL);
if (error != FILE_LOCK_DEFERRED)
break;
error = wait_event_interruptible(fl.fl_wait, !fl.fl_next);
@@ -1287,6 +1316,7 @@ int locks_mandatory_area(int read_write, struct inode *inode,
}
EXPORT_SYMBOL(locks_mandatory_area);
+#endif /* CONFIG_MANDATORY_FILE_LOCKING */
static void lease_clear_pending(struct file_lock *fl, int arg)
{
@@ -1388,7 +1418,7 @@ any_leases_conflict(struct inode *inode, struct file_lock *breaker)
int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
{
int error = 0;
- struct file_lock_context *ctx = inode->i_flctx;
+ struct file_lock_context *ctx;
struct file_lock *new_fl, *fl, *tmp;
unsigned long break_time;
int want_write = (mode & O_ACCMODE) != O_RDONLY;
@@ -1400,6 +1430,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
new_fl->fl_flags = type;
/* typically we will check that ctx is non-NULL before calling */
+ ctx = smp_load_acquire(&inode->i_flctx);
if (!ctx) {
WARN_ON_ONCE(1);
return error;
@@ -1494,17 +1525,16 @@ EXPORT_SYMBOL(__break_lease);
void lease_get_mtime(struct inode *inode, struct timespec *time)
{
bool has_lease = false;
- struct file_lock_context *ctx = inode->i_flctx;
+ struct file_lock_context *ctx;
struct file_lock *fl;
+ ctx = smp_load_acquire(&inode->i_flctx);
if (ctx && !list_empty_careful(&ctx->flc_lease)) {
spin_lock(&ctx->flc_lock);
- if (!list_empty(&ctx->flc_lease)) {
- fl = list_first_entry(&ctx->flc_lease,
- struct file_lock, fl_list);
- if (fl->fl_type == F_WRLCK)
- has_lease = true;
- }
+ fl = list_first_entry_or_null(&ctx->flc_lease,
+ struct file_lock, fl_list);
+ if (fl && (fl->fl_type == F_WRLCK))
+ has_lease = true;
spin_unlock(&ctx->flc_lock);
}
@@ -1543,10 +1573,11 @@ int fcntl_getlease(struct file *filp)
{
struct file_lock *fl;
struct inode *inode = file_inode(filp);
- struct file_lock_context *ctx = inode->i_flctx;
+ struct file_lock_context *ctx;
int type = F_UNLCK;
LIST_HEAD(dispose);
+ ctx = smp_load_acquire(&inode->i_flctx);
if (ctx && !list_empty_careful(&ctx->flc_lease)) {
spin_lock(&ctx->flc_lock);
time_out_leases(file_inode(filp), &dispose);
@@ -1711,11 +1742,11 @@ static int generic_delete_lease(struct file *filp, void *owner)
{
int error = -EAGAIN;
struct file_lock *fl, *victim = NULL;
- struct dentry *dentry = filp->f_path.dentry;
- struct inode *inode = dentry->d_inode;
- struct file_lock_context *ctx = inode->i_flctx;
+ struct inode *inode = file_inode(filp);
+ struct file_lock_context *ctx;
LIST_HEAD(dispose);
+ ctx = smp_load_acquire(&inode->i_flctx);
if (!ctx) {
trace_generic_delete_lease(inode, NULL);
return error;
@@ -1751,8 +1782,7 @@ static int generic_delete_lease(struct file *filp, void *owner)
int generic_setlease(struct file *filp, long arg, struct file_lock **flp,
void **priv)
{
- struct dentry *dentry = filp->f_path.dentry;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = file_inode(filp);
int error;
if ((!uid_eq(current_fsuid(), inode->i_uid)) && !capable(CAP_LEASE))
@@ -1856,7 +1886,7 @@ int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
*
* Apply a FLOCK style lock request to an inode.
*/
-int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl)
+static int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl)
{
int error;
might_sleep();
@@ -1873,7 +1903,30 @@ int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl)
}
return error;
}
-EXPORT_SYMBOL(flock_lock_inode_wait);
+
+/**
+ * locks_lock_inode_wait - Apply a lock to an inode
+ * @inode: inode of the file to apply to
+ * @fl: The lock to be applied
+ *
+ * Apply a POSIX or FLOCK style lock request to an inode.
+ */
+int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl)
+{
+ int res = 0;
+ switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) {
+ case FL_POSIX:
+ res = posix_lock_inode_wait(inode, fl);
+ break;
+ case FL_FLOCK:
+ res = flock_lock_inode_wait(inode, fl);
+ break;
+ default:
+ BUG();
+ }
+ return res;
+}
+EXPORT_SYMBOL(locks_lock_inode_wait);
/**
* sys_flock: - flock() system call.
@@ -1931,7 +1984,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
(can_sleep) ? F_SETLKW : F_SETLK,
lock);
else
- error = flock_lock_file_wait(f.file, lock);
+ error = locks_lock_file_wait(f.file, lock);
out_free:
locks_free_lock(lock);
@@ -2107,7 +2160,7 @@ static int do_lock_file_wait(struct file *filp, unsigned int cmd,
return error;
}
-/* Ensure that fl->fl_filp has compatible f_mode for F_SETLK calls */
+/* Ensure that fl->fl_file has compatible f_mode for F_SETLK calls */
static int
check_fmode_for_setlk(struct file_lock *fl)
{
@@ -2138,6 +2191,8 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
if (file_lock == NULL)
return -ENOLCK;
+ inode = file_inode(filp);
+
/*
* This might block, so we do it before checking the inode.
*/
@@ -2145,8 +2200,6 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
if (copy_from_user(&flock, l, sizeof(flock)))
goto out;
- inode = file_inode(filp);
-
/* Don't allow mandatory locks on files that may be memory mapped
* and shared.
*/
@@ -2155,7 +2208,6 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
goto out;
}
-again:
error = flock_to_posix_lock(filp, file_lock, &flock);
if (error)
goto out;
@@ -2194,23 +2246,29 @@ again:
error = do_lock_file_wait(filp, cmd, file_lock);
/*
- * Attempt to detect a close/fcntl race and recover by
- * releasing the lock that was just acquired.
+ * Attempt to detect a close/fcntl race and recover by releasing the
+ * lock that was just acquired. There is no need to do that when we're
+ * unlocking though, or for OFD locks.
*/
- /*
- * we need that spin_lock here - it prevents reordering between
- * update of i_flctx->flc_posix and check for it done in close().
- * rcu_read_lock() wouldn't do.
- */
- spin_lock(&current->files->file_lock);
- f = fcheck(fd);
- spin_unlock(&current->files->file_lock);
- if (!error && f != filp && flock.l_type != F_UNLCK) {
- flock.l_type = F_UNLCK;
- goto again;
+ if (!error && file_lock->fl_type != F_UNLCK &&
+ !(file_lock->fl_flags & FL_OFDLCK)) {
+ /*
+ * We need that spin_lock here - it prevents reordering between
+ * update of i_flctx->flc_posix and check for it done in
+ * close(). rcu_read_lock() wouldn't do.
+ */
+ spin_lock(&current->files->file_lock);
+ f = fcheck(fd);
+ spin_unlock(&current->files->file_lock);
+ if (f != filp) {
+ file_lock->fl_type = F_UNLCK;
+ error = do_lock_file_wait(filp, cmd, file_lock);
+ WARN_ON_ONCE(error);
+ error = -EBADF;
+ }
}
-
out:
+ trace_fcntl_setlk(inode, file_lock, error);
locks_free_lock(file_lock);
return error;
}
@@ -2295,7 +2353,6 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
goto out;
}
-again:
error = flock64_to_posix_lock(filp, file_lock, &flock);
if (error)
goto out;
@@ -2334,17 +2391,27 @@ again:
error = do_lock_file_wait(filp, cmd, file_lock);
/*
- * Attempt to detect a close/fcntl race and recover by
- * releasing the lock that was just acquired.
+ * Attempt to detect a close/fcntl race and recover by releasing the
+ * lock that was just acquired. There is no need to do that when we're
+ * unlocking though, or for OFD locks.
*/
- spin_lock(&current->files->file_lock);
- f = fcheck(fd);
- spin_unlock(&current->files->file_lock);
- if (!error && f != filp && flock.l_type != F_UNLCK) {
- flock.l_type = F_UNLCK;
- goto again;
+ if (!error && file_lock->fl_type != F_UNLCK &&
+ !(file_lock->fl_flags & FL_OFDLCK)) {
+ /*
+ * We need that spin_lock here - it prevents reordering between
+ * update of i_flctx->flc_posix and check for it done in
+ * close(). rcu_read_lock() wouldn't do.
+ */
+ spin_lock(&current->files->file_lock);
+ f = fcheck(fd);
+ spin_unlock(&current->files->file_lock);
+ if (f != filp) {
+ file_lock->fl_type = F_UNLCK;
+ error = do_lock_file_wait(filp, cmd, file_lock);
+ WARN_ON_ONCE(error);
+ error = -EBADF;
+ }
}
-
out:
locks_free_lock(file_lock);
return error;
@@ -2358,14 +2425,16 @@ out:
*/
void locks_remove_posix(struct file *filp, fl_owner_t owner)
{
+ int error;
struct file_lock lock;
- struct file_lock_context *ctx = file_inode(filp)->i_flctx;
+ struct file_lock_context *ctx;
/*
* If there are no locks held on this file, we don't need to call
* posix_lock_file(). Another process could be setting a lock on this
* file at the same time, but we wouldn't remove that lock anyway.
*/
+ ctx = smp_load_acquire(&file_inode(filp)->i_flctx);
if (!ctx || list_empty(&ctx->flc_posix))
return;
@@ -2379,17 +2448,18 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner)
lock.fl_ops = NULL;
lock.fl_lmops = NULL;
- vfs_lock_file(filp, F_SETLK, &lock, NULL);
+ error = vfs_lock_file(filp, F_SETLK, &lock, NULL);
if (lock.fl_ops && lock.fl_ops->fl_release_private)
lock.fl_ops->fl_release_private(&lock);
+ trace_locks_remove_posix(file_inode(filp), &lock, error);
}
EXPORT_SYMBOL(locks_remove_posix);
/* The i_flctx must be valid when calling into here */
static void
-locks_remove_flock(struct file *filp)
+locks_remove_flock(struct file *filp, struct file_lock_context *flctx)
{
struct file_lock fl = {
.fl_owner = filp,
@@ -2400,7 +2470,6 @@ locks_remove_flock(struct file *filp)
.fl_end = OFFSET_MAX,
};
struct inode *inode = file_inode(filp);
- struct file_lock_context *flctx = inode->i_flctx;
if (list_empty(&flctx->flc_flock))
return;
@@ -2416,10 +2485,8 @@ locks_remove_flock(struct file *filp)
/* The i_flctx must be valid when calling into here */
static void
-locks_remove_lease(struct file *filp)
+locks_remove_lease(struct file *filp, struct file_lock_context *ctx)
{
- struct inode *inode = file_inode(filp);
- struct file_lock_context *ctx = inode->i_flctx;
struct file_lock *fl, *tmp;
LIST_HEAD(dispose);
@@ -2439,17 +2506,20 @@ locks_remove_lease(struct file *filp)
*/
void locks_remove_file(struct file *filp)
{
- if (!file_inode(filp)->i_flctx)
+ struct file_lock_context *ctx;
+
+ ctx = smp_load_acquire(&file_inode(filp)->i_flctx);
+ if (!ctx)
return;
/* remove any OFD locks */
locks_remove_posix(filp, filp);
/* remove flock locks */
- locks_remove_flock(filp);
+ locks_remove_flock(filp, ctx);
/* remove any leases */
- locks_remove_lease(filp);
+ locks_remove_lease(filp, ctx);
}
/**
@@ -2616,7 +2686,7 @@ void show_fd_locks(struct seq_file *f,
struct file_lock_context *ctx;
int id = 0;
- ctx = inode->i_flctx;
+ ctx = smp_load_acquire(&inode->i_flctx);
if (!ctx)
return;
@@ -2678,7 +2748,7 @@ static int __init proc_locks_init(void)
proc_create("locks", 0, NULL, &proc_locks_operations);
return 0;
}
-module_init(proc_locks_init);
+fs_initcall(proc_locks_init);
#endif
static int __init filelock_init(void)
diff --git a/fs/logfs/Kconfig b/fs/logfs/Kconfig
index 09ed066c0221..2b4503163930 100644
--- a/fs/logfs/Kconfig
+++ b/fs/logfs/Kconfig
@@ -1,6 +1,6 @@
config LOGFS
tristate "LogFS file system"
- depends on (MTD || BLOCK)
+ depends on MTD || (!MTD && BLOCK)
select ZLIB_INFLATE
select ZLIB_DEFLATE
select CRC32
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index a7fdbd868474..a709d80c8ebc 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -81,7 +81,7 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
unsigned int max_pages;
int i;
- max_pages = min(nr_pages, BIO_MAX_PAGES);
+ max_pages = min_t(size_t, nr_pages, BIO_MAX_PAGES);
bio = bio_alloc(GFP_NOFS, max_pages);
BUG_ON(!bio);
@@ -171,7 +171,7 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
unsigned int max_pages;
int i;
- max_pages = min(nr_pages, BIO_MAX_PAGES);
+ max_pages = min_t(size_t, nr_pages, BIO_MAX_PAGES);
bio = bio_alloc(GFP_NOFS, max_pages);
BUG_ON(!bio);
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index f9b45d46d4c4..542468e9bfb4 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -528,7 +528,8 @@ static int logfs_symlink(struct inode *dir, struct dentry *dentry,
if (IS_ERR(inode))
return PTR_ERR(inode);
- inode->i_op = &logfs_symlink_iops;
+ inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &logfs_reg_aops;
return __logfs_create(dir, dentry, inode, target, destlen);
@@ -776,12 +777,6 @@ fail:
return -EIO;
}
-const struct inode_operations logfs_symlink_iops = {
- .readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
-};
-
const struct inode_operations logfs_dir_iops = {
.create = logfs_create,
.link = logfs_link,
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index af49e2d6941a..db9cfc598883 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -64,7 +64,8 @@ static void logfs_inode_setops(struct inode *inode)
inode->i_mapping->a_ops = &logfs_reg_aops;
break;
case S_IFLNK:
- inode->i_op = &logfs_symlink_iops;
+ inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &logfs_reg_aops;
break;
case S_IFSOCK: /* fall through */
@@ -408,7 +409,8 @@ const struct super_operations logfs_super_operations = {
int logfs_init_inode_cache(void)
{
logfs_inode_cache = kmem_cache_create("logfs_inode_cache",
- sizeof(struct logfs_inode), 0, SLAB_RECLAIM_ACCOUNT,
+ sizeof(struct logfs_inode), 0,
+ SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT,
logfs_init_once);
if (!logfs_inode_cache)
return -ENOMEM;
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index 5f0937609465..27d040e35faa 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -302,7 +302,7 @@ struct logfs_block {
struct inode *inode;
struct logfs_transaction *ta;
unsigned long alias_map[LOGFS_BLOCK_FACTOR / BITS_PER_LONG];
- struct logfs_block_ops *ops;
+ const struct logfs_block_ops *ops;
int full;
int partial;
int reserved_bytes;
@@ -485,7 +485,7 @@ static inline int logfs_get_sb_bdev(struct logfs_super *s,
#endif
/* dev_mtd.c */
-#ifdef CONFIG_MTD
+#if IS_ENABLED(CONFIG_MTD)
int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr);
#else
static inline int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr)
@@ -495,7 +495,6 @@ static inline int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr)
#endif
/* dir.c */
-extern const struct inode_operations logfs_symlink_iops;
extern const struct inode_operations logfs_dir_iops;
extern const struct file_operations logfs_dir_fops;
int logfs_replay_journal(struct super_block *sb);
@@ -579,7 +578,7 @@ int logfs_exist_block(struct inode *inode, u64 bix);
int get_page_reserve(struct inode *inode, struct page *page);
void logfs_get_wblocks(struct super_block *sb, struct page *page, int lock);
void logfs_put_wblocks(struct super_block *sb, struct page *page, int lock);
-extern struct logfs_block_ops indirect_block_ops;
+extern const struct logfs_block_ops indirect_block_ops;
/* segment.c */
int logfs_erase_segment(struct super_block *sb, u32 ofs, int ensure_erase);
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 380d86e1ab45..20973c9e52f8 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -569,13 +569,13 @@ static void indirect_free_block(struct super_block *sb,
}
-static struct logfs_block_ops inode_block_ops = {
+static const struct logfs_block_ops inode_block_ops = {
.write_block = inode_write_block,
.free_block = inode_free_block,
.write_alias = inode_write_alias,
};
-struct logfs_block_ops indirect_block_ops = {
+const struct logfs_block_ops indirect_block_ops = {
.write_block = indirect_write_block,
.free_block = indirect_free_block,
.write_alias = indirect_write_alias,
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index 7f9b096d8d57..d270e4b2ab6b 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c
@@ -57,7 +57,7 @@ static struct page *get_mapping_page(struct super_block *sb, pgoff_t index,
filler_t *filler = super->s_devops->readpage;
struct page *page;
- BUG_ON(mapping_gfp_mask(mapping) & __GFP_FS);
+ BUG_ON(mapping_gfp_constraint(mapping, __GFP_FS));
if (use_filler)
page = read_cache_page(mapping, index, filler, sb);
else {
@@ -197,7 +197,7 @@ static int btree_write_alias(struct super_block *sb, struct logfs_block *block,
return 0;
}
-static struct logfs_block_ops btree_block_ops = {
+static const struct logfs_block_ops btree_block_ops = {
.write_block = btree_write_block,
.free_block = __free_block,
.write_alias = btree_write_alias,
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 086cd0a61e80..f975d667c539 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -91,7 +91,7 @@ static int __init init_inodecache(void)
minix_inode_cachep = kmem_cache_create("minix_inode_cache",
sizeof(struct minix_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (minix_inode_cachep == NULL)
return -ENOMEM;
@@ -435,8 +435,7 @@ static const struct address_space_operations minix_aops = {
static const struct inode_operations minix_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.getattr = minix_getattr,
};
@@ -452,6 +451,7 @@ void minix_set_inode(struct inode *inode, dev_t rdev)
inode->i_mapping->a_ops = &minix_aops;
} else if (S_ISLNK(inode->i_mode)) {
inode->i_op = &minix_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &minix_aops;
} else
init_special_inode(inode, inode->i_mode, rdev);
diff --git a/fs/minix/itree_v1.c b/fs/minix/itree_v1.c
index 282e15ad8cd8..46ca39d6c735 100644
--- a/fs/minix/itree_v1.c
+++ b/fs/minix/itree_v1.c
@@ -24,16 +24,15 @@ static inline block_t *i_data(struct inode *inode)
static int block_to_path(struct inode * inode, long block, int offsets[DEPTH])
{
int n = 0;
- char b[BDEVNAME_SIZE];
if (block < 0) {
- printk("MINIX-fs: block_to_path: block %ld < 0 on dev %s\n",
- block, bdevname(inode->i_sb->s_bdev, b));
+ printk("MINIX-fs: block_to_path: block %ld < 0 on dev %pg\n",
+ block, inode->i_sb->s_bdev);
} else if (block >= (minix_sb(inode->i_sb)->s_max_size/BLOCK_SIZE)) {
if (printk_ratelimit())
printk("MINIX-fs: block_to_path: "
- "block %ld too big on dev %s\n",
- block, bdevname(inode->i_sb->s_bdev, b));
+ "block %ld too big on dev %pg\n",
+ block, inode->i_sb->s_bdev);
} else if (block < 7) {
offsets[n++] = block;
} else if ((block -= 7) < 512) {
diff --git a/fs/minix/itree_v2.c b/fs/minix/itree_v2.c
index 78e2d93e5c83..1ee101352586 100644
--- a/fs/minix/itree_v2.c
+++ b/fs/minix/itree_v2.c
@@ -26,18 +26,17 @@ static inline block_t *i_data(struct inode *inode)
static int block_to_path(struct inode * inode, long block, int offsets[DEPTH])
{
int n = 0;
- char b[BDEVNAME_SIZE];
struct super_block *sb = inode->i_sb;
if (block < 0) {
- printk("MINIX-fs: block_to_path: block %ld < 0 on dev %s\n",
- block, bdevname(sb->s_bdev, b));
+ printk("MINIX-fs: block_to_path: block %ld < 0 on dev %pg\n",
+ block, sb->s_bdev);
} else if ((u64)block * (u64)sb->s_blocksize >=
minix_sb(sb)->s_max_size) {
if (printk_ratelimit())
printk("MINIX-fs: block_to_path: "
- "block %ld too big on dev %s\n",
- block, bdevname(sb->s_bdev, b));
+ "block %ld too big on dev %pg\n",
+ block, sb->s_bdev);
} else if (block < DIRCOUNT) {
offsets[n++] = block;
} else if ((block -= DIRCOUNT) < INDIRCOUNT(sb)) {
diff --git a/fs/mpage.c b/fs/mpage.c
index a7c34274f207..1480d3a18037 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -361,7 +361,7 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
sector_t last_block_in_bio = 0;
struct buffer_head map_bh;
unsigned long first_logical_block = 0;
- gfp_t gfp = GFP_KERNEL & mapping_gfp_mask(mapping);
+ gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL);
map_bh.b_state = 0;
map_bh.b_size = 0;
@@ -397,7 +397,7 @@ int mpage_readpage(struct page *page, get_block_t get_block)
sector_t last_block_in_bio = 0;
struct buffer_head map_bh;
unsigned long first_logical_block = 0;
- gfp_t gfp = GFP_KERNEL & mapping_gfp_mask(page->mapping);
+ gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL);
map_bh.b_state = 0;
map_bh.b_size = 0;
@@ -485,6 +485,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
struct buffer_head map_bh;
loff_t i_size = i_size_read(inode);
int ret = 0;
+ int wr = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
if (page_has_buffers(page)) {
struct buffer_head *head = page_buffers(page);
@@ -593,7 +594,7 @@ page_is_mapped:
* This page will go to BIO. Do we need to send this BIO off first?
*/
if (bio && mpd->last_block_in_bio != blocks[0] - 1)
- bio = mpage_bio_submit(WRITE, bio);
+ bio = mpage_bio_submit(wr, bio);
alloc_new:
if (bio == NULL) {
@@ -620,7 +621,7 @@ alloc_new:
wbc_account_io(wbc, page, PAGE_SIZE);
length = first_unmapped << blkbits;
if (bio_add_page(bio, page, length, 0) < length) {
- bio = mpage_bio_submit(WRITE, bio);
+ bio = mpage_bio_submit(wr, bio);
goto alloc_new;
}
@@ -630,7 +631,7 @@ alloc_new:
set_page_writeback(page);
unlock_page(page);
if (boundary || (first_unmapped != blocks_per_page)) {
- bio = mpage_bio_submit(WRITE, bio);
+ bio = mpage_bio_submit(wr, bio);
if (boundary_block) {
write_boundary_block(boundary_bdev,
boundary_block, 1 << blkbits);
@@ -642,7 +643,7 @@ alloc_new:
confused:
if (bio)
- bio = mpage_bio_submit(WRITE, bio);
+ bio = mpage_bio_submit(wr, bio);
if (mpd->use_writepage) {
ret = mapping->a_ops->writepage(page, wbc);
@@ -698,8 +699,11 @@ mpage_writepages(struct address_space *mapping,
};
ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd);
- if (mpd.bio)
- mpage_bio_submit(WRITE, mpd.bio);
+ if (mpd.bio) {
+ int wr = (wbc->sync_mode == WB_SYNC_ALL ?
+ WRITE_SYNC : WRITE);
+ mpage_bio_submit(wr, mpd.bio);
+ }
}
blk_finish_plug(&plug);
return ret;
@@ -716,8 +720,11 @@ int mpage_writepage(struct page *page, get_block_t get_block,
.use_writepage = 0,
};
int ret = __mpage_writepage(page, wbc, &mpd);
- if (mpd.bio)
- mpage_bio_submit(WRITE, mpd.bio);
+ if (mpd.bio) {
+ int wr = (wbc->sync_mode == WB_SYNC_ALL ?
+ WRITE_SYNC : WRITE);
+ mpage_bio_submit(wr, mpd.bio);
+ }
return ret;
}
EXPORT_SYMBOL(mpage_writepage);
diff --git a/fs/namei.c b/fs/namei.c
index 33e9495a3129..bceefd5588a2 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -505,13 +505,13 @@ struct nameidata {
int total_link_count;
struct saved {
struct path link;
- void *cookie;
+ struct delayed_call done;
const char *name;
- struct inode *inode;
unsigned seq;
} *stack, internal[EMBEDDED_LEVELS];
struct filename *name;
struct nameidata *saved;
+ struct inode *link_inode;
unsigned root_seq;
int dfd;
};
@@ -534,10 +534,8 @@ static void restore_nameidata(void)
current->nameidata = old;
if (old)
old->total_link_count = now->total_link_count;
- if (now->stack != now->internal) {
+ if (now->stack != now->internal)
kfree(now->stack);
- now->stack = now->internal;
- }
}
static int __nd_alloc_stack(struct nameidata *nd)
@@ -592,11 +590,8 @@ static void drop_links(struct nameidata *nd)
int i = nd->depth;
while (i--) {
struct saved *last = nd->stack + i;
- struct inode *inode = last->inode;
- if (last->cookie && inode->i_op->put_link) {
- inode->i_op->put_link(inode, last->cookie);
- last->cookie = NULL;
- }
+ do_delayed_call(&last->done);
+ clear_delayed_call(&last->done);
}
}
@@ -657,7 +652,7 @@ static bool legitimize_links(struct nameidata *nd)
* Path walking has 2 modes, rcu-walk and ref-walk (see
* Documentation/filesystems/path-lookup.txt). In situations when we can't
* continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
- * normal reference counts on dentries and vfsmounts to transition to rcu-walk
+ * normal reference counts on dentries and vfsmounts to transition to ref-walk
* mode. Refcounts are grabbed at the last known good point before rcu-walk
* got stuck, so ref-walk may continue from there. If this is not successful
* (eg. a seqcount has changed), then failure is returned and it's up to caller
@@ -807,19 +802,19 @@ static int complete_walk(struct nameidata *nd)
static void set_root(struct nameidata *nd)
{
- get_fs_root(current->fs, &nd->root);
-}
-
-static void set_root_rcu(struct nameidata *nd)
-{
struct fs_struct *fs = current->fs;
- unsigned seq;
- do {
- seq = read_seqcount_begin(&fs->seq);
- nd->root = fs->root;
- nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
- } while (read_seqcount_retry(&fs->seq, seq));
+ if (nd->flags & LOOKUP_RCU) {
+ unsigned seq;
+
+ do {
+ seq = read_seqcount_begin(&fs->seq);
+ nd->root = fs->root;
+ nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
+ } while (read_seqcount_retry(&fs->seq, seq));
+ } else {
+ get_fs_root(fs, &nd->root);
+ }
}
static void path_put_conditional(struct path *path, struct nameidata *nd)
@@ -841,8 +836,28 @@ static inline void path_to_nameidata(const struct path *path,
nd->path.dentry = path->dentry;
}
+static int nd_jump_root(struct nameidata *nd)
+{
+ if (nd->flags & LOOKUP_RCU) {
+ struct dentry *d;
+ nd->path = nd->root;
+ d = nd->path.dentry;
+ nd->inode = d->d_inode;
+ nd->seq = nd->root_seq;
+ if (unlikely(read_seqcount_retry(&d->d_seq, nd->seq)))
+ return -ECHILD;
+ } else {
+ path_put(&nd->path);
+ nd->path = nd->root;
+ path_get(&nd->path);
+ nd->inode = nd->path.dentry->d_inode;
+ }
+ nd->flags |= LOOKUP_JUMPED;
+ return 0;
+}
+
/*
- * Helper to directly jump to a known parsed path from ->follow_link,
+ * Helper to directly jump to a known parsed path from ->get_link,
* caller must have taken a reference to path beforehand.
*/
void nd_jump_link(struct path *path)
@@ -858,9 +873,7 @@ void nd_jump_link(struct path *path)
static inline void put_link(struct nameidata *nd)
{
struct saved *last = nd->stack + --nd->depth;
- struct inode *inode = last->inode;
- if (last->cookie && inode->i_op->put_link)
- inode->i_op->put_link(inode, last->cookie);
+ do_delayed_call(&last->done);
if (!(nd->flags & LOOKUP_RCU))
path_put(&last->link);
}
@@ -892,7 +905,7 @@ static inline int may_follow_link(struct nameidata *nd)
return 0;
/* Allowed if owner and follower match. */
- inode = nd->stack[0].inode;
+ inode = nd->link_inode;
if (uid_eq(current_cred()->fsuid, inode->i_uid))
return 0;
@@ -955,26 +968,23 @@ static bool safe_hardlink_source(struct inode *inode)
* - sysctl_protected_hardlinks enabled
* - fsuid does not match inode
* - hardlink source is unsafe (see safe_hardlink_source() above)
- * - not CAP_FOWNER
+ * - not CAP_FOWNER in a namespace with the inode owner uid mapped
*
* Returns 0 if successful, -ve on error.
*/
static int may_linkat(struct path *link)
{
- const struct cred *cred;
struct inode *inode;
if (!sysctl_protected_hardlinks)
return 0;
- cred = current_cred();
inode = link->dentry->d_inode;
/* Source inode owner (or CAP_FOWNER) can hardlink all they like,
* otherwise, it must be a safe source.
*/
- if (uid_eq(cred->fsuid, inode->i_uid) || safe_hardlink_source(inode) ||
- capable(CAP_FOWNER))
+ if (inode_owner_or_capable(inode) || safe_hardlink_source(inode))
return 0;
audit_log_link_denied("linkat", link);
@@ -986,7 +996,7 @@ const char *get_link(struct nameidata *nd)
{
struct saved *last = nd->stack + nd->depth - 1;
struct dentry *dentry = last->link.dentry;
- struct inode *inode = last->inode;
+ struct inode *inode = nd->link_inode;
int error;
const char *res;
@@ -1007,36 +1017,27 @@ const char *get_link(struct nameidata *nd)
nd->last_type = LAST_BIND;
res = inode->i_link;
if (!res) {
+ const char * (*get)(struct dentry *, struct inode *,
+ struct delayed_call *);
+ get = inode->i_op->get_link;
if (nd->flags & LOOKUP_RCU) {
- if (unlikely(unlazy_walk(nd, NULL, 0)))
- return ERR_PTR(-ECHILD);
+ res = get(NULL, inode, &last->done);
+ if (res == ERR_PTR(-ECHILD)) {
+ if (unlikely(unlazy_walk(nd, NULL, 0)))
+ return ERR_PTR(-ECHILD);
+ res = get(dentry, inode, &last->done);
+ }
+ } else {
+ res = get(dentry, inode, &last->done);
}
- res = inode->i_op->follow_link(dentry, &last->cookie);
- if (IS_ERR_OR_NULL(res)) {
- last->cookie = NULL;
+ if (IS_ERR_OR_NULL(res))
return res;
- }
}
if (*res == '/') {
- if (nd->flags & LOOKUP_RCU) {
- struct dentry *d;
- if (!nd->root.mnt)
- set_root_rcu(nd);
- nd->path = nd->root;
- d = nd->path.dentry;
- nd->inode = d->d_inode;
- nd->seq = nd->root_seq;
- if (unlikely(read_seqcount_retry(&d->d_seq, nd->seq)))
- return ERR_PTR(-ECHILD);
- } else {
- if (!nd->root.mnt)
- set_root(nd);
- path_put(&nd->path);
- nd->path = nd->root;
- path_get(&nd->root);
- nd->inode = nd->path.dentry->d_inode;
- }
- nd->flags |= LOOKUP_JUMPED;
+ if (!nd->root.mnt)
+ set_root(nd);
+ if (unlikely(nd_jump_root(nd)))
+ return ERR_PTR(-ECHILD);
while (unlikely(*++res == '/'))
;
}
@@ -1297,8 +1298,6 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
static int follow_dotdot_rcu(struct nameidata *nd)
{
struct inode *inode = nd->inode;
- if (!nd->root.mnt)
- set_root_rcu(nd);
while (1) {
if (path_equal(&nd->path, &nd->root))
@@ -1418,9 +1417,6 @@ static void follow_mount(struct path *path)
static int follow_dotdot(struct nameidata *nd)
{
- if (!nd->root.mnt)
- set_root(nd);
-
while(1) {
struct dentry *old = nd->path.dentry;
@@ -1658,6 +1654,8 @@ static inline int may_lookup(struct nameidata *nd)
static inline int handle_dots(struct nameidata *nd, int type)
{
if (type == LAST_DOTDOT) {
+ if (!nd->root.mnt)
+ set_root(nd);
if (nd->flags & LOOKUP_RCU) {
return follow_dotdot_rcu(nd);
} else
@@ -1694,8 +1692,8 @@ static int pick_link(struct nameidata *nd, struct path *link,
last = nd->stack + nd->depth++;
last->link = *link;
- last->cookie = NULL;
- last->inode = inode;
+ clear_delayed_call(&last->done);
+ nd->link_inode = inode;
last->seq = seq;
return 1;
}
@@ -1969,7 +1967,7 @@ OK:
if (err) {
const char *s = get_link(nd);
- if (unlikely(IS_ERR(s)))
+ if (IS_ERR(s))
return PTR_ERR(s);
err = 0;
if (unlikely(!s)) {
@@ -1999,7 +1997,6 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
nd->last_type = LAST_ROOT; /* if there are only slashes... */
nd->flags = flags | LOOKUP_JUMPED | LOOKUP_PARENT;
nd->depth = 0;
- nd->total_link_count = 0;
if (flags & LOOKUP_ROOT) {
struct dentry *root = nd->root.dentry;
struct inode *inode = root->d_inode;
@@ -2024,18 +2021,19 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
}
nd->root.mnt = NULL;
+ nd->path.mnt = NULL;
+ nd->path.dentry = NULL;
nd->m_seq = read_seqbegin(&mount_lock);
if (*s == '/') {
- if (flags & LOOKUP_RCU) {
+ if (flags & LOOKUP_RCU)
rcu_read_lock();
- set_root_rcu(nd);
- nd->seq = nd->root_seq;
- } else {
- set_root(nd);
- path_get(&nd->root);
- }
- nd->path = nd->root;
+ set_root(nd);
+ if (likely(!nd_jump_root(nd)))
+ return s;
+ nd->root.mnt = NULL;
+ rcu_read_unlock();
+ return ERR_PTR(-ECHILD);
} else if (nd->dfd == AT_FDCWD) {
if (flags & LOOKUP_RCU) {
struct fs_struct *fs = current->fs;
@@ -2046,11 +2044,14 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
do {
seq = read_seqcount_begin(&fs->seq);
nd->path = fs->pwd;
+ nd->inode = nd->path.dentry->d_inode;
nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
} while (read_seqcount_retry(&fs->seq, seq));
} else {
get_fs_pwd(current->fs, &nd->path);
+ nd->inode = nd->path.dentry->d_inode;
}
+ return s;
} else {
/* Caller must check execute permissions on the starting path component */
struct fd f = fdget_raw(nd->dfd);
@@ -2080,16 +2081,6 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
fdput(f);
return s;
}
-
- nd->inode = nd->path.dentry->d_inode;
- if (!(flags & LOOKUP_RCU))
- return s;
- if (likely(!read_seqcount_retry(&nd->path.dentry->d_seq, nd->seq)))
- return s;
- if (!(nd->flags & LOOKUP_ROOT))
- nd->root.mnt = NULL;
- rcu_read_unlock();
- return ERR_PTR(-ECHILD);
}
static const char *trailing_symlink(struct nameidata *nd)
@@ -2282,6 +2273,8 @@ EXPORT_SYMBOL(vfs_path_lookup);
*
* Note that this routine is purely a helper for filesystem usage and should
* not be called by generic code.
+ *
+ * The caller must hold base->i_mutex.
*/
struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
{
@@ -2325,6 +2318,75 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
}
EXPORT_SYMBOL(lookup_one_len);
+/**
+ * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
+ * @name: pathname component to lookup
+ * @base: base directory to lookup from
+ * @len: maximum length @len should be interpreted to
+ *
+ * Note that this routine is purely a helper for filesystem usage and should
+ * not be called by generic code.
+ *
+ * Unlike lookup_one_len, it should be called without the parent
+ * i_mutex held, and will take the i_mutex itself if necessary.
+ */
+struct dentry *lookup_one_len_unlocked(const char *name,
+ struct dentry *base, int len)
+{
+ struct qstr this;
+ unsigned int c;
+ int err;
+ struct dentry *ret;
+
+ this.name = name;
+ this.len = len;
+ this.hash = full_name_hash(name, len);
+ if (!len)
+ return ERR_PTR(-EACCES);
+
+ if (unlikely(name[0] == '.')) {
+ if (len < 2 || (len == 2 && name[1] == '.'))
+ return ERR_PTR(-EACCES);
+ }
+
+ while (len--) {
+ c = *(const unsigned char *)name++;
+ if (c == '/' || c == '\0')
+ return ERR_PTR(-EACCES);
+ }
+ /*
+ * See if the low-level filesystem might want
+ * to use its own hash..
+ */
+ if (base->d_flags & DCACHE_OP_HASH) {
+ int err = base->d_op->d_hash(base, &this);
+ if (err < 0)
+ return ERR_PTR(err);
+ }
+
+ err = inode_permission(base->d_inode, MAY_EXEC);
+ if (err)
+ return ERR_PTR(err);
+
+ /*
+ * __d_lookup() is used to try to get a quick answer and avoid the
+ * mutex. A false-negative does no harm.
+ */
+ ret = __d_lookup(base, &this);
+ if (ret && unlikely(ret->d_flags & DCACHE_OP_REVALIDATE)) {
+ dput(ret);
+ ret = NULL;
+ }
+ if (ret)
+ return ret;
+
+ mutex_lock(&base->d_inode->i_mutex);
+ ret = __lookup_hash(&this, base, 0);
+ mutex_unlock(&base->d_inode->i_mutex);
+ return ret;
+}
+EXPORT_SYMBOL(lookup_one_len_unlocked);
+
int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
struct path *path, int *empty)
{
@@ -2673,10 +2735,6 @@ static int may_open(struct path *path, int acc_mode, int flag)
struct inode *inode = dentry->d_inode;
int error;
- /* O_PATH? */
- if (!acc_mode)
- return 0;
-
if (!inode)
return -ENOENT;
@@ -2698,7 +2756,7 @@ static int may_open(struct path *path, int acc_mode, int flag)
break;
}
- error = inode_permission(inode, acc_mode);
+ error = inode_permission(inode, MAY_OPEN | acc_mode);
if (error)
return error;
@@ -2890,7 +2948,7 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry,
if (*opened & FILE_CREATED) {
WARN_ON(!(open_flag & O_CREAT));
fsnotify_create(dir, dentry);
- acc_mode = MAY_OPEN;
+ acc_mode = 0;
}
error = may_open(&file->f_path, acc_mode, open_flag);
if (error)
@@ -3103,7 +3161,7 @@ retry_lookup:
/* Don't check for write permission, don't truncate */
open_flag &= ~O_TRUNC;
will_truncate = false;
- acc_mode = MAY_OPEN;
+ acc_mode = 0;
path_to_nameidata(&path, nd);
goto finish_open_created;
}
@@ -3187,10 +3245,11 @@ finish_open:
got_write = true;
}
finish_open_created:
- error = may_open(&nd->path, acc_mode, open_flag);
- if (error)
- goto out;
-
+ if (likely(!(open_flag & O_PATH))) {
+ error = may_open(&nd->path, acc_mode, open_flag);
+ if (error)
+ goto out;
+ }
BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */
error = vfs_open(&nd->path, file, current_cred());
if (!error) {
@@ -3277,7 +3336,7 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags,
goto out2;
audit_inode(nd->name, child, 0);
/* Don't check for other permissions, the inode was just created */
- error = may_open(&path, MAY_OPEN, op->open_flag);
+ error = may_open(&path, 0, op->open_flag);
if (error)
goto out2;
file->f_path.mnt = path.mnt;
@@ -3383,7 +3442,7 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
return ERR_PTR(-ELOOP);
filename = getname_kernel(name);
- if (unlikely(IS_ERR(filename)))
+ if (IS_ERR(filename))
return ERR_CAST(filename);
set_nameidata(&nd, -1, filename);
@@ -4499,72 +4558,73 @@ EXPORT_SYMBOL(readlink_copy);
/*
* A helper for ->readlink(). This should be used *ONLY* for symlinks that
- * have ->follow_link() touching nd only in nd_set_link(). Using (or not
- * using) it for any given inode is up to filesystem.
+ * have ->get_link() not calling nd_jump_link(). Using (or not using) it
+ * for any given inode is up to filesystem.
*/
int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
- void *cookie;
+ DEFINE_DELAYED_CALL(done);
struct inode *inode = d_inode(dentry);
const char *link = inode->i_link;
int res;
if (!link) {
- link = inode->i_op->follow_link(dentry, &cookie);
+ link = inode->i_op->get_link(dentry, inode, &done);
if (IS_ERR(link))
return PTR_ERR(link);
}
res = readlink_copy(buffer, buflen, link);
- if (inode->i_op->put_link)
- inode->i_op->put_link(inode, cookie);
+ do_delayed_call(&done);
return res;
}
EXPORT_SYMBOL(generic_readlink);
/* get the link contents into pagecache */
-static char *page_getlink(struct dentry * dentry, struct page **ppage)
+const char *page_get_link(struct dentry *dentry, struct inode *inode,
+ struct delayed_call *callback)
{
char *kaddr;
struct page *page;
- struct address_space *mapping = dentry->d_inode->i_mapping;
- page = read_mapping_page(mapping, 0, NULL);
- if (IS_ERR(page))
- return (char*)page;
- *ppage = page;
- kaddr = kmap(page);
- nd_terminate_link(kaddr, dentry->d_inode->i_size, PAGE_SIZE - 1);
+ struct address_space *mapping = inode->i_mapping;
+
+ if (!dentry) {
+ page = find_get_page(mapping, 0);
+ if (!page)
+ return ERR_PTR(-ECHILD);
+ if (!PageUptodate(page)) {
+ put_page(page);
+ return ERR_PTR(-ECHILD);
+ }
+ } else {
+ page = read_mapping_page(mapping, 0, NULL);
+ if (IS_ERR(page))
+ return (char*)page;
+ }
+ set_delayed_call(callback, page_put_link, page);
+ BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM);
+ kaddr = page_address(page);
+ nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1);
return kaddr;
}
-int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
-{
- struct page *page = NULL;
- int res = readlink_copy(buffer, buflen, page_getlink(dentry, &page));
- if (page) {
- kunmap(page);
- page_cache_release(page);
- }
- return res;
-}
-EXPORT_SYMBOL(page_readlink);
+EXPORT_SYMBOL(page_get_link);
-const char *page_follow_link_light(struct dentry *dentry, void **cookie)
+void page_put_link(void *arg)
{
- struct page *page = NULL;
- char *res = page_getlink(dentry, &page);
- if (!IS_ERR(res))
- *cookie = page;
- return res;
+ put_page(arg);
}
-EXPORT_SYMBOL(page_follow_link_light);
+EXPORT_SYMBOL(page_put_link);
-void page_put_link(struct inode *unused, void *cookie)
+int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
- struct page *page = cookie;
- kunmap(page);
- page_cache_release(page);
+ DEFINE_DELAYED_CALL(done);
+ int res = readlink_copy(buffer, buflen,
+ page_get_link(dentry, d_inode(dentry),
+ &done));
+ do_delayed_call(&done);
+ return res;
}
-EXPORT_SYMBOL(page_put_link);
+EXPORT_SYMBOL(page_readlink);
/*
* The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
@@ -4575,7 +4635,6 @@ int __page_symlink(struct inode *inode, const char *symname, int len, int nofs)
struct page *page;
void *fsdata;
int err;
- char *kaddr;
unsigned int flags = AOP_FLAG_UNINTERRUPTIBLE;
if (nofs)
flags |= AOP_FLAG_NOFS;
@@ -4586,9 +4645,7 @@ retry:
if (err)
goto fail;
- kaddr = kmap_atomic(page);
- memcpy(kaddr, symname, len-1);
- kunmap_atomic(kaddr);
+ memcpy(page_address(page), symname, len-1);
err = pagecache_write_end(NULL, mapping, 0, len-1, len-1,
page, fsdata);
@@ -4607,13 +4664,12 @@ EXPORT_SYMBOL(__page_symlink);
int page_symlink(struct inode *inode, const char *symname, int len)
{
return __page_symlink(inode, symname, len,
- !(mapping_gfp_mask(inode->i_mapping) & __GFP_FS));
+ !mapping_gfp_constraint(inode->i_mapping, __GFP_FS));
}
EXPORT_SYMBOL(page_symlink);
const struct inode_operations page_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
};
EXPORT_SYMBOL(page_symlink_inode_operations);
diff --git a/fs/namespace.c b/fs/namespace.c
index 0570729c87fd..a830e1463704 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1584,6 +1584,14 @@ static inline bool may_mount(void)
return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN);
}
+static inline bool may_mandlock(void)
+{
+#ifndef CONFIG_MANDATORY_FILE_LOCKING
+ return false;
+#endif
+ return capable(CAP_SYS_ADMIN);
+}
+
/*
* Now umount can handle mount points as well as block devices.
* This is important for filesystems which use unnamed block devices.
@@ -2601,18 +2609,18 @@ static long exact_copy_from_user(void *to, const void __user * from,
return n;
}
-int copy_mount_options(const void __user * data, unsigned long *where)
+void *copy_mount_options(const void __user * data)
{
int i;
- unsigned long page;
unsigned long size;
+ char *copy;
- *where = 0;
if (!data)
- return 0;
+ return NULL;
- if (!(page = __get_free_page(GFP_KERNEL)))
- return -ENOMEM;
+ copy = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!copy)
+ return ERR_PTR(-ENOMEM);
/* We only care that *some* data at the address the user
* gave us is valid. Just in case, we'll zero
@@ -2623,15 +2631,14 @@ int copy_mount_options(const void __user * data, unsigned long *where)
if (size > PAGE_SIZE)
size = PAGE_SIZE;
- i = size - exact_copy_from_user((void *)page, data, size);
+ i = size - exact_copy_from_user(copy, data, size);
if (!i) {
- free_page(page);
- return -EFAULT;
+ kfree(copy);
+ return ERR_PTR(-EFAULT);
}
if (i != PAGE_SIZE)
- memset((char *)page + i, 0, PAGE_SIZE - i);
- *where = page;
- return 0;
+ memset(copy + i, 0, PAGE_SIZE - i);
+ return copy;
}
char *copy_mount_string(const void __user *data)
@@ -2677,6 +2684,8 @@ long do_mount(const char *dev_name, const char __user *dir_name,
type_page, flags, data_page);
if (!retval && !may_mount())
retval = -EPERM;
+ if (!retval && (flags & MS_MANDLOCK) && !may_mandlock())
+ retval = -EPERM;
if (retval)
goto dput_out;
@@ -2896,7 +2905,7 @@ SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
int ret;
char *kernel_type;
char *kernel_dev;
- unsigned long data_page;
+ void *options;
kernel_type = copy_mount_string(type);
ret = PTR_ERR(kernel_type);
@@ -2908,14 +2917,14 @@ SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
if (IS_ERR(kernel_dev))
goto out_dev;
- ret = copy_mount_options(data, &data_page);
- if (ret < 0)
+ options = copy_mount_options(data);
+ ret = PTR_ERR(options);
+ if (IS_ERR(options))
goto out_data;
- ret = do_mount(kernel_dev, dir_name, kernel_type, flags,
- (void *) data_page);
+ ret = do_mount(kernel_dev, dir_name, kernel_type, flags, options);
- free_page(data_page);
+ kfree(options);
out_data:
kfree(kernel_dev);
out_dev:
@@ -2939,9 +2948,9 @@ bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry);
}
-int path_is_under(struct path *path1, struct path *path2)
+bool path_is_under(struct path *path1, struct path *path2)
{
- int res;
+ bool res;
read_seqlock_excl(&mount_lock);
res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
read_sequnlock_excl(&mount_lock);
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 93575e91a7aa..f0e3e9e747dd 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -597,7 +597,7 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx,
qname.name = __name;
newdent = d_hash_and_lookup(dentry, &qname);
- if (unlikely(IS_ERR(newdent)))
+ if (IS_ERR(newdent))
goto end_advance;
if (!newdent) {
newdent = d_alloc(dentry, &qname);
@@ -1165,8 +1165,6 @@ out:
static int ncp_mknod(struct inode * dir, struct dentry *dentry,
umode_t mode, dev_t rdev)
{
- if (!new_valid_dev(rdev))
- return -EINVAL;
if (ncp_is_nfs_extras(NCP_SERVER(dir), NCP_FINFO(dir)->volNumber)) {
ncp_dbg(1, "mode = 0%ho\n", mode);
return ncp_create_new(dir, dentry, mode, rdev, 0);
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 9605a2f63549..1af15fcbe57b 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -82,7 +82,7 @@ static int init_inodecache(void)
ncp_inode_cachep = kmem_cache_create("ncp_inode_cache",
sizeof(struct ncp_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (ncp_inode_cachep == NULL)
return -ENOMEM;
@@ -244,8 +244,7 @@ static void ncp_set_attr(struct inode *inode, struct ncp_entry_info *nwinfo)
#if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS)
static const struct inode_operations ncp_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.setattr = ncp_notify_change,
};
#endif
@@ -283,6 +282,7 @@ ncp_iget(struct super_block *sb, struct ncp_entry_info *info)
#if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS)
} else if (S_ISLNK(inode->i_mode)) {
inode->i_op = &ncp_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_data.a_ops = &ncp_symlink_aops;
#endif
} else {
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 79b113048eac..0a3f9b594602 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -525,6 +525,8 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg
switch (rqdata.cmd) {
case NCP_LOCK_EX:
case NCP_LOCK_SH:
+ if (rqdata.timeout < 0)
+ return -EINVAL;
if (rqdata.timeout == 0)
rqdata.timeout = NCP_LOCK_DEFAULT_TIMEOUT;
else if (rqdata.timeout > NCP_LOCK_MAX_TIMEOUT)
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 9cd4eb3a1e22..ddd0138f410c 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -229,7 +229,7 @@ bl_read_pagelist(struct nfs_pgio_header *header)
struct parallel_io *par;
loff_t f_offset = header->args.offset;
size_t bytes_left = header->args.count;
- unsigned int pg_offset, pg_len;
+ unsigned int pg_offset = header->args.pgbase, pg_len;
struct page **pages = header->args.pages;
int pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
const bool is_dio = (header->dreq != NULL);
@@ -262,7 +262,6 @@ bl_read_pagelist(struct nfs_pgio_header *header)
extent_length = be.be_length - (isect - be.be_f_offset);
}
- pg_offset = f_offset & ~PAGE_CACHE_MASK;
if (is_dio) {
if (pg_offset + bytes_left > PAGE_CACHE_SIZE)
pg_len = PAGE_CACHE_SIZE - pg_offset;
@@ -273,9 +272,6 @@ bl_read_pagelist(struct nfs_pgio_header *header)
pg_len = PAGE_CACHE_SIZE;
}
- isect += (pg_offset >> SECTOR_SHIFT);
- extent_length -= (pg_offset >> SECTOR_SHIFT);
-
if (is_hole(&be)) {
bio = bl_submit_bio(READ, bio);
/* Fill hole w/ zeroes w/o accessing device */
@@ -301,6 +297,7 @@ bl_read_pagelist(struct nfs_pgio_header *header)
extent_length -= (pg_len >> SECTOR_SHIFT);
f_offset += pg_len;
bytes_left -= pg_len;
+ pg_offset = 0;
}
if ((isect << SECTOR_SHIFT) >= header->inode->i_size) {
header->res.eof = 1;
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 75f7c0a7538a..a7f2e6e33305 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -99,17 +99,6 @@ nfs4_callback_up(struct svc_serv *serv)
}
#if defined(CONFIG_NFS_V4_1)
-static int nfs41_callback_up_net(struct svc_serv *serv, struct net *net)
-{
- /*
- * Create an svc_sock for the back channel service that shares the
- * fore channel connection.
- * Returns the input port (0) and sets the svc_serv bc_xprt on success
- */
- return svc_create_xprt(serv, "tcp-bc", net, PF_INET, 0,
- SVC_SOCK_ANONYMOUS);
-}
-
/*
* The callback service for NFSv4.1 callbacks
*/
@@ -184,11 +173,6 @@ static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
xprt->bc_serv = serv;
}
#else
-static int nfs41_callback_up_net(struct svc_serv *serv, struct net *net)
-{
- return 0;
-}
-
static void nfs_minorversion_callback_svc_setup(struct svc_serv *serv,
struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp))
{
@@ -259,7 +243,8 @@ static void nfs_callback_down_net(u32 minorversion, struct svc_serv *serv, struc
svc_shutdown_net(serv, net);
}
-static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, struct net *net)
+static int nfs_callback_up_net(int minorversion, struct svc_serv *serv,
+ struct net *net, struct rpc_xprt *xprt)
{
struct nfs_net *nn = net_generic(net, nfs_net_id);
int ret;
@@ -275,20 +260,11 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, struct n
goto err_bind;
}
- switch (minorversion) {
- case 0:
- ret = nfs4_callback_up_net(serv, net);
- break;
- case 1:
- case 2:
- ret = nfs41_callback_up_net(serv, net);
- break;
- default:
- printk(KERN_ERR "NFS: unknown callback version: %d\n",
- minorversion);
- ret = -EINVAL;
- break;
- }
+ ret = -EPROTONOSUPPORT;
+ if (minorversion == 0)
+ ret = nfs4_callback_up_net(serv, net);
+ else if (xprt->ops->bc_up)
+ ret = xprt->ops->bc_up(serv, net);
if (ret < 0) {
printk(KERN_ERR "NFS: callback service start failed\n");
@@ -364,7 +340,7 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt)
goto err_create;
}
- ret = nfs_callback_up_net(minorversion, serv, net);
+ ret = nfs_callback_up_net(minorversion, serv, net, xprt);
if (ret < 0)
goto err_net;
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 84326e9fb47a..ff8195bd75ea 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -61,7 +61,6 @@ struct cb_compound_hdr_res {
};
struct cb_getattrargs {
- struct sockaddr *addr;
struct nfs_fh fh;
uint32_t bitmap[2];
};
@@ -76,7 +75,6 @@ struct cb_getattrres {
};
struct cb_recallargs {
- struct sockaddr *addr;
struct nfs_fh fh;
nfs4_stateid stateid;
uint32_t truncate;
@@ -119,9 +117,6 @@ extern __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
struct cb_sequenceres *res,
struct cb_process_state *cps);
-extern int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation,
- const nfs4_stateid *stateid);
-
#define RCA4_TYPE_MASK_RDATA_DLG 0
#define RCA4_TYPE_MASK_WDATA_DLG 1
#define RCA4_TYPE_MASK_DIR_DLG 2
@@ -134,7 +129,6 @@ extern int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation,
#define RCA4_TYPE_MASK_ALL 0xf31f
struct cb_recallanyargs {
- struct sockaddr *craa_addr;
uint32_t craa_objs_to_keep;
uint32_t craa_type_mask;
};
@@ -144,7 +138,6 @@ extern __be32 nfs4_callback_recallany(struct cb_recallanyargs *args,
struct cb_process_state *cps);
struct cb_recallslotargs {
- struct sockaddr *crsa_addr;
uint32_t crsa_target_highest_slotid;
};
extern __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args,
@@ -152,7 +145,6 @@ extern __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args,
struct cb_process_state *cps);
struct cb_layoutrecallargs {
- struct sockaddr *cbl_addr;
uint32_t cbl_recall_type;
uint32_t cbl_layout_type;
uint32_t cbl_layoutchanged;
@@ -196,9 +188,6 @@ extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
#if IS_ENABLED(CONFIG_NFS_V4)
extern int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt);
extern void nfs_callback_down(int minorversion, struct net *net);
-extern int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation,
- const nfs4_stateid *stateid);
-extern int nfs4_set_callback_sessionid(struct nfs_client *clp);
#endif /* CONFIG_NFS_V4 */
/*
* nfs41: Callbacks are expected to not cause substantial latency,
@@ -209,6 +198,5 @@ extern int nfs4_set_callback_sessionid(struct nfs_client *clp);
#define NFS41_BC_MAX_CALLBACKS 1
extern unsigned int nfs_callback_set_tcpport;
-extern unsigned short nfs_callback_tcpport;
#endif /* __LINUX_FS_NFS_CALLBACK_H */
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index b85cf7a30232..f0939d097406 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -17,9 +17,7 @@
#include "nfs4session.h"
#include "nfs4trace.h"
-#ifdef NFS_DEBUG
#define NFSDBG_FACILITY NFSDBG_CALLBACK
-#endif
__be32 nfs4_callback_getattr(struct cb_getattrargs *args,
struct cb_getattrres *res,
@@ -85,8 +83,11 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
res = htonl(NFS4ERR_BADHANDLE);
inode = nfs_delegation_find_inode(cps->clp, &args->fh);
- if (inode == NULL)
+ if (inode == NULL) {
+ trace_nfs4_cb_recall(cps->clp, &args->fh, NULL,
+ &args->stateid, -ntohl(res));
goto out;
+ }
/* Set up a helper thread to actually return the delegation */
switch (nfs_async_inode_return_delegation(inode, &args->stateid)) {
case 0:
@@ -98,7 +99,8 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
default:
res = htonl(NFS4ERR_RESOURCE);
}
- trace_nfs4_recall_delegation(inode, -ntohl(res));
+ trace_nfs4_cb_recall(cps->clp, &args->fh, inode,
+ &args->stateid, -ntohl(res));
iput(inode);
out:
dprintk("%s: exit with status = %d\n", __func__, ntohl(res));
@@ -162,6 +164,22 @@ static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp,
return lo;
}
+/*
+ * Enforce RFC5661 section 12.5.5.2.1. (Layout Recall and Return Sequencing)
+ */
+static bool pnfs_check_stateid_sequence(struct pnfs_layout_hdr *lo,
+ const nfs4_stateid *new)
+{
+ u32 oldseq, newseq;
+
+ oldseq = be32_to_cpu(lo->plh_stateid.seqid);
+ newseq = be32_to_cpu(new->seqid);
+
+ if (newseq > oldseq + 1)
+ return false;
+ return true;
+}
+
static u32 initiate_file_draining(struct nfs_client *clp,
struct cb_layoutrecallargs *args)
{
@@ -171,34 +189,52 @@ static u32 initiate_file_draining(struct nfs_client *clp,
LIST_HEAD(free_me_list);
lo = get_layout_by_fh(clp, &args->cbl_fh, &args->cbl_stateid);
- if (!lo)
+ if (!lo) {
+ trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, NULL,
+ &args->cbl_stateid, -rv);
goto out;
+ }
ino = lo->plh_inode;
spin_lock(&ino->i_lock);
+ if (!pnfs_check_stateid_sequence(lo, &args->cbl_stateid)) {
+ rv = NFS4ERR_DELAY;
+ goto unlock;
+ }
pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
spin_unlock(&ino->i_lock);
pnfs_layoutcommit_inode(ino, false);
spin_lock(&ino->i_lock);
- if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
- pnfs_mark_matching_lsegs_invalid(lo, &free_me_list,
- &args->cbl_range)) {
+ /*
+ * Enforce RFC5661 Section 12.5.5.2.1.5 (Bulk Recall and Return)
+ */
+ if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
rv = NFS4ERR_DELAY;
goto unlock;
}
+ if (pnfs_mark_matching_lsegs_return(lo, &free_me_list,
+ &args->cbl_range)) {
+ rv = NFS4_OK;
+ goto unlock;
+ }
+
if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo,
&args->cbl_range);
}
+ pnfs_mark_layout_returned_if_empty(lo);
unlock:
spin_unlock(&ino->i_lock);
pnfs_free_lseg_list(&free_me_list);
+ /* Free all lsegs that are attached to commit buckets */
+ nfs_commit_inode(ino, 0);
pnfs_put_layout_hdr(lo);
- trace_nfs4_cb_layoutrecall_inode(clp, &args->cbl_fh, ino, -rv);
+ trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, ino,
+ &args->cbl_stateid, -rv);
iput(ino);
out:
return rv;
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 6b1697a01dde..646cdac73488 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -18,19 +18,21 @@
#include "internal.h"
#include "nfs4session.h"
-#define CB_OP_TAGLEN_MAXSZ (512)
-#define CB_OP_HDR_RES_MAXSZ (2 + CB_OP_TAGLEN_MAXSZ)
-#define CB_OP_GETATTR_BITMAP_MAXSZ (4)
-#define CB_OP_GETATTR_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \
- CB_OP_GETATTR_BITMAP_MAXSZ + \
- 2 + 2 + 3 + 3)
-#define CB_OP_RECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
+#define CB_OP_TAGLEN_MAXSZ (512)
+#define CB_OP_HDR_RES_MAXSZ (2 * 4) // opcode, status
+#define CB_OP_GETATTR_BITMAP_MAXSZ (4 * 4) // bitmap length, 3 bitmaps
+#define CB_OP_GETATTR_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \
+ CB_OP_GETATTR_BITMAP_MAXSZ + \
+ /* change, size, ctime, mtime */\
+ (2 + 2 + 3 + 3) * 4)
+#define CB_OP_RECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
#if defined(CONFIG_NFS_V4_1)
#define CB_OP_LAYOUTRECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
#define CB_OP_DEVICENOTIFY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
#define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \
- 4 + 1 + 3)
+ NFS4_MAX_SESSIONID_LEN + \
+ (1 + 3) * 4) // seqid, 3 slotids
#define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
#define CB_OP_RECALLSLOT_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
#endif /* CONFIG_NFS_V4_1 */
@@ -157,7 +159,7 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound
if (unlikely(status != 0))
return status;
/* We do not like overly long tags! */
- if (hdr->taglen > CB_OP_TAGLEN_MAXSZ - 12) {
+ if (hdr->taglen > CB_OP_TAGLEN_MAXSZ) {
printk("NFS: NFSv4 CALLBACK %s: client sent tag of length %u\n",
__func__, hdr->taglen);
return htonl(NFS4ERR_RESOURCE);
@@ -198,7 +200,6 @@ static __be32 decode_getattr_args(struct svc_rqst *rqstp, struct xdr_stream *xdr
status = decode_fh(xdr, &args->fh);
if (unlikely(status != 0))
goto out;
- args->addr = svc_addr(rqstp);
status = decode_bitmap(xdr, args->bitmap);
out:
dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
@@ -210,7 +211,6 @@ static __be32 decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr,
__be32 *p;
__be32 status;
- args->addr = svc_addr(rqstp);
status = decode_stateid(xdr, &args->stateid);
if (unlikely(status != 0))
goto out;
@@ -236,7 +236,6 @@ static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp,
__be32 status = 0;
uint32_t iomode;
- args->cbl_addr = svc_addr(rqstp);
p = read_buf(xdr, 4 * sizeof(uint32_t));
if (unlikely(p == NULL)) {
status = htonl(NFS4ERR_BADXDR);
@@ -383,13 +382,12 @@ static __be32 decode_sessionid(struct xdr_stream *xdr,
struct nfs4_sessionid *sid)
{
__be32 *p;
- int len = NFS4_MAX_SESSIONID_LEN;
- p = read_buf(xdr, len);
+ p = read_buf(xdr, NFS4_MAX_SESSIONID_LEN);
if (unlikely(p == NULL))
return htonl(NFS4ERR_RESOURCE);
- memcpy(sid->data, p, len);
+ memcpy(sid->data, p, NFS4_MAX_SESSIONID_LEN);
return 0;
}
@@ -500,7 +498,6 @@ static __be32 decode_recallany_args(struct svc_rqst *rqstp,
uint32_t bitmap[2];
__be32 *p, status;
- args->craa_addr = svc_addr(rqstp);
p = read_buf(xdr, 4);
if (unlikely(p == NULL))
return htonl(NFS4ERR_BADXDR);
@@ -519,7 +516,6 @@ static __be32 decode_recallslot_args(struct svc_rqst *rqstp,
{
__be32 *p;
- args->crsa_addr = svc_addr(rqstp);
p = read_buf(xdr, 4);
if (unlikely(p == NULL))
return htonl(NFS4ERR_BADXDR);
@@ -684,13 +680,12 @@ static __be32 encode_sessionid(struct xdr_stream *xdr,
const struct nfs4_sessionid *sid)
{
__be32 *p;
- int len = NFS4_MAX_SESSIONID_LEN;
- p = xdr_reserve_space(xdr, len);
+ p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN);
if (unlikely(p == NULL))
return htonl(NFS4ERR_RESOURCE);
- memcpy(p, sid, len);
+ memcpy(p, sid, NFS4_MAX_SESSIONID_LEN);
return 0;
}
@@ -704,7 +699,9 @@ static __be32 encode_cb_sequence_res(struct svc_rqst *rqstp,
if (unlikely(status != 0))
goto out;
- encode_sessionid(xdr, &res->csr_sessionid);
+ status = encode_sessionid(xdr, &res->csr_sessionid);
+ if (status)
+ goto out;
p = xdr_reserve_space(xdr, 4 * sizeof(uint32_t));
if (unlikely(p == NULL))
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 57c5a02f6213..d6d5d2a48e83 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -764,6 +764,7 @@ static void nfs_server_set_fsinfo(struct nfs_server *server,
server->time_delta = fsinfo->time_delta;
+ server->clone_blksize = fsinfo->clone_blksize;
/* We're airborne Set socket buffersize */
rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100);
}
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index be806ead7f4d..5166adcfc0fb 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -721,14 +721,12 @@ int nfs_async_inode_return_delegation(struct inode *inode,
struct nfs_client *clp = server->nfs_client;
struct nfs_delegation *delegation;
- filemap_flush(inode->i_mapping);
-
rcu_read_lock();
delegation = rcu_dereference(NFS_I(inode)->delegation);
if (delegation == NULL)
goto out_enoent;
-
- if (!clp->cl_mvops->match_stateid(&delegation->stateid, stateid))
+ if (stateid != NULL &&
+ !clp->cl_mvops->match_stateid(&delegation->stateid, stateid))
goto out_enoent;
nfs_mark_return_delegation(server, delegation);
rcu_read_unlock();
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 3d8e4ffa0a33..c82a21228a34 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1714,9 +1714,6 @@ nfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev)
dfprintk(VFS, "NFS: mknod(%s/%lu), %pd\n",
dir->i_sb->s_id, dir->i_ino, dentry);
- if (!new_valid_dev(rdev))
- return -EINVAL;
-
attr.ia_mode = mode;
attr.ia_valid = ATTR_MODE;
@@ -1897,15 +1894,14 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
attr.ia_mode = S_IFLNK | S_IRWXUGO;
attr.ia_valid = ATTR_MODE;
- page = alloc_page(GFP_HIGHUSER);
+ page = alloc_page(GFP_USER);
if (!page)
return -ENOMEM;
- kaddr = kmap_atomic(page);
+ kaddr = page_address(page);
memcpy(kaddr, symname, pathlen);
if (pathlen < PAGE_SIZE)
memset(kaddr + pathlen, 0, PAGE_SIZE - pathlen);
- kunmap_atomic(kaddr);
trace_nfs_symlink_enter(dir, dentry);
error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr);
@@ -2435,6 +2431,20 @@ int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags)
}
EXPORT_SYMBOL_GPL(nfs_may_open);
+static int nfs_execute_ok(struct inode *inode, int mask)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ int ret;
+
+ if (mask & MAY_NOT_BLOCK)
+ ret = nfs_revalidate_inode_rcu(server, inode);
+ else
+ ret = nfs_revalidate_inode(server, inode);
+ if (ret == 0 && !execute_ok(inode))
+ ret = -EACCES;
+ return ret;
+}
+
int nfs_permission(struct inode *inode, int mask)
{
struct rpc_cred *cred;
@@ -2452,6 +2462,9 @@ int nfs_permission(struct inode *inode, int mask)
case S_IFLNK:
goto out;
case S_IFREG:
+ if ((mask & MAY_OPEN) &&
+ nfs_server_capable(inode, NFS_CAP_ATOMIC_OPEN))
+ return 0;
break;
case S_IFDIR:
/*
@@ -2484,8 +2497,8 @@ force_lookup:
res = PTR_ERR(cred);
}
out:
- if (!res && (mask & MAY_EXEC) && !execute_ok(inode))
- res = -EACCES;
+ if (!res && (mask & MAY_EXEC))
+ res = nfs_execute_ok(inode, mask);
dfprintk(VFS, "NFS: permission(%s/%lu), mask=0x%x, res=%d\n",
inode->i_sb->s_id, inode->i_ino, mask, res);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 4b1d08f56aba..7ab7ec9f4eed 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -117,12 +117,6 @@ static inline int put_dreq(struct nfs_direct_req *dreq)
return atomic_dec_and_test(&dreq->io_count);
}
-void nfs_direct_set_resched_writes(struct nfs_direct_req *dreq)
-{
- dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
-}
-EXPORT_SYMBOL_GPL(nfs_direct_set_resched_writes);
-
static void
nfs_direct_good_bytes(struct nfs_direct_req *dreq, struct nfs_pgio_header *hdr)
{
@@ -670,6 +664,10 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
req = nfs_list_entry(reqs.next);
nfs_direct_setup_mirroring(dreq, &desc, req);
+ if (desc.pg_error < 0) {
+ list_splice_init(&reqs, &failed);
+ goto out_failed;
+ }
list_for_each_entry_safe(req, tmp, &reqs, wb_list) {
if (!nfs_pageio_add_request(&desc, req)) {
@@ -677,13 +675,17 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
nfs_list_add_request(req, &failed);
spin_lock(cinfo.lock);
dreq->flags = 0;
- dreq->error = -EIO;
+ if (desc.pg_error < 0)
+ dreq->error = desc.pg_error;
+ else
+ dreq->error = -EIO;
spin_unlock(cinfo.lock);
}
nfs_release_request(req);
}
nfs_pageio_complete(&desc);
+out_failed:
while (!list_empty(&failed)) {
req = nfs_list_entry(failed.next);
nfs_list_remove_request(req);
@@ -727,14 +729,20 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data)
nfs_direct_write_complete(dreq, data->inode);
}
-static void nfs_direct_error_cleanup(struct nfs_inode *nfsi)
+static void nfs_direct_resched_write(struct nfs_commit_info *cinfo,
+ struct nfs_page *req)
{
- /* There is no lock to clear */
+ struct nfs_direct_req *dreq = cinfo->dreq;
+
+ spin_lock(&dreq->lock);
+ dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+ spin_unlock(&dreq->lock);
+ nfs_mark_request_commit(req, NULL, cinfo, 0);
}
static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops = {
.completion = nfs_direct_commit_complete,
- .error_cleanup = nfs_direct_error_cleanup,
+ .resched_write = nfs_direct_resched_write,
};
static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
@@ -839,10 +847,25 @@ static void nfs_write_sync_pgio_error(struct list_head *head)
}
}
+static void nfs_direct_write_reschedule_io(struct nfs_pgio_header *hdr)
+{
+ struct nfs_direct_req *dreq = hdr->dreq;
+
+ spin_lock(&dreq->lock);
+ if (dreq->error == 0) {
+ dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+ /* fake unstable write to let common nfs resend pages */
+ hdr->verf.committed = NFS_UNSTABLE;
+ hdr->good_bytes = hdr->args.count;
+ }
+ spin_unlock(&dreq->lock);
+}
+
static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = {
.error_cleanup = nfs_write_sync_pgio_error,
.init_hdr = nfs_direct_pgio_init,
.completion = nfs_direct_write_completion,
+ .reschedule_io = nfs_direct_write_reschedule_io,
};
@@ -900,6 +923,11 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
}
nfs_direct_setup_mirroring(dreq, &desc, req);
+ if (desc.pg_error < 0) {
+ nfs_free_request(req);
+ result = desc.pg_error;
+ break;
+ }
nfs_lock_request(req);
req->wb_index = pos >> PAGE_SHIFT;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index c0f9b1ed12b9..4ef8f5addcad 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -473,8 +473,8 @@ static int nfs_release_page(struct page *page, gfp_t gfp)
dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
/* Always try to initiate a 'commit' if relevant, but only
- * wait for it if __GFP_WAIT is set. Even then, only wait 1
- * second and only if the 'bdi' is not congested.
+ * wait for it if the caller allows blocking. Even then,
+ * only wait 1 second and only if the 'bdi' is not congested.
* Waiting indefinitely can cause deadlocks when the NFS
* server is on this machine, when a new TCP connection is
* needed and in other rare cases. There is no particular
@@ -484,7 +484,7 @@ static int nfs_release_page(struct page *page, gfp_t gfp)
if (mapping) {
struct nfs_server *nfss = NFS_SERVER(mapping->host);
nfs_commit_inode(mapping->host, 0);
- if ((gfp & __GFP_WAIT) &&
+ if (gfpflags_allow_blocking(gfp) &&
!bdi_write_congested(&nfss->backing_dev_info)) {
wait_on_page_bit_killable_timeout(page, PG_private,
HZ);
@@ -514,7 +514,7 @@ static void nfs_check_dirty_writeback(struct page *page,
* so it will not block due to pages that will shortly be freeable.
*/
nfsi = NFS_I(mapping->host);
- if (test_bit(NFS_INO_COMMIT, &nfsi->flags)) {
+ if (atomic_read(&nfsi->commit_info.rpcs_out)) {
*writeback = true;
return;
}
@@ -545,7 +545,7 @@ static int nfs_launder_page(struct page *page)
inode->i_ino, (long long)page_offset(page));
nfs_fscache_wait_on_page_write(nfsi, page);
- return nfs_wb_page(inode, page);
+ return nfs_wb_launder_page(inode, page);
}
static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
@@ -738,18 +738,7 @@ out_noconflict:
static int do_vfs_lock(struct file *file, struct file_lock *fl)
{
- int res = 0;
- switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) {
- case FL_POSIX:
- res = posix_lock_file_wait(file, fl);
- break;
- case FL_FLOCK:
- res = flock_lock_file_wait(file, fl);
- break;
- default:
- BUG();
- }
- return res;
+ return locks_lock_file_wait(file, fl);
}
static int
@@ -767,7 +756,7 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
l_ctx = nfs_get_lock_context(nfs_file_open_context(filp));
if (!IS_ERR(l_ctx)) {
- status = nfs_iocounter_wait(&l_ctx->io_count);
+ status = nfs_iocounter_wait(l_ctx);
nfs_put_lock_context(l_ctx);
if (status < 0)
return status;
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 02ec07973bc4..bb1f4e7a3270 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -202,6 +202,7 @@ static int filelayout_async_handle_error(struct rpc_task *task,
task->tk_status);
nfs4_mark_deviceid_unavailable(devid);
pnfs_error_mark_layout_for_return(inode, lseg);
+ pnfs_set_lo_fail(lseg);
rpc_wake_up(&tbl->slot_tbl_waitq);
/* fall through */
default:
@@ -883,13 +884,19 @@ static void
filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req)
{
- if (!pgio->pg_lseg)
+ if (!pgio->pg_lseg) {
pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
req->wb_context,
0,
NFS4_MAX_UINT64,
IOMODE_READ,
GFP_KERNEL);
+ if (IS_ERR(pgio->pg_lseg)) {
+ pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ return;
+ }
+ }
/* If no lseg, fall back to read through mds */
if (pgio->pg_lseg == NULL)
nfs_pageio_reset_read_mds(pgio);
@@ -902,13 +909,20 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,
struct nfs_commit_info cinfo;
int status;
- if (!pgio->pg_lseg)
+ if (!pgio->pg_lseg) {
pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
req->wb_context,
0,
NFS4_MAX_UINT64,
IOMODE_RW,
GFP_NOFS);
+ if (IS_ERR(pgio->pg_lseg)) {
+ pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ return;
+ }
+ }
+
/* If no lseg, fall back to write through mds */
if (pgio->pg_lseg == NULL)
goto out_mds;
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index fbc5a56de875..6594e9f903a0 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -145,7 +145,7 @@ static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1,
return false;
for (i = 0; i < m1->fh_versions_cnt; i++) {
bool found_fh = false;
- for (j = 0; j < m2->fh_versions_cnt; i++) {
+ for (j = 0; j < m2->fh_versions_cnt; j++) {
if (nfs_compare_fh(&m1->fh_versions[i],
&m2->fh_versions[j]) == 0) {
found_fh = true;
@@ -339,6 +339,19 @@ static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls)
}
}
+static void ff_layout_mark_devices_valid(struct nfs4_ff_layout_segment *fls)
+{
+ struct nfs4_deviceid_node *node;
+ int i;
+
+ if (!(fls->flags & FF_FLAGS_NO_IO_THRU_MDS))
+ return;
+ for (i = 0; i < fls->mirror_array_cnt; i++) {
+ node = &fls->mirror_array[i]->mirror_ds->id_node;
+ clear_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags);
+ }
+}
+
static struct pnfs_layout_segment *
ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
struct nfs4_layoutget_res *lgr,
@@ -492,13 +505,22 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
}
p = xdr_inline_decode(&stream, 4);
- if (p)
- fls->flags = be32_to_cpup(p);
+ if (!p)
+ goto out_sort_mirrors;
+ fls->flags = be32_to_cpup(p);
+
+ p = xdr_inline_decode(&stream, 4);
+ if (!p)
+ goto out_sort_mirrors;
+ for (i=0; i < fls->mirror_array_cnt; i++)
+ fls->mirror_array[i]->report_interval = be32_to_cpup(p);
+out_sort_mirrors:
ff_layout_sort_mirrors(fls);
rc = ff_layout_check_layout(lgr);
if (rc)
goto out_err_free;
+ ff_layout_mark_devices_valid(fls);
ret = &fls->generic_hdr;
dprintk("<-- %s (success)\n", __func__);
@@ -589,7 +611,9 @@ nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror,
mirror->start_time = now;
if (ktime_equal(mirror->last_report_time, notime))
mirror->last_report_time = now;
- if (layoutstats_timer != 0)
+ if (mirror->report_interval != 0)
+ report_interval = (s64)mirror->report_interval * 1000LL;
+ else if (layoutstats_timer != 0)
report_interval = (s64)layoutstats_timer * 1000LL;
if (ktime_to_ms(ktime_sub(now, mirror->last_report_time)) >=
report_interval) {
@@ -741,17 +765,17 @@ ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg,
}
static struct nfs4_pnfs_ds *
-ff_layout_choose_best_ds_for_read(struct nfs_pageio_descriptor *pgio,
+ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg,
+ int start_idx,
int *best_idx)
{
- struct nfs4_ff_layout_segment *fls;
+ struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
struct nfs4_pnfs_ds *ds;
int idx;
- fls = FF_LAYOUT_LSEG(pgio->pg_lseg);
/* mirrors are sorted by efficiency */
- for (idx = 0; idx < fls->mirror_array_cnt; idx++) {
- ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, idx, false);
+ for (idx = start_idx; idx < fls->mirror_array_cnt; idx++) {
+ ds = nfs4_ff_layout_prepare_ds(lseg, idx, false);
if (ds) {
*best_idx = idx;
return ds;
@@ -771,18 +795,24 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
int ds_idx;
/* Use full layout for now */
- if (!pgio->pg_lseg)
+ if (!pgio->pg_lseg) {
pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
req->wb_context,
0,
NFS4_MAX_UINT64,
IOMODE_READ,
GFP_KERNEL);
+ if (IS_ERR(pgio->pg_lseg)) {
+ pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ return;
+ }
+ }
/* If no lseg, fall back to read through mds */
if (pgio->pg_lseg == NULL)
goto out_mds;
- ds = ff_layout_choose_best_ds_for_read(pgio, &ds_idx);
+ ds = ff_layout_choose_best_ds_for_read(pgio->pg_lseg, 0, &ds_idx);
if (!ds)
goto out_mds;
mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx);
@@ -811,13 +841,19 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
int i;
int status;
- if (!pgio->pg_lseg)
+ if (!pgio->pg_lseg) {
pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
req->wb_context,
0,
NFS4_MAX_UINT64,
IOMODE_RW,
GFP_NOFS);
+ if (IS_ERR(pgio->pg_lseg)) {
+ pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ return;
+ }
+ }
/* If no lseg, fall back to write through mds */
if (pgio->pg_lseg == NULL)
goto out_mds;
@@ -853,18 +889,25 @@ static unsigned int
ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req)
{
- if (!pgio->pg_lseg)
+ if (!pgio->pg_lseg) {
pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
req->wb_context,
0,
NFS4_MAX_UINT64,
IOMODE_RW,
GFP_NOFS);
+ if (IS_ERR(pgio->pg_lseg)) {
+ pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ goto out;
+ }
+ }
if (pgio->pg_lseg)
return FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg);
/* no lseg means that pnfs is not in use, so no mirroring here */
nfs_pageio_reset_write_mds(pgio);
+out:
return 1;
}
@@ -898,18 +941,7 @@ static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs)
hdr->args.count,
(unsigned long long)hdr->args.offset);
- if (!hdr->dreq) {
- struct nfs_open_context *ctx;
-
- ctx = nfs_list_entry(hdr->pages.next)->wb_context;
- set_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
- hdr->completion_ops->error_cleanup(&hdr->pages);
- } else {
- nfs_direct_set_resched_writes(hdr->dreq);
- /* fake unstable write to let common nfs resend pages */
- hdr->verf.committed = NFS_UNSTABLE;
- hdr->good_bytes = hdr->args.count;
- }
+ hdr->completion_ops->reschedule_io(hdr);
return;
}
@@ -1035,7 +1067,8 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task,
rpc_wake_up(&tbl->slot_tbl_waitq);
/* fall through */
default:
- if (ff_layout_has_available_ds(lseg))
+ if (ff_layout_no_fallback_to_mds(lseg) ||
+ ff_layout_has_available_ds(lseg))
return -NFS4ERR_RESET_TO_PNFS;
reset:
dprintk("%s Retry through MDS. Error %d\n", __func__,
@@ -1086,7 +1119,7 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task,
return -NFS4ERR_RESET_TO_PNFS;
out_retry:
task->tk_status = 0;
- rpc_restart_call(task);
+ rpc_restart_call_prepare(task);
rpc_delay(task, NFS_JUKEBOX_RETRY_TIME);
return -EAGAIN;
}
@@ -1144,6 +1177,14 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
}
}
+ switch (status) {
+ case NFS4ERR_DELAY:
+ case NFS4ERR_GRACE:
+ return;
+ default:
+ break;
+ }
+
mirror = FF_LAYOUT_COMP(lseg, idx);
err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
mirror, offset, length, status, opnum,
@@ -1153,7 +1194,6 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
}
/* NFS_PROTO call done callback routines */
-
static int ff_layout_read_done_cb(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
@@ -1171,6 +1211,10 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
switch (err) {
case -NFS4ERR_RESET_TO_PNFS:
+ if (ff_layout_choose_best_ds_for_read(hdr->lseg,
+ hdr->pgio_mirror_idx + 1,
+ &hdr->pgio_mirror_idx))
+ goto out_eagain;
set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
&hdr->lseg->pls_layout->plh_flags);
pnfs_read_resend_pnfs(hdr);
@@ -1179,11 +1223,13 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
ff_layout_reset_read(hdr);
return task->tk_status;
case -EAGAIN:
- rpc_restart_call_prepare(task);
- return -EAGAIN;
+ goto out_eagain;
}
return 0;
+out_eagain:
+ rpc_restart_call_prepare(task);
+ return -EAGAIN;
}
static bool
@@ -1222,14 +1268,31 @@ ff_layout_reset_to_mds(struct pnfs_layout_segment *lseg, int idx)
return ff_layout_test_devid_unavailable(node);
}
-static int ff_layout_read_prepare_common(struct rpc_task *task,
- struct nfs_pgio_header *hdr)
+static void ff_layout_read_record_layoutstats_start(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
{
+ if (test_and_set_bit(NFS_IOHDR_STAT, &hdr->flags))
+ return;
nfs4_ff_layout_stat_io_start_read(hdr->inode,
FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
hdr->args.count,
task->tk_start);
+}
+
+static void ff_layout_read_record_layoutstats_done(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ if (!test_and_clear_bit(NFS_IOHDR_STAT, &hdr->flags))
+ return;
+ nfs4_ff_layout_stat_io_end_read(task,
+ FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
+ hdr->args.count,
+ hdr->res.count);
+}
+static int ff_layout_read_prepare_common(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
rpc_exit(task, -EIO);
return -EIO;
@@ -1245,6 +1308,7 @@ static int ff_layout_read_prepare_common(struct rpc_task *task,
}
hdr->pgio_done_cb = ff_layout_read_done_cb;
+ ff_layout_read_record_layoutstats_start(task, hdr);
return 0;
}
@@ -1303,10 +1367,6 @@ static void ff_layout_read_call_done(struct rpc_task *task, void *data)
dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
- nfs4_ff_layout_stat_io_end_read(task,
- FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
- hdr->args.count, hdr->res.count);
-
if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
task->tk_status == 0) {
nfs4_sequence_done(task, &hdr->res.seq_res);
@@ -1321,10 +1381,20 @@ static void ff_layout_read_count_stats(struct rpc_task *task, void *data)
{
struct nfs_pgio_header *hdr = data;
+ ff_layout_read_record_layoutstats_done(task, hdr);
rpc_count_iostats_metrics(task,
&NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_READ]);
}
+static void ff_layout_read_release(void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ ff_layout_read_record_layoutstats_done(&hdr->task, hdr);
+ pnfs_generic_rw_release(data);
+}
+
+
static int ff_layout_write_done_cb(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
@@ -1342,15 +1412,12 @@ static int ff_layout_write_done_cb(struct rpc_task *task,
switch (err) {
case -NFS4ERR_RESET_TO_PNFS:
- pnfs_set_retry_layoutget(hdr->lseg->pls_layout);
ff_layout_reset_write(hdr, true);
return task->tk_status;
case -NFS4ERR_RESET_TO_MDS:
- pnfs_clear_retry_layoutget(hdr->lseg->pls_layout);
ff_layout_reset_write(hdr, false);
return task->tk_status;
case -EAGAIN:
- rpc_restart_call_prepare(task);
return -EAGAIN;
}
@@ -1382,11 +1449,9 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
switch (err) {
case -NFS4ERR_RESET_TO_PNFS:
- pnfs_set_retry_layoutget(data->lseg->pls_layout);
pnfs_generic_prepare_to_resend_writes(data);
return -EAGAIN;
case -NFS4ERR_RESET_TO_MDS:
- pnfs_clear_retry_layoutget(data->lseg->pls_layout);
pnfs_generic_prepare_to_resend_writes(data);
return -EAGAIN;
case -EAGAIN:
@@ -1401,14 +1466,31 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
return 0;
}
-static int ff_layout_write_prepare_common(struct rpc_task *task,
- struct nfs_pgio_header *hdr)
+static void ff_layout_write_record_layoutstats_start(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
{
+ if (test_and_set_bit(NFS_IOHDR_STAT, &hdr->flags))
+ return;
nfs4_ff_layout_stat_io_start_write(hdr->inode,
FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
hdr->args.count,
task->tk_start);
+}
+static void ff_layout_write_record_layoutstats_done(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ if (!test_and_clear_bit(NFS_IOHDR_STAT, &hdr->flags))
+ return;
+ nfs4_ff_layout_stat_io_end_write(task,
+ FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
+ hdr->args.count, hdr->res.count,
+ hdr->res.verf->committed);
+}
+
+static int ff_layout_write_prepare_common(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
rpc_exit(task, -EIO);
return -EIO;
@@ -1425,6 +1507,7 @@ static int ff_layout_write_prepare_common(struct rpc_task *task,
return -EAGAIN;
}
+ ff_layout_write_record_layoutstats_start(task, hdr);
return 0;
}
@@ -1460,11 +1543,6 @@ static void ff_layout_write_call_done(struct rpc_task *task, void *data)
{
struct nfs_pgio_header *hdr = data;
- nfs4_ff_layout_stat_io_end_write(task,
- FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
- hdr->args.count, hdr->res.count,
- hdr->res.verf->committed);
-
if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
task->tk_status == 0) {
nfs4_sequence_done(task, &hdr->res.seq_res);
@@ -1479,18 +1557,53 @@ static void ff_layout_write_count_stats(struct rpc_task *task, void *data)
{
struct nfs_pgio_header *hdr = data;
+ ff_layout_write_record_layoutstats_done(task, hdr);
rpc_count_iostats_metrics(task,
&NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_WRITE]);
}
-static void ff_layout_commit_prepare_common(struct rpc_task *task,
+static void ff_layout_write_release(void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ ff_layout_write_record_layoutstats_done(&hdr->task, hdr);
+ pnfs_generic_rw_release(data);
+}
+
+static void ff_layout_commit_record_layoutstats_start(struct rpc_task *task,
struct nfs_commit_data *cdata)
{
+ if (test_and_set_bit(NFS_IOHDR_STAT, &cdata->flags))
+ return;
nfs4_ff_layout_stat_io_start_write(cdata->inode,
FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
0, task->tk_start);
}
+static void ff_layout_commit_record_layoutstats_done(struct rpc_task *task,
+ struct nfs_commit_data *cdata)
+{
+ struct nfs_page *req;
+ __u64 count = 0;
+
+ if (!test_and_clear_bit(NFS_IOHDR_STAT, &cdata->flags))
+ return;
+
+ if (task->tk_status == 0) {
+ list_for_each_entry(req, &cdata->pages, wb_list)
+ count += req->wb_bytes;
+ }
+ nfs4_ff_layout_stat_io_end_write(task,
+ FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
+ count, count, NFS_FILE_SYNC);
+}
+
+static void ff_layout_commit_prepare_common(struct rpc_task *task,
+ struct nfs_commit_data *cdata)
+{
+ ff_layout_commit_record_layoutstats_start(task, cdata);
+}
+
static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data)
{
ff_layout_commit_prepare_common(task, data);
@@ -1511,19 +1624,6 @@ static void ff_layout_commit_prepare_v4(struct rpc_task *task, void *data)
static void ff_layout_commit_done(struct rpc_task *task, void *data)
{
- struct nfs_commit_data *cdata = data;
- struct nfs_page *req;
- __u64 count = 0;
-
- if (task->tk_status == 0) {
- list_for_each_entry(req, &cdata->pages, wb_list)
- count += req->wb_bytes;
- }
-
- nfs4_ff_layout_stat_io_end_write(task,
- FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
- count, count, NFS_FILE_SYNC);
-
pnfs_generic_write_commit_done(task, data);
}
@@ -1531,50 +1631,59 @@ static void ff_layout_commit_count_stats(struct rpc_task *task, void *data)
{
struct nfs_commit_data *cdata = data;
+ ff_layout_commit_record_layoutstats_done(task, cdata);
rpc_count_iostats_metrics(task,
&NFS_CLIENT(cdata->inode)->cl_metrics[NFSPROC4_CLNT_COMMIT]);
}
+static void ff_layout_commit_release(void *data)
+{
+ struct nfs_commit_data *cdata = data;
+
+ ff_layout_commit_record_layoutstats_done(&cdata->task, cdata);
+ pnfs_generic_commit_release(data);
+}
+
static const struct rpc_call_ops ff_layout_read_call_ops_v3 = {
.rpc_call_prepare = ff_layout_read_prepare_v3,
.rpc_call_done = ff_layout_read_call_done,
.rpc_count_stats = ff_layout_read_count_stats,
- .rpc_release = pnfs_generic_rw_release,
+ .rpc_release = ff_layout_read_release,
};
static const struct rpc_call_ops ff_layout_read_call_ops_v4 = {
.rpc_call_prepare = ff_layout_read_prepare_v4,
.rpc_call_done = ff_layout_read_call_done,
.rpc_count_stats = ff_layout_read_count_stats,
- .rpc_release = pnfs_generic_rw_release,
+ .rpc_release = ff_layout_read_release,
};
static const struct rpc_call_ops ff_layout_write_call_ops_v3 = {
.rpc_call_prepare = ff_layout_write_prepare_v3,
.rpc_call_done = ff_layout_write_call_done,
.rpc_count_stats = ff_layout_write_count_stats,
- .rpc_release = pnfs_generic_rw_release,
+ .rpc_release = ff_layout_write_release,
};
static const struct rpc_call_ops ff_layout_write_call_ops_v4 = {
.rpc_call_prepare = ff_layout_write_prepare_v4,
.rpc_call_done = ff_layout_write_call_done,
.rpc_count_stats = ff_layout_write_count_stats,
- .rpc_release = pnfs_generic_rw_release,
+ .rpc_release = ff_layout_write_release,
};
static const struct rpc_call_ops ff_layout_commit_call_ops_v3 = {
.rpc_call_prepare = ff_layout_commit_prepare_v3,
.rpc_call_done = ff_layout_commit_done,
.rpc_count_stats = ff_layout_commit_count_stats,
- .rpc_release = pnfs_generic_commit_release,
+ .rpc_release = ff_layout_commit_release,
};
static const struct rpc_call_ops ff_layout_commit_call_ops_v4 = {
.rpc_call_prepare = ff_layout_commit_prepare_v4,
.rpc_call_done = ff_layout_commit_done,
.rpc_count_stats = ff_layout_commit_count_stats,
- .rpc_release = pnfs_generic_commit_release,
+ .rpc_release = ff_layout_commit_release,
};
static enum pnfs_try_status
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
index 68cc0d9828f9..dd353bb7dc0a 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.h
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -10,6 +10,7 @@
#define FS_NFS_NFS4FLEXFILELAYOUT_H
#define FF_FLAGS_NO_LAYOUTCOMMIT 1
+#define FF_FLAGS_NO_IO_THRU_MDS 2
#include "../pnfs.h"
@@ -84,6 +85,7 @@ struct nfs4_ff_layout_mirror {
struct nfs4_ff_layoutstat write_stat;
ktime_t start_time;
ktime_t last_report_time;
+ u32 report_interval;
};
struct nfs4_ff_layout_segment {
@@ -146,6 +148,12 @@ FF_LAYOUT_MIRROR_COUNT(struct pnfs_layout_segment *lseg)
}
static inline bool
+ff_layout_no_fallback_to_mds(struct pnfs_layout_segment *lseg)
+{
+ return FF_LAYOUT_LSEG(lseg)->flags & FF_FLAGS_NO_IO_THRU_MDS;
+}
+
+static inline bool
ff_layout_test_devid_unavailable(struct nfs4_deviceid_node *node)
{
return nfs4_test_deviceid_unavailable(node);
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index e125e55de86d..bd0327541366 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -429,22 +429,14 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
mirror, lseg->pls_range.offset,
lseg->pls_range.length, NFS4ERR_NXIO,
OP_ILLEGAL, GFP_NOIO);
- if (fail_return) {
- pnfs_error_mark_layout_for_return(ino, lseg);
- if (ff_layout_has_available_ds(lseg))
- pnfs_set_retry_layoutget(lseg->pls_layout);
- else
- pnfs_clear_retry_layoutget(lseg->pls_layout);
-
- } else {
+ if (!fail_return) {
if (ff_layout_has_available_ds(lseg))
set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
&lseg->pls_layout->plh_flags);
- else {
+ else
pnfs_error_mark_layout_for_return(ino, lseg);
- pnfs_clear_retry_layoutget(lseg->pls_layout);
- }
- }
+ } else
+ pnfs_error_mark_layout_for_return(ino, lseg);
}
out_update_creds:
if (ff_layout_update_mirror_cred(mirror, ds))
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 326d9e10d833..8e24d886d2c5 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -71,19 +71,25 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr)
return nfs_fileid_to_ino_t(fattr->fileid);
}
-/**
- * nfs_wait_bit_killable - helper for functions that are sleeping on bit locks
- * @word: long word containing the bit lock
- */
-int nfs_wait_bit_killable(struct wait_bit_key *key)
+static int nfs_wait_killable(int mode)
{
- if (fatal_signal_pending(current))
- return -ERESTARTSYS;
freezable_schedule_unsafe();
+ if (signal_pending_state(mode, current))
+ return -ERESTARTSYS;
return 0;
}
+
+int nfs_wait_bit_killable(struct wait_bit_key *key, int mode)
+{
+ return nfs_wait_killable(mode);
+}
EXPORT_SYMBOL_GPL(nfs_wait_bit_killable);
+int nfs_wait_atomic_killable(atomic_t *p)
+{
+ return nfs_wait_killable(TASK_KILLABLE);
+}
+
/**
* nfs_compat_user_ino64 - returns the user-visible inode number
* @fileid: 64-bit fileid
@@ -408,9 +414,10 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
inode->i_fop = NULL;
inode->i_flags |= S_AUTOMOUNT;
}
- } else if (S_ISLNK(inode->i_mode))
+ } else if (S_ISLNK(inode->i_mode)) {
inode->i_op = &nfs_symlink_inode_operations;
- else
+ inode_nohighmem(inode);
+ } else
init_special_inode(inode, inode->i_mode, fattr->rdev);
memset(&inode->i_atime, 0, sizeof(inode->i_atime));
@@ -618,7 +625,10 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr,
nfs_inc_stats(inode, NFSIOS_SETATTRTRUNC);
nfs_vmtruncate(inode, attr->ia_size);
}
- nfs_update_inode(inode, fattr);
+ if (fattr->valid)
+ nfs_update_inode(inode, fattr);
+ else
+ NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATTR;
spin_unlock(&inode->i_lock);
}
EXPORT_SYMBOL_GPL(nfs_setattr_update_inode);
@@ -696,7 +706,7 @@ static void nfs_init_lock_context(struct nfs_lock_context *l_ctx)
l_ctx->lockowner.l_owner = current->files;
l_ctx->lockowner.l_pid = current->tgid;
INIT_LIST_HEAD(&l_ctx->list);
- nfs_iocounter_init(&l_ctx->io_count);
+ atomic_set(&l_ctx->io_count, 0);
}
static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx)
@@ -909,6 +919,12 @@ void nfs_file_clear_open_context(struct file *filp)
if (ctx) {
struct inode *inode = d_inode(ctx->dentry);
+ /*
+ * We fatal error on write before. Try to writeback
+ * every page again.
+ */
+ if (ctx->error < 0)
+ invalidate_inode_pages2(inode->i_mapping);
filp->private_data = NULL;
spin_lock(&inode->i_lock);
list_move_tail(&ctx->list, &NFS_I(inode)->open_files);
@@ -1083,6 +1099,27 @@ static bool nfs_mapping_need_revalidate_inode(struct inode *inode)
|| NFS_STALE(inode);
}
+int nfs_revalidate_mapping_rcu(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ unsigned long *bitlock = &nfsi->flags;
+ int ret = 0;
+
+ if (IS_SWAPFILE(inode))
+ goto out;
+ if (nfs_mapping_need_revalidate_inode(inode)) {
+ ret = -ECHILD;
+ goto out;
+ }
+ spin_lock(&inode->i_lock);
+ if (test_bit(NFS_INO_INVALIDATING, bitlock) ||
+ (nfsi->cache_validity & NFS_INO_INVALID_DATA))
+ ret = -ECHILD;
+ spin_unlock(&inode->i_lock);
+out:
+ return ret;
+}
+
/**
* __nfs_revalidate_mapping - Revalidate the pagecache
* @inode - pointer to host inode
@@ -1638,6 +1675,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
unsigned long invalid = 0;
unsigned long now = jiffies;
unsigned long save_cache_validity;
+ bool cache_revalidated = true;
dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%x)\n",
__func__, inode->i_sb->s_id, inode->i_ino,
@@ -1699,22 +1737,28 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
nfs_force_lookup_revalidate(inode);
inode->i_version = fattr->change_attr;
}
- } else
+ } else {
nfsi->cache_validity |= save_cache_validity;
+ cache_revalidated = false;
+ }
if (fattr->valid & NFS_ATTR_FATTR_MTIME) {
memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
- } else if (server->caps & NFS_CAP_MTIME)
+ } else if (server->caps & NFS_CAP_MTIME) {
nfsi->cache_validity |= save_cache_validity &
(NFS_INO_INVALID_ATTR
| NFS_INO_REVAL_FORCED);
+ cache_revalidated = false;
+ }
if (fattr->valid & NFS_ATTR_FATTR_CTIME) {
memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
- } else if (server->caps & NFS_CAP_CTIME)
+ } else if (server->caps & NFS_CAP_CTIME) {
nfsi->cache_validity |= save_cache_validity &
(NFS_INO_INVALID_ATTR
| NFS_INO_REVAL_FORCED);
+ cache_revalidated = false;
+ }
/* Check if our cached file size is stale */
if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
@@ -1734,19 +1778,23 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
(long long)cur_isize,
(long long)new_isize);
}
- } else
+ } else {
nfsi->cache_validity |= save_cache_validity &
(NFS_INO_INVALID_ATTR
| NFS_INO_REVAL_PAGECACHE
| NFS_INO_REVAL_FORCED);
+ cache_revalidated = false;
+ }
if (fattr->valid & NFS_ATTR_FATTR_ATIME)
memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));
- else if (server->caps & NFS_CAP_ATIME)
+ else if (server->caps & NFS_CAP_ATIME) {
nfsi->cache_validity |= save_cache_validity &
(NFS_INO_INVALID_ATIME
| NFS_INO_REVAL_FORCED);
+ cache_revalidated = false;
+ }
if (fattr->valid & NFS_ATTR_FATTR_MODE) {
if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) {
@@ -1755,36 +1803,42 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
inode->i_mode = newmode;
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
}
- } else if (server->caps & NFS_CAP_MODE)
+ } else if (server->caps & NFS_CAP_MODE) {
nfsi->cache_validity |= save_cache_validity &
(NFS_INO_INVALID_ATTR
| NFS_INO_INVALID_ACCESS
| NFS_INO_INVALID_ACL
| NFS_INO_REVAL_FORCED);
+ cache_revalidated = false;
+ }
if (fattr->valid & NFS_ATTR_FATTR_OWNER) {
if (!uid_eq(inode->i_uid, fattr->uid)) {
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
inode->i_uid = fattr->uid;
}
- } else if (server->caps & NFS_CAP_OWNER)
+ } else if (server->caps & NFS_CAP_OWNER) {
nfsi->cache_validity |= save_cache_validity &
(NFS_INO_INVALID_ATTR
| NFS_INO_INVALID_ACCESS
| NFS_INO_INVALID_ACL
| NFS_INO_REVAL_FORCED);
+ cache_revalidated = false;
+ }
if (fattr->valid & NFS_ATTR_FATTR_GROUP) {
if (!gid_eq(inode->i_gid, fattr->gid)) {
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
inode->i_gid = fattr->gid;
}
- } else if (server->caps & NFS_CAP_OWNER_GROUP)
+ } else if (server->caps & NFS_CAP_OWNER_GROUP) {
nfsi->cache_validity |= save_cache_validity &
(NFS_INO_INVALID_ATTR
| NFS_INO_INVALID_ACCESS
| NFS_INO_INVALID_ACL
| NFS_INO_REVAL_FORCED);
+ cache_revalidated = false;
+ }
if (fattr->valid & NFS_ATTR_FATTR_NLINK) {
if (inode->i_nlink != fattr->nlink) {
@@ -1793,19 +1847,22 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
invalid |= NFS_INO_INVALID_DATA;
set_nlink(inode, fattr->nlink);
}
- } else if (server->caps & NFS_CAP_NLINK)
+ } else if (server->caps & NFS_CAP_NLINK) {
nfsi->cache_validity |= save_cache_validity &
(NFS_INO_INVALID_ATTR
| NFS_INO_REVAL_FORCED);
+ cache_revalidated = false;
+ }
if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
/*
* report the blocks in 512byte units
*/
inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
- }
- if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
+ } else if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
inode->i_blocks = fattr->du.nfs2.blocks;
+ else
+ cache_revalidated = false;
/* Update attrtimeo value if we're out of the unstable period */
if (invalid & NFS_INO_INVALID_ATTR) {
@@ -1815,16 +1872,24 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
/* Set barrier to be more recent than all outstanding updates */
nfsi->attr_gencount = nfs_inc_attr_generation_counter();
} else {
- if (!time_in_range_open(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) {
- if ((nfsi->attrtimeo <<= 1) > NFS_MAXATTRTIMEO(inode))
- nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode);
+ if (cache_revalidated) {
+ if (!time_in_range_open(now, nfsi->attrtimeo_timestamp,
+ nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) {
+ nfsi->attrtimeo <<= 1;
+ if (nfsi->attrtimeo > NFS_MAXATTRTIMEO(inode))
+ nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode);
+ }
nfsi->attrtimeo_timestamp = now;
}
/* Set the barrier to be more recent than this fattr */
if ((long)fattr->gencount - (long)nfsi->attr_gencount > 0)
nfsi->attr_gencount = fattr->gencount;
}
- invalid &= ~NFS_INO_INVALID_ATTR;
+
+ /* Don't declare attrcache up to date if there were no attrs! */
+ if (cache_revalidated)
+ invalid &= ~NFS_INO_INVALID_ATTR;
+
/* Don't invalidate the data if we were to blame */
if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
|| S_ISLNK(inode->i_mode)))
@@ -1904,7 +1969,7 @@ static int __init nfs_init_inodecache(void)
nfs_inode_cachep = kmem_cache_create("nfs_inode_cache",
sizeof(struct nfs_inode),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (nfs_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 56cfde26fb9c..4e8cc942336c 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -238,7 +238,7 @@ extern void nfs_pgheader_init(struct nfs_pageio_descriptor *desc,
struct nfs_pgio_header *hdr,
void (*release)(struct nfs_pgio_header *hdr));
void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos);
-int nfs_iocounter_wait(struct nfs_io_counter *c);
+int nfs_iocounter_wait(struct nfs_lock_context *l_ctx);
extern const struct nfs_pageio_ops nfs_pgio_rw_ops;
struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *);
@@ -252,18 +252,18 @@ void nfs_free_request(struct nfs_page *req);
struct nfs_pgio_mirror *
nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc);
-static inline void nfs_iocounter_init(struct nfs_io_counter *c)
-{
- c->flags = 0;
- atomic_set(&c->io_count, 0);
-}
-
static inline bool nfs_pgio_has_mirroring(struct nfs_pageio_descriptor *desc)
{
WARN_ON_ONCE(desc->pg_mirror_count < 1);
return desc->pg_mirror_count > 1;
}
+static inline bool nfs_match_open_context(const struct nfs_open_context *ctx1,
+ const struct nfs_open_context *ctx2)
+{
+ return ctx1->cred == ctx2->cred && ctx1->state == ctx2->state;
+}
+
/* nfs2xdr.c */
extern struct rpc_procinfo nfs_procedures[];
extern int nfs2_decode_dirent(struct xdr_stream *,
@@ -379,7 +379,8 @@ extern int nfs_drop_inode(struct inode *);
extern void nfs_clear_inode(struct inode *);
extern void nfs_evict_inode(struct inode *);
void nfs_zap_acl_cache(struct inode *inode);
-extern int nfs_wait_bit_killable(struct wait_bit_key *key);
+extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode);
+extern int nfs_wait_atomic_killable(atomic_t *p);
/* super.c */
extern const struct super_operations nfs_sops;
@@ -519,7 +520,6 @@ static inline void nfs_inode_dio_wait(struct inode *inode)
inode_dio_wait(inode);
}
extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
-extern void nfs_direct_set_resched_writes(struct nfs_direct_req *dreq);
/* nfs4proc.c */
extern void __nfs4_read_done_cb(struct nfs_pgio_header *);
@@ -696,9 +696,32 @@ static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh)
{
return ~crc32_le(0xFFFFFFFF, &fh->data[0], fh->size);
}
+static inline u32 nfs_stateid_hash(const nfs4_stateid *stateid)
+{
+ return ~crc32_le(0xFFFFFFFF, &stateid->other[0],
+ NFS4_STATEID_OTHER_SIZE);
+}
#else
static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh)
{
return 0;
}
+static inline u32 nfs_stateid_hash(nfs4_stateid *stateid)
+{
+ return 0;
+}
#endif
+
+static inline bool nfs_error_is_fatal(int err)
+{
+ switch (err) {
+ case -ERESTARTSYS:
+ case -EIO:
+ case -ENOSPC:
+ case -EROFS:
+ case -E2BIG:
+ return true;
+ default:
+ return false;
+ }
+}
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 99a45283b9ee..09b190015df4 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -16,9 +16,7 @@
#include <linux/nfs_fs.h>
#include "internal.h"
-#ifdef NFS_DEBUG
-# define NFSDBG_FACILITY NFSDBG_MOUNT
-#endif
+#define NFSDBG_FACILITY NFSDBG_MOUNT
/*
* Defined by RFC 1094, section A.3; and RFC 1813, section 5.1.4
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 1ebe2fc7cda2..17c0fa1eccfa 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -284,12 +284,12 @@ nfs3_listxattr(struct dentry *dentry, char *data, size_t size)
int error;
error = nfs3_list_one_acl(inode, ACL_TYPE_ACCESS,
- POSIX_ACL_XATTR_ACCESS, data, size, &result);
+ XATTR_NAME_POSIX_ACL_ACCESS, data, size, &result);
if (error)
return error;
error = nfs3_list_one_acl(inode, ACL_TYPE_DEFAULT,
- POSIX_ACL_XATTR_DEFAULT, data, size, &result);
+ XATTR_NAME_POSIX_ACL_DEFAULT, data, size, &result);
if (error)
return error;
return result;
diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h
index 814c1255f1d2..b587ccd31083 100644
--- a/fs/nfs/nfs42.h
+++ b/fs/nfs/nfs42.h
@@ -17,5 +17,6 @@ int nfs42_proc_deallocate(struct file *, loff_t, loff_t);
loff_t nfs42_proc_llseek(struct file *, loff_t, int);
int nfs42_proc_layoutstats_generic(struct nfs_server *,
struct nfs42_layoutstat_data *);
+int nfs42_proc_clone(struct file *, struct file *, loff_t, loff_t, loff_t);
#endif /* __LINUX_FS_NFS_NFS4_2_H */
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 0f020e4d8421..6e8174930a48 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -14,7 +14,7 @@
#include "pnfs.h"
#include "internal.h"
-#define NFSDBG_FACILITY NFSDBG_PNFS
+#define NFSDBG_FACILITY NFSDBG_PROC
static int nfs42_set_rw_stateid(nfs4_stateid *dst, struct file *file,
fmode_t fmode)
@@ -204,6 +204,8 @@ static void
nfs42_layoutstat_done(struct rpc_task *task, void *calldata)
{
struct nfs42_layoutstat_data *data = calldata;
+ struct inode *inode = data->inode;
+ struct pnfs_layout_hdr *lo;
if (!nfs4_sequence_done(task, &data->res.seq_res))
return;
@@ -211,12 +213,35 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata)
switch (task->tk_status) {
case 0:
break;
+ case -NFS4ERR_EXPIRED:
+ case -NFS4ERR_STALE_STATEID:
+ case -NFS4ERR_OLD_STATEID:
+ case -NFS4ERR_BAD_STATEID:
+ spin_lock(&inode->i_lock);
+ lo = NFS_I(inode)->layout;
+ if (lo && nfs4_stateid_match(&data->args.stateid,
+ &lo->plh_stateid)) {
+ LIST_HEAD(head);
+
+ /*
+ * Mark the bad layout state as invalid, then retry
+ * with the current stateid.
+ */
+ set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
+ pnfs_mark_matching_lsegs_invalid(lo, &head, NULL);
+ spin_unlock(&inode->i_lock);
+ pnfs_free_lseg_list(&head);
+ } else
+ spin_unlock(&inode->i_lock);
+ break;
case -ENOTSUPP:
case -EOPNOTSUPP:
- NFS_SERVER(data->inode)->caps &= ~NFS_CAP_LAYOUTSTATS;
+ NFS_SERVER(inode)->caps &= ~NFS_CAP_LAYOUTSTATS;
default:
- dprintk("%s server returns %d\n", __func__, task->tk_status);
+ break;
}
+
+ dprintk("%s server returns %d\n", __func__, task->tk_status);
}
static void
@@ -271,3 +296,75 @@ int nfs42_proc_layoutstats_generic(struct nfs_server *server,
return PTR_ERR(task);
return 0;
}
+
+static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f,
+ struct file *dst_f, loff_t src_offset,
+ loff_t dst_offset, loff_t count)
+{
+ struct inode *src_inode = file_inode(src_f);
+ struct inode *dst_inode = file_inode(dst_f);
+ struct nfs_server *server = NFS_SERVER(dst_inode);
+ struct nfs42_clone_args args = {
+ .src_fh = NFS_FH(src_inode),
+ .dst_fh = NFS_FH(dst_inode),
+ .src_offset = src_offset,
+ .dst_offset = dst_offset,
+ .count = count,
+ .dst_bitmask = server->cache_consistency_bitmask,
+ };
+ struct nfs42_clone_res res = {
+ .server = server,
+ };
+ int status;
+
+ msg->rpc_argp = &args;
+ msg->rpc_resp = &res;
+
+ status = nfs42_set_rw_stateid(&args.src_stateid, src_f, FMODE_READ);
+ if (status)
+ return status;
+
+ status = nfs42_set_rw_stateid(&args.dst_stateid, dst_f, FMODE_WRITE);
+ if (status)
+ return status;
+
+ res.dst_fattr = nfs_alloc_fattr();
+ if (!res.dst_fattr)
+ return -ENOMEM;
+
+ status = nfs4_call_sync(server->client, server, msg,
+ &args.seq_args, &res.seq_res, 0);
+ if (status == 0)
+ status = nfs_post_op_update_inode(dst_inode, res.dst_fattr);
+
+ kfree(res.dst_fattr);
+ return status;
+}
+
+int nfs42_proc_clone(struct file *src_f, struct file *dst_f,
+ loff_t src_offset, loff_t dst_offset, loff_t count)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLONE],
+ };
+ struct inode *inode = file_inode(src_f);
+ struct nfs_server *server = NFS_SERVER(file_inode(src_f));
+ struct nfs4_exception exception = { };
+ int err;
+
+ if (!nfs_server_capable(inode, NFS_CAP_CLONE))
+ return -EOPNOTSUPP;
+
+ do {
+ err = _nfs42_proc_clone(&msg, src_f, dst_f, src_offset,
+ dst_offset, count);
+ if (err == -ENOTSUPP || err == -EOPNOTSUPP) {
+ NFS_SERVER(inode)->caps &= ~NFS_CAP_CLONE;
+ return -EOPNOTSUPP;
+ }
+ err = nfs4_handle_exception(server, err, &exception);
+ } while (exception.retry);
+
+ return err;
+
+}
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 0eb29e14070d..0ca482a51e53 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -34,6 +34,12 @@
1 /* opaque devaddr4 length */ + \
XDR_QUADLEN(PNFS_LAYOUTSTATS_MAXSIZE))
#define decode_layoutstats_maxsz (op_decode_hdr_maxsz)
+#define encode_clone_maxsz (encode_stateid_maxsz + \
+ encode_stateid_maxsz + \
+ 2 /* src offset */ + \
+ 2 /* dst offset */ + \
+ 2 /* count */)
+#define decode_clone_maxsz (op_decode_hdr_maxsz)
#define NFS4_enc_allocate_sz (compound_encode_hdr_maxsz + \
encode_putfh_maxsz + \
@@ -65,7 +71,20 @@
decode_sequence_maxsz + \
decode_putfh_maxsz + \
PNFS_LAYOUTSTATS_MAXDEV * decode_layoutstats_maxsz)
-
+#define NFS4_enc_clone_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_savefh_maxsz + \
+ encode_putfh_maxsz + \
+ encode_clone_maxsz + \
+ encode_getattr_maxsz)
+#define NFS4_dec_clone_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_savefh_maxsz + \
+ decode_putfh_maxsz + \
+ decode_clone_maxsz + \
+ decode_getattr_maxsz)
static void encode_fallocate(struct xdr_stream *xdr,
struct nfs42_falloc_args *args)
@@ -128,6 +147,21 @@ static void encode_layoutstats(struct xdr_stream *xdr,
encode_uint32(xdr, 0);
}
+static void encode_clone(struct xdr_stream *xdr,
+ struct nfs42_clone_args *args,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ encode_op_hdr(xdr, OP_CLONE, decode_clone_maxsz, hdr);
+ encode_nfs4_stateid(xdr, &args->src_stateid);
+ encode_nfs4_stateid(xdr, &args->dst_stateid);
+ p = reserve_space(xdr, 3*8);
+ p = xdr_encode_hyper(p, args->src_offset);
+ p = xdr_encode_hyper(p, args->dst_offset);
+ xdr_encode_hyper(p, args->count);
+}
+
/*
* Encode ALLOCATE request
*/
@@ -206,6 +240,27 @@ static void nfs4_xdr_enc_layoutstats(struct rpc_rqst *req,
encode_nops(&hdr);
}
+/*
+ * Encode CLONE request
+ */
+static void nfs4_xdr_enc_clone(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ struct nfs42_clone_args *args)
+{
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->src_fh, &hdr);
+ encode_savefh(xdr, &hdr);
+ encode_putfh(xdr, args->dst_fh, &hdr);
+ encode_clone(xdr, args, &hdr);
+ encode_getfattr(xdr, args->dst_bitmask, &hdr);
+ encode_nops(&hdr);
+}
+
static int decode_allocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res)
{
return decode_op_hdr(xdr, OP_ALLOCATE);
@@ -243,6 +298,11 @@ static int decode_layoutstats(struct xdr_stream *xdr)
return decode_op_hdr(xdr, OP_LAYOUTSTATS);
}
+static int decode_clone(struct xdr_stream *xdr)
+{
+ return decode_op_hdr(xdr, OP_CLONE);
+}
+
/*
* Decode ALLOCATE request
*/
@@ -351,4 +411,39 @@ out:
return status;
}
+/*
+ * Decode CLONE request
+ */
+static int nfs4_xdr_dec_clone(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ struct nfs42_clone_res *res)
+{
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_savefh(xdr);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_clone(xdr);
+ if (status)
+ goto out;
+ status = decode_getfattr(xdr, res->dst_fattr, res->server);
+
+out:
+ res->rpc_status = status;
+ return status;
+}
+
#endif /* __LINUX_FS_NFS_NFS4_2XDR_H */
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 50cfc4ca7a02..4afdee420d25 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -183,10 +183,12 @@ struct nfs4_state {
struct nfs4_exception {
- long timeout;
- int retry;
struct nfs4_state *state;
struct inode *inode;
+ long timeout;
+ unsigned char delay : 1,
+ recovering : 1,
+ retry : 1;
};
struct nfs4_state_recovery_ops {
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 223bedda64ae..10410e8b5853 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -33,7 +33,7 @@ static int nfs_get_cb_ident_idr(struct nfs_client *clp, int minorversion)
return ret;
idr_preload(GFP_KERNEL);
spin_lock(&nn->nfs_client_lock);
- ret = idr_alloc(&nn->cb_ident_idr, clp, 0, 0, GFP_NOWAIT);
+ ret = idr_alloc(&nn->cb_ident_idr, clp, 1, 0, GFP_NOWAIT);
if (ret >= 0)
clp->cl_cb_ident = ret;
spin_unlock(&nn->nfs_client_lock);
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index b0dbe0abed53..26f9a23e2b25 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -4,8 +4,10 @@
* Copyright (C) 1992 Rick Sladkey
*/
#include <linux/fs.h>
+#include <linux/file.h>
#include <linux/falloc.h>
#include <linux/nfs_fs.h>
+#include <uapi/linux/btrfs.h> /* BTRFS_IOC_CLONE/BTRFS_IOC_CLONE_RANGE */
#include "delegation.h"
#include "internal.h"
#include "iostat.h"
@@ -192,14 +194,72 @@ static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t
return nfs42_proc_deallocate(filep, offset, len);
return nfs42_proc_allocate(filep, offset, len);
}
+
+static int nfs42_clone_file_range(struct file *src_file, loff_t src_off,
+ struct file *dst_file, loff_t dst_off, u64 count)
+{
+ struct inode *dst_inode = file_inode(dst_file);
+ struct nfs_server *server = NFS_SERVER(dst_inode);
+ struct inode *src_inode = file_inode(src_file);
+ unsigned int bs = server->clone_blksize;
+ bool same_inode = false;
+ int ret;
+
+ /* check alignment w.r.t. clone_blksize */
+ ret = -EINVAL;
+ if (bs) {
+ if (!IS_ALIGNED(src_off, bs) || !IS_ALIGNED(dst_off, bs))
+ goto out;
+ if (!IS_ALIGNED(count, bs) && i_size_read(src_inode) != (src_off + count))
+ goto out;
+ }
+
+ if (src_inode == dst_inode)
+ same_inode = true;
+
+ /* XXX: do we lock at all? what if server needs CB_RECALL_LAYOUT? */
+ if (same_inode) {
+ mutex_lock(&src_inode->i_mutex);
+ } else if (dst_inode < src_inode) {
+ mutex_lock_nested(&dst_inode->i_mutex, I_MUTEX_PARENT);
+ mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_CHILD);
+ } else {
+ mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_PARENT);
+ mutex_lock_nested(&dst_inode->i_mutex, I_MUTEX_CHILD);
+ }
+
+ /* flush all pending writes on both src and dst so that server
+ * has the latest data */
+ ret = nfs_sync_inode(src_inode);
+ if (ret)
+ goto out_unlock;
+ ret = nfs_sync_inode(dst_inode);
+ if (ret)
+ goto out_unlock;
+
+ ret = nfs42_proc_clone(src_file, dst_file, src_off, dst_off, count);
+
+ /* truncate inode page cache of the dst range so that future reads can fetch
+ * new data from server */
+ if (!ret)
+ truncate_inode_pages_range(&dst_inode->i_data, dst_off, dst_off + count - 1);
+
+out_unlock:
+ if (same_inode) {
+ mutex_unlock(&src_inode->i_mutex);
+ } else if (dst_inode < src_inode) {
+ mutex_unlock(&src_inode->i_mutex);
+ mutex_unlock(&dst_inode->i_mutex);
+ } else {
+ mutex_unlock(&dst_inode->i_mutex);
+ mutex_unlock(&src_inode->i_mutex);
+ }
+out:
+ return ret;
+}
#endif /* CONFIG_NFS_V4_2 */
const struct file_operations nfs4_file_operations = {
-#ifdef CONFIG_NFS_V4_2
- .llseek = nfs4_file_llseek,
-#else
- .llseek = nfs_file_llseek,
-#endif
.read_iter = nfs_file_read,
.write_iter = nfs_file_write,
.mmap = nfs_file_mmap,
@@ -211,9 +271,13 @@ const struct file_operations nfs4_file_operations = {
.flock = nfs_flock,
.splice_read = nfs_file_splice_read,
.splice_write = iter_file_splice_write,
-#ifdef CONFIG_NFS_V4_2
- .fallocate = nfs42_fallocate,
-#endif /* CONFIG_NFS_V4_2 */
.check_flags = nfs_check_flags,
.setlease = simple_nosetlease,
+#ifdef CONFIG_NFS_V4_2
+ .llseek = nfs4_file_llseek,
+ .fallocate = nfs42_fallocate,
+ .clone_file_range = nfs42_clone_file_range,
+#else
+ .llseek = nfs_file_llseek,
+#endif
};
diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index 2e4902203c35..5ba22c6b0ffa 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -297,7 +297,7 @@ static ssize_t nfs_idmap_get_key(const char *name, size_t namelen,
{
const struct cred *saved_cred;
struct key *rkey;
- struct user_key_payload *payload;
+ const struct user_key_payload *payload;
ssize_t ret;
saved_cred = override_creds(id_resolver_cache);
@@ -316,7 +316,7 @@ static ssize_t nfs_idmap_get_key(const char *name, size_t namelen,
if (ret < 0)
goto out_up;
- payload = rcu_dereference(rkey->payload.rcudata);
+ payload = user_key_payload(rkey);
if (IS_ERR_OR_NULL(payload)) {
ret = PTR_ERR(payload);
goto out_up;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 5133bb18830e..4bfc33ad0563 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -78,7 +78,6 @@ struct nfs4_opendata;
static int _nfs4_proc_open(struct nfs4_opendata *data);
static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
-static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *, long *);
static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr);
static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, struct nfs4_label *label);
static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label);
@@ -209,6 +208,9 @@ static const u32 nfs4_pnfs_open_bitmap[3] = {
| FATTR4_WORD1_TIME_METADATA
| FATTR4_WORD1_TIME_MODIFY,
FATTR4_WORD2_MDSTHRESHOLD
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+ | FATTR4_WORD2_SECURITY_LABEL
+#endif
};
static const u32 nfs4_open_noattr_bitmap[3] = {
@@ -239,6 +241,7 @@ const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE
FATTR4_WORD1_TIME_DELTA
| FATTR4_WORD1_FS_LAYOUT_TYPES,
FATTR4_WORD2_LAYOUT_BLKSIZE
+ | FATTR4_WORD2_CLONE_BLKSIZE
};
const u32 nfs4_fs_locations_bitmap[3] = {
@@ -344,13 +347,16 @@ static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
/* This is the error handling routine for processes that are allowed
* to sleep.
*/
-int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_exception *exception)
+static int nfs4_do_handle_exception(struct nfs_server *server,
+ int errorcode, struct nfs4_exception *exception)
{
struct nfs_client *clp = server->nfs_client;
struct nfs4_state *state = exception->state;
struct inode *inode = exception->inode;
int ret = errorcode;
+ exception->delay = 0;
+ exception->recovering = 0;
exception->retry = 0;
switch(errorcode) {
case 0:
@@ -359,11 +365,9 @@ int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_
case -NFS4ERR_DELEG_REVOKED:
case -NFS4ERR_ADMIN_REVOKED:
case -NFS4ERR_BAD_STATEID:
- if (inode && nfs4_have_delegation(inode, FMODE_READ)) {
- nfs4_inode_return_delegation(inode);
- exception->retry = 1;
- return 0;
- }
+ if (inode && nfs_async_inode_return_delegation(inode,
+ NULL) == 0)
+ goto wait_on_recovery;
if (state == NULL)
break;
ret = nfs4_schedule_stateid_recovery(server, state);
@@ -409,11 +413,12 @@ int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_
ret = -EBUSY;
break;
}
- case -NFS4ERR_GRACE:
case -NFS4ERR_DELAY:
- ret = nfs4_delay(server->client, &exception->timeout);
- if (ret != 0)
- break;
+ nfs_inc_server_stats(server, NFSIOS_DELAY);
+ case -NFS4ERR_GRACE:
+ exception->delay = 1;
+ return 0;
+
case -NFS4ERR_RETRY_UNCACHED_REP:
case -NFS4ERR_OLD_STATEID:
exception->retry = 1;
@@ -434,14 +439,85 @@ int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_
/* We failed to handle the error */
return nfs4_map_errors(ret);
wait_on_recovery:
- ret = nfs4_wait_clnt_recover(clp);
+ exception->recovering = 1;
+ return 0;
+}
+
+/* This is the error handling routine for processes that are allowed
+ * to sleep.
+ */
+int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_exception *exception)
+{
+ struct nfs_client *clp = server->nfs_client;
+ int ret;
+
+ ret = nfs4_do_handle_exception(server, errorcode, exception);
+ if (exception->delay) {
+ ret = nfs4_delay(server->client, &exception->timeout);
+ goto out_retry;
+ }
+ if (exception->recovering) {
+ ret = nfs4_wait_clnt_recover(clp);
+ if (test_bit(NFS_MIG_FAILED, &server->mig_status))
+ return -EIO;
+ goto out_retry;
+ }
+ return ret;
+out_retry:
+ if (ret == 0)
+ exception->retry = 1;
+ return ret;
+}
+
+static int
+nfs4_async_handle_exception(struct rpc_task *task, struct nfs_server *server,
+ int errorcode, struct nfs4_exception *exception)
+{
+ struct nfs_client *clp = server->nfs_client;
+ int ret;
+
+ ret = nfs4_do_handle_exception(server, errorcode, exception);
+ if (exception->delay) {
+ rpc_delay(task, nfs4_update_delay(&exception->timeout));
+ goto out_retry;
+ }
+ if (exception->recovering) {
+ rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL);
+ if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0)
+ rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task);
+ goto out_retry;
+ }
if (test_bit(NFS_MIG_FAILED, &server->mig_status))
- return -EIO;
+ ret = -EIO;
+ return ret;
+out_retry:
if (ret == 0)
exception->retry = 1;
return ret;
}
+static int
+nfs4_async_handle_error(struct rpc_task *task, struct nfs_server *server,
+ struct nfs4_state *state, long *timeout)
+{
+ struct nfs4_exception exception = {
+ .state = state,
+ };
+
+ if (task->tk_status >= 0)
+ return 0;
+ if (timeout)
+ exception.timeout = *timeout;
+ task->tk_status = nfs4_async_handle_exception(task, server,
+ task->tk_status,
+ &exception);
+ if (exception.delay && timeout)
+ *timeout = exception.timeout;
+ if (exception.retry)
+ return -EAGAIN;
+ return 0;
+}
+
/*
* Return 'true' if 'clp' is using an rpc_client that is integrity protected
* or 'false' otherwise.
@@ -1312,6 +1388,7 @@ static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_s
* Protect the call to nfs4_state_set_mode_locked and
* serialise the stateid update
*/
+ spin_lock(&state->owner->so_lock);
write_seqlock(&state->seqlock);
if (deleg_stateid != NULL) {
nfs4_stateid_copy(&state->stateid, deleg_stateid);
@@ -1320,7 +1397,6 @@ static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_s
if (open_stateid != NULL)
nfs_set_open_stateid_locked(state, open_stateid, fmode);
write_sequnlock(&state->seqlock);
- spin_lock(&state->owner->so_lock);
update_open_stateflags(state, fmode);
spin_unlock(&state->owner->so_lock);
}
@@ -1525,6 +1601,7 @@ _nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
if (!data->rpc_done) {
state = nfs4_try_open_cached(data);
+ trace_nfs4_cached_open(data->state);
goto out;
}
@@ -1942,6 +2019,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
}
return;
unlock_no_action:
+ trace_nfs4_cached_open(data->state);
rcu_read_unlock();
out_no_action:
task->tk_action = NULL;
@@ -2630,6 +2708,7 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
if (status == 0 && state != NULL)
renew_lease(server, timestamp);
+ trace_nfs4_setattr(inode, &arg.stateid, status);
return status;
}
@@ -2646,7 +2725,6 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
int err;
do {
err = _nfs4_do_setattr(inode, cred, fattr, sattr, state, ilabel, olabel);
- trace_nfs4_setattr(inode, err);
switch (err) {
case -NFS4ERR_OPENMODE:
if (!(sattr->ia_valid & ATTR_SIZE)) {
@@ -4530,7 +4608,7 @@ static inline int nfs4_server_supports_acls(struct nfs_server *server)
#define NFS4ACL_MAXPAGES DIV_ROUND_UP(XATTR_SIZE_MAX, PAGE_SIZE)
static int buf_to_pages_noslab(const void *buf, size_t buflen,
- struct page **pages, unsigned int *pgbase)
+ struct page **pages)
{
struct page *newpage, **spages;
int rc = 0;
@@ -4674,7 +4752,6 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu
goto out_free;
args.acl_len = npages * PAGE_SIZE;
- args.acl_pgbase = 0;
dprintk("%s buf %p buflen %zu npages %d args.acl_len %zu\n",
__func__, buf, buflen, npages, args.acl_len);
@@ -4766,7 +4843,7 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
return -EOPNOTSUPP;
if (npages > ARRAY_SIZE(pages))
return -ERANGE;
- i = buf_to_pages_noslab(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
+ i = buf_to_pages_noslab(buf, buflen, arg.acl_pages);
if (i < 0)
return i;
nfs4_inode_return_delegation(inode);
@@ -4955,79 +5032,6 @@ out:
#endif /* CONFIG_NFS_V4_SECURITY_LABEL */
-static int
-nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
- struct nfs4_state *state, long *timeout)
-{
- struct nfs_client *clp = server->nfs_client;
-
- if (task->tk_status >= 0)
- return 0;
- switch(task->tk_status) {
- case -NFS4ERR_DELEG_REVOKED:
- case -NFS4ERR_ADMIN_REVOKED:
- case -NFS4ERR_BAD_STATEID:
- case -NFS4ERR_OPENMODE:
- if (state == NULL)
- break;
- if (nfs4_schedule_stateid_recovery(server, state) < 0)
- goto recovery_failed;
- goto wait_on_recovery;
- case -NFS4ERR_EXPIRED:
- if (state != NULL) {
- if (nfs4_schedule_stateid_recovery(server, state) < 0)
- goto recovery_failed;
- }
- case -NFS4ERR_STALE_STATEID:
- case -NFS4ERR_STALE_CLIENTID:
- nfs4_schedule_lease_recovery(clp);
- goto wait_on_recovery;
- case -NFS4ERR_MOVED:
- if (nfs4_schedule_migration_recovery(server) < 0)
- goto recovery_failed;
- goto wait_on_recovery;
- case -NFS4ERR_LEASE_MOVED:
- nfs4_schedule_lease_moved_recovery(clp);
- goto wait_on_recovery;
-#if defined(CONFIG_NFS_V4_1)
- case -NFS4ERR_BADSESSION:
- case -NFS4ERR_BADSLOT:
- case -NFS4ERR_BAD_HIGH_SLOT:
- case -NFS4ERR_DEADSESSION:
- case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
- case -NFS4ERR_SEQ_FALSE_RETRY:
- case -NFS4ERR_SEQ_MISORDERED:
- dprintk("%s ERROR %d, Reset session\n", __func__,
- task->tk_status);
- nfs4_schedule_session_recovery(clp->cl_session, task->tk_status);
- goto wait_on_recovery;
-#endif /* CONFIG_NFS_V4_1 */
- case -NFS4ERR_DELAY:
- nfs_inc_server_stats(server, NFSIOS_DELAY);
- rpc_delay(task, nfs4_update_delay(timeout));
- goto restart_call;
- case -NFS4ERR_GRACE:
- rpc_delay(task, NFS4_POLL_RETRY_MAX);
- case -NFS4ERR_RETRY_UNCACHED_REP:
- case -NFS4ERR_OLD_STATEID:
- goto restart_call;
- }
- task->tk_status = nfs4_map_errors(task->tk_status);
- return 0;
-recovery_failed:
- task->tk_status = -EIO;
- return 0;
-wait_on_recovery:
- rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL);
- if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0)
- rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task);
- if (test_bit(NFS_MIG_FAILED, &server->mig_status))
- goto recovery_failed;
-restart_call:
- task->tk_status = 0;
- return -EAGAIN;
-}
-
static void nfs4_init_boot_verifier(const struct nfs_client *clp,
nfs4_verifier *bootverf)
{
@@ -5049,7 +5053,6 @@ static void nfs4_init_boot_verifier(const struct nfs_client *clp,
static int
nfs4_init_nonuniform_client_string(struct nfs_client *clp)
{
- int result;
size_t len;
char *str;
@@ -5077,7 +5080,7 @@ nfs4_init_nonuniform_client_string(struct nfs_client *clp)
return -ENOMEM;
rcu_read_lock();
- result = scnprintf(str, len, "Linux NFSv4.0 %s/%s %s",
+ scnprintf(str, len, "Linux NFSv4.0 %s/%s %s",
clp->cl_ipaddr,
rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR),
rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO));
@@ -5090,7 +5093,6 @@ nfs4_init_nonuniform_client_string(struct nfs_client *clp)
static int
nfs4_init_uniquifier_client_string(struct nfs_client *clp)
{
- int result;
size_t len;
char *str;
@@ -5110,7 +5112,7 @@ nfs4_init_uniquifier_client_string(struct nfs_client *clp)
if (!str)
return -ENOMEM;
- result = scnprintf(str, len, "Linux NFSv%u.%u %s/%s",
+ scnprintf(str, len, "Linux NFSv%u.%u %s/%s",
clp->rpc_ops->version, clp->cl_minorversion,
nfs4_client_id_uniquifier,
clp->cl_rpcclient->cl_nodename);
@@ -5121,7 +5123,6 @@ nfs4_init_uniquifier_client_string(struct nfs_client *clp)
static int
nfs4_init_uniform_client_string(struct nfs_client *clp)
{
- int result;
size_t len;
char *str;
@@ -5146,7 +5147,7 @@ nfs4_init_uniform_client_string(struct nfs_client *clp)
if (!str)
return -ENOMEM;
- result = scnprintf(str, len, "Linux NFSv%u.%u %s",
+ scnprintf(str, len, "Linux NFSv%u.%u %s",
clp->rpc_ops->version, clp->cl_minorversion,
clp->cl_rpcclient->cl_nodename);
clp->cl_owner_id = str;
@@ -5385,6 +5386,11 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
if (data == NULL)
return -ENOMEM;
nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
+
+ nfs4_state_protect(server->nfs_client,
+ NFS_SP4_MACH_CRED_CLEANUP,
+ &task_setup_data.rpc_client, &msg);
+
data->args.fhandle = &data->fh;
data->args.stateid = &data->stateid;
data->args.bitmask = server->cache_consistency_bitmask;
@@ -5427,7 +5433,7 @@ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4
int err;
do {
err = _nfs4_proc_delegreturn(inode, cred, stateid, issync);
- trace_nfs4_delegreturn(inode, err);
+ trace_nfs4_delegreturn(inode, stateid, err);
switch (err) {
case -NFS4ERR_STALE_STATEID:
case -NFS4ERR_EXPIRED:
@@ -5513,18 +5519,7 @@ static int nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *
static int do_vfs_lock(struct inode *inode, struct file_lock *fl)
{
- int res = 0;
- switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) {
- case FL_POSIX:
- res = posix_lock_inode_wait(inode, fl);
- break;
- case FL_FLOCK:
- res = flock_lock_inode_wait(inode, fl);
- break;
- default:
- BUG();
- }
- return res;
+ return locks_lock_inode_wait(inode, fl);
}
struct nfs4_unlockdata {
@@ -5533,7 +5528,7 @@ struct nfs4_unlockdata {
struct nfs4_lock_state *lsp;
struct nfs_open_context *ctx;
struct file_lock fl;
- const struct nfs_server *server;
+ struct nfs_server *server;
unsigned long timestamp;
};
@@ -5948,6 +5943,7 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
data->cancelled = 1;
rpc_put_task(task);
dprintk("%s: done, ret = %d!\n", __func__, ret);
+ trace_nfs4_set_lock(fl, state, &data->res.stateid, cmd, ret);
return ret;
}
@@ -5964,7 +5960,6 @@ static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request
if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)
return 0;
err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_RECLAIM);
- trace_nfs4_lock_reclaim(request, state, F_SETLK, err);
if (err != -NFS4ERR_DELAY)
break;
nfs4_handle_exception(server, err, &exception);
@@ -5991,7 +5986,6 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request
if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)
return 0;
err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_EXPIRED);
- trace_nfs4_lock_expired(request, state, F_SETLK, err);
switch (err) {
default:
goto out;
@@ -6099,7 +6093,6 @@ static int nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *
do {
err = _nfs4_proc_setlk(state, cmd, request);
- trace_nfs4_set_lock(request, state, cmd, err);
if (err == -NFS4ERR_DENIED)
err = -EAGAIN;
err = nfs4_handle_exception(NFS_SERVER(state->inode),
@@ -6260,48 +6253,32 @@ nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp)
#define XATTR_NAME_NFSV4_ACL "system.nfs4_acl"
-static int nfs4_xattr_set_nfs4_acl(struct dentry *dentry, const char *key,
+static int nfs4_xattr_set_nfs4_acl(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *key,
const void *buf, size_t buflen,
- int flags, int type)
+ int flags)
{
- if (strcmp(key, "") != 0)
- return -EINVAL;
-
return nfs4_proc_set_acl(d_inode(dentry), buf, buflen);
}
-static int nfs4_xattr_get_nfs4_acl(struct dentry *dentry, const char *key,
- void *buf, size_t buflen, int type)
+static int nfs4_xattr_get_nfs4_acl(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *key,
+ void *buf, size_t buflen)
{
- if (strcmp(key, "") != 0)
- return -EINVAL;
-
return nfs4_proc_get_acl(d_inode(dentry), buf, buflen);
}
-static size_t nfs4_xattr_list_nfs4_acl(struct dentry *dentry, char *list,
- size_t list_len, const char *name,
- size_t name_len, int type)
+static bool nfs4_xattr_list_nfs4_acl(struct dentry *dentry)
{
- size_t len = sizeof(XATTR_NAME_NFSV4_ACL);
-
- if (!nfs4_server_supports_acls(NFS_SERVER(d_inode(dentry))))
- return 0;
-
- if (list && len <= list_len)
- memcpy(list, XATTR_NAME_NFSV4_ACL, len);
- return len;
+ return nfs4_server_supports_acls(NFS_SERVER(d_inode(dentry)));
}
#ifdef CONFIG_NFS_V4_SECURITY_LABEL
-static inline int nfs4_server_supports_labels(struct nfs_server *server)
-{
- return server->caps & NFS_CAP_SECURITY_LABEL;
-}
-static int nfs4_xattr_set_nfs4_label(struct dentry *dentry, const char *key,
- const void *buf, size_t buflen,
- int flags, int type)
+static int nfs4_xattr_set_nfs4_label(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *key,
+ const void *buf, size_t buflen,
+ int flags)
{
if (security_ismaclabel(key))
return nfs4_set_security_label(dentry, buf, buflen);
@@ -6309,36 +6286,43 @@ static int nfs4_xattr_set_nfs4_label(struct dentry *dentry, const char *key,
return -EOPNOTSUPP;
}
-static int nfs4_xattr_get_nfs4_label(struct dentry *dentry, const char *key,
- void *buf, size_t buflen, int type)
+static int nfs4_xattr_get_nfs4_label(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *key,
+ void *buf, size_t buflen)
{
if (security_ismaclabel(key))
return nfs4_get_security_label(d_inode(dentry), buf, buflen);
return -EOPNOTSUPP;
}
-static size_t nfs4_xattr_list_nfs4_label(struct dentry *dentry, char *list,
- size_t list_len, const char *name,
- size_t name_len, int type)
+static ssize_t
+nfs4_listxattr_nfs4_label(struct inode *inode, char *list, size_t list_len)
{
- size_t len = 0;
+ int len = 0;
- if (nfs_server_capable(d_inode(dentry), NFS_CAP_SECURITY_LABEL)) {
- len = security_inode_listsecurity(d_inode(dentry), NULL, 0);
- if (list && len <= list_len)
- security_inode_listsecurity(d_inode(dentry), list, len);
+ if (nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL)) {
+ len = security_inode_listsecurity(inode, list, list_len);
+ if (list_len && len > list_len)
+ return -ERANGE;
}
return len;
}
static const struct xattr_handler nfs4_xattr_nfs4_label_handler = {
.prefix = XATTR_SECURITY_PREFIX,
- .list = nfs4_xattr_list_nfs4_label,
.get = nfs4_xattr_get_nfs4_label,
.set = nfs4_xattr_set_nfs4_label,
};
-#endif
+#else
+
+static ssize_t
+nfs4_listxattr_nfs4_label(struct inode *inode, char *list, size_t list_len)
+{
+ return 0;
+}
+
+#endif
/*
* nfs_fhget will use either the mounted_on_fileid or the fileid
@@ -6868,10 +6852,13 @@ static const struct nfs41_state_protection nfs4_sp4_mach_cred_request = {
},
.allow.u.words = {
[0] = 1 << (OP_CLOSE) |
+ 1 << (OP_OPEN_DOWNGRADE) |
1 << (OP_LOCKU) |
+ 1 << (OP_DELEGRETURN) |
1 << (OP_COMMIT),
[1] = 1 << (OP_SECINFO - 32) |
1 << (OP_SECINFO_NO_NAME - 32) |
+ 1 << (OP_LAYOUTRETURN - 32) |
1 << (OP_TEST_STATEID - 32) |
1 << (OP_FREE_STATEID - 32) |
1 << (OP_WRITE - 32)
@@ -6936,11 +6923,19 @@ static int nfs4_sp4_select_mode(struct nfs_client *clp,
}
if (test_bit(OP_CLOSE, sp->allow.u.longs) &&
+ test_bit(OP_OPEN_DOWNGRADE, sp->allow.u.longs) &&
+ test_bit(OP_DELEGRETURN, sp->allow.u.longs) &&
test_bit(OP_LOCKU, sp->allow.u.longs)) {
dfprintk(MOUNT, " cleanup mode enabled\n");
set_bit(NFS_SP4_MACH_CRED_CLEANUP, &clp->cl_sp4_flags);
}
+ if (test_bit(OP_LAYOUTRETURN, sp->allow.u.longs)) {
+ dfprintk(MOUNT, " pnfs cleanup mode enabled\n");
+ set_bit(NFS_SP4_MACH_CRED_PNFS_CLEANUP,
+ &clp->cl_sp4_flags);
+ }
+
if (test_bit(OP_SECINFO, sp->allow.u.longs) &&
test_bit(OP_SECINFO_NO_NAME, sp->allow.u.longs)) {
dfprintk(MOUNT, " secinfo mode enabled\n");
@@ -7769,6 +7764,7 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
struct nfs4_layoutget *lgp = calldata;
struct nfs_server *server = NFS_SERVER(lgp->args.inode);
struct nfs4_session *session = nfs4_get_session(server);
+ int ret;
dprintk("--> %s\n", __func__);
/* Note the is a race here, where a CB_LAYOUTRECALL can come in
@@ -7779,12 +7775,12 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
if (nfs41_setup_sequence(session, &lgp->args.seq_args,
&lgp->res.seq_res, task))
return;
- if (pnfs_choose_layoutget_stateid(&lgp->args.stateid,
+ ret = pnfs_choose_layoutget_stateid(&lgp->args.stateid,
NFS_I(lgp->args.inode)->layout,
&lgp->args.range,
- lgp->args.ctx->state)) {
- rpc_exit(task, NFS4_OK);
- }
+ lgp->args.ctx->state);
+ if (ret < 0)
+ rpc_exit(task, ret);
}
static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
@@ -7804,6 +7800,15 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
switch (task->tk_status) {
case 0:
goto out;
+
+ /*
+ * NFS4ERR_LAYOUTUNAVAILABLE means we are not supposed to use pnfs
+ * on the file. set tk_status to -ENODATA to tell upper layer to
+ * retry go inband.
+ */
+ case -NFS4ERR_LAYOUTUNAVAILABLE:
+ task->tk_status = -ENODATA;
+ goto out;
/*
* NFS4ERR_BADLAYOUT means the MDS cannot return a layout of
* length lgp->args.minlength != 0 (see RFC5661 section 18.43.3).
@@ -7872,7 +7877,7 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
spin_unlock(&inode->i_lock);
goto out_restart;
}
- if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN)
+ if (nfs4_async_handle_error(task, server, state, &lgp->timeout) == -EAGAIN)
goto out_restart;
out:
dprintk("<-- %s\n", __func__);
@@ -8000,6 +8005,7 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
trace_nfs4_layoutget(lgp->args.ctx,
&lgp->args.range,
&lgp->res.range,
+ &lgp->res.stateid,
status);
/* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */
if (status == 0 && lgp->res.layoutp->len)
@@ -8056,11 +8062,11 @@ static void nfs4_layoutreturn_release(void *calldata)
dprintk("--> %s\n", __func__);
spin_lock(&lo->plh_inode->i_lock);
+ pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range);
+ pnfs_mark_layout_returned_if_empty(lo);
if (lrp->res.lrs_present)
pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
- pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range);
pnfs_clear_layoutreturn_waitbit(lo);
- lo->plh_block_lgets--;
spin_unlock(&lo->plh_inode->i_lock);
pnfs_free_lseg_list(&freeme);
pnfs_put_layout_hdr(lrp->args.layout);
@@ -8092,6 +8098,10 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync)
};
int status = 0;
+ nfs4_state_protect(NFS_SERVER(lrp->args.inode)->nfs_client,
+ NFS_SP4_MACH_CRED_PNFS_CLEANUP,
+ &task_setup_data.rpc_client, &msg);
+
dprintk("--> %s\n", __func__);
if (!sync) {
lrp->inode = nfs_igrab_and_active(lrp->args.inode);
@@ -8107,7 +8117,7 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync)
return PTR_ERR(task);
if (sync)
status = task->tk_status;
- trace_nfs4_layoutreturn(lrp->args.inode, status);
+ trace_nfs4_layoutreturn(lrp->args.inode, &lrp->args.stateid, status);
dprintk("<-- %s status=%d\n", __func__, status);
rpc_put_task(task);
return status;
@@ -8255,7 +8265,7 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync)
return PTR_ERR(task);
if (sync)
status = task->tk_status;
- trace_nfs4_layoutcommit(data->args.inode, status);
+ trace_nfs4_layoutcommit(data->args.inode, &data->args.stateid, status);
dprintk("%s: status %d\n", __func__, status);
rpc_put_task(task);
return status;
@@ -8729,7 +8739,8 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
| NFS_CAP_ALLOCATE
| NFS_CAP_DEALLOCATE
| NFS_CAP_SEEK
- | NFS_CAP_LAYOUTSTATS,
+ | NFS_CAP_LAYOUTSTATS
+ | NFS_CAP_CLONE,
.init_client = nfs41_init_client,
.shutdown_client = nfs41_shutdown_client,
.match_stateid = nfs41_match_stateid,
@@ -8754,6 +8765,24 @@ const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = {
#endif
};
+ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size)
+{
+ ssize_t error, error2;
+
+ error = generic_listxattr(dentry, list, size);
+ if (error < 0)
+ return error;
+ if (list) {
+ list += error;
+ size -= error;
+ }
+
+ error2 = nfs4_listxattr_nfs4_label(d_inode(dentry), list, size);
+ if (error2 < 0)
+ return error2;
+ return error + error2;
+}
+
static const struct inode_operations nfs4_dir_inode_operations = {
.create = nfs_create,
.lookup = nfs_lookup,
@@ -8770,7 +8799,7 @@ static const struct inode_operations nfs4_dir_inode_operations = {
.setattr = nfs_setattr,
.getxattr = generic_getxattr,
.setxattr = generic_setxattr,
- .listxattr = generic_listxattr,
+ .listxattr = nfs4_listxattr,
.removexattr = generic_removexattr,
};
@@ -8780,7 +8809,7 @@ static const struct inode_operations nfs4_file_inode_operations = {
.setattr = nfs_setattr,
.getxattr = generic_getxattr,
.setxattr = generic_setxattr,
- .listxattr = generic_listxattr,
+ .listxattr = nfs4_listxattr,
.removexattr = generic_removexattr,
};
@@ -8839,7 +8868,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
};
static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = {
- .prefix = XATTR_NAME_NFSV4_ACL,
+ .name = XATTR_NAME_NFSV4_ACL,
.list = nfs4_xattr_list_nfs4_acl,
.get = nfs4_xattr_get_nfs4_acl,
.set = nfs4_xattr_set_nfs4_acl,
diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c
index 0fbd3ab1be22..8693d77c45ea 100644
--- a/fs/nfs/nfs4sysctl.c
+++ b/fs/nfs/nfs4sysctl.c
@@ -12,7 +12,7 @@
#include "nfs4idmap.h"
#include "callback.h"
-static const int nfs_set_port_min = 0;
+static const int nfs_set_port_min;
static const int nfs_set_port_max = 65535;
static struct ctl_table_header *nfs4_callback_sysctl_table;
diff --git a/fs/nfs/nfs4trace.c b/fs/nfs/nfs4trace.c
index d774335cc8bc..2850bce19244 100644
--- a/fs/nfs/nfs4trace.c
+++ b/fs/nfs/nfs4trace.c
@@ -6,6 +6,7 @@
#include "internal.h"
#include "nfs4session.h"
#include "callback.h"
+#include "pnfs.h"
#define CREATE_TRACE_POINTS
#include "nfs4trace.h"
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index 671cf68fe56b..2c8d05dae5b1 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -321,6 +321,7 @@ TRACE_EVENT(nfs4_sequence_done,
__entry->highest_slotid = res->sr_highest_slotid;
__entry->target_highest_slotid =
res->sr_target_highest_slotid;
+ __entry->status_flags = res->sr_status_flags;
__entry->error = res->sr_status;
),
TP_printk(
@@ -399,6 +400,10 @@ DECLARE_EVENT_CLASS(nfs4_open_event,
__field(u64, fileid)
__field(u64, dir)
__string(name, ctx->dentry->d_name.name)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ __field(int, openstateid_seq)
+ __field(u32, openstateid_hash)
),
TP_fast_assign(
@@ -409,8 +414,22 @@ DECLARE_EVENT_CLASS(nfs4_open_event,
__entry->flags = flags;
__entry->fmode = (__force unsigned int)ctx->mode;
__entry->dev = ctx->dentry->d_sb->s_dev;
- if (!IS_ERR_OR_NULL(state))
+ if (!IS_ERR_OR_NULL(state)) {
inode = state->inode;
+ __entry->stateid_seq =
+ be32_to_cpu(state->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&state->stateid);
+ __entry->openstateid_seq =
+ be32_to_cpu(state->open_stateid.seqid);
+ __entry->openstateid_hash =
+ nfs_stateid_hash(&state->open_stateid);
+ } else {
+ __entry->stateid_seq = 0;
+ __entry->stateid_hash = 0;
+ __entry->openstateid_seq = 0;
+ __entry->openstateid_hash = 0;
+ }
if (inode != NULL) {
__entry->fileid = NFS_FILEID(inode);
__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
@@ -425,7 +444,8 @@ DECLARE_EVENT_CLASS(nfs4_open_event,
TP_printk(
"error=%d (%s) flags=%d (%s) fmode=%s "
"fileid=%02x:%02x:%llu fhandle=0x%08x "
- "name=%02x:%02x:%llu/%s",
+ "name=%02x:%02x:%llu/%s stateid=%d:0x%08x "
+ "openstateid=%d:0x%08x",
__entry->error,
show_nfsv4_errors(__entry->error),
__entry->flags,
@@ -436,7 +456,9 @@ DECLARE_EVENT_CLASS(nfs4_open_event,
__entry->fhandle,
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->dir,
- __get_str(name)
+ __get_str(name),
+ __entry->stateid_seq, __entry->stateid_hash,
+ __entry->openstateid_seq, __entry->openstateid_hash
)
);
@@ -452,6 +474,45 @@ DEFINE_NFS4_OPEN_EVENT(nfs4_open_reclaim);
DEFINE_NFS4_OPEN_EVENT(nfs4_open_expired);
DEFINE_NFS4_OPEN_EVENT(nfs4_open_file);
+TRACE_EVENT(nfs4_cached_open,
+ TP_PROTO(
+ const struct nfs4_state *state
+ ),
+ TP_ARGS(state),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(unsigned int, fmode)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = state->inode;
+
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->fmode = (__force unsigned int)state->state;
+ __entry->stateid_seq =
+ be32_to_cpu(state->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&state->stateid);
+ ),
+
+ TP_printk(
+ "fmode=%s fileid=%02x:%02x:%llu "
+ "fhandle=0x%08x stateid=%d:0x%08x",
+ __entry->fmode ? show_fmode_flags(__entry->fmode) :
+ "closed",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash
+ )
+);
+
TRACE_EVENT(nfs4_close,
TP_PROTO(
const struct nfs4_state *state,
@@ -468,6 +529,8 @@ TRACE_EVENT(nfs4_close,
__field(u64, fileid)
__field(unsigned int, fmode)
__field(int, error)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
),
TP_fast_assign(
@@ -478,18 +541,23 @@ TRACE_EVENT(nfs4_close,
__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
__entry->fmode = (__force unsigned int)state->state;
__entry->error = error;
+ __entry->stateid_seq =
+ be32_to_cpu(args->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&args->stateid);
),
TP_printk(
"error=%d (%s) fmode=%s fileid=%02x:%02x:%llu "
- "fhandle=0x%08x",
+ "fhandle=0x%08x openstateid=%d:0x%08x",
__entry->error,
show_nfsv4_errors(__entry->error),
__entry->fmode ? show_fmode_flags(__entry->fmode) :
"closed",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
- __entry->fhandle
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash
)
);
@@ -523,6 +591,8 @@ DECLARE_EVENT_CLASS(nfs4_lock_event,
__field(dev_t, dev)
__field(u32, fhandle)
__field(u64, fileid)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
),
TP_fast_assign(
@@ -536,11 +606,16 @@ DECLARE_EVENT_CLASS(nfs4_lock_event,
__entry->dev = inode->i_sb->s_dev;
__entry->fileid = NFS_FILEID(inode);
__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->stateid_seq =
+ be32_to_cpu(state->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&state->stateid);
),
TP_printk(
"error=%d (%s) cmd=%s:%s range=%lld:%lld "
- "fileid=%02x:%02x:%llu fhandle=0x%08x",
+ "fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "stateid=%d:0x%08x",
__entry->error,
show_nfsv4_errors(__entry->error),
show_lock_cmd(__entry->cmd),
@@ -549,7 +624,8 @@ DECLARE_EVENT_CLASS(nfs4_lock_event,
(long long)__entry->end,
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
- __entry->fhandle
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash
)
);
@@ -563,11 +639,73 @@ DECLARE_EVENT_CLASS(nfs4_lock_event,
), \
TP_ARGS(request, state, cmd, error))
DEFINE_NFS4_LOCK_EVENT(nfs4_get_lock);
-DEFINE_NFS4_LOCK_EVENT(nfs4_set_lock);
-DEFINE_NFS4_LOCK_EVENT(nfs4_lock_reclaim);
-DEFINE_NFS4_LOCK_EVENT(nfs4_lock_expired);
DEFINE_NFS4_LOCK_EVENT(nfs4_unlock);
+TRACE_EVENT(nfs4_set_lock,
+ TP_PROTO(
+ const struct file_lock *request,
+ const struct nfs4_state *state,
+ const nfs4_stateid *lockstateid,
+ int cmd,
+ int error
+ ),
+
+ TP_ARGS(request, state, lockstateid, cmd, error),
+
+ TP_STRUCT__entry(
+ __field(int, error)
+ __field(int, cmd)
+ __field(char, type)
+ __field(loff_t, start)
+ __field(loff_t, end)
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ __field(int, lockstateid_seq)
+ __field(u32, lockstateid_hash)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = state->inode;
+
+ __entry->error = error;
+ __entry->cmd = cmd;
+ __entry->type = request->fl_type;
+ __entry->start = request->fl_start;
+ __entry->end = request->fl_end;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->stateid_seq =
+ be32_to_cpu(state->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&state->stateid);
+ __entry->lockstateid_seq =
+ be32_to_cpu(lockstateid->seqid);
+ __entry->lockstateid_hash =
+ nfs_stateid_hash(lockstateid);
+ ),
+
+ TP_printk(
+ "error=%d (%s) cmd=%s:%s range=%lld:%lld "
+ "fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "stateid=%d:0x%08x lockstateid=%d:0x%08x",
+ __entry->error,
+ show_nfsv4_errors(__entry->error),
+ show_lock_cmd(__entry->cmd),
+ show_lock_type(__entry->type),
+ (long long)__entry->start,
+ (long long)__entry->end,
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash,
+ __entry->lockstateid_seq, __entry->lockstateid_hash
+ )
+);
+
DECLARE_EVENT_CLASS(nfs4_set_delegation_event,
TP_PROTO(
const struct inode *inode,
@@ -621,20 +759,28 @@ TRACE_EVENT(nfs4_delegreturn_exit,
__field(dev_t, dev)
__field(u32, fhandle)
__field(int, error)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
),
TP_fast_assign(
__entry->dev = res->server->s_dev;
__entry->fhandle = nfs_fhandle_hash(args->fhandle);
__entry->error = error;
+ __entry->stateid_seq =
+ be32_to_cpu(args->stateid->seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(args->stateid);
),
TP_printk(
- "error=%d (%s) dev=%02x:%02x fhandle=0x%08x",
+ "error=%d (%s) dev=%02x:%02x fhandle=0x%08x "
+ "stateid=%d:0x%08x",
__entry->error,
show_nfsv4_errors(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->fhandle
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash
)
);
@@ -653,6 +799,8 @@ DECLARE_EVENT_CLASS(nfs4_test_stateid_event,
__field(dev_t, dev)
__field(u32, fhandle)
__field(u64, fileid)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
),
TP_fast_assign(
@@ -662,15 +810,21 @@ DECLARE_EVENT_CLASS(nfs4_test_stateid_event,
__entry->dev = inode->i_sb->s_dev;
__entry->fileid = NFS_FILEID(inode);
__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->stateid_seq =
+ be32_to_cpu(state->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&state->stateid);
),
TP_printk(
- "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x",
+ "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "stateid=%d:0x%08x",
__entry->error,
show_nfsv4_errors(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
- __entry->fhandle
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash
)
);
@@ -820,7 +974,6 @@ DECLARE_EVENT_CLASS(nfs4_inode_event,
), \
TP_ARGS(inode, error))
-DEFINE_NFS4_INODE_EVENT(nfs4_setattr);
DEFINE_NFS4_INODE_EVENT(nfs4_access);
DEFINE_NFS4_INODE_EVENT(nfs4_readlink);
DEFINE_NFS4_INODE_EVENT(nfs4_readdir);
@@ -830,8 +983,59 @@ DEFINE_NFS4_INODE_EVENT(nfs4_set_acl);
DEFINE_NFS4_INODE_EVENT(nfs4_get_security_label);
DEFINE_NFS4_INODE_EVENT(nfs4_set_security_label);
#endif /* CONFIG_NFS_V4_SECURITY_LABEL */
-DEFINE_NFS4_INODE_EVENT(nfs4_recall_delegation);
-DEFINE_NFS4_INODE_EVENT(nfs4_delegreturn);
+
+DECLARE_EVENT_CLASS(nfs4_inode_stateid_event,
+ TP_PROTO(
+ const struct inode *inode,
+ const nfs4_stateid *stateid,
+ int error
+ ),
+
+ TP_ARGS(inode, stateid, error),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(int, error)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->error = error;
+ __entry->stateid_seq =
+ be32_to_cpu(stateid->seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(stateid);
+ ),
+
+ TP_printk(
+ "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "stateid=%d:0x%08x",
+ __entry->error,
+ show_nfsv4_errors(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash
+ )
+);
+
+#define DEFINE_NFS4_INODE_STATEID_EVENT(name) \
+ DEFINE_EVENT(nfs4_inode_stateid_event, name, \
+ TP_PROTO( \
+ const struct inode *inode, \
+ const nfs4_stateid *stateid, \
+ int error \
+ ), \
+ TP_ARGS(inode, stateid, error))
+
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_setattr);
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_delegreturn);
DECLARE_EVENT_CLASS(nfs4_getattr_event,
TP_PROTO(
@@ -941,8 +1145,74 @@ DECLARE_EVENT_CLASS(nfs4_inode_callback_event,
), \
TP_ARGS(clp, fhandle, inode, error))
DEFINE_NFS4_INODE_CALLBACK_EVENT(nfs4_cb_getattr);
-DEFINE_NFS4_INODE_CALLBACK_EVENT(nfs4_cb_layoutrecall_inode);
+DECLARE_EVENT_CLASS(nfs4_inode_stateid_callback_event,
+ TP_PROTO(
+ const struct nfs_client *clp,
+ const struct nfs_fh *fhandle,
+ const struct inode *inode,
+ const nfs4_stateid *stateid,
+ int error
+ ),
+
+ TP_ARGS(clp, fhandle, inode, stateid, error),
+
+ TP_STRUCT__entry(
+ __field(int, error)
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __string(dstaddr, clp ?
+ rpc_peeraddr2str(clp->cl_rpcclient,
+ RPC_DISPLAY_ADDR) : "unknown")
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ ),
+
+ TP_fast_assign(
+ __entry->error = error;
+ __entry->fhandle = nfs_fhandle_hash(fhandle);
+ if (inode != NULL) {
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->dev = inode->i_sb->s_dev;
+ } else {
+ __entry->fileid = 0;
+ __entry->dev = 0;
+ }
+ __assign_str(dstaddr, clp ?
+ rpc_peeraddr2str(clp->cl_rpcclient,
+ RPC_DISPLAY_ADDR) : "unknown")
+ __entry->stateid_seq =
+ be32_to_cpu(stateid->seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(stateid);
+ ),
+
+ TP_printk(
+ "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "stateid=%d:0x%08x dstaddr=%s",
+ __entry->error,
+ show_nfsv4_errors(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash,
+ __get_str(dstaddr)
+ )
+);
+
+#define DEFINE_NFS4_INODE_STATEID_CALLBACK_EVENT(name) \
+ DEFINE_EVENT(nfs4_inode_stateid_callback_event, name, \
+ TP_PROTO( \
+ const struct nfs_client *clp, \
+ const struct nfs_fh *fhandle, \
+ const struct inode *inode, \
+ const nfs4_stateid *stateid, \
+ int error \
+ ), \
+ TP_ARGS(clp, fhandle, inode, stateid, error))
+DEFINE_NFS4_INODE_STATEID_CALLBACK_EVENT(nfs4_cb_recall);
+DEFINE_NFS4_INODE_STATEID_CALLBACK_EVENT(nfs4_cb_layoutrecall_file);
DECLARE_EVENT_CLASS(nfs4_idmap_event,
TP_PROTO(
@@ -1005,28 +1275,37 @@ DECLARE_EVENT_CLASS(nfs4_read_event,
__field(loff_t, offset)
__field(size_t, count)
__field(int, error)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
),
TP_fast_assign(
const struct inode *inode = hdr->inode;
+ const struct nfs4_state *state =
+ hdr->args.context->state;
__entry->dev = inode->i_sb->s_dev;
__entry->fileid = NFS_FILEID(inode);
__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
__entry->offset = hdr->args.offset;
__entry->count = hdr->args.count;
__entry->error = error;
+ __entry->stateid_seq =
+ be32_to_cpu(state->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&state->stateid);
),
TP_printk(
"error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
- "offset=%lld count=%zu",
+ "offset=%lld count=%zu stateid=%d:0x%08x",
__entry->error,
show_nfsv4_errors(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
(long long)__entry->offset,
- __entry->count
+ __entry->count,
+ __entry->stateid_seq, __entry->stateid_hash
)
);
#define DEFINE_NFS4_READ_EVENT(name) \
@@ -1056,28 +1335,37 @@ DECLARE_EVENT_CLASS(nfs4_write_event,
__field(loff_t, offset)
__field(size_t, count)
__field(int, error)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
),
TP_fast_assign(
const struct inode *inode = hdr->inode;
+ const struct nfs4_state *state =
+ hdr->args.context->state;
__entry->dev = inode->i_sb->s_dev;
__entry->fileid = NFS_FILEID(inode);
__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
__entry->offset = hdr->args.offset;
__entry->count = hdr->args.count;
__entry->error = error;
+ __entry->stateid_seq =
+ be32_to_cpu(state->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&state->stateid);
),
TP_printk(
"error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
- "offset=%lld count=%zu",
+ "offset=%lld count=%zu stateid=%d:0x%08x",
__entry->error,
show_nfsv4_errors(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
(long long)__entry->offset,
- __entry->count
+ __entry->count,
+ __entry->stateid_seq, __entry->stateid_hash
)
);
@@ -1154,10 +1442,11 @@ TRACE_EVENT(nfs4_layoutget,
const struct nfs_open_context *ctx,
const struct pnfs_layout_range *args,
const struct pnfs_layout_range *res,
+ const nfs4_stateid *layout_stateid,
int error
),
- TP_ARGS(ctx, args, res, error),
+ TP_ARGS(ctx, args, res, layout_stateid, error),
TP_STRUCT__entry(
__field(dev_t, dev)
@@ -1167,10 +1456,15 @@ TRACE_EVENT(nfs4_layoutget,
__field(u64, offset)
__field(u64, count)
__field(int, error)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ __field(int, layoutstateid_seq)
+ __field(u32, layoutstateid_hash)
),
TP_fast_assign(
const struct inode *inode = d_inode(ctx->dentry);
+ const struct nfs4_state *state = ctx->state;
__entry->dev = inode->i_sb->s_dev;
__entry->fileid = NFS_FILEID(inode);
__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
@@ -1178,11 +1472,25 @@ TRACE_EVENT(nfs4_layoutget,
__entry->offset = args->offset;
__entry->count = args->length;
__entry->error = error;
+ __entry->stateid_seq =
+ be32_to_cpu(state->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&state->stateid);
+ if (!error) {
+ __entry->layoutstateid_seq =
+ be32_to_cpu(layout_stateid->seqid);
+ __entry->layoutstateid_hash =
+ nfs_stateid_hash(layout_stateid);
+ } else {
+ __entry->layoutstateid_seq = 0;
+ __entry->layoutstateid_hash = 0;
+ }
),
TP_printk(
"error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
- "iomode=%s offset=%llu count=%llu",
+ "iomode=%s offset=%llu count=%llu stateid=%d:0x%08x "
+ "layoutstateid=%d:0x%08x",
__entry->error,
show_nfsv4_errors(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
@@ -1190,14 +1498,83 @@ TRACE_EVENT(nfs4_layoutget,
__entry->fhandle,
show_pnfs_iomode(__entry->iomode),
(unsigned long long)__entry->offset,
- (unsigned long long)__entry->count
+ (unsigned long long)__entry->count,
+ __entry->stateid_seq, __entry->stateid_hash,
+ __entry->layoutstateid_seq, __entry->layoutstateid_hash
)
);
-DEFINE_NFS4_INODE_EVENT(nfs4_layoutcommit);
-DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn);
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_layoutcommit);
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_layoutreturn);
DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn_on_close);
+#define show_pnfs_update_layout_reason(reason) \
+ __print_symbolic(reason, \
+ { PNFS_UPDATE_LAYOUT_UNKNOWN, "unknown" }, \
+ { PNFS_UPDATE_LAYOUT_NO_PNFS, "no pnfs" }, \
+ { PNFS_UPDATE_LAYOUT_RD_ZEROLEN, "read+zerolen" }, \
+ { PNFS_UPDATE_LAYOUT_MDSTHRESH, "mdsthresh" }, \
+ { PNFS_UPDATE_LAYOUT_NOMEM, "nomem" }, \
+ { PNFS_UPDATE_LAYOUT_BULK_RECALL, "bulk recall" }, \
+ { PNFS_UPDATE_LAYOUT_IO_TEST_FAIL, "io test fail" }, \
+ { PNFS_UPDATE_LAYOUT_FOUND_CACHED, "found cached" }, \
+ { PNFS_UPDATE_LAYOUT_RETURN, "layoutreturn" }, \
+ { PNFS_UPDATE_LAYOUT_BLOCKED, "layouts blocked" }, \
+ { PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET, "sent layoutget" })
+
+TRACE_EVENT(pnfs_update_layout,
+ TP_PROTO(struct inode *inode,
+ loff_t pos,
+ u64 count,
+ enum pnfs_iomode iomode,
+ struct pnfs_layout_hdr *lo,
+ enum pnfs_update_layout_reason reason
+ ),
+ TP_ARGS(inode, pos, count, iomode, lo, reason),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u64, fileid)
+ __field(u32, fhandle)
+ __field(loff_t, pos)
+ __field(u64, count)
+ __field(enum pnfs_iomode, iomode)
+ __field(int, layoutstateid_seq)
+ __field(u32, layoutstateid_hash)
+ __field(enum pnfs_update_layout_reason, reason)
+ ),
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->pos = pos;
+ __entry->count = count;
+ __entry->iomode = iomode;
+ __entry->reason = reason;
+ if (lo != NULL) {
+ __entry->layoutstateid_seq =
+ be32_to_cpu(lo->plh_stateid.seqid);
+ __entry->layoutstateid_hash =
+ nfs_stateid_hash(&lo->plh_stateid);
+ } else {
+ __entry->layoutstateid_seq = 0;
+ __entry->layoutstateid_hash = 0;
+ }
+ ),
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "iomode=%s pos=%llu count=%llu "
+ "layoutstateid=%d:0x%08x (%s)",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ show_pnfs_iomode(__entry->iomode),
+ (unsigned long long)__entry->pos,
+ (unsigned long long)__entry->count,
+ __entry->layoutstateid_seq, __entry->layoutstateid_hash,
+ show_pnfs_update_layout_reason(__entry->reason)
+ )
+);
+
#endif /* CONFIG_NFS_V4_1 */
#endif /* _TRACE_NFS4_H */
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 788adf3897c7..4e4441216804 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1659,7 +1659,7 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun
*p = cpu_to_be32(FATTR4_WORD0_ACL);
p = reserve_space(xdr, 4);
*p = cpu_to_be32(arg->acl_len);
- xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len);
+ xdr_write_pages(xdr, arg->acl_pages, 0, arg->acl_len);
}
static void
@@ -2491,7 +2491,7 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr,
encode_getattr_two(xdr, FATTR4_WORD0_ACL, 0, &hdr);
xdr_inline_pages(&req->rq_rcv_buf, replen << 2,
- args->acl_pages, args->acl_pgbase, args->acl_len);
+ args->acl_pages, 0, args->acl_len);
encode_nops(&hdr);
}
@@ -3615,6 +3615,7 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
status = 0;
if (unlikely(!(bitmap[0] & FATTR4_WORD0_FS_LOCATIONS)))
goto out;
+ bitmap[0] &= ~FATTR4_WORD0_FS_LOCATIONS;
status = -EIO;
/* Ignore borken servers that return unrequested attrs */
if (unlikely(res == NULL))
@@ -4375,6 +4376,11 @@ static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat)
goto xdr_error;
if ((status = decode_attr_files_total(xdr, bitmap, &fsstat->tfiles)) != 0)
goto xdr_error;
+
+ status = -EIO;
+ if (unlikely(bitmap[0]))
+ goto xdr_error;
+
if ((status = decode_attr_space_avail(xdr, bitmap, &fsstat->abytes)) != 0)
goto xdr_error;
if ((status = decode_attr_space_free(xdr, bitmap, &fsstat->fbytes)) != 0)
@@ -4574,6 +4580,10 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
goto xdr_error;
fattr->valid |= status;
+ status = -EIO;
+ if (unlikely(bitmap[0]))
+ goto xdr_error;
+
status = decode_attr_mode(xdr, bitmap, &fmode);
if (status < 0)
goto xdr_error;
@@ -4627,6 +4637,10 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
goto xdr_error;
fattr->valid |= status;
+ status = -EIO;
+ if (unlikely(bitmap[1]))
+ goto xdr_error;
+
status = decode_attr_mdsthreshold(xdr, bitmap, fattr->mdsthreshold);
if (status < 0)
goto xdr_error;
@@ -4764,6 +4778,28 @@ static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap,
return 0;
}
+/*
+ * The granularity of a CLONE operation.
+ */
+static int decode_attr_clone_blksize(struct xdr_stream *xdr, uint32_t *bitmap,
+ uint32_t *res)
+{
+ __be32 *p;
+
+ dprintk("%s: bitmap is %x\n", __func__, bitmap[2]);
+ *res = 0;
+ if (bitmap[2] & FATTR4_WORD2_CLONE_BLKSIZE) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p)) {
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
+ }
+ *res = be32_to_cpup(p);
+ bitmap[2] &= ~FATTR4_WORD2_CLONE_BLKSIZE;
+ }
+ return 0;
+}
+
static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
{
unsigned int savep;
@@ -4789,15 +4825,28 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
if ((status = decode_attr_maxwrite(xdr, bitmap, &fsinfo->wtmax)) != 0)
goto xdr_error;
fsinfo->wtpref = fsinfo->wtmax;
+
+ status = -EIO;
+ if (unlikely(bitmap[0]))
+ goto xdr_error;
+
status = decode_attr_time_delta(xdr, bitmap, &fsinfo->time_delta);
if (status != 0)
goto xdr_error;
status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype);
if (status != 0)
goto xdr_error;
+
+ status = -EIO;
+ if (unlikely(bitmap[1]))
+ goto xdr_error;
+
status = decode_attr_layout_blksize(xdr, bitmap, &fsinfo->blksize);
if (status)
goto xdr_error;
+ status = decode_attr_clone_blksize(xdr, bitmap, &fsinfo->clone_blksize);
+ if (status)
+ goto xdr_error;
status = verify_attr_len(xdr, savep, attrlen);
xdr_error:
@@ -7465,6 +7514,7 @@ struct rpc_procinfo nfs4_procedures[] = {
PROC(ALLOCATE, enc_allocate, dec_allocate),
PROC(DEALLOCATE, enc_deallocate, dec_deallocate),
PROC(LAYOUTSTATS, enc_layoutstats, dec_layoutstats),
+ PROC(CLONE, enc_clone, dec_clone),
#endif /* CONFIG_NFS_V4_2 */
};
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 9bc9f04fb7f6..89a15dbe5efc 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -90,7 +90,7 @@
#define NFS_DEF_OPTIONS "vers=2,udp,rsize=4096,wsize=4096"
/* Parameters passed from the kernel command line */
-static char nfs_root_parms[256] __initdata = "";
+static char nfs_root_parms[NFS_MAXPATHLEN + 1] __initdata = "";
/* Text-based mount options passed to super.c */
static char nfs_root_options[256] __initdata = NFS_DEF_OPTIONS;
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 59f838cdc009..9f80a086b612 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -39,7 +39,6 @@
{ 1 << NFS_INO_INVALIDATING, "INVALIDATING" }, \
{ 1 << NFS_INO_FLUSHING, "FLUSHING" }, \
{ 1 << NFS_INO_FSCACHE, "FSCACHE" }, \
- { 1 << NFS_INO_COMMIT, "COMMIT" }, \
{ 1 << NFS_INO_LAYOUTCOMMIT, "NEED_LAYOUTCOMMIT" }, \
{ 1 << NFS_INO_LAYOUTCOMMITTING, "LAYOUTCOMMIT" })
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 5aaed363556a..9aebffb40505 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -124,7 +124,7 @@ objio_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
retry_lookup:
od = osduld_info_lookup(&odi);
- if (unlikely(IS_ERR(od))) {
+ if (IS_ERR(od)) {
err = PTR_ERR(od);
dprintk("%s: osduld_info_lookup => %d\n", __func__, err);
if (err == -ENODEV && retry_flag) {
@@ -476,10 +476,7 @@ static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
}
unlock_page(page);
}
- if (PageDirty(page) || PageWriteback(page))
- *uptodate = true;
- else
- *uptodate = PageUptodate(page);
+ *uptodate = PageUptodate(page);
dprintk("%s: index=0x%lx uptodate=%d\n", __func__, index, *uptodate);
return page;
}
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index fe3ddd20ff89..8ce4f61cbaa5 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -101,53 +101,18 @@ nfs_page_free(struct nfs_page *p)
kmem_cache_free(nfs_page_cachep, p);
}
-static void
-nfs_iocounter_inc(struct nfs_io_counter *c)
-{
- atomic_inc(&c->io_count);
-}
-
-static void
-nfs_iocounter_dec(struct nfs_io_counter *c)
-{
- if (atomic_dec_and_test(&c->io_count)) {
- clear_bit(NFS_IO_INPROGRESS, &c->flags);
- smp_mb__after_atomic();
- wake_up_bit(&c->flags, NFS_IO_INPROGRESS);
- }
-}
-
-static int
-__nfs_iocounter_wait(struct nfs_io_counter *c)
-{
- wait_queue_head_t *wq = bit_waitqueue(&c->flags, NFS_IO_INPROGRESS);
- DEFINE_WAIT_BIT(q, &c->flags, NFS_IO_INPROGRESS);
- int ret = 0;
-
- do {
- prepare_to_wait(wq, &q.wait, TASK_KILLABLE);
- set_bit(NFS_IO_INPROGRESS, &c->flags);
- if (atomic_read(&c->io_count) == 0)
- break;
- ret = nfs_wait_bit_killable(&q.key);
- } while (atomic_read(&c->io_count) != 0 && !ret);
- finish_wait(wq, &q.wait);
- return ret;
-}
-
/**
* nfs_iocounter_wait - wait for i/o to complete
- * @c: nfs_io_counter to use
+ * @l_ctx: nfs_lock_context with io_counter to use
*
* returns -ERESTARTSYS if interrupted by a fatal signal.
* Otherwise returns 0 once the io_count hits 0.
*/
int
-nfs_iocounter_wait(struct nfs_io_counter *c)
+nfs_iocounter_wait(struct nfs_lock_context *l_ctx)
{
- if (atomic_read(&c->io_count) == 0)
- return 0;
- return __nfs_iocounter_wait(c);
+ return wait_on_atomic_t(&l_ctx->io_count, nfs_wait_atomic_killable,
+ TASK_KILLABLE);
}
/*
@@ -370,7 +335,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page,
return ERR_CAST(l_ctx);
}
req->wb_lock_context = l_ctx;
- nfs_iocounter_inc(&l_ctx->io_count);
+ atomic_inc(&l_ctx->io_count);
/* Initialize the request struct. Initially, we assume a
* long write-back delay. This will be adjusted in
@@ -431,7 +396,8 @@ static void nfs_clear_request(struct nfs_page *req)
req->wb_page = NULL;
}
if (l_ctx != NULL) {
- nfs_iocounter_dec(&l_ctx->io_count);
+ if (atomic_dec_and_test(&l_ctx->io_count))
+ wake_up_atomic_t(&l_ctx->io_count);
nfs_put_lock_context(l_ctx);
req->wb_lock_context = NULL;
}
@@ -664,22 +630,11 @@ EXPORT_SYMBOL_GPL(nfs_initiate_pgio);
* @desc: IO descriptor
* @hdr: pageio header
*/
-static int nfs_pgio_error(struct nfs_pageio_descriptor *desc,
- struct nfs_pgio_header *hdr)
+static void nfs_pgio_error(struct nfs_pgio_header *hdr)
{
- struct nfs_pgio_mirror *mirror;
- u32 midx;
-
set_bit(NFS_IOHDR_REDO, &hdr->flags);
nfs_pgio_data_destroy(hdr);
hdr->completion_ops->completion(hdr);
- /* TODO: Make sure it's right to clean up all mirrors here
- * and not just hdr->pgio_mirror_idx */
- for (midx = 0; midx < desc->pg_mirror_count; midx++) {
- mirror = &desc->pg_mirrors[midx];
- desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
- }
- return -ENOMEM;
}
/**
@@ -800,8 +755,11 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
unsigned int pagecount, pageused;
pagecount = nfs_page_array_len(mirror->pg_base, mirror->pg_count);
- if (!nfs_pgarray_set(&hdr->page_array, pagecount))
- return nfs_pgio_error(desc, hdr);
+ if (!nfs_pgarray_set(&hdr->page_array, pagecount)) {
+ nfs_pgio_error(hdr);
+ desc->pg_error = -ENOMEM;
+ return desc->pg_error;
+ }
nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq);
pages = hdr->page_array.pagevec;
@@ -819,8 +777,11 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
*pages++ = last_page = req->wb_page;
}
}
- if (WARN_ON_ONCE(pageused != pagecount))
- return nfs_pgio_error(desc, hdr);
+ if (WARN_ON_ONCE(pageused != pagecount)) {
+ nfs_pgio_error(hdr);
+ desc->pg_error = -EINVAL;
+ return desc->pg_error;
+ }
if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
(desc->pg_moreio || nfs_reqs_to_commit(&cinfo)))
@@ -835,18 +796,13 @@ EXPORT_SYMBOL_GPL(nfs_generic_pgio);
static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc)
{
- struct nfs_pgio_mirror *mirror;
struct nfs_pgio_header *hdr;
int ret;
- mirror = nfs_pgio_current_mirror(desc);
-
hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
if (!hdr) {
- /* TODO: make sure this is right with mirroring - or
- * should it back out all mirrors? */
- desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
- return -ENOMEM;
+ desc->pg_error = -ENOMEM;
+ return desc->pg_error;
}
nfs_pgheader_init(desc, hdr, nfs_pgio_header_free);
ret = nfs_generic_pgio(desc, hdr);
@@ -874,6 +830,9 @@ static int nfs_pageio_setup_mirroring(struct nfs_pageio_descriptor *pgio,
mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req);
+ if (pgio->pg_error < 0)
+ return pgio->pg_error;
+
if (!mirror_count || mirror_count > NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX)
return -EINVAL;
@@ -903,12 +862,6 @@ static void nfs_pageio_cleanup_mirroring(struct nfs_pageio_descriptor *pgio)
pgio->pg_mirrors_dynamic = NULL;
}
-static bool nfs_match_open_context(const struct nfs_open_context *ctx1,
- const struct nfs_open_context *ctx2)
-{
- return ctx1->cred == ctx2->cred && ctx1->state == ctx2->state;
-}
-
static bool nfs_match_lock_context(const struct nfs_lock_context *l1,
const struct nfs_lock_context *l2)
{
@@ -982,6 +935,8 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
} else {
if (desc->pg_ops->pg_init)
desc->pg_ops->pg_init(desc, req);
+ if (desc->pg_error < 0)
+ return 0;
mirror->pg_base = req->wb_pgbase;
}
if (!nfs_can_coalesce_requests(prev, req, desc))
@@ -1147,6 +1102,8 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
bytes = req->wb_bytes;
nfs_pageio_setup_mirroring(desc, req);
+ if (desc->pg_error < 0)
+ goto out_failed;
for (midx = 0; midx < desc->pg_mirror_count; midx++) {
if (midx) {
@@ -1163,7 +1120,8 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
if (IS_ERR(dupreq)) {
nfs_page_group_unlock(req);
- return 0;
+ desc->pg_error = PTR_ERR(dupreq);
+ goto out_failed;
}
nfs_lock_request(dupreq);
@@ -1176,10 +1134,32 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
if (nfs_pgio_has_mirroring(desc))
desc->pg_mirror_idx = midx;
if (!nfs_pageio_add_request_mirror(desc, dupreq))
- return 0;
+ goto out_failed;
}
return 1;
+
+out_failed:
+ /*
+ * We might have failed before sending any reqs over wire.
+ * Clean up rest of the reqs in mirror pg_list.
+ */
+ if (desc->pg_error) {
+ struct nfs_pgio_mirror *mirror;
+ void (*func)(struct list_head *);
+
+ /* remember fatal errors */
+ if (nfs_error_is_fatal(desc->pg_error))
+ mapping_set_error(desc->pg_inode->i_mapping,
+ desc->pg_error);
+
+ func = desc->pg_completion_ops->error_cleanup;
+ for (midx = 0; midx < desc->pg_mirror_count; midx++) {
+ mirror = &desc->pg_mirrors[midx];
+ func(&mirror->pg_list);
+ }
+ }
+ return 0;
}
/*
@@ -1232,7 +1212,7 @@ int nfs_pageio_resend(struct nfs_pageio_descriptor *desc,
nfs_pageio_complete(desc);
if (!list_empty(&failed)) {
list_move(&failed, &hdr->pages);
- return -EIO;
+ return desc->pg_error < 0 ? desc->pg_error : -EIO;
}
return 0;
}
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 8abe27165ad0..a3592cc34a20 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -53,7 +53,7 @@ static DEFINE_SPINLOCK(pnfs_spinlock);
static LIST_HEAD(pnfs_modules_tbl);
static int
-pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid,
+pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid,
enum pnfs_iomode iomode, bool sync);
/* Return the registered pnfs layout driver module matching given id */
@@ -385,13 +385,13 @@ static void pnfs_layoutreturn_before_put_lseg(struct pnfs_layout_segment *lseg,
enum pnfs_iomode iomode;
bool send;
- stateid = lo->plh_stateid;
+ nfs4_stateid_copy(&stateid, &lo->plh_stateid);
iomode = lo->plh_return_iomode;
send = pnfs_prepare_layoutreturn(lo);
spin_unlock(&inode->i_lock);
if (send) {
/* Send an async layoutreturn so we dont deadlock */
- pnfs_send_layoutreturn(lo, stateid, iomode, false);
+ pnfs_send_layoutreturn(lo, &stateid, iomode, false);
}
} else
spin_unlock(&inode->i_lock);
@@ -566,10 +566,10 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
int
pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
struct list_head *tmp_list,
- struct pnfs_layout_range *recall_range)
+ const struct pnfs_layout_range *recall_range)
{
struct pnfs_layout_segment *lseg, *next;
- int invalid = 0, removed = 0;
+ int remaining = 0;
dprintk("%s:Begin lo %p\n", __func__, lo);
@@ -582,11 +582,11 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
"offset %llu length %llu\n", __func__,
lseg, lseg->pls_range.iomode, lseg->pls_range.offset,
lseg->pls_range.length);
- invalid++;
- removed += mark_lseg_invalid(lseg, tmp_list);
+ if (!mark_lseg_invalid(lseg, tmp_list))
+ remaining++;
}
- dprintk("%s:Return %i\n", __func__, invalid - removed);
- return invalid - removed;
+ dprintk("%s:Return %i\n", __func__, remaining);
+ return remaining;
}
/* note free_me must contain lsegs from a single layout_hdr */
@@ -618,7 +618,6 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
pnfs_get_layout_hdr(lo);
pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED);
pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED);
- pnfs_clear_retry_layoutget(lo);
spin_unlock(&nfsi->vfs_inode.i_lock);
pnfs_free_lseg_list(&tmp_list);
pnfs_put_layout_hdr(lo);
@@ -703,6 +702,8 @@ pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list,
ret = -EAGAIN;
spin_unlock(&inode->i_lock);
pnfs_free_lseg_list(&lseg_list);
+ /* Free all lsegs that are attached to commit buckets */
+ nfs_commit_inode(inode, 0);
pnfs_put_layout_hdr(lo);
iput(inode);
}
@@ -826,7 +827,7 @@ pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo)
int
pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
- struct pnfs_layout_range *range,
+ const struct pnfs_layout_range *range,
struct nfs4_state *open_state)
{
int status = 0;
@@ -861,7 +862,7 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
static struct pnfs_layout_segment *
send_layoutget(struct pnfs_layout_hdr *lo,
struct nfs_open_context *ctx,
- struct pnfs_layout_range *range,
+ const struct pnfs_layout_range *range,
gfp_t gfp_flags)
{
struct inode *ino = lo->plh_inode;
@@ -872,44 +873,41 @@ send_layoutget(struct pnfs_layout_hdr *lo,
dprintk("--> %s\n", __func__);
- lgp = kzalloc(sizeof(*lgp), gfp_flags);
- if (lgp == NULL)
- return NULL;
-
- i_size = i_size_read(ino);
-
- lgp->args.minlength = PAGE_CACHE_SIZE;
- if (lgp->args.minlength > range->length)
- lgp->args.minlength = range->length;
- if (range->iomode == IOMODE_READ) {
- if (range->offset >= i_size)
- lgp->args.minlength = 0;
- else if (i_size - range->offset < lgp->args.minlength)
- lgp->args.minlength = i_size - range->offset;
- }
- lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
- lgp->args.range = *range;
- lgp->args.type = server->pnfs_curr_ld->id;
- lgp->args.inode = ino;
- lgp->args.ctx = get_nfs_open_context(ctx);
- lgp->gfp_flags = gfp_flags;
- lgp->cred = lo->plh_lc_cred;
-
- /* Synchronously retrieve layout information from server and
- * store in lseg.
+ /*
+ * Synchronously retrieve layout information from server and
+ * store in lseg. If we race with a concurrent seqid morphing
+ * op, then re-send the LAYOUTGET.
*/
- lseg = nfs4_proc_layoutget(lgp, gfp_flags);
- if (IS_ERR(lseg)) {
- switch (PTR_ERR(lseg)) {
- case -ENOMEM:
- case -ERESTARTSYS:
- break;
- default:
- /* remember that LAYOUTGET failed and suspend trying */
- pnfs_layout_io_set_failed(lo, range->iomode);
+ do {
+ lgp = kzalloc(sizeof(*lgp), gfp_flags);
+ if (lgp == NULL)
+ return NULL;
+
+ i_size = i_size_read(ino);
+
+ lgp->args.minlength = PAGE_CACHE_SIZE;
+ if (lgp->args.minlength > range->length)
+ lgp->args.minlength = range->length;
+ if (range->iomode == IOMODE_READ) {
+ if (range->offset >= i_size)
+ lgp->args.minlength = 0;
+ else if (i_size - range->offset < lgp->args.minlength)
+ lgp->args.minlength = i_size - range->offset;
}
- return NULL;
- } else
+ lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
+ pnfs_copy_range(&lgp->args.range, range);
+ lgp->args.type = server->pnfs_curr_ld->id;
+ lgp->args.inode = ino;
+ lgp->args.ctx = get_nfs_open_context(ctx);
+ lgp->gfp_flags = gfp_flags;
+ lgp->cred = lo->plh_lc_cred;
+
+ lseg = nfs4_proc_layoutget(lgp, gfp_flags);
+ } while (lseg == ERR_PTR(-EAGAIN));
+
+ if (IS_ERR(lseg) && !nfs_error_is_fatal(PTR_ERR(lseg)))
+ lseg = NULL;
+ else
pnfs_layout_clear_fail_bit(lo,
pnfs_iomode_to_fail_bit(range->iomode));
@@ -940,7 +938,7 @@ void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo)
}
static int
-pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid,
+pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid,
enum pnfs_iomode iomode, bool sync)
{
struct inode *ino = lo->plh_inode;
@@ -957,7 +955,7 @@ pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid,
goto out;
}
- lrp->args.stateid = stateid;
+ nfs4_stateid_copy(&lrp->args.stateid, stateid);
lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id;
lrp->args.inode = ino;
lrp->args.range.iomode = iomode;
@@ -1000,7 +998,7 @@ _pnfs_return_layout(struct inode *ino)
dprintk("NFS: %s no layout to return\n", __func__);
goto out;
}
- stateid = nfsi->layout->plh_stateid;
+ nfs4_stateid_copy(&stateid, &nfsi->layout->plh_stateid);
/* Reference matched in nfs4_layoutreturn_release */
pnfs_get_layout_hdr(lo);
empty = list_empty(&lo->plh_segs);
@@ -1028,7 +1026,7 @@ _pnfs_return_layout(struct inode *ino)
spin_unlock(&ino->i_lock);
pnfs_free_lseg_list(&tmp_list);
if (send)
- status = pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true);
+ status = pnfs_send_layoutreturn(lo, &stateid, IOMODE_ANY, true);
out_put_layout_hdr:
pnfs_put_layout_hdr(lo);
out:
@@ -1091,13 +1089,12 @@ bool pnfs_roc(struct inode *ino)
goto out_noroc;
}
- stateid = lo->plh_stateid;
+ nfs4_stateid_copy(&stateid, &lo->plh_stateid);
/* always send layoutreturn if being marked so */
if (test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
&lo->plh_flags))
layoutreturn = pnfs_prepare_layoutreturn(lo);
- pnfs_clear_retry_layoutget(lo);
list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
/* If we are sending layoutreturn, invalidate all valid lsegs */
if (layoutreturn || test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
@@ -1119,7 +1116,7 @@ out_noroc:
pnfs_free_lseg_list(&tmp_list);
pnfs_layoutcommit_inode(ino, true);
if (layoutreturn)
- pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true);
+ pnfs_send_layoutreturn(lo, &stateid, IOMODE_ANY, true);
return roc;
}
@@ -1144,6 +1141,7 @@ void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
spin_lock(&ino->i_lock);
lo = NFS_I(ino)->layout;
+ pnfs_mark_layout_returned_if_empty(lo);
if (pnfs_seqid_is_newer(barrier, lo->plh_barrier))
lo->plh_barrier = barrier;
spin_unlock(&ino->i_lock);
@@ -1460,25 +1458,15 @@ static bool pnfs_within_mdsthreshold(struct nfs_open_context *ctx,
return ret;
}
-/* stop waiting if someone clears NFS_LAYOUT_RETRY_LAYOUTGET bit. */
-static int pnfs_layoutget_retry_bit_wait(struct wait_bit_key *key)
-{
- if (!test_bit(NFS_LAYOUT_RETRY_LAYOUTGET, key->flags))
- return 1;
- return nfs_wait_bit_killable(key);
-}
-
static bool pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo)
{
- if (!pnfs_should_retry_layoutget(lo))
- return false;
/*
* send layoutcommit as it can hold up layoutreturn due to lseg
* reference
*/
pnfs_layoutcommit_inode(lo->plh_inode, false);
return !wait_on_bit_action(&lo->plh_flags, NFS_LAYOUT_RETURN,
- pnfs_layoutget_retry_bit_wait,
+ nfs_wait_bit_killable,
TASK_UNINTERRUPTIBLE);
}
@@ -1515,14 +1503,23 @@ pnfs_update_layout(struct inode *ino,
struct pnfs_layout_segment *lseg = NULL;
bool first;
- if (!pnfs_enabled_sb(NFS_SERVER(ino)))
+ if (!pnfs_enabled_sb(NFS_SERVER(ino))) {
+ trace_pnfs_update_layout(ino, pos, count, iomode, NULL,
+ PNFS_UPDATE_LAYOUT_NO_PNFS);
goto out;
+ }
- if (iomode == IOMODE_READ && i_size_read(ino) == 0)
+ if (iomode == IOMODE_READ && i_size_read(ino) == 0) {
+ trace_pnfs_update_layout(ino, pos, count, iomode, NULL,
+ PNFS_UPDATE_LAYOUT_RD_ZEROLEN);
goto out;
+ }
- if (pnfs_within_mdsthreshold(ctx, ino, iomode))
+ if (pnfs_within_mdsthreshold(ctx, ino, iomode)) {
+ trace_pnfs_update_layout(ino, pos, count, iomode, NULL,
+ PNFS_UPDATE_LAYOUT_MDSTHRESH);
goto out;
+ }
lookup_again:
first = false;
@@ -1530,19 +1527,25 @@ lookup_again:
lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
if (lo == NULL) {
spin_unlock(&ino->i_lock);
+ trace_pnfs_update_layout(ino, pos, count, iomode, NULL,
+ PNFS_UPDATE_LAYOUT_NOMEM);
goto out;
}
/* Do we even need to bother with this? */
if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+ PNFS_UPDATE_LAYOUT_BULK_RECALL);
dprintk("%s matches recall, use MDS\n", __func__);
goto out_unlock;
}
/* if LAYOUTGET already failed once we don't try again */
- if (pnfs_layout_io_test_failed(lo, iomode) &&
- !pnfs_should_retry_layoutget(lo))
+ if (pnfs_layout_io_test_failed(lo, iomode)) {
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+ PNFS_UPDATE_LAYOUT_IO_TEST_FAIL);
goto out_unlock;
+ }
first = list_empty(&lo->plh_segs);
if (first) {
@@ -1562,8 +1565,11 @@ lookup_again:
* already exists
*/
lseg = pnfs_find_lseg(lo, &arg);
- if (lseg)
+ if (lseg) {
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+ PNFS_UPDATE_LAYOUT_FOUND_CACHED);
goto out_unlock;
+ }
}
/*
@@ -1580,11 +1586,16 @@ lookup_again:
dprintk("%s retrying\n", __func__);
goto lookup_again;
}
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+ PNFS_UPDATE_LAYOUT_RETURN);
goto out_put_layout_hdr;
}
- if (pnfs_layoutgets_blocked(lo))
+ if (pnfs_layoutgets_blocked(lo)) {
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+ PNFS_UPDATE_LAYOUT_BLOCKED);
goto out_unlock;
+ }
atomic_inc(&lo->plh_outstanding);
spin_unlock(&ino->i_lock);
@@ -1607,8 +1618,9 @@ lookup_again:
arg.length = PAGE_CACHE_ALIGN(arg.length);
lseg = send_layoutget(lo, ctx, &arg, gfp_flags);
- pnfs_clear_retry_layoutget(lo);
atomic_dec(&lo->plh_outstanding);
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+ PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET);
out_put_layout_hdr:
if (first)
pnfs_clear_first_layoutget(lo);
@@ -1618,7 +1630,7 @@ out:
"(%s, offset: %llu, length: %llu)\n",
__func__, ino->i_sb->s_id,
(unsigned long long)NFS_FILEID(ino),
- lseg == NULL ? "not found" : "found",
+ IS_ERR_OR_NULL(lseg) ? "not found" : "found",
iomode==IOMODE_RW ? "read/write" : "read-only",
(unsigned long long)pos,
(unsigned long long)count);
@@ -1687,6 +1699,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
/* existing state ID, make sure the sequence number matches. */
if (pnfs_layout_stateid_blocked(lo, &res->stateid)) {
dprintk("%s forget reply due to sequence\n", __func__);
+ status = -EAGAIN;
goto out_forget_reply;
}
pnfs_set_layout_stateid(lo, &res->stateid, false);
@@ -1724,16 +1737,29 @@ out_forget_reply:
}
static void
+pnfs_set_plh_return_iomode(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode)
+{
+ if (lo->plh_return_iomode == iomode)
+ return;
+ if (lo->plh_return_iomode != 0)
+ iomode = IOMODE_ANY;
+ lo->plh_return_iomode = iomode;
+}
+
+int
pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
struct list_head *tmp_list,
- struct pnfs_layout_range *return_range)
+ const struct pnfs_layout_range *return_range)
{
struct pnfs_layout_segment *lseg, *next;
+ int remaining = 0;
dprintk("%s:Begin lo %p\n", __func__, lo);
if (list_empty(&lo->plh_segs))
- return;
+ return 0;
+
+ assert_spin_locked(&lo->plh_inode->i_lock);
list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
if (should_free_lseg(&lseg->pls_range, return_range)) {
@@ -1743,38 +1769,47 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
lseg->pls_range.offset,
lseg->pls_range.length);
set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
- mark_lseg_invalid(lseg, tmp_list);
+ pnfs_set_plh_return_iomode(lo, return_range->iomode);
+ if (!mark_lseg_invalid(lseg, tmp_list))
+ remaining++;
set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
&lo->plh_flags);
}
+ return remaining;
}
void pnfs_error_mark_layout_for_return(struct inode *inode,
struct pnfs_layout_segment *lseg)
{
struct pnfs_layout_hdr *lo = NFS_I(inode)->layout;
- int iomode = pnfs_iomode_to_fail_bit(lseg->pls_range.iomode);
struct pnfs_layout_range range = {
.iomode = lseg->pls_range.iomode,
.offset = 0,
.length = NFS4_MAX_UINT64,
};
LIST_HEAD(free_me);
+ bool return_now = false;
spin_lock(&inode->i_lock);
- /* set failure bit so that pnfs path will be retried later */
- pnfs_layout_set_fail_bit(lo, iomode);
- if (lo->plh_return_iomode == 0)
- lo->plh_return_iomode = range.iomode;
- else if (lo->plh_return_iomode != range.iomode)
- lo->plh_return_iomode = IOMODE_ANY;
+ pnfs_set_plh_return_iomode(lo, range.iomode);
/*
* mark all matching lsegs so that we are sure to have no live
* segments at hand when sending layoutreturn. See pnfs_put_lseg()
* for how it works.
*/
- pnfs_mark_matching_lsegs_return(lo, &free_me, &range);
- spin_unlock(&inode->i_lock);
+ if (!pnfs_mark_matching_lsegs_return(lo, &free_me, &range)) {
+ nfs4_stateid stateid;
+ enum pnfs_iomode iomode = lo->plh_return_iomode;
+
+ nfs4_stateid_copy(&stateid, &lo->plh_stateid);
+ return_now = pnfs_prepare_layoutreturn(lo);
+ spin_unlock(&inode->i_lock);
+ if (return_now)
+ pnfs_send_layoutreturn(lo, &stateid, iomode, false);
+ } else {
+ spin_unlock(&inode->i_lock);
+ nfs_commit_inode(inode, 0);
+ }
pnfs_free_lseg_list(&free_me);
}
EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return);
@@ -1796,6 +1831,11 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r
rd_size,
IOMODE_READ,
GFP_KERNEL);
+ if (IS_ERR(pgio->pg_lseg)) {
+ pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ return;
+ }
}
/* If no lseg, fall back to read through mds */
if (pgio->pg_lseg == NULL)
@@ -1808,13 +1848,19 @@ void
pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req, u64 wb_size)
{
- if (pgio->pg_lseg == NULL)
+ if (pgio->pg_lseg == NULL) {
pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
req->wb_context,
req_offset(req),
wb_size,
IOMODE_RW,
GFP_NOFS);
+ if (IS_ERR(pgio->pg_lseg)) {
+ pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ return;
+ }
+ }
/* If no lseg, fall back to write through mds */
if (pgio->pg_lseg == NULL)
nfs_pageio_reset_write_mds(pgio);
@@ -1912,12 +1958,13 @@ static void pnfs_ld_handle_write_error(struct nfs_pgio_header *hdr)
*/
void pnfs_ld_write_done(struct nfs_pgio_header *hdr)
{
- trace_nfs4_pnfs_write(hdr, hdr->pnfs_error);
- if (!hdr->pnfs_error) {
+ if (likely(!hdr->pnfs_error)) {
pnfs_set_layoutcommit(hdr->inode, hdr->lseg,
hdr->mds_offset + hdr->res.count);
hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
- } else
+ }
+ trace_nfs4_pnfs_write(hdr, hdr->pnfs_error);
+ if (unlikely(hdr->pnfs_error))
pnfs_ld_handle_write_error(hdr);
hdr->mds_ops->rpc_release(hdr);
}
@@ -1981,15 +2028,13 @@ static void pnfs_writehdr_free(struct nfs_pgio_header *hdr)
int
pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
{
- struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
-
struct nfs_pgio_header *hdr;
int ret;
hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
if (!hdr) {
- desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
- return -ENOMEM;
+ desc->pg_error = -ENOMEM;
+ return desc->pg_error;
}
nfs_pgheader_init(desc, hdr, pnfs_writehdr_free);
@@ -2028,11 +2073,12 @@ static void pnfs_ld_handle_read_error(struct nfs_pgio_header *hdr)
*/
void pnfs_ld_read_done(struct nfs_pgio_header *hdr)
{
- trace_nfs4_pnfs_read(hdr, hdr->pnfs_error);
if (likely(!hdr->pnfs_error)) {
__nfs4_read_done_cb(hdr);
hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
- } else
+ }
+ trace_nfs4_pnfs_read(hdr, hdr->pnfs_error);
+ if (unlikely(hdr->pnfs_error))
pnfs_ld_handle_read_error(hdr);
hdr->mds_ops->rpc_release(hdr);
}
@@ -2111,15 +2157,13 @@ static void pnfs_readhdr_free(struct nfs_pgio_header *hdr)
int
pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
{
- struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
-
struct nfs_pgio_header *hdr;
int ret;
hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
if (!hdr) {
- desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
- return -ENOMEM;
+ desc->pg_error = -ENOMEM;
+ return desc->pg_error;
}
nfs_pgheader_init(desc, hdr, pnfs_readhdr_free);
hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index d1990e90e7a0..9f4e2a47f4aa 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -98,7 +98,6 @@ enum {
NFS_LAYOUT_RETURN_BEFORE_CLOSE, /* Return this layout before close */
NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */
NFS_LAYOUT_FIRST_LAYOUTGET, /* Serialize first layoutget */
- NFS_LAYOUT_RETRY_LAYOUTGET, /* Retry layoutget */
};
enum layoutdriver_policy_flags {
@@ -261,11 +260,14 @@ void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
bool update_barrier);
int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
struct pnfs_layout_hdr *lo,
- struct pnfs_layout_range *range,
+ const struct pnfs_layout_range *range,
struct nfs4_state *open_state);
int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
struct list_head *tmp_list,
- struct pnfs_layout_range *recall_range);
+ const struct pnfs_layout_range *recall_range);
+int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
+ struct list_head *tmp_list,
+ const struct pnfs_layout_range *recall_range);
bool pnfs_roc(struct inode *ino);
void pnfs_roc_release(struct inode *ino);
void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
@@ -379,26 +381,6 @@ nfs4_get_deviceid(struct nfs4_deviceid_node *d)
return d;
}
-static inline void pnfs_set_retry_layoutget(struct pnfs_layout_hdr *lo)
-{
- if (!test_and_set_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags))
- atomic_inc(&lo->plh_refcount);
-}
-
-static inline void pnfs_clear_retry_layoutget(struct pnfs_layout_hdr *lo)
-{
- if (test_and_clear_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags)) {
- atomic_dec(&lo->plh_refcount);
- /* wake up waiters for LAYOUTRETURN as that is not needed */
- wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN);
- }
-}
-
-static inline bool pnfs_should_retry_layoutget(struct pnfs_layout_hdr *lo)
-{
- return test_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags);
-}
-
static inline struct pnfs_layout_segment *
pnfs_get_lseg(struct pnfs_layout_segment *lseg)
{
@@ -409,6 +391,12 @@ pnfs_get_lseg(struct pnfs_layout_segment *lseg)
return lseg;
}
+static inline bool
+pnfs_is_valid_lseg(struct pnfs_layout_segment *lseg)
+{
+ return test_bit(NFS_LSEG_VALID, &lseg->pls_flags) != 0;
+}
+
/* Return true if a layout driver is being used for this mountpoint */
static inline int pnfs_enabled_sb(struct nfs_server *nfss)
{
@@ -556,6 +544,26 @@ pnfs_calc_offset_length(u64 offset, u64 end)
return 1 + end - offset;
}
+/**
+ * pnfs_mark_layout_returned_if_empty - marks the layout as returned
+ * @lo: layout header
+ *
+ * Note: Caller must hold inode->i_lock
+ */
+static inline void
+pnfs_mark_layout_returned_if_empty(struct pnfs_layout_hdr *lo)
+{
+ if (list_empty(&lo->plh_segs))
+ set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
+}
+
+static inline void
+pnfs_copy_range(struct pnfs_layout_range *dst,
+ const struct pnfs_layout_range *src)
+{
+ memcpy(dst, src, sizeof(*dst));
+}
+
extern unsigned int layoutstats_timer;
#ifdef NFS_DEBUG
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 24655b807d44..81ac6480f9e7 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -266,17 +266,14 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
} else {
nfs_retry_commit(mds_pages, NULL, cinfo, 0);
pnfs_generic_retry_commit(cinfo, 0);
- cinfo->completion_ops->error_cleanup(NFS_I(inode));
return -ENOMEM;
}
}
nreq += pnfs_generic_alloc_ds_commits(cinfo, &list);
- if (nreq == 0) {
- cinfo->completion_ops->error_cleanup(NFS_I(inode));
+ if (nreq == 0)
goto out;
- }
atomic_add(nreq, &cinfo->mds->rpcs_out);
@@ -871,6 +868,11 @@ pnfs_layout_mark_request_commit(struct nfs_page *req,
buckets = cinfo->ds->buckets;
list = &buckets[ds_commit_idx].written;
if (list_empty(list)) {
+ if (!pnfs_is_valid_lseg(lseg)) {
+ spin_unlock(cinfo->lock);
+ cinfo->completion_ops->resched_write(cinfo, req);
+ return;
+ }
/* Non-empty buckets hold a reference on the lseg. That ref
* is normally transferred to the COMMIT call and released
* there. It could also be released if the last req is pulled
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 01b8cc8e8cfc..eb31e23e7def 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -85,6 +85,23 @@ void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio)
}
EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds);
+static void nfs_readpage_release(struct nfs_page *req)
+{
+ struct inode *inode = d_inode(req->wb_context->dentry);
+
+ dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb->s_id,
+ (unsigned long long)NFS_FILEID(inode), req->wb_bytes,
+ (long long)req_offset(req));
+
+ if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) {
+ if (PageUptodate(req->wb_page))
+ nfs_readpage_to_fscache(inode, req->wb_page, 0);
+
+ unlock_page(req->wb_page);
+ }
+ nfs_release_request(req);
+}
+
int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
struct page *page)
{
@@ -106,7 +123,10 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
nfs_pageio_init_read(&pgio, inode, false,
&nfs_async_read_completion_ops);
- nfs_pageio_add_request(&pgio, new);
+ if (!nfs_pageio_add_request(&pgio, new)) {
+ nfs_list_remove_request(new);
+ nfs_readpage_release(new);
+ }
nfs_pageio_complete(&pgio);
/* It doesn't make sense to do mirrored reads! */
@@ -115,24 +135,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
pgm = &pgio.pg_mirrors[0];
NFS_I(inode)->read_io += pgm->pg_bytes_written;
- return 0;
-}
-
-static void nfs_readpage_release(struct nfs_page *req)
-{
- struct inode *inode = d_inode(req->wb_context->dentry);
-
- dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb->s_id,
- (unsigned long long)NFS_FILEID(inode), req->wb_bytes,
- (long long)req_offset(req));
-
- if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) {
- if (PageUptodate(req->wb_page))
- nfs_readpage_to_fscache(inode, req->wb_page, 0);
-
- unlock_page(req->wb_page);
- }
- nfs_release_request(req);
+ return pgio.pg_error < 0 ? pgio.pg_error : 0;
}
static void nfs_page_group_set_uptodate(struct nfs_page *req)
@@ -246,6 +249,13 @@ static void nfs_readpage_retry(struct rpc_task *task,
nfs_set_pgio_error(hdr, -EIO, argp->offset);
return;
}
+
+ /* For non rpc-based layout drivers, retry-through-MDS */
+ if (!task->tk_ops) {
+ hdr->pnfs_error = -EAGAIN;
+ return;
+ }
+
/* Yes, so retry the read at the end of the hdr */
hdr->mds_offset += resp->count;
argp->offset += resp->count;
@@ -268,7 +278,7 @@ static void nfs_readpage_result(struct rpc_task *task,
hdr->good_bytes = bound - hdr->io_start;
}
spin_unlock(&hdr->lock);
- } else if (hdr->res.count != hdr->args.count)
+ } else if (hdr->res.count < hdr->args.count)
nfs_readpage_retry(task, hdr);
}
@@ -354,6 +364,8 @@ readpage_async_filler(void *data, struct page *page)
if (len < PAGE_CACHE_SIZE)
zero_user_segment(page, len, PAGE_CACHE_SIZE);
if (!nfs_pageio_add_request(desc->pgio, new)) {
+ nfs_list_remove_request(new);
+ nfs_readpage_release(new);
error = desc->pgio->pg_error;
goto out_unlock;
}
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 383a027de452..f1268280244e 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2816,7 +2816,6 @@ out_invalid_transport_udp:
* NFS client for backwards compatibility
*/
unsigned int nfs_callback_set_tcpport;
-unsigned short nfs_callback_tcpport;
/* Default cache timeout is 10 minutes */
unsigned int nfs_idmap_cache_timeout = 600;
/* Turn off NFSv4 uid/gid mapping when using AUTH_SYS */
@@ -2827,7 +2826,6 @@ char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN] = "";
bool recover_lost_locks = false;
EXPORT_SYMBOL_GPL(nfs_callback_set_tcpport);
-EXPORT_SYMBOL_GPL(nfs_callback_tcpport);
EXPORT_SYMBOL_GPL(nfs_idmap_cache_timeout);
EXPORT_SYMBOL_GPL(nfs4_disable_idmapping);
EXPORT_SYMBOL_GPL(max_session_slots);
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index b6de433da5db..4fe3eead3868 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -42,21 +42,35 @@ error:
return -EIO;
}
-static const char *nfs_follow_link(struct dentry *dentry, void **cookie)
+static const char *nfs_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- struct inode *inode = d_inode(dentry);
struct page *page;
void *err;
- err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping));
- if (err)
- return err;
- page = read_cache_page(&inode->i_data, 0,
- (filler_t *)nfs_symlink_filler, inode);
- if (IS_ERR(page))
- return ERR_CAST(page);
- *cookie = page;
- return kmap(page);
+ if (!dentry) {
+ err = ERR_PTR(nfs_revalidate_mapping_rcu(inode));
+ if (err)
+ return err;
+ page = find_get_page(inode->i_mapping, 0);
+ if (!page)
+ return ERR_PTR(-ECHILD);
+ if (!PageUptodate(page)) {
+ put_page(page);
+ return ERR_PTR(-ECHILD);
+ }
+ } else {
+ err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping));
+ if (err)
+ return err;
+ page = read_cache_page(&inode->i_data, 0,
+ (filler_t *)nfs_symlink_filler, inode);
+ if (IS_ERR(page))
+ return ERR_CAST(page);
+ }
+ set_delayed_call(done, page_put_link, page);
+ return page_address(page);
}
/*
@@ -64,8 +78,7 @@ static const char *nfs_follow_link(struct dentry *dentry, void **cookie)
*/
const struct inode_operations nfs_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = nfs_follow_link,
- .put_link = page_put_link,
+ .get_link = nfs_get_link,
.getattr = nfs_getattr,
.setattr = nfs_setattr,
};
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 75ab7622e0cc..ce43cd6d88c6 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -21,6 +21,8 @@
#include <linux/nfs_page.h>
#include <linux/backing-dev.h>
#include <linux/export.h>
+#include <linux/freezer.h>
+#include <linux/wait.h>
#include <asm/uaccess.h>
@@ -244,11 +246,9 @@ static int wb_priority(struct writeback_control *wbc)
{
int ret = 0;
if (wbc->for_reclaim)
- return FLUSH_HIGHPRI | FLUSH_STABLE;
+ return FLUSH_HIGHPRI | FLUSH_COND_STABLE;
if (wbc->sync_mode == WB_SYNC_ALL)
ret = FLUSH_COND_STABLE;
- if (wbc->for_kupdate || wbc->for_background)
- ret |= FLUSH_LOWPRI;
return ret;
}
@@ -545,12 +545,22 @@ try_again:
return head;
}
+static void nfs_write_error_remove_page(struct nfs_page *req)
+{
+ nfs_unlock_request(req);
+ nfs_end_page_writeback(req);
+ nfs_release_request(req);
+ generic_error_remove_page(page_file_mapping(req->wb_page),
+ req->wb_page);
+}
+
/*
* Find an associated nfs write request, and prepare to flush it out
* May return an error if the user signalled nfs_wait_on_request().
*/
static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
- struct page *page, bool nonblock)
+ struct page *page, bool nonblock,
+ bool launder)
{
struct nfs_page *req;
int ret = 0;
@@ -567,8 +577,21 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
ret = 0;
if (!nfs_pageio_add_request(pgio, req)) {
- nfs_redirty_request(req);
ret = pgio->pg_error;
+ /*
+ * Remove the problematic req upon fatal errors
+ * in launder case, while other dirty pages can
+ * still be around until they get flushed.
+ */
+ if (nfs_error_is_fatal(ret)) {
+ nfs_context_set_write_error(req->wb_context, ret);
+ if (launder) {
+ nfs_write_error_remove_page(req);
+ goto out;
+ }
+ }
+ nfs_redirty_request(req);
+ ret = -EAGAIN;
} else
nfs_add_stats(page_file_mapping(page)->host,
NFSIOS_WRITEPAGES, 1);
@@ -576,12 +599,14 @@ out:
return ret;
}
-static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio)
+static int nfs_do_writepage(struct page *page, struct writeback_control *wbc,
+ struct nfs_pageio_descriptor *pgio, bool launder)
{
int ret;
nfs_pageio_cond_complete(pgio, page_file_index(page));
- ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE);
+ ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE,
+ launder);
if (ret == -EAGAIN) {
redirty_page_for_writepage(wbc, page);
ret = 0;
@@ -592,7 +617,9 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, st
/*
* Write an mmapped page to the server.
*/
-static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc)
+static int nfs_writepage_locked(struct page *page,
+ struct writeback_control *wbc,
+ bool launder)
{
struct nfs_pageio_descriptor pgio;
struct inode *inode = page_file_mapping(page)->host;
@@ -601,7 +628,7 @@ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc
nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
nfs_pageio_init_write(&pgio, inode, wb_priority(wbc),
false, &nfs_async_write_completion_ops);
- err = nfs_do_writepage(page, wbc, &pgio);
+ err = nfs_do_writepage(page, wbc, &pgio, launder);
nfs_pageio_complete(&pgio);
if (err < 0)
return err;
@@ -614,7 +641,7 @@ int nfs_writepage(struct page *page, struct writeback_control *wbc)
{
int ret;
- ret = nfs_writepage_locked(page, wbc);
+ ret = nfs_writepage_locked(page, wbc, false);
unlock_page(page);
return ret;
}
@@ -623,7 +650,7 @@ static int nfs_writepages_callback(struct page *page, struct writeback_control *
{
int ret;
- ret = nfs_do_writepage(page, wbc, data);
+ ret = nfs_do_writepage(page, wbc, data, false);
unlock_page(page);
return ret;
}
@@ -1128,7 +1155,8 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
if (req == NULL)
return 0;
l_ctx = req->wb_lock_context;
- do_flush = req->wb_page != page || req->wb_context != ctx;
+ do_flush = req->wb_page != page ||
+ !nfs_match_open_context(req->wb_context, ctx);
/* for now, flush if more than 1 request in page_group */
do_flush |= req->wb_this_page != req;
if (l_ctx && flctx &&
@@ -1326,9 +1354,15 @@ static void nfs_async_write_error(struct list_head *head)
}
}
+static void nfs_async_write_reschedule_io(struct nfs_pgio_header *hdr)
+{
+ nfs_async_write_error(&hdr->pages);
+}
+
static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops = {
.error_cleanup = nfs_async_write_error,
.completion = nfs_write_completion,
+ .reschedule_io = nfs_async_write_reschedule_io,
};
void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
@@ -1505,6 +1539,13 @@ static void nfs_writeback_result(struct rpc_task *task,
task->tk_status = -EIO;
return;
}
+
+ /* For non rpc-based layout drivers, retry-through-MDS */
+ if (!task->tk_ops) {
+ hdr->pnfs_error = -EAGAIN;
+ return;
+ }
+
/* Was this an NFSv2 write or an NFSv3 stable write? */
if (resp->verf->committed != NFS_UNSTABLE) {
/* Resend from where the server left off */
@@ -1522,27 +1563,21 @@ static void nfs_writeback_result(struct rpc_task *task,
}
}
-
-static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait)
+static int wait_on_commit(struct nfs_mds_commit_info *cinfo)
{
- int ret;
+ return wait_on_atomic_t(&cinfo->rpcs_out,
+ nfs_wait_atomic_killable, TASK_KILLABLE);
+}
- if (!test_and_set_bit(NFS_INO_COMMIT, &nfsi->flags))
- return 1;
- if (!may_wait)
- return 0;
- ret = out_of_line_wait_on_bit_lock(&nfsi->flags,
- NFS_INO_COMMIT,
- nfs_wait_bit_killable,
- TASK_KILLABLE);
- return (ret < 0) ? ret : 1;
+static void nfs_commit_begin(struct nfs_mds_commit_info *cinfo)
+{
+ atomic_inc(&cinfo->rpcs_out);
}
-static void nfs_commit_clear_lock(struct nfs_inode *nfsi)
+static void nfs_commit_end(struct nfs_mds_commit_info *cinfo)
{
- clear_bit(NFS_INO_COMMIT, &nfsi->flags);
- smp_mb__after_atomic();
- wake_up_bit(&nfsi->flags, NFS_INO_COMMIT);
+ if (atomic_dec_and_test(&cinfo->rpcs_out))
+ wake_up_atomic_t(&cinfo->rpcs_out);
}
void nfs_commitdata_release(struct nfs_commit_data *data)
@@ -1659,6 +1694,13 @@ void nfs_retry_commit(struct list_head *page_list,
}
EXPORT_SYMBOL_GPL(nfs_retry_commit);
+static void
+nfs_commit_resched_write(struct nfs_commit_info *cinfo,
+ struct nfs_page *req)
+{
+ __set_page_dirty_nobuffers(req->wb_page);
+}
+
/*
* Commit dirty pages
*/
@@ -1680,7 +1722,6 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how,
data->mds_ops, how, 0);
out_bad:
nfs_retry_commit(head, NULL, cinfo, 0);
- cinfo->completion_ops->error_cleanup(NFS_I(inode));
return -ENOMEM;
}
@@ -1742,8 +1783,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
nfs_init_cinfo(&cinfo, data->inode, data->dreq);
- if (atomic_dec_and_test(&cinfo.mds->rpcs_out))
- nfs_commit_clear_lock(NFS_I(data->inode));
+ nfs_commit_end(cinfo.mds);
}
static void nfs_commit_release(void *calldata)
@@ -1762,7 +1802,7 @@ static const struct rpc_call_ops nfs_commit_ops = {
static const struct nfs_commit_completion_ops nfs_commit_completion_ops = {
.completion = nfs_commit_release_pages,
- .error_cleanup = nfs_commit_clear_lock,
+ .resched_write = nfs_commit_resched_write,
};
int nfs_generic_commit_list(struct inode *inode, struct list_head *head,
@@ -1781,30 +1821,25 @@ int nfs_commit_inode(struct inode *inode, int how)
LIST_HEAD(head);
struct nfs_commit_info cinfo;
int may_wait = how & FLUSH_SYNC;
+ int error = 0;
int res;
- res = nfs_commit_set_lock(NFS_I(inode), may_wait);
- if (res <= 0)
- goto out_mark_dirty;
nfs_init_cinfo_from_inode(&cinfo, inode);
+ nfs_commit_begin(cinfo.mds);
res = nfs_scan_commit(inode, &head, &cinfo);
- if (res) {
- int error;
-
+ if (res)
error = nfs_generic_commit_list(inode, &head, how, &cinfo);
- if (error < 0)
- return error;
- if (!may_wait)
- goto out_mark_dirty;
- error = wait_on_bit_action(&NFS_I(inode)->flags,
- NFS_INO_COMMIT,
- nfs_wait_bit_killable,
- TASK_KILLABLE);
- if (error < 0)
- return error;
- } else
- nfs_commit_clear_lock(NFS_I(inode));
+ nfs_commit_end(cinfo.mds);
+ if (error < 0)
+ goto out_error;
+ if (!may_wait)
+ goto out_mark_dirty;
+ error = wait_on_commit(cinfo.mds);
+ if (error < 0)
+ return error;
return res;
+out_error:
+ res = error;
/* Note: If we exit without ensuring that the commit is complete,
* we must mark the inode as dirty. Otherwise, future calls to
* sync_inode() with the WB_SYNC_ALL flag set will fail to ensure
@@ -1814,6 +1849,7 @@ out_mark_dirty:
__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
return res;
}
+EXPORT_SYMBOL_GPL(nfs_commit_inode);
int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
{
@@ -1904,7 +1940,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
/*
* Write back all requests on one page - we do this before reading it.
*/
-int nfs_wb_page(struct inode *inode, struct page *page)
+int nfs_wb_single_page(struct inode *inode, struct page *page, bool launder)
{
loff_t range_start = page_file_offset(page);
loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1);
@@ -1921,7 +1957,7 @@ int nfs_wb_page(struct inode *inode, struct page *page)
for (;;) {
wait_on_page_writeback(page);
if (clear_page_dirty_for_io(page)) {
- ret = nfs_writepage_locked(page, &wbc);
+ ret = nfs_writepage_locked(page, &wbc, launder);
if (ret < 0)
goto out_error;
continue;
diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c
index 77e7a5cca888..1a03bc3059e8 100644
--- a/fs/nfsd/lockd.c
+++ b/fs/nfsd/lockd.c
@@ -58,7 +58,7 @@ nlm_fclose(struct file *filp)
fput(filp);
}
-static struct nlmsvc_binding nfsd_nlm_ops = {
+static const struct nlmsvc_binding nfsd_nlm_ops = {
.fopen = nlm_fopen, /* open file for locking */
.fclose = nlm_fclose, /* close file */
};
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index d8b16c2568f3..5fbf3bbd00d0 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -92,7 +92,7 @@ struct nfsd_net {
struct file *rec_file;
bool in_grace;
- struct nfsd4_client_tracking_ops *client_tracking_ops;
+ const struct nfsd4_client_tracking_ops *client_tracking_ops;
time_t nfsd4_lease;
time_t nfsd4_grace;
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index f6e7cbabac5a..2246454dec76 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -262,11 +262,11 @@ void fill_post_wcc(struct svc_fh *fhp)
err = fh_getattr(fhp, &fhp->fh_post_attr);
fhp->fh_post_change = d_inode(fhp->fh_dentry)->i_version;
if (err) {
- fhp->fh_post_saved = 0;
+ fhp->fh_post_saved = false;
/* Grab the ctime anyway - set_change_info might use it */
fhp->fh_post_attr.ctime = d_inode(fhp->fh_dentry)->i_ctime;
} else
- fhp->fh_post_saved = 1;
+ fhp->fh_post_saved = true;
}
/*
@@ -823,7 +823,7 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
} else
dchild = dget(dparent);
} else
- dchild = lookup_one_len(name, dparent, namlen);
+ dchild = lookup_one_len_unlocked(name, dparent, namlen);
if (IS_ERR(dchild))
return rv;
if (d_mountpoint(dchild))
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index e7f50c4081d6..7389cb1d7409 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -792,12 +792,16 @@ static void warn_no_callback_path(struct nfs4_client *clp, int reason)
static void nfsd4_mark_cb_down(struct nfs4_client *clp, int reason)
{
+ if (test_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags))
+ return;
clp->cl_cb_state = NFSD4_CB_DOWN;
warn_no_callback_path(clp, reason);
}
static void nfsd4_mark_cb_fault(struct nfs4_client *clp, int reason)
{
+ if (test_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags))
+ return;
clp->cl_cb_state = NFSD4_CB_FAULT;
warn_no_callback_path(clp, reason);
}
@@ -1143,7 +1147,7 @@ nfsd4_run_cb_work(struct work_struct *work)
}
void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
- struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op)
+ const struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op)
{
cb->cb_clp = clp;
cb->cb_msg.rpc_proc = &nfs4_cb_procedures[op];
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index ebf90e487c75..ce2d010d3b17 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -22,7 +22,7 @@ struct nfs4_layout {
static struct kmem_cache *nfs4_layout_cache;
static struct kmem_cache *nfs4_layout_stateid_cache;
-static struct nfsd4_callback_ops nfsd4_cb_layout_ops;
+static const struct nfsd4_callback_ops nfsd4_cb_layout_ops;
static const struct lock_manager_operations nfsd4_layouts_lm_ops;
const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] = {
@@ -201,6 +201,7 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
INIT_LIST_HEAD(&ls->ls_perfile);
spin_lock_init(&ls->ls_lock);
INIT_LIST_HEAD(&ls->ls_layouts);
+ mutex_init(&ls->ls_mutex);
ls->ls_layout_type = layout_type;
nfsd4_init_cb(&ls->ls_recall, clp, &nfsd4_cb_layout_ops,
NFSPROC4_CLNT_CB_LAYOUT);
@@ -262,19 +263,23 @@ nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
status = nfserr_jukebox;
if (!ls)
goto out;
+ mutex_lock(&ls->ls_mutex);
} else {
ls = container_of(stid, struct nfs4_layout_stateid, ls_stid);
status = nfserr_bad_stateid;
+ mutex_lock(&ls->ls_mutex);
if (stateid->si_generation > stid->sc_stateid.si_generation)
- goto out_put_stid;
+ goto out_unlock_stid;
if (layout_type != ls->ls_layout_type)
- goto out_put_stid;
+ goto out_unlock_stid;
}
*lsp = ls;
return 0;
+out_unlock_stid:
+ mutex_unlock(&ls->ls_mutex);
out_put_stid:
nfs4_put_stid(stid);
out:
@@ -296,8 +301,6 @@ nfsd4_recall_file_layout(struct nfs4_layout_stateid *ls)
trace_layout_recall(&ls->ls_stid.sc_stateid);
atomic_inc(&ls->ls_stid.sc_count);
- update_stateid(&ls->ls_stid.sc_stateid);
- memcpy(&ls->ls_recall_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t));
nfsd4_run_cb(&ls->ls_recall);
out_unlock:
@@ -406,8 +409,7 @@ nfsd4_insert_layout(struct nfsd4_layoutget *lgp, struct nfs4_layout_stateid *ls)
list_add_tail(&new->lo_perstate, &ls->ls_layouts);
new = NULL;
done:
- update_stateid(&ls->ls_stid.sc_stateid);
- memcpy(&lgp->lg_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t));
+ nfs4_inc_and_copy_stateid(&lgp->lg_sid, &ls->ls_stid);
spin_unlock(&ls->ls_lock);
out:
spin_unlock(&fp->fi_lock);
@@ -481,11 +483,8 @@ nfsd4_return_file_layouts(struct svc_rqst *rqstp,
}
}
if (!list_empty(&ls->ls_layouts)) {
- if (found) {
- update_stateid(&ls->ls_stid.sc_stateid);
- memcpy(&lrp->lr_sid, &ls->ls_stid.sc_stateid,
- sizeof(stateid_t));
- }
+ if (found)
+ nfs4_inc_and_copy_stateid(&lrp->lr_sid, &ls->ls_stid);
lrp->lrs_present = 1;
} else {
trace_layoutstate_unhash(&ls->ls_stid.sc_stateid);
@@ -494,6 +493,7 @@ nfsd4_return_file_layouts(struct svc_rqst *rqstp,
}
spin_unlock(&ls->ls_lock);
+ mutex_unlock(&ls->ls_mutex);
nfs4_put_stid(&ls->ls_stid);
nfsd4_free_layouts(&reaplist);
return nfs_ok;
@@ -608,29 +608,55 @@ nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls)
}
}
+static void
+nfsd4_cb_layout_prepare(struct nfsd4_callback *cb)
+{
+ struct nfs4_layout_stateid *ls =
+ container_of(cb, struct nfs4_layout_stateid, ls_recall);
+
+ mutex_lock(&ls->ls_mutex);
+ nfs4_inc_and_copy_stateid(&ls->ls_recall_sid, &ls->ls_stid);
+ mutex_unlock(&ls->ls_mutex);
+}
+
static int
nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
{
struct nfs4_layout_stateid *ls =
container_of(cb, struct nfs4_layout_stateid, ls_recall);
+ struct nfsd_net *nn;
+ ktime_t now, cutoff;
LIST_HEAD(reaplist);
+
switch (task->tk_status) {
case 0:
- return 1;
+ case -NFS4ERR_DELAY:
+ /*
+ * Anything left? If not, then call it done. Note that we don't
+ * take the spinlock since this is an optimization and nothing
+ * should get added until the cb counter goes to zero.
+ */
+ if (list_empty(&ls->ls_layouts))
+ return 1;
+
+ /* Poll the client until it's done with the layout */
+ now = ktime_get();
+ nn = net_generic(ls->ls_stid.sc_client->net, nfsd_net_id);
+
+ /* Client gets 2 lease periods to return it */
+ cutoff = ktime_add_ns(task->tk_start,
+ nn->nfsd4_lease * NSEC_PER_SEC * 2);
+
+ if (ktime_before(now, cutoff)) {
+ rpc_delay(task, HZ/100); /* 10 mili-seconds */
+ return 0;
+ }
+ /* Fallthrough */
case -NFS4ERR_NOMATCHING_LAYOUT:
trace_layout_recall_done(&ls->ls_stid.sc_stateid);
task->tk_status = 0;
return 1;
- case -NFS4ERR_DELAY:
- /* Poll the client until it's done with the layout */
- /* FIXME: cap number of retries.
- * The pnfs standard states that we need to only expire
- * the client after at-least "lease time" .eg lease-time * 2
- * when failing to communicate a recall
- */
- rpc_delay(task, HZ/100); /* 10 mili-seconds */
- return 0;
default:
/*
* Unknown error or non-responding client, we'll need to fence.
@@ -654,7 +680,8 @@ nfsd4_cb_layout_release(struct nfsd4_callback *cb)
nfs4_put_stid(&ls->ls_stid);
}
-static struct nfsd4_callback_ops nfsd4_cb_layout_ops = {
+static const struct nfsd4_callback_ops nfsd4_cb_layout_ops = {
+ .prepare = nfsd4_cb_layout_prepare,
.done = nfsd4_cb_layout_done,
.release = nfsd4_cb_layout_release,
};
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 4ce6b97b31ad..819ad812c71b 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -774,8 +774,9 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags);
/* check stateid */
- status = nfs4_preprocess_stateid_op(rqstp, cstate, &read->rd_stateid,
- RD_STATE, &read->rd_filp, &read->rd_tmp_file);
+ status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
+ &read->rd_stateid, RD_STATE,
+ &read->rd_filp, &read->rd_tmp_file);
if (status) {
dprintk("NFSD: nfsd4_read: couldn't process stateid!\n");
goto out;
@@ -921,7 +922,8 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (setattr->sa_iattr.ia_valid & ATTR_SIZE) {
status = nfs4_preprocess_stateid_op(rqstp, cstate,
- &setattr->sa_stateid, WR_STATE, NULL, NULL);
+ &cstate->current_fh, &setattr->sa_stateid,
+ WR_STATE, NULL, NULL);
if (status) {
dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n");
return status;
@@ -985,8 +987,8 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (write->wr_offset >= OFFSET_MAX)
return nfserr_inval;
- status = nfs4_preprocess_stateid_op(rqstp, cstate, stateid, WR_STATE,
- &filp, NULL);
+ status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
+ stateid, WR_STATE, &filp, NULL);
if (status) {
dprintk("NFSD: nfsd4_write: couldn't process stateid!\n");
return status;
@@ -1010,13 +1012,54 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
}
static __be32
+nfsd4_clone(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ struct nfsd4_clone *clone)
+{
+ struct file *src, *dst;
+ __be32 status;
+
+ status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->save_fh,
+ &clone->cl_src_stateid, RD_STATE,
+ &src, NULL);
+ if (status) {
+ dprintk("NFSD: %s: couldn't process src stateid!\n", __func__);
+ goto out;
+ }
+
+ status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
+ &clone->cl_dst_stateid, WR_STATE,
+ &dst, NULL);
+ if (status) {
+ dprintk("NFSD: %s: couldn't process dst stateid!\n", __func__);
+ goto out_put_src;
+ }
+
+ /* fix up for NFS-specific error code */
+ if (!S_ISREG(file_inode(src)->i_mode) ||
+ !S_ISREG(file_inode(dst)->i_mode)) {
+ status = nfserr_wrong_type;
+ goto out_put_dst;
+ }
+
+ status = nfsd4_clone_file_range(src, clone->cl_src_pos,
+ dst, clone->cl_dst_pos, clone->cl_count);
+
+out_put_dst:
+ fput(dst);
+out_put_src:
+ fput(src);
+out:
+ return status;
+}
+
+static __be32
nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_fallocate *fallocate, int flags)
{
__be32 status = nfserr_notsupp;
struct file *file;
- status = nfs4_preprocess_stateid_op(rqstp, cstate,
+ status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
&fallocate->falloc_stateid,
WR_STATE, &file, NULL);
if (status != nfs_ok) {
@@ -1055,7 +1098,7 @@ nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
__be32 status;
struct file *file;
- status = nfs4_preprocess_stateid_op(rqstp, cstate,
+ status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
&seek->seek_stateid,
RD_STATE, &file, NULL);
if (status) {
@@ -1309,6 +1352,7 @@ nfsd4_layoutget(struct svc_rqst *rqstp,
nfserr = nfsd4_insert_layout(lgp, ls);
out_put_stid:
+ mutex_unlock(&ls->ls_mutex);
nfs4_put_stid(&ls->ls_stid);
out:
return nfserr;
@@ -1362,6 +1406,9 @@ nfsd4_layoutcommit(struct svc_rqst *rqstp,
goto out;
}
+ /* LAYOUTCOMMIT does not require any serialization */
+ mutex_unlock(&ls->ls_mutex);
+
if (new_size > i_size_read(inode)) {
lcp->lc_size_chg = 1;
lcp->lc_newsize = new_size;
@@ -2275,6 +2322,12 @@ static struct nfsd4_operation nfsd4_ops[] = {
.op_name = "OP_DEALLOCATE",
.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
+ [OP_CLONE] = {
+ .op_func = (nfsd4op_func)nfsd4_clone,
+ .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
+ .op_name = "OP_CLONE",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
+ },
[OP_SEEK] = {
.op_func = (nfsd4op_func)nfsd4_seek,
.op_name = "OP_SEEK",
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index e3d47091b191..79f0307a5ec8 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -631,7 +631,7 @@ nfsd4_check_legacy_client(struct nfs4_client *clp)
return -ENOENT;
}
-static struct nfsd4_client_tracking_ops nfsd4_legacy_tracking_ops = {
+static const struct nfsd4_client_tracking_ops nfsd4_legacy_tracking_ops = {
.init = nfsd4_legacy_tracking_init,
.exit = nfsd4_legacy_tracking_exit,
.create = nfsd4_create_clid_dir,
@@ -1050,7 +1050,7 @@ out_err:
printk(KERN_ERR "NFSD: Unable to end grace period: %d\n", ret);
}
-static struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops = {
+static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops = {
.init = nfsd4_init_cld_pipe,
.exit = nfsd4_remove_cld_pipe,
.create = nfsd4_cld_create,
@@ -1394,7 +1394,7 @@ nfsd4_umh_cltrack_grace_done(struct nfsd_net *nn)
kfree(legacy);
}
-static struct nfsd4_client_tracking_ops nfsd4_umh_tracking_ops = {
+static const struct nfsd4_client_tracking_ops nfsd4_umh_tracking_ops = {
.init = nfsd4_umh_cltrack_init,
.exit = NULL,
.create = nfsd4_umh_cltrack_create,
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 0f1d5691b795..c484a2b6cd10 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -98,7 +98,7 @@ static struct kmem_cache *odstate_slab;
static void free_session(struct nfsd4_session *);
-static struct nfsd4_callback_ops nfsd4_cb_recall_ops;
+static const struct nfsd4_callback_ops nfsd4_cb_recall_ops;
static bool is_session_dead(struct nfsd4_session *ses)
{
@@ -575,6 +575,7 @@ struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl,
stid->sc_stateid.si_opaque.so_clid = cl->cl_clientid;
/* Will be incremented before return to client: */
atomic_set(&stid->sc_count, 1);
+ spin_lock_init(&stid->sc_lock);
/*
* It shouldn't be a problem to reuse an opaque stateid value.
@@ -745,6 +746,18 @@ nfs4_put_stid(struct nfs4_stid *s)
put_nfs4_file(fp);
}
+void
+nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid)
+{
+ stateid_t *src = &stid->sc_stateid;
+
+ spin_lock(&stid->sc_lock);
+ if (unlikely(++src->si_generation == 0))
+ src->si_generation = 1;
+ memcpy(dst, src, sizeof(*dst));
+ spin_unlock(&stid->sc_lock);
+}
+
static void nfs4_put_deleg_lease(struct nfs4_file *fp)
{
struct file *filp = NULL;
@@ -765,16 +778,68 @@ void nfs4_unhash_stid(struct nfs4_stid *s)
s->sc_type = 0;
}
-static void
+/**
+ * nfs4_get_existing_delegation - Discover if this delegation already exists
+ * @clp: a pointer to the nfs4_client we're granting a delegation to
+ * @fp: a pointer to the nfs4_file we're granting a delegation on
+ *
+ * Return:
+ * On success: NULL if an existing delegation was not found.
+ *
+ * On error: -EAGAIN if one was previously granted to this nfs4_client
+ * for this nfs4_file.
+ *
+ */
+
+static int
+nfs4_get_existing_delegation(struct nfs4_client *clp, struct nfs4_file *fp)
+{
+ struct nfs4_delegation *searchdp = NULL;
+ struct nfs4_client *searchclp = NULL;
+
+ lockdep_assert_held(&state_lock);
+ lockdep_assert_held(&fp->fi_lock);
+
+ list_for_each_entry(searchdp, &fp->fi_delegations, dl_perfile) {
+ searchclp = searchdp->dl_stid.sc_client;
+ if (clp == searchclp) {
+ return -EAGAIN;
+ }
+ }
+ return 0;
+}
+
+/**
+ * hash_delegation_locked - Add a delegation to the appropriate lists
+ * @dp: a pointer to the nfs4_delegation we are adding.
+ * @fp: a pointer to the nfs4_file we're granting a delegation on
+ *
+ * Return:
+ * On success: NULL if the delegation was successfully hashed.
+ *
+ * On error: -EAGAIN if one was previously granted to this
+ * nfs4_client for this nfs4_file. Delegation is not hashed.
+ *
+ */
+
+static int
hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp)
{
+ int status;
+ struct nfs4_client *clp = dp->dl_stid.sc_client;
+
lockdep_assert_held(&state_lock);
lockdep_assert_held(&fp->fi_lock);
+ status = nfs4_get_existing_delegation(clp, fp);
+ if (status)
+ return status;
+ ++fp->fi_delegees;
atomic_inc(&dp->dl_stid.sc_count);
dp->dl_stid.sc_type = NFS4_DELEG_STID;
list_add(&dp->dl_perfile, &fp->fi_delegations);
- list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations);
+ list_add(&dp->dl_perclnt, &clp->cl_delegations);
+ return 0;
}
static bool
@@ -1792,15 +1857,28 @@ static void copy_clid(struct nfs4_client *target, struct nfs4_client *source)
target->cl_clientid.cl_id = source->cl_clientid.cl_id;
}
-static int copy_cred(struct svc_cred *target, struct svc_cred *source)
+int strdup_if_nonnull(char **target, char *source)
{
- if (source->cr_principal) {
- target->cr_principal =
- kstrdup(source->cr_principal, GFP_KERNEL);
- if (target->cr_principal == NULL)
+ if (source) {
+ *target = kstrdup(source, GFP_KERNEL);
+ if (!*target)
return -ENOMEM;
} else
- target->cr_principal = NULL;
+ *target = NULL;
+ return 0;
+}
+
+static int copy_cred(struct svc_cred *target, struct svc_cred *source)
+{
+ int ret;
+
+ ret = strdup_if_nonnull(&target->cr_principal, source->cr_principal);
+ if (ret)
+ return ret;
+ ret = strdup_if_nonnull(&target->cr_raw_principal,
+ source->cr_raw_principal);
+ if (ret)
+ return ret;
target->cr_flavor = source->cr_flavor;
target->cr_uid = source->cr_uid;
target->cr_gid = source->cr_gid;
@@ -1904,6 +1982,9 @@ static bool mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp)
return false;
if (!svc_rqst_integrity_protected(rqstp))
return false;
+ if (cl->cl_cred.cr_raw_principal)
+ return 0 == strcmp(cl->cl_cred.cr_raw_principal,
+ cr->cr_raw_principal);
if (!cr->cr_principal)
return false;
return 0 == strcmp(cl->cl_cred.cr_principal, cr->cr_principal);
@@ -2175,7 +2256,8 @@ nfsd4_store_cache_entry(struct nfsd4_compoundres *resp)
base = resp->cstate.data_offset;
slot->sl_datalen = buf->len - base;
if (read_bytes_from_xdr_buf(buf, base, slot->sl_data, slot->sl_datalen))
- WARN("%s: sessions DRC could not cache compound\n", __func__);
+ WARN(1, "%s: sessions DRC could not cache compound\n",
+ __func__);
return;
}
@@ -2256,15 +2338,20 @@ nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid)
clid->flags = new->cl_exchange_flags;
}
+static bool client_has_openowners(struct nfs4_client *clp)
+{
+ struct nfs4_openowner *oo;
+
+ list_for_each_entry(oo, &clp->cl_openowners, oo_perclient) {
+ if (!list_empty(&oo->oo_owner.so_stateids))
+ return true;
+ }
+ return false;
+}
+
static bool client_has_state(struct nfs4_client *clp)
{
- /*
- * Note clp->cl_openowners check isn't quite right: there's no
- * need to count owners without stateid's.
- *
- * Also note we should probably be using this in 4.0 case too.
- */
- return !list_empty(&clp->cl_openowners)
+ return client_has_openowners(clp)
#ifdef CONFIG_NFSD_PNFS
|| !list_empty(&clp->cl_lo_states)
#endif
@@ -2295,10 +2382,27 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
if (exid->flags & ~EXCHGID4_FLAG_MASK_A)
return nfserr_inval;
+ new = create_client(exid->clname, rqstp, &verf);
+ if (new == NULL)
+ return nfserr_jukebox;
+
switch (exid->spa_how) {
case SP4_MACH_CRED:
- if (!svc_rqst_integrity_protected(rqstp))
- return nfserr_inval;
+ if (!svc_rqst_integrity_protected(rqstp)) {
+ status = nfserr_inval;
+ goto out_nolock;
+ }
+ /*
+ * Sometimes userspace doesn't give us a principal.
+ * Which is a bug, really. Anyway, we can't enforce
+ * MACH_CRED in that case, better to give up now:
+ */
+ if (!new->cl_cred.cr_principal &&
+ !new->cl_cred.cr_raw_principal) {
+ status = nfserr_serverfault;
+ goto out_nolock;
+ }
+ new->cl_mach_cred = true;
case SP4_NONE:
break;
default: /* checked by xdr code */
@@ -2307,10 +2411,6 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
return nfserr_encr_alg_unsupp;
}
- new = create_client(exid->clname, rqstp, &verf);
- if (new == NULL)
- return nfserr_jukebox;
-
/* Cases below refer to rfc 5661 section 18.35.4: */
spin_lock(&nn->client_lock);
conf = find_confirmed_client_by_name(&exid->clname, nn);
@@ -2372,7 +2472,6 @@ out_new:
goto out;
}
new->cl_minorversion = cstate->minorversion;
- new->cl_mach_cred = (exid->spa_how == SP4_MACH_CRED);
gen_clid(new, nn);
add_to_unconfirmed(new);
@@ -2390,6 +2489,7 @@ out_copy:
out:
spin_unlock(&nn->client_lock);
+out_nolock:
if (new)
expire_client(new);
if (unconf)
@@ -3049,7 +3149,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
/* Cases below refer to rfc 3530 section 14.2.33: */
spin_lock(&nn->client_lock);
conf = find_confirmed_client_by_name(&clname, nn);
- if (conf) {
+ if (conf && client_has_state(conf)) {
/* case 0: */
status = nfserr_clid_inuse;
if (clp_used_exchangeid(conf))
@@ -3136,6 +3236,11 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
} else { /* case 3: normal case; new or rebooted client */
old = find_confirmed_client_by_name(&unconf->cl_name, nn);
if (old) {
+ status = nfserr_clid_inuse;
+ if (client_has_state(old)
+ && !same_creds(&unconf->cl_cred,
+ &old->cl_cred))
+ goto out;
status = mark_client_expired_locked(old);
if (status) {
old = NULL;
@@ -3317,6 +3422,27 @@ static const struct nfs4_stateowner_operations openowner_ops = {
.so_free = nfs4_free_openowner,
};
+static struct nfs4_ol_stateid *
+nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open)
+{
+ struct nfs4_ol_stateid *local, *ret = NULL;
+ struct nfs4_openowner *oo = open->op_openowner;
+
+ lockdep_assert_held(&fp->fi_lock);
+
+ list_for_each_entry(local, &fp->fi_stateids, st_perfile) {
+ /* ignore lock owners */
+ if (local->st_stateowner->so_is_open_owner == 0)
+ continue;
+ if (local->st_stateowner == &oo->oo_owner) {
+ ret = local;
+ atomic_inc(&ret->st_stid.sc_count);
+ break;
+ }
+ }
+ return ret;
+}
+
static struct nfs4_openowner *
alloc_init_open_stateowner(unsigned int strhashval, struct nfsd4_open *open,
struct nfsd4_compound_state *cstate)
@@ -3348,9 +3474,20 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfsd4_open *open,
return ret;
}
-static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) {
+static struct nfs4_ol_stateid *
+init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp,
+ struct nfsd4_open *open)
+{
+
struct nfs4_openowner *oo = open->op_openowner;
+ struct nfs4_ol_stateid *retstp = NULL;
+ spin_lock(&oo->oo_owner.so_client->cl_lock);
+ spin_lock(&fp->fi_lock);
+
+ retstp = nfsd4_find_existing_open(fp, open);
+ if (retstp)
+ goto out_unlock;
atomic_inc(&stp->st_stid.sc_count);
stp->st_stid.sc_type = NFS4_OPEN_STID;
INIT_LIST_HEAD(&stp->st_locks);
@@ -3360,12 +3497,14 @@ static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp,
stp->st_access_bmap = 0;
stp->st_deny_bmap = 0;
stp->st_openstp = NULL;
- spin_lock(&oo->oo_owner.so_client->cl_lock);
+ init_rwsem(&stp->st_rwsem);
list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids);
- spin_lock(&fp->fi_lock);
list_add(&stp->st_perfile, &fp->fi_stateids);
+
+out_unlock:
spin_unlock(&fp->fi_lock);
spin_unlock(&oo->oo_owner.so_client->cl_lock);
+ return retstp;
}
/*
@@ -3539,7 +3678,7 @@ static void nfsd4_cb_recall_release(struct nfsd4_callback *cb)
nfs4_put_stid(&dp->dl_stid);
}
-static struct nfsd4_callback_ops nfsd4_cb_recall_ops = {
+static const struct nfsd4_callback_ops nfsd4_cb_recall_ops = {
.prepare = nfsd4_cb_recall_prepare,
.done = nfsd4_cb_recall_done,
.release = nfsd4_cb_recall_release,
@@ -3776,27 +3915,6 @@ out:
return nfs_ok;
}
-static struct nfs4_ol_stateid *
-nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open)
-{
- struct nfs4_ol_stateid *local, *ret = NULL;
- struct nfs4_openowner *oo = open->op_openowner;
-
- spin_lock(&fp->fi_lock);
- list_for_each_entry(local, &fp->fi_stateids, st_perfile) {
- /* ignore lock owners */
- if (local->st_stateowner->so_is_open_owner == 0)
- continue;
- if (local->st_stateowner == &oo->oo_owner) {
- ret = local;
- atomic_inc(&ret->st_stid.sc_count);
- break;
- }
- }
- spin_unlock(&fp->fi_lock);
- return ret;
-}
-
static inline int nfs4_access_to_access(u32 nfs4_access)
{
int flags = 0;
@@ -3945,6 +4063,18 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_file *fp, int flag)
return fl;
}
+/**
+ * nfs4_setlease - Obtain a delegation by requesting lease from vfs layer
+ * @dp: a pointer to the nfs4_delegation we're adding.
+ *
+ * Return:
+ * On success: Return code will be 0 on success.
+ *
+ * On error: -EAGAIN if there was an existing delegation.
+ * nonzero if there is an error in other cases.
+ *
+ */
+
static int nfs4_setlease(struct nfs4_delegation *dp)
{
struct nfs4_file *fp = dp->dl_stid.sc_file;
@@ -3976,16 +4106,19 @@ static int nfs4_setlease(struct nfs4_delegation *dp)
goto out_unlock;
/* Race breaker */
if (fp->fi_deleg_file) {
- status = 0;
- ++fp->fi_delegees;
- hash_delegation_locked(dp, fp);
+ status = hash_delegation_locked(dp, fp);
goto out_unlock;
}
fp->fi_deleg_file = filp;
- fp->fi_delegees = 1;
- hash_delegation_locked(dp, fp);
+ fp->fi_delegees = 0;
+ status = hash_delegation_locked(dp, fp);
spin_unlock(&fp->fi_lock);
spin_unlock(&state_lock);
+ if (status) {
+ /* Should never happen, this is a new fi_deleg_file */
+ WARN_ON_ONCE(1);
+ goto out_fput;
+ }
return 0;
out_unlock:
spin_unlock(&fp->fi_lock);
@@ -4005,6 +4138,15 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
if (fp->fi_had_conflict)
return ERR_PTR(-EAGAIN);
+ spin_lock(&state_lock);
+ spin_lock(&fp->fi_lock);
+ status = nfs4_get_existing_delegation(clp, fp);
+ spin_unlock(&fp->fi_lock);
+ spin_unlock(&state_lock);
+
+ if (status)
+ return ERR_PTR(status);
+
dp = alloc_init_deleg(clp, fh, odstate);
if (!dp)
return ERR_PTR(-ENOMEM);
@@ -4023,9 +4165,7 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
status = -EAGAIN;
goto out_unlock;
}
- ++fp->fi_delegees;
- hash_delegation_locked(dp, fp);
- status = 0;
+ status = hash_delegation_locked(dp, fp);
out_unlock:
spin_unlock(&fp->fi_lock);
spin_unlock(&state_lock);
@@ -4160,6 +4300,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
struct nfs4_client *cl = open->op_openowner->oo_owner.so_client;
struct nfs4_file *fp = NULL;
struct nfs4_ol_stateid *stp = NULL;
+ struct nfs4_ol_stateid *swapstp = NULL;
struct nfs4_delegation *dp = NULL;
__be32 status;
@@ -4173,7 +4314,9 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
status = nfs4_check_deleg(cl, open, &dp);
if (status)
goto out;
+ spin_lock(&fp->fi_lock);
stp = nfsd4_find_existing_open(fp, open);
+ spin_unlock(&fp->fi_lock);
} else {
open->op_file = NULL;
status = nfserr_bad_stateid;
@@ -4187,15 +4330,32 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
*/
if (stp) {
/* Stateid was found, this is an OPEN upgrade */
+ down_read(&stp->st_rwsem);
status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open);
- if (status)
+ if (status) {
+ up_read(&stp->st_rwsem);
goto out;
+ }
} else {
stp = open->op_stp;
open->op_stp = NULL;
- init_open_stateid(stp, fp, open);
+ swapstp = init_open_stateid(stp, fp, open);
+ if (swapstp) {
+ nfs4_put_stid(&stp->st_stid);
+ stp = swapstp;
+ down_read(&stp->st_rwsem);
+ status = nfs4_upgrade_open(rqstp, fp, current_fh,
+ stp, open);
+ if (status) {
+ up_read(&stp->st_rwsem);
+ goto out;
+ }
+ goto upgrade_out;
+ }
+ down_read(&stp->st_rwsem);
status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open);
if (status) {
+ up_read(&stp->st_rwsem);
release_open_stateid(stp);
goto out;
}
@@ -4205,8 +4365,9 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
if (stp->st_clnt_odstate == open->op_odstate)
open->op_odstate = NULL;
}
- update_stateid(&stp->st_stid.sc_stateid);
- memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
+upgrade_out:
+ nfs4_inc_and_copy_stateid(&open->op_stateid, &stp->st_stid);
+ up_read(&stp->st_rwsem);
if (nfsd4_has_session(&resp->cstate)) {
if (open->op_deleg_want & NFS4_SHARE_WANT_NO_DELEG) {
@@ -4410,8 +4571,7 @@ static void
laundromat_main(struct work_struct *laundry)
{
time_t t;
- struct delayed_work *dwork = container_of(laundry, struct delayed_work,
- work);
+ struct delayed_work *dwork = to_delayed_work(laundry);
struct nfsd_net *nn = container_of(dwork, struct nfsd_net,
laundromat_work);
@@ -4666,10 +4826,9 @@ nfs4_check_file(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfs4_stid *s,
*/
__be32
nfs4_preprocess_stateid_op(struct svc_rqst *rqstp,
- struct nfsd4_compound_state *cstate, stateid_t *stateid,
- int flags, struct file **filpp, bool *tmp_file)
+ struct nfsd4_compound_state *cstate, struct svc_fh *fhp,
+ stateid_t *stateid, int flags, struct file **filpp, bool *tmp_file)
{
- struct svc_fh *fhp = &cstate->current_fh;
struct inode *ino = d_inode(fhp->fh_dentry);
struct net *net = SVC_NET(rqstp);
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
@@ -4819,10 +4978,13 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_
* revoked delegations are kept only for free_stateid.
*/
return nfserr_bad_stateid;
+ down_write(&stp->st_rwsem);
status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate));
- if (status)
- return status;
- return nfs4_check_fh(current_fh, &stp->st_stid);
+ if (status == nfs_ok)
+ status = nfs4_check_fh(current_fh, &stp->st_stid);
+ if (status != nfs_ok)
+ up_write(&stp->st_rwsem);
+ return status;
}
/*
@@ -4869,6 +5031,7 @@ static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cs
return status;
oo = openowner(stp->st_stateowner);
if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) {
+ up_write(&stp->st_rwsem);
nfs4_put_stid(&stp->st_stid);
return nfserr_bad_stateid;
}
@@ -4899,11 +5062,13 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out;
oo = openowner(stp->st_stateowner);
status = nfserr_bad_stateid;
- if (oo->oo_flags & NFS4_OO_CONFIRMED)
+ if (oo->oo_flags & NFS4_OO_CONFIRMED) {
+ up_write(&stp->st_rwsem);
goto put_stateid;
+ }
oo->oo_flags |= NFS4_OO_CONFIRMED;
- update_stateid(&stp->st_stid.sc_stateid);
- memcpy(&oc->oc_resp_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
+ nfs4_inc_and_copy_stateid(&oc->oc_resp_stateid, &stp->st_stid);
+ up_write(&stp->st_rwsem);
dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n",
__func__, oc->oc_seqid, STATEID_VAL(&stp->st_stid.sc_stateid));
@@ -4975,13 +5140,11 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
goto put_stateid;
}
nfs4_stateid_downgrade(stp, od->od_share_access);
-
reset_union_bmap_deny(od->od_share_deny, stp);
-
- update_stateid(&stp->st_stid.sc_stateid);
- memcpy(&od->od_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
+ nfs4_inc_and_copy_stateid(&od->od_stateid, &stp->st_stid);
status = nfs_ok;
put_stateid:
+ up_write(&stp->st_rwsem);
nfs4_put_stid(&stp->st_stid);
out:
nfsd4_bump_seqid(cstate, status);
@@ -5033,8 +5196,8 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
nfsd4_bump_seqid(cstate, status);
if (status)
goto out;
- update_stateid(&stp->st_stid.sc_stateid);
- memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
+ nfs4_inc_and_copy_stateid(&close->cl_stateid, &stp->st_stid);
+ up_write(&stp->st_rwsem);
nfsd4_close_open_stateid(stp);
@@ -5260,6 +5423,7 @@ init_lock_stateid(struct nfs4_ol_stateid *stp, struct nfs4_lockowner *lo,
stp->st_access_bmap = 0;
stp->st_deny_bmap = open_stp->st_deny_bmap;
stp->st_openstp = open_stp;
+ init_rwsem(&stp->st_rwsem);
list_add(&stp->st_locks, &open_stp->st_locks);
list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids);
spin_lock(&fp->fi_lock);
@@ -5428,6 +5592,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
&open_stp, nn);
if (status)
goto out;
+ up_write(&open_stp->st_rwsem);
open_sop = openowner(open_stp->st_stateowner);
status = nfserr_bad_stateid;
if (!same_clid(&open_sop->oo_owner.so_client->cl_clientid,
@@ -5435,6 +5600,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out;
status = lookup_or_create_lock_state(cstate, open_stp, lock,
&lock_stp, &new);
+ if (status == nfs_ok)
+ down_write(&lock_stp->st_rwsem);
} else {
status = nfs4_preprocess_seqid_op(cstate,
lock->lk_old_lock_seqid,
@@ -5512,9 +5679,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
err = vfs_lock_file(filp, F_SETLK, file_lock, conflock);
switch (-err) {
case 0: /* success! */
- update_stateid(&lock_stp->st_stid.sc_stateid);
- memcpy(&lock->lk_resp_stateid, &lock_stp->st_stid.sc_stateid,
- sizeof(stateid_t));
+ nfs4_inc_and_copy_stateid(&lock->lk_resp_stateid, &lock_stp->st_stid);
status = 0;
break;
case (EAGAIN): /* conflock holds conflicting lock */
@@ -5540,6 +5705,8 @@ out:
seqid_mutating_err(ntohl(status)))
lock_sop->lo_owner.so_seqid++;
+ up_write(&lock_stp->st_rwsem);
+
/*
* If this is a new, never-before-used stateid, and we are
* returning an error, then just go ahead and release it.
@@ -5704,11 +5871,11 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
dprintk("NFSD: nfs4_locku: vfs_lock_file failed!\n");
goto out_nfserr;
}
- update_stateid(&stp->st_stid.sc_stateid);
- memcpy(&locku->lu_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
+ nfs4_inc_and_copy_stateid(&locku->lu_stateid, &stp->st_stid);
fput:
fput(filp);
put_stateid:
+ up_write(&stp->st_rwsem);
nfs4_put_stid(&stp->st_stid);
out:
nfsd4_bump_seqid(cstate, status);
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 51c9e9ca39a4..d6ef0955a979 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1675,6 +1675,25 @@ nfsd4_decode_fallocate(struct nfsd4_compoundargs *argp,
}
static __be32
+nfsd4_decode_clone(struct nfsd4_compoundargs *argp, struct nfsd4_clone *clone)
+{
+ DECODE_HEAD;
+
+ status = nfsd4_decode_stateid(argp, &clone->cl_src_stateid);
+ if (status)
+ return status;
+ status = nfsd4_decode_stateid(argp, &clone->cl_dst_stateid);
+ if (status)
+ return status;
+
+ READ_BUF(8 + 8 + 8);
+ p = xdr_decode_hyper(p, &clone->cl_src_pos);
+ p = xdr_decode_hyper(p, &clone->cl_dst_pos);
+ p = xdr_decode_hyper(p, &clone->cl_count);
+ DECODE_TAIL;
+}
+
+static __be32
nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek)
{
DECODE_HEAD;
@@ -1785,6 +1804,7 @@ static nfsd4_dec nfsd4_dec_ops[] = {
[OP_READ_PLUS] = (nfsd4_dec)nfsd4_decode_notsupp,
[OP_SEEK] = (nfsd4_dec)nfsd4_decode_seek,
[OP_WRITE_SAME] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_CLONE] = (nfsd4_dec)nfsd4_decode_clone,
};
static inline bool
@@ -2838,14 +2858,14 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
__be32 nfserr;
int ignore_crossmnt = 0;
- dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
+ dentry = lookup_one_len_unlocked(name, cd->rd_fhp->fh_dentry, namlen);
if (IS_ERR(dentry))
return nfserrno(PTR_ERR(dentry));
if (d_really_is_negative(dentry)) {
/*
- * nfsd_buffered_readdir drops the i_mutex between
- * readdir and calling this callback, leaving a window
- * where this directory entry could have gone away.
+ * we're not holding the i_mutex here, so there's
+ * a window where this directory entry could have gone
+ * away.
*/
dput(dentry);
return nfserr_noent;
@@ -4292,6 +4312,7 @@ static nfsd4_enc nfsd4_enc_ops[] = {
[OP_READ_PLUS] = (nfsd4_enc)nfsd4_encode_noop,
[OP_SEEK] = (nfsd4_enc)nfsd4_encode_seek,
[OP_WRITE_SAME] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_CLONE] = (nfsd4_enc)nfsd4_encode_noop,
};
/*
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 46ec934f5dee..54cde9a5864e 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -63,7 +63,6 @@ static unsigned int longest_chain;
static unsigned int longest_chain_cachesize;
static int nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec);
-static void cache_cleaner_func(struct work_struct *unused);
static unsigned long nfsd_reply_cache_count(struct shrinker *shrink,
struct shrink_control *sc);
static unsigned long nfsd_reply_cache_scan(struct shrinker *shrink,
@@ -76,13 +75,6 @@ static struct shrinker nfsd_reply_cache_shrinker = {
};
/*
- * locking for the reply cache:
- * A cache entry is "single use" if c_state == RC_INPROG
- * Otherwise, it when accessing _prev or _next, the lock must be held.
- */
-static DECLARE_DELAYED_WORK(cache_cleaner, cache_cleaner_func);
-
-/*
* Put a cap on the size of the DRC based on the amount of available
* low memory in the machine.
*
@@ -203,7 +195,6 @@ void nfsd_reply_cache_shutdown(void)
unsigned int i;
unregister_shrinker(&nfsd_reply_cache_shrinker);
- cancel_delayed_work_sync(&cache_cleaner);
for (i = 0; i < drc_hashsize; i++) {
struct list_head *head = &drc_hashtbl[i].lru_head;
@@ -217,10 +208,8 @@ void nfsd_reply_cache_shutdown(void)
drc_hashtbl = NULL;
drc_hashsize = 0;
- if (drc_slab) {
- kmem_cache_destroy(drc_slab);
- drc_slab = NULL;
- }
+ kmem_cache_destroy(drc_slab);
+ drc_slab = NULL;
}
/*
@@ -232,7 +221,6 @@ lru_put_end(struct nfsd_drc_bucket *b, struct svc_cacherep *rp)
{
rp->c_timestamp = jiffies;
list_move_tail(&rp->c_lru, &b->lru_head);
- schedule_delayed_work(&cache_cleaner, RC_EXPIRE);
}
static long
@@ -266,7 +254,6 @@ prune_cache_entries(void)
{
unsigned int i;
long freed = 0;
- bool cancel = true;
for (i = 0; i < drc_hashsize; i++) {
struct nfsd_drc_bucket *b = &drc_hashtbl[i];
@@ -275,26 +262,11 @@ prune_cache_entries(void)
continue;
spin_lock(&b->cache_lock);
freed += prune_bucket(b);
- if (!list_empty(&b->lru_head))
- cancel = false;
spin_unlock(&b->cache_lock);
}
-
- /*
- * Conditionally rearm the job to run in RC_EXPIRE since we just
- * ran the pruner.
- */
- if (!cancel)
- mod_delayed_work(system_wq, &cache_cleaner, RC_EXPIRE);
return freed;
}
-static void
-cache_cleaner_func(struct work_struct *unused)
-{
- prune_cache_entries();
-}
-
static unsigned long
nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc)
{
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 350041a40fe5..c1681ce894c5 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -631,10 +631,7 @@ fh_put(struct svc_fh *fhp)
fh_unlock(fhp);
fhp->fh_dentry = NULL;
dput(dentry);
-#ifdef CONFIG_NFSD_V3
- fhp->fh_pre_saved = 0;
- fhp->fh_post_saved = 0;
-#endif
+ fh_clear_wcc(fhp);
}
fh_drop_write(fhp);
if (exp) {
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 1e90dad4926b..0770bcb543c8 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -7,6 +7,7 @@
#ifndef _LINUX_NFSD_NFSFH_H
#define _LINUX_NFSD_NFSFH_H
+#include <linux/crc32.h>
#include <linux/sunrpc/svc.h>
#include <uapi/linux/nfsd/nfsfh.h>
@@ -26,16 +27,16 @@ static inline ino_t u32_to_ino_t(__u32 uino)
*/
typedef struct svc_fh {
struct knfsd_fh fh_handle; /* FH data */
+ int fh_maxsize; /* max size for fh_handle */
struct dentry * fh_dentry; /* validated dentry */
struct svc_export * fh_export; /* export pointer */
- int fh_maxsize; /* max size for fh_handle */
- unsigned char fh_locked; /* inode locked by us */
- unsigned char fh_want_write; /* remount protection taken */
+ bool fh_locked; /* inode locked by us */
+ bool fh_want_write; /* remount protection taken */
#ifdef CONFIG_NFSD_V3
- unsigned char fh_post_saved; /* post-op attrs saved */
- unsigned char fh_pre_saved; /* pre-op attrs saved */
+ bool fh_post_saved; /* post-op attrs saved */
+ bool fh_pre_saved; /* pre-op attrs saved */
/* Pre-op attributes saved during fh_lock */
__u64 fh_pre_size; /* size before operation */
@@ -205,6 +206,28 @@ static inline bool fh_fsid_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
return true;
}
+#ifdef CONFIG_CRC32
+/**
+ * knfsd_fh_hash - calculate the crc32 hash for the filehandle
+ * @fh - pointer to filehandle
+ *
+ * returns a crc32 hash for the filehandle that is compatible with
+ * the one displayed by "wireshark".
+ */
+
+static inline u32
+knfsd_fh_hash(struct knfsd_fh *fh)
+{
+ return ~crc32_le(0xFFFFFFFF, (unsigned char *)&fh->fh_base, fh->fh_size);
+}
+#else
+static inline u32
+knfsd_fh_hash(struct knfsd_fh *fh)
+{
+ return 0;
+}
+#endif
+
#ifdef CONFIG_NFSD_V3
/*
* The wcc data stored in current_fh should be cleared
@@ -213,8 +236,8 @@ static inline bool fh_fsid_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
static inline void
fh_clear_wcc(struct svc_fh *fhp)
{
- fhp->fh_post_saved = 0;
- fhp->fh_pre_saved = 0;
+ fhp->fh_post_saved = false;
+ fhp->fh_pre_saved = false;
}
/*
@@ -231,7 +254,7 @@ fill_pre_wcc(struct svc_fh *fhp)
fhp->fh_pre_ctime = inode->i_ctime;
fhp->fh_pre_size = inode->i_size;
fhp->fh_pre_change = inode->i_version;
- fhp->fh_pre_saved = 1;
+ fhp->fh_pre_saved = true;
}
}
@@ -267,7 +290,7 @@ fh_lock_nested(struct svc_fh *fhp, unsigned int subclass)
inode = d_inode(dentry);
mutex_lock_nested(&inode->i_mutex, subclass);
fill_pre_wcc(fhp);
- fhp->fh_locked = 1;
+ fhp->fh_locked = true;
}
static inline void
@@ -285,7 +308,7 @@ fh_unlock(struct svc_fh *fhp)
if (fhp->fh_locked) {
fill_post_wcc(fhp);
mutex_unlock(&d_inode(fhp->fh_dentry)->i_mutex);
- fhp->fh_locked = 0;
+ fhp->fh_locked = false;
}
}
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index ad4e2377dd63..45007acaf364 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -14,9 +14,13 @@
#include <linux/sunrpc/stats.h>
#include <linux/sunrpc/svcsock.h>
+#include <linux/sunrpc/svc_xprt.h>
#include <linux/lockd/bind.h>
#include <linux/nfsacl.h>
#include <linux/seq_file.h>
+#include <linux/inetdevice.h>
+#include <net/addrconf.h>
+#include <net/ipv6.h>
#include <net/net_namespace.h>
#include "nfsd.h"
#include "cache.h"
@@ -306,22 +310,81 @@ static void nfsd_shutdown_net(struct net *net)
nfsd_shutdown_generic();
}
+static int nfsd_inetaddr_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
+ struct net_device *dev = ifa->ifa_dev->dev;
+ struct net *net = dev_net(dev);
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct sockaddr_in sin;
+
+ if (event != NETDEV_DOWN)
+ goto out;
+
+ if (nn->nfsd_serv) {
+ dprintk("nfsd_inetaddr_event: removed %pI4\n", &ifa->ifa_local);
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = ifa->ifa_local;
+ svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin);
+ }
+
+out:
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block nfsd_inetaddr_notifier = {
+ .notifier_call = nfsd_inetaddr_event,
+};
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int nfsd_inet6addr_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr;
+ struct net_device *dev = ifa->idev->dev;
+ struct net *net = dev_net(dev);
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct sockaddr_in6 sin6;
+
+ if (event != NETDEV_DOWN)
+ goto out;
+
+ if (nn->nfsd_serv) {
+ dprintk("nfsd_inet6addr_event: removed %pI6\n", &ifa->addr);
+ sin6.sin6_family = AF_INET6;
+ sin6.sin6_addr = ifa->addr;
+ svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin6);
+ }
+
+out:
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block nfsd_inet6addr_notifier = {
+ .notifier_call = nfsd_inet6addr_event,
+};
+#endif
+
static void nfsd_last_thread(struct svc_serv *serv, struct net *net)
{
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ unregister_inetaddr_notifier(&nfsd_inetaddr_notifier);
+#if IS_ENABLED(CONFIG_IPV6)
+ unregister_inet6addr_notifier(&nfsd_inet6addr_notifier);
+#endif
/*
* write_ports can create the server without actually starting
* any threads--if we get shut down before any threads are
* started, then nfsd_last_thread will be run before any of this
- * other initialization has been done.
+ * other initialization has been done except the rpcb information.
*/
+ svc_rpcb_cleanup(serv, net);
if (!nn->nfsd_net_up)
return;
- nfsd_shutdown_net(net);
-
- svc_rpcb_cleanup(serv, net);
+ nfsd_shutdown_net(net);
printk(KERN_WARNING "nfsd: last server has exited, flushing export "
"cache\n");
nfsd_export_flush(net);
@@ -425,6 +488,10 @@ int nfsd_create_serv(struct net *net)
}
set_max_drc();
+ register_inetaddr_notifier(&nfsd_inetaddr_notifier);
+#if IS_ENABLED(CONFIG_IPV6)
+ register_inet6addr_notifier(&nfsd_inet6addr_notifier);
+#endif
do_gettimeofday(&nn->nfssvc_boot); /* record boot time */
return 0;
}
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 583ffc13cae2..c050c53036a6 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -65,7 +65,7 @@ struct nfsd4_callback {
struct nfs4_client *cb_clp;
u32 cb_minorversion;
struct rpc_message cb_msg;
- struct nfsd4_callback_ops *cb_ops;
+ const struct nfsd4_callback_ops *cb_ops;
struct work_struct cb_work;
int cb_seq_status;
int cb_status;
@@ -84,7 +84,7 @@ struct nfsd4_callback_ops {
* fields that are of general use to any stateid.
*/
struct nfs4_stid {
- atomic_t sc_count;
+ atomic_t sc_count;
#define NFS4_OPEN_STID 1
#define NFS4_LOCK_STID 2
#define NFS4_DELEG_STID 4
@@ -94,11 +94,12 @@ struct nfs4_stid {
#define NFS4_REVOKED_DELEG_STID 16
#define NFS4_CLOSED_DELEG_STID 32
#define NFS4_LAYOUT_STID 64
- unsigned char sc_type;
- stateid_t sc_stateid;
- struct nfs4_client *sc_client;
- struct nfs4_file *sc_file;
- void (*sc_free)(struct nfs4_stid *);
+ unsigned char sc_type;
+ stateid_t sc_stateid;
+ spinlock_t sc_lock;
+ struct nfs4_client *sc_client;
+ struct nfs4_file *sc_file;
+ void (*sc_free)(struct nfs4_stid *);
};
/*
@@ -364,15 +365,6 @@ struct nfs4_client_reclaim {
char cr_recdir[HEXDIR_LEN]; /* recover dir */
};
-static inline void
-update_stateid(stateid_t *stateid)
-{
- stateid->si_generation++;
- /* Wraparound recommendation from 3530bis-13 9.1.3.2: */
- if (stateid->si_generation == 0)
- stateid->si_generation = 1;
-}
-
/* A reasonable value for REPLAY_ISIZE was estimated as follows:
* The OPEN response, typically the largest, requires
* 4(status) + 8(stateid) + 20(changeinfo) + 4(rflags) + 8(verifier) +
@@ -534,15 +526,16 @@ struct nfs4_file {
* Better suggestions welcome.
*/
struct nfs4_ol_stateid {
- struct nfs4_stid st_stid; /* must be first field */
- struct list_head st_perfile;
- struct list_head st_perstateowner;
- struct list_head st_locks;
- struct nfs4_stateowner * st_stateowner;
- struct nfs4_clnt_odstate * st_clnt_odstate;
- unsigned char st_access_bmap;
- unsigned char st_deny_bmap;
- struct nfs4_ol_stateid * st_openstp;
+ struct nfs4_stid st_stid;
+ struct list_head st_perfile;
+ struct list_head st_perstateowner;
+ struct list_head st_locks;
+ struct nfs4_stateowner *st_stateowner;
+ struct nfs4_clnt_odstate *st_clnt_odstate;
+ unsigned char st_access_bmap;
+ unsigned char st_deny_bmap;
+ struct nfs4_ol_stateid *st_openstp;
+ struct rw_semaphore st_rwsem;
};
static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s)
@@ -561,6 +554,7 @@ struct nfs4_layout_stateid {
struct nfsd4_callback ls_recall;
stateid_t ls_recall_sid;
bool ls_recalled;
+ struct mutex ls_mutex;
};
static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s)
@@ -584,8 +578,8 @@ struct nfsd4_compound_state;
struct nfsd_net;
extern __be32 nfs4_preprocess_stateid_op(struct svc_rqst *rqstp,
- struct nfsd4_compound_state *cstate, stateid_t *stateid,
- int flags, struct file **filp, bool *tmp_file);
+ struct nfsd4_compound_state *cstate, struct svc_fh *fhp,
+ stateid_t *stateid, int flags, struct file **filp, bool *tmp_file);
__be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
stateid_t *stateid, unsigned char typemask,
struct nfs4_stid **s, struct nfsd_net *nn);
@@ -593,6 +587,7 @@ struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl,
struct kmem_cache *slab);
void nfs4_unhash_stid(struct nfs4_stid *s);
void nfs4_put_stid(struct nfs4_stid *s);
+void nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid);
void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *);
extern void nfs4_release_reclaim(struct nfsd_net *);
extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(const char *recdir,
@@ -604,7 +599,7 @@ extern void nfsd4_probe_callback(struct nfs4_client *clp);
extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);
extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
extern void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
- struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op);
+ const struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op);
extern void nfsd4_run_cb(struct nfsd4_callback *cb);
extern int nfsd4_create_callback_queue(void);
extern void nfsd4_destroy_callback_queue(void);
diff --git a/fs/nfsd/trace.c b/fs/nfsd/trace.c
index 82f89070594c..90967466a1e5 100644
--- a/fs/nfsd/trace.c
+++ b/fs/nfsd/trace.c
@@ -1,5 +1,3 @@
-#include "state.h"
-
#define CREATE_TRACE_POINTS
#include "trace.h"
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index c668520c344b..3287041905da 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -8,6 +8,49 @@
#define _NFSD_TRACE_H
#include <linux/tracepoint.h>
+#include "nfsfh.h"
+
+DECLARE_EVENT_CLASS(nfsd_io_class,
+ TP_PROTO(struct svc_rqst *rqstp,
+ struct svc_fh *fhp,
+ loff_t offset,
+ int len),
+ TP_ARGS(rqstp, fhp, offset, len),
+ TP_STRUCT__entry(
+ __field(__be32, xid)
+ __field_struct(struct knfsd_fh, fh)
+ __field(loff_t, offset)
+ __field(int, len)
+ ),
+ TP_fast_assign(
+ __entry->xid = rqstp->rq_xid,
+ fh_copy_shallow(&__entry->fh, &fhp->fh_handle);
+ __entry->offset = offset;
+ __entry->len = len;
+ ),
+ TP_printk("xid=0x%x fh=0x%x offset=%lld len=%d",
+ __be32_to_cpu(__entry->xid), knfsd_fh_hash(&__entry->fh),
+ __entry->offset, __entry->len)
+)
+
+#define DEFINE_NFSD_IO_EVENT(name) \
+DEFINE_EVENT(nfsd_io_class, name, \
+ TP_PROTO(struct svc_rqst *rqstp, \
+ struct svc_fh *fhp, \
+ loff_t offset, \
+ int len), \
+ TP_ARGS(rqstp, fhp, offset, len))
+
+DEFINE_NFSD_IO_EVENT(read_start);
+DEFINE_NFSD_IO_EVENT(read_opened);
+DEFINE_NFSD_IO_EVENT(read_io_done);
+DEFINE_NFSD_IO_EVENT(read_done);
+DEFINE_NFSD_IO_EVENT(write_start);
+DEFINE_NFSD_IO_EVENT(write_opened);
+DEFINE_NFSD_IO_EVENT(write_io_done);
+DEFINE_NFSD_IO_EVENT(write_done);
+
+#include "state.h"
DECLARE_EVENT_CLASS(nfsd_stateid_class,
TP_PROTO(stateid_t *stp),
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 45c04979e7b3..6739077f17fe 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -36,12 +36,14 @@
#endif /* CONFIG_NFSD_V3 */
#ifdef CONFIG_NFSD_V4
+#include "../internal.h"
#include "acl.h"
#include "idmap.h"
#endif /* CONFIG_NFSD_V4 */
#include "nfsd.h"
#include "vfs.h"
+#include "trace.h"
#define NFSDDBG_FACILITY NFSDDBG_FILEOP
@@ -217,10 +219,16 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
host_err = PTR_ERR(dentry);
if (IS_ERR(dentry))
goto out_nfserr;
- /*
- * check if we have crossed a mount point ...
- */
if (nfsd_mountpoint(dentry, exp)) {
+ /*
+ * We don't need the i_mutex after all. It's
+ * still possible we could open this (regular
+ * files can be mountpoints too), but the
+ * i_mutex is just there to prevent renames of
+ * something that we might be about to delegate,
+ * and a mountpoint won't be renamed:
+ */
+ fh_unlock(fhp);
if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) {
dput(dentry);
goto out_nfserr;
@@ -498,6 +506,13 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp,
}
#endif
+__be32 nfsd4_clone_file_range(struct file *src, u64 src_pos, struct file *dst,
+ u64 dst_pos, u64 count)
+{
+ return nfserrno(vfs_clone_file_range(src, src_pos, dst, dst_pos,
+ count));
+}
+
__be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct file *file, loff_t offset, loff_t len,
int flags)
@@ -983,16 +998,23 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct raparms *ra;
__be32 err;
+ trace_read_start(rqstp, fhp, offset, vlen);
err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file);
if (err)
return err;
ra = nfsd_init_raparms(file);
+
+ trace_read_opened(rqstp, fhp, offset, vlen);
err = nfsd_vfs_read(rqstp, file, offset, vec, vlen, count);
+ trace_read_io_done(rqstp, fhp, offset, vlen);
+
if (ra)
nfsd_put_raparams(file, ra);
fput(file);
+ trace_read_done(rqstp, fhp, offset, vlen);
+
return err;
}
@@ -1008,24 +1030,31 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
{
__be32 err = 0;
+ trace_write_start(rqstp, fhp, offset, vlen);
+
if (file) {
err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry,
NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE);
if (err)
goto out;
+ trace_write_opened(rqstp, fhp, offset, vlen);
err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt,
stablep);
+ trace_write_io_done(rqstp, fhp, offset, vlen);
} else {
err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file);
if (err)
goto out;
+ trace_write_opened(rqstp, fhp, offset, vlen);
if (cnt)
err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen,
cnt, stablep);
+ trace_write_io_done(rqstp, fhp, offset, vlen);
fput(file);
}
out:
+ trace_write_done(rqstp, fhp, offset, vlen);
return err;
}
@@ -1631,7 +1660,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
/* cannot use fh_lock as we need deadlock protective ordering
* so do it by hand */
trap = lock_rename(tdentry, fdentry);
- ffhp->fh_locked = tfhp->fh_locked = 1;
+ ffhp->fh_locked = tfhp->fh_locked = true;
fill_pre_wcc(ffhp);
fill_pre_wcc(tfhp);
@@ -1681,7 +1710,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
fill_post_wcc(ffhp);
fill_post_wcc(tfhp);
unlock_rename(tdentry, fdentry);
- ffhp->fh_locked = tfhp->fh_locked = 0;
+ ffhp->fh_locked = tfhp->fh_locked = false;
fh_drop_write(ffhp);
out:
@@ -1809,7 +1838,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
offset = *offsetp;
while (1) {
- struct inode *dir_inode = file_inode(file);
unsigned int reclen;
cdp->err = nfserr_eof; /* will be cleared on successful read */
@@ -1828,15 +1856,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
if (!size)
break;
- /*
- * Various filldir functions may end up calling back into
- * lookup_one_len() and the file system's ->lookup() method.
- * These expect i_mutex to be held, as it would within readdir.
- */
- host_err = mutex_lock_killable(&dir_inode->i_mutex);
- if (host_err)
- break;
-
de = (struct buffered_dirent *)buf.dirent;
while (size > 0) {
offset = de->offset;
@@ -1853,7 +1872,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
size -= reclen;
de = (struct buffered_dirent *)((char *)de + reclen);
}
- mutex_unlock(&dir_inode->i_mutex);
if (size > 0) /* We bailed out early */
break;
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index fee2451ae248..c11ba316f23f 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -56,6 +56,8 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *, struct svc_fh *,
struct xdr_netobj *);
__be32 nfsd4_vfs_fallocate(struct svc_rqst *, struct svc_fh *,
struct file *, loff_t, loff_t, int);
+__be32 nfsd4_clone_file_range(struct file *, u64, struct file *,
+ u64, u64);
#endif /* CONFIG_NFSD_V4 */
__be32 nfsd_create(struct svc_rqst *, struct svc_fh *,
char *name, int len, struct iattr *attrs,
@@ -112,14 +114,14 @@ static inline int fh_want_write(struct svc_fh *fh)
int ret = mnt_want_write(fh->fh_export->ex_path.mnt);
if (!ret)
- fh->fh_want_write = 1;
+ fh->fh_want_write = true;
return ret;
}
static inline void fh_drop_write(struct svc_fh *fh)
{
if (fh->fh_want_write) {
- fh->fh_want_write = 0;
+ fh->fh_want_write = false;
mnt_drop_write(fh->fh_export->ex_path.mnt);
}
}
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 9f991007a578..d9554813e58a 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -491,6 +491,15 @@ struct nfsd4_fallocate {
u64 falloc_length;
};
+struct nfsd4_clone {
+ /* request */
+ stateid_t cl_src_stateid;
+ stateid_t cl_dst_stateid;
+ u64 cl_src_pos;
+ u64 cl_dst_pos;
+ u64 cl_count;
+};
+
struct nfsd4_seek {
/* request */
stateid_t seek_stateid;
@@ -555,6 +564,7 @@ struct nfsd4_op {
/* NFSv4.2 */
struct nfsd4_fallocate allocate;
struct nfsd4_fallocate deallocate;
+ struct nfsd4_clone clone;
struct nfsd4_seek seek;
} u;
struct nfs4_replay * replay;
@@ -632,7 +642,7 @@ static inline void
set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp)
{
BUG_ON(!fhp->fh_pre_saved);
- cinfo->atomic = fhp->fh_post_saved;
+ cinfo->atomic = (u32)fhp->fh_post_saved;
cinfo->change_supported = IS_I_VERSION(d_inode(fhp->fh_dentry));
cinfo->before_change = fhp->fh_pre_change;
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 8df0f3b7839b..2ccbf5531554 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -133,38 +133,38 @@ nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group)
/**
* nilfs_palloc_group_desc_nfrees - get the number of free entries in a group
- * @inode: inode of metadata file using this allocator
- * @group: group number
* @desc: pointer to descriptor structure for the group
+ * @lock: spin lock protecting @desc
*/
static unsigned long
-nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group,
- const struct nilfs_palloc_group_desc *desc)
+nilfs_palloc_group_desc_nfrees(const struct nilfs_palloc_group_desc *desc,
+ spinlock_t *lock)
{
unsigned long nfree;
- spin_lock(nilfs_mdt_bgl_lock(inode, group));
+ spin_lock(lock);
nfree = le32_to_cpu(desc->pg_nfrees);
- spin_unlock(nilfs_mdt_bgl_lock(inode, group));
+ spin_unlock(lock);
return nfree;
}
/**
* nilfs_palloc_group_desc_add_entries - adjust count of free entries
- * @inode: inode of metadata file using this allocator
- * @group: group number
* @desc: pointer to descriptor structure for the group
+ * @lock: spin lock protecting @desc
* @n: delta to be added
*/
-static void
-nilfs_palloc_group_desc_add_entries(struct inode *inode,
- unsigned long group,
- struct nilfs_palloc_group_desc *desc,
- u32 n)
+static u32
+nilfs_palloc_group_desc_add_entries(struct nilfs_palloc_group_desc *desc,
+ spinlock_t *lock, u32 n)
{
- spin_lock(nilfs_mdt_bgl_lock(inode, group));
+ u32 nfree;
+
+ spin_lock(lock);
le32_add_cpu(&desc->pg_nfrees, n);
- spin_unlock(nilfs_mdt_bgl_lock(inode, group));
+ nfree = le32_to_cpu(desc->pg_nfrees);
+ spin_unlock(lock);
+ return nfree;
}
/**
@@ -240,6 +240,26 @@ static int nilfs_palloc_get_block(struct inode *inode, unsigned long blkoff,
}
/**
+ * nilfs_palloc_delete_block - delete a block on the persistent allocator file
+ * @inode: inode of metadata file using this allocator
+ * @blkoff: block offset
+ * @prev: nilfs_bh_assoc struct of the last used buffer
+ * @lock: spin lock protecting @prev
+ */
+static int nilfs_palloc_delete_block(struct inode *inode, unsigned long blkoff,
+ struct nilfs_bh_assoc *prev,
+ spinlock_t *lock)
+{
+ spin_lock(lock);
+ if (prev->bh && blkoff == prev->blkoff) {
+ brelse(prev->bh);
+ prev->bh = NULL;
+ }
+ spin_unlock(lock);
+ return nilfs_mdt_delete_block(inode, blkoff);
+}
+
+/**
* nilfs_palloc_get_desc_block - get buffer head of a group descriptor block
* @inode: inode of metadata file using this allocator
* @group: group number
@@ -278,6 +298,22 @@ static int nilfs_palloc_get_bitmap_block(struct inode *inode,
}
/**
+ * nilfs_palloc_delete_bitmap_block - delete a bitmap block
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ */
+static int nilfs_palloc_delete_bitmap_block(struct inode *inode,
+ unsigned long group)
+{
+ struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;
+
+ return nilfs_palloc_delete_block(inode,
+ nilfs_palloc_bitmap_blkoff(inode,
+ group),
+ &cache->prev_bitmap, &cache->lock);
+}
+
+/**
* nilfs_palloc_get_entry_block - get buffer head of an entry block
* @inode: inode of metadata file using this allocator
* @nr: serial number of the entry (e.g. inode number)
@@ -296,6 +332,20 @@ int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr,
}
/**
+ * nilfs_palloc_delete_entry_block - delete an entry block
+ * @inode: inode of metadata file using this allocator
+ * @nr: serial number of the entry
+ */
+static int nilfs_palloc_delete_entry_block(struct inode *inode, __u64 nr)
+{
+ struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;
+
+ return nilfs_palloc_delete_block(inode,
+ nilfs_palloc_entry_blkoff(inode, nr),
+ &cache->prev_entry, &cache->lock);
+}
+
+/**
* nilfs_palloc_block_get_group_desc - get kernel address of a group descriptor
* @inode: inode of metadata file using this allocator
* @group: group number
@@ -332,51 +382,40 @@ void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr,
/**
* nilfs_palloc_find_available_slot - find available slot in a group
- * @inode: inode of metadata file using this allocator
- * @group: group number
- * @target: offset number of an entry in the group (start point)
* @bitmap: bitmap of the group
+ * @target: offset number of an entry in the group (start point)
* @bsize: size in bits
+ * @lock: spin lock protecting @bitmap
*/
-static int nilfs_palloc_find_available_slot(struct inode *inode,
- unsigned long group,
+static int nilfs_palloc_find_available_slot(unsigned char *bitmap,
unsigned long target,
- unsigned char *bitmap,
- int bsize)
-{
- int curr, pos, end, i;
-
- if (target > 0) {
- end = (target + BITS_PER_LONG - 1) & ~(BITS_PER_LONG - 1);
- if (end > bsize)
- end = bsize;
- pos = nilfs_find_next_zero_bit(bitmap, end, target);
- if (pos < end &&
- !nilfs_set_bit_atomic(
- nilfs_mdt_bgl_lock(inode, group), pos, bitmap))
- return pos;
- } else
- end = 0;
-
- for (i = 0, curr = end;
- i < bsize;
- i += BITS_PER_LONG, curr += BITS_PER_LONG) {
- /* wrap around */
- if (curr >= bsize)
- curr = 0;
- while (*((unsigned long *)bitmap + curr / BITS_PER_LONG)
- != ~0UL) {
- end = curr + BITS_PER_LONG;
- if (end > bsize)
- end = bsize;
- pos = nilfs_find_next_zero_bit(bitmap, end, curr);
- if ((pos < end) &&
- !nilfs_set_bit_atomic(
- nilfs_mdt_bgl_lock(inode, group), pos,
- bitmap))
+ unsigned bsize,
+ spinlock_t *lock)
+{
+ int pos, end = bsize;
+
+ if (likely(target < bsize)) {
+ pos = target;
+ do {
+ pos = nilfs_find_next_zero_bit(bitmap, end, pos);
+ if (pos >= end)
+ break;
+ if (!nilfs_set_bit_atomic(lock, pos, bitmap))
return pos;
- }
+ } while (++pos < end);
+
+ end = target;
+ }
+
+ /* wrap around */
+ for (pos = 0; pos < end; pos++) {
+ pos = nilfs_find_next_zero_bit(bitmap, end, pos);
+ if (pos >= end)
+ break;
+ if (!nilfs_set_bit_atomic(lock, pos, bitmap))
+ return pos;
}
+
return -ENOSPC;
}
@@ -475,15 +514,15 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
void *desc_kaddr, *bitmap_kaddr;
unsigned long group, maxgroup, ngroups;
unsigned long group_offset, maxgroup_offset;
- unsigned long n, entries_per_group, groups_per_desc_block;
+ unsigned long n, entries_per_group;
unsigned long i, j;
+ spinlock_t *lock;
int pos, ret;
ngroups = nilfs_palloc_groups_count(inode);
maxgroup = ngroups - 1;
group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
entries_per_group = nilfs_palloc_entries_per_group(inode);
- groups_per_desc_block = nilfs_palloc_groups_per_desc_block(inode);
for (i = 0; i < ngroups; i += n) {
if (group >= ngroups) {
@@ -501,8 +540,8 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
n = nilfs_palloc_rest_groups_in_desc_block(inode, group,
maxgroup);
for (j = 0; j < n; j++, desc++, group++) {
- if (nilfs_palloc_group_desc_nfrees(inode, group, desc)
- > 0) {
+ lock = nilfs_mdt_bgl_lock(inode, group);
+ if (nilfs_palloc_group_desc_nfrees(desc, lock) > 0) {
ret = nilfs_palloc_get_bitmap_block(
inode, group, 1, &bitmap_bh);
if (ret < 0)
@@ -510,12 +549,12 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
bitmap_kaddr = kmap(bitmap_bh->b_page);
bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
pos = nilfs_palloc_find_available_slot(
- inode, group, group_offset, bitmap,
- entries_per_group);
+ bitmap, group_offset,
+ entries_per_group, lock);
if (pos >= 0) {
/* found a free entry */
nilfs_palloc_group_desc_add_entries(
- inode, group, desc, -1);
+ desc, lock, -1);
req->pr_entry_nr =
entries_per_group * group + pos;
kunmap(desc_bh->b_page);
@@ -573,6 +612,7 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
unsigned long group, group_offset;
unsigned char *bitmap;
void *desc_kaddr, *bitmap_kaddr;
+ spinlock_t *lock;
group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
desc_kaddr = kmap(req->pr_desc_bh->b_page);
@@ -580,13 +620,15 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
req->pr_desc_bh, desc_kaddr);
bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh);
+ lock = nilfs_mdt_bgl_lock(inode, group);
- if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
- group_offset, bitmap))
- printk(KERN_WARNING "%s: entry number %llu already freed\n",
- __func__, (unsigned long long)req->pr_entry_nr);
+ if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap))
+ nilfs_warning(inode->i_sb, __func__,
+ "entry number %llu already freed: ino=%lu\n",
+ (unsigned long long)req->pr_entry_nr,
+ (unsigned long)inode->i_ino);
else
- nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
+ nilfs_palloc_group_desc_add_entries(desc, lock, 1);
kunmap(req->pr_bitmap_bh->b_page);
kunmap(req->pr_desc_bh->b_page);
@@ -611,6 +653,7 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
void *desc_kaddr, *bitmap_kaddr;
unsigned char *bitmap;
unsigned long group, group_offset;
+ spinlock_t *lock;
group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
desc_kaddr = kmap(req->pr_desc_bh->b_page);
@@ -618,12 +661,15 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
req->pr_desc_bh, desc_kaddr);
bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh);
- if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
- group_offset, bitmap))
- printk(KERN_WARNING "%s: entry number %llu already freed\n",
- __func__, (unsigned long long)req->pr_entry_nr);
+ lock = nilfs_mdt_bgl_lock(inode, group);
+
+ if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap))
+ nilfs_warning(inode->i_sb, __func__,
+ "entry number %llu already freed: ino=%lu\n",
+ (unsigned long long)req->pr_entry_nr,
+ (unsigned long)inode->i_ino);
else
- nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
+ nilfs_palloc_group_desc_add_entries(desc, lock, 1);
kunmap(req->pr_bitmap_bh->b_page);
kunmap(req->pr_desc_bh->b_page);
@@ -680,22 +726,6 @@ void nilfs_palloc_abort_free_entry(struct inode *inode,
}
/**
- * nilfs_palloc_group_is_in - judge if an entry is in a group
- * @inode: inode of metadata file using this allocator
- * @group: group number
- * @nr: serial number of the entry (e.g. inode number)
- */
-static int
-nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr)
-{
- __u64 first, last;
-
- first = group * nilfs_palloc_entries_per_group(inode);
- last = first + nilfs_palloc_entries_per_group(inode) - 1;
- return (nr >= first) && (nr <= last);
-}
-
-/**
* nilfs_palloc_freev - deallocate a set of persistent objects
* @inode: inode of metadata file using this allocator
* @entry_nrs: array of entry numbers to be deallocated
@@ -708,9 +738,18 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
unsigned char *bitmap;
void *desc_kaddr, *bitmap_kaddr;
unsigned long group, group_offset;
- int i, j, n, ret;
+ __u64 group_min_nr, last_nrs[8];
+ const unsigned long epg = nilfs_palloc_entries_per_group(inode);
+ const unsigned epb = NILFS_MDT(inode)->mi_entries_per_block;
+ unsigned entry_start, end, pos;
+ spinlock_t *lock;
+ int i, j, k, ret;
+ u32 nfree;
for (i = 0; i < nitems; i = j) {
+ int change_group = false;
+ int nempties = 0, n = 0;
+
group = nilfs_palloc_group(inode, entry_nrs[i], &group_offset);
ret = nilfs_palloc_get_desc_block(inode, group, 0, &desc_bh);
if (ret < 0)
@@ -721,38 +760,89 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
brelse(desc_bh);
return ret;
}
- desc_kaddr = kmap(desc_bh->b_page);
- desc = nilfs_palloc_block_get_group_desc(
- inode, group, desc_bh, desc_kaddr);
+
+ /* Get the first entry number of the group */
+ group_min_nr = (__u64)group * epg;
+
bitmap_kaddr = kmap(bitmap_bh->b_page);
bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
- for (j = i, n = 0;
- (j < nitems) && nilfs_palloc_group_is_in(inode, group,
- entry_nrs[j]);
- j++) {
- nilfs_palloc_group(inode, entry_nrs[j], &group_offset);
- if (!nilfs_clear_bit_atomic(
- nilfs_mdt_bgl_lock(inode, group),
- group_offset, bitmap)) {
- printk(KERN_WARNING
- "%s: entry number %llu already freed\n",
- __func__,
- (unsigned long long)entry_nrs[j]);
+ lock = nilfs_mdt_bgl_lock(inode, group);
+
+ j = i;
+ entry_start = rounddown(group_offset, epb);
+ do {
+ if (!nilfs_clear_bit_atomic(lock, group_offset,
+ bitmap)) {
+ nilfs_warning(inode->i_sb, __func__,
+ "entry number %llu already freed: ino=%lu\n",
+ (unsigned long long)entry_nrs[j],
+ (unsigned long)inode->i_ino);
} else {
n++;
}
- }
- nilfs_palloc_group_desc_add_entries(inode, group, desc, n);
+
+ j++;
+ if (j >= nitems || entry_nrs[j] < group_min_nr ||
+ entry_nrs[j] >= group_min_nr + epg) {
+ change_group = true;
+ } else {
+ group_offset = entry_nrs[j] - group_min_nr;
+ if (group_offset >= entry_start &&
+ group_offset < entry_start + epb) {
+ /* This entry is in the same block */
+ continue;
+ }
+ }
+
+ /* Test if the entry block is empty or not */
+ end = entry_start + epb;
+ pos = nilfs_find_next_bit(bitmap, end, entry_start);
+ if (pos >= end) {
+ last_nrs[nempties++] = entry_nrs[j - 1];
+ if (nempties >= ARRAY_SIZE(last_nrs))
+ break;
+ }
+
+ if (change_group)
+ break;
+
+ /* Go on to the next entry block */
+ entry_start = rounddown(group_offset, epb);
+ } while (true);
kunmap(bitmap_bh->b_page);
- kunmap(desc_bh->b_page);
+ mark_buffer_dirty(bitmap_bh);
+ brelse(bitmap_bh);
+ for (k = 0; k < nempties; k++) {
+ ret = nilfs_palloc_delete_entry_block(inode,
+ last_nrs[k]);
+ if (ret && ret != -ENOENT) {
+ nilfs_warning(inode->i_sb, __func__,
+ "failed to delete block of entry %llu: ino=%lu, err=%d\n",
+ (unsigned long long)last_nrs[k],
+ (unsigned long)inode->i_ino, ret);
+ }
+ }
+
+ desc_kaddr = kmap_atomic(desc_bh->b_page);
+ desc = nilfs_palloc_block_get_group_desc(
+ inode, group, desc_bh, desc_kaddr);
+ nfree = nilfs_palloc_group_desc_add_entries(desc, lock, n);
+ kunmap_atomic(desc_kaddr);
mark_buffer_dirty(desc_bh);
- mark_buffer_dirty(bitmap_bh);
nilfs_mdt_mark_dirty(inode);
-
- brelse(bitmap_bh);
brelse(desc_bh);
+
+ if (nfree == nilfs_palloc_entries_per_group(inode)) {
+ ret = nilfs_palloc_delete_bitmap_block(inode, group);
+ if (ret && ret != -ENOENT) {
+ nilfs_warning(inode->i_sb, __func__,
+ "failed to delete bitmap block of group %lu: ino=%lu, err=%d\n",
+ group,
+ (unsigned long)inode->i_ino, ret);
+ }
+ }
}
return 0;
}
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index 4bd6451b5703..6e6f49aa53df 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -77,6 +77,7 @@ int nilfs_palloc_freev(struct inode *, __u64 *, size_t);
#define nilfs_set_bit_atomic ext2_set_bit_atomic
#define nilfs_clear_bit_atomic ext2_clear_bit_atomic
#define nilfs_find_next_zero_bit find_next_zero_bit_le
+#define nilfs_find_next_bit find_next_bit_le
/**
* struct nilfs_bh_assoc - block offset and buffer head association
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 919fd5bb14a8..3a3821b00486 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -919,8 +919,6 @@ static void nilfs_btree_split(struct nilfs_bmap *btree,
int level, __u64 *keyp, __u64 *ptrp)
{
struct nilfs_btree_node *node, *right;
- __u64 newkey;
- __u64 newptr;
int nchildren, n, move, ncblk;
node = nilfs_btree_get_nonroot_node(path, level);
@@ -942,9 +940,6 @@ static void nilfs_btree_split(struct nilfs_bmap *btree,
if (!buffer_dirty(path[level].bp_sib_bh))
mark_buffer_dirty(path[level].bp_sib_bh);
- newkey = nilfs_btree_node_get_key(right, 0);
- newptr = path[level].bp_newreq.bpr_ptr;
-
if (move) {
path[level].bp_index -= nilfs_btree_node_get_nchildren(node);
nilfs_btree_node_insert(right, path[level].bp_index,
@@ -1856,7 +1851,7 @@ int nilfs_btree_convert_and_insert(struct nilfs_bmap *btree,
__u64 key, __u64 ptr,
const __u64 *keys, const __u64 *ptrs, int n)
{
- struct buffer_head *bh;
+ struct buffer_head *bh = NULL;
union nilfs_bmap_ptr_req dreq, nreq, *di, *ni;
struct nilfs_bmap_stats stats;
int ret;
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 0d5fada91191..7dc23f100e57 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -155,7 +155,6 @@ void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req,
int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req)
{
struct nilfs_dat_entry *entry;
- __u64 start;
sector_t blocknr;
void *kaddr;
int ret;
@@ -169,7 +168,6 @@ int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req)
kaddr = kmap_atomic(req->pr_entry_bh->b_page);
entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
req->pr_entry_bh, kaddr);
- start = le64_to_cpu(entry->de_start);
blocknr = le64_to_cpu(entry->de_blocknr);
kunmap_atomic(kaddr);
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 54575e3cc1a2..088ba001c6ef 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -109,7 +109,7 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
goto out;
file_update_time(vma->vm_file);
- ret = __block_page_mkwrite(vma, vmf, nilfs_get_block);
+ ret = block_page_mkwrite(vma, vmf, nilfs_get_block);
if (ret) {
nilfs_transaction_abort(inode->i_sb);
goto out;
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 4a73d6dffabf..10b22527a617 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -356,7 +356,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
goto failed;
mapping_set_gfp_mask(inode->i_mapping,
- mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
+ mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS));
root = NILFS_I(dir)->i_root;
ii = NILFS_I(inode);
@@ -510,6 +510,7 @@ static int __nilfs_read_inode(struct super_block *sb,
inode->i_mapping->a_ops = &nilfs_aops;
} else if (S_ISLNK(inode->i_mode)) {
inode->i_op = &nilfs_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &nilfs_aops;
} else {
inode->i_op = &nilfs_special_inode_operations;
@@ -522,7 +523,7 @@ static int __nilfs_read_inode(struct super_block *sb,
up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
nilfs_set_inode_flags(inode);
mapping_set_gfp_mask(inode->i_mapping,
- mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
+ mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS));
return 0;
failed_unmap:
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index dee34d990281..1125f40233ff 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -33,6 +33,7 @@
#include "page.h"
#include "mdt.h"
+#include <trace/events/nilfs2.h>
#define NILFS_MDT_MAX_RA_BLOCKS (16 - 1)
@@ -68,6 +69,9 @@ nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block,
set_buffer_uptodate(bh);
mark_buffer_dirty(bh);
nilfs_mdt_mark_dirty(inode);
+
+ trace_nilfs2_mdt_insert_new_block(inode, inode->i_ino, block);
+
return 0;
}
@@ -158,6 +162,8 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
get_bh(bh);
submit_bh(mode, bh);
ret = 0;
+
+ trace_nilfs2_mdt_submit_block(inode, inode->i_ino, blkoff, mode);
out:
get_bh(bh);
*out_bh = bh;
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index fe529a87a208..03246cac3338 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -72,7 +72,7 @@ static inline struct nilfs_mdt_info *NILFS_MDT(const struct inode *inode)
}
/* Default GFP flags using highmem */
-#define NILFS_MDT_GFP (__GFP_WAIT | __GFP_IO | __GFP_HIGHMEM)
+#define NILFS_MDT_GFP (__GFP_RECLAIM | __GFP_IO | __GFP_HIGHMEM)
int nilfs_mdt_get_block(struct inode *, unsigned long, int,
void (*init_block)(struct inode *,
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 37dd6b05b1b5..7ccdb961eea9 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -120,9 +120,6 @@ nilfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev)
struct nilfs_transaction_info ti;
int err;
- if (!new_valid_dev(rdev))
- return -EINVAL;
-
err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
if (err)
return err;
@@ -164,6 +161,7 @@ static int nilfs_symlink(struct inode *dir, struct dentry *dentry,
/* slow symlink */
inode->i_op = &nilfs_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &nilfs_aops;
err = page_symlink(inode, symname, l);
if (err)
@@ -571,8 +569,7 @@ const struct inode_operations nilfs_special_inode_operations = {
const struct inode_operations nilfs_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.permission = nilfs_permission,
};
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index ff00a0b7acb9..9b4f205d1173 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -582,7 +582,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
struct nilfs_recovery_info *ri)
{
struct buffer_head *bh_sum = NULL;
- struct nilfs_segment_summary *sum;
+ struct nilfs_segment_summary *sum = NULL;
sector_t pseg_start;
sector_t seg_start, seg_end; /* Starting/ending DBN of full segment */
unsigned long nsalvaged_blocks = 0;
@@ -814,7 +814,7 @@ int nilfs_search_super_root(struct the_nilfs *nilfs,
struct nilfs_recovery_info *ri)
{
struct buffer_head *bh_sum = NULL;
- struct nilfs_segment_summary *sum;
+ struct nilfs_segment_summary *sum = NULL;
sector_t pseg_start, pseg_end, sr_pseg_start = 0;
sector_t seg_start, seg_end; /* range of full segment (block number) */
sector_t b, end;
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index c6abbad9b8e3..3b65adaae7e4 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -77,6 +77,36 @@ enum {
NILFS_ST_DONE,
};
+#define CREATE_TRACE_POINTS
+#include <trace/events/nilfs2.h>
+
+/*
+ * nilfs_sc_cstage_inc(), nilfs_sc_cstage_set(), nilfs_sc_cstage_get() are
+ * wrapper functions of stage count (nilfs_sc_info->sc_stage.scnt). Users of
+ * the variable must use them because transition of stage count must involve
+ * trace events (trace_nilfs2_collection_stage_transition).
+ *
+ * nilfs_sc_cstage_get() isn't required for the above purpose because it doesn't
+ * produce tracepoint events. It is provided just for making the intention
+ * clear.
+ */
+static inline void nilfs_sc_cstage_inc(struct nilfs_sc_info *sci)
+{
+ sci->sc_stage.scnt++;
+ trace_nilfs2_collection_stage_transition(sci);
+}
+
+static inline void nilfs_sc_cstage_set(struct nilfs_sc_info *sci, int next_scnt)
+{
+ sci->sc_stage.scnt = next_scnt;
+ trace_nilfs2_collection_stage_transition(sci);
+}
+
+static inline int nilfs_sc_cstage_get(struct nilfs_sc_info *sci)
+{
+ return sci->sc_stage.scnt;
+}
+
/* State flags of collection */
#define NILFS_CF_NODE 0x0001 /* Collecting node blocks */
#define NILFS_CF_IFILE_STARTED 0x0002 /* IFILE stage has started */
@@ -184,11 +214,18 @@ int nilfs_transaction_begin(struct super_block *sb,
{
struct the_nilfs *nilfs;
int ret = nilfs_prepare_segment_lock(ti);
+ struct nilfs_transaction_info *trace_ti;
if (unlikely(ret < 0))
return ret;
- if (ret > 0)
+ if (ret > 0) {
+ trace_ti = current->journal_info;
+
+ trace_nilfs2_transaction_transition(sb, trace_ti,
+ trace_ti->ti_count, trace_ti->ti_flags,
+ TRACE_NILFS2_TRANSACTION_BEGIN);
return 0;
+ }
sb_start_intwrite(sb);
@@ -199,6 +236,11 @@ int nilfs_transaction_begin(struct super_block *sb,
ret = -ENOSPC;
goto failed;
}
+
+ trace_ti = current->journal_info;
+ trace_nilfs2_transaction_transition(sb, trace_ti, trace_ti->ti_count,
+ trace_ti->ti_flags,
+ TRACE_NILFS2_TRANSACTION_BEGIN);
return 0;
failed:
@@ -231,6 +273,8 @@ int nilfs_transaction_commit(struct super_block *sb)
ti->ti_flags |= NILFS_TI_COMMIT;
if (ti->ti_count > 0) {
ti->ti_count--;
+ trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
+ ti->ti_flags, TRACE_NILFS2_TRANSACTION_COMMIT);
return 0;
}
if (nilfs->ns_writer) {
@@ -242,6 +286,9 @@ int nilfs_transaction_commit(struct super_block *sb)
nilfs_segctor_do_flush(sci, 0);
}
up_read(&nilfs->ns_segctor_sem);
+ trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
+ ti->ti_flags, TRACE_NILFS2_TRANSACTION_COMMIT);
+
current->journal_info = ti->ti_save;
if (ti->ti_flags & NILFS_TI_SYNC)
@@ -260,10 +307,15 @@ void nilfs_transaction_abort(struct super_block *sb)
BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
if (ti->ti_count > 0) {
ti->ti_count--;
+ trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
+ ti->ti_flags, TRACE_NILFS2_TRANSACTION_ABORT);
return;
}
up_read(&nilfs->ns_segctor_sem);
+ trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
+ ti->ti_flags, TRACE_NILFS2_TRANSACTION_ABORT);
+
current->journal_info = ti->ti_save;
if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
kmem_cache_free(nilfs_transaction_cachep, ti);
@@ -309,6 +361,9 @@ static void nilfs_transaction_lock(struct super_block *sb,
current->journal_info = ti;
for (;;) {
+ trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
+ ti->ti_flags, TRACE_NILFS2_TRANSACTION_TRYLOCK);
+
down_write(&nilfs->ns_segctor_sem);
if (!test_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags))
break;
@@ -320,6 +375,9 @@ static void nilfs_transaction_lock(struct super_block *sb,
}
if (gcflag)
ti->ti_flags |= NILFS_TI_GC;
+
+ trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
+ ti->ti_flags, TRACE_NILFS2_TRANSACTION_LOCK);
}
static void nilfs_transaction_unlock(struct super_block *sb)
@@ -332,6 +390,9 @@ static void nilfs_transaction_unlock(struct super_block *sb)
up_write(&nilfs->ns_segctor_sem);
current->journal_info = ti->ti_save;
+
+ trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
+ ti->ti_flags, TRACE_NILFS2_TRANSACTION_UNLOCK);
}
static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci,
@@ -1062,7 +1123,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
size_t ndone;
int err = 0;
- switch (sci->sc_stage.scnt) {
+ switch (nilfs_sc_cstage_get(sci)) {
case NILFS_ST_INIT:
/* Pre-processes */
sci->sc_stage.flags = 0;
@@ -1071,7 +1132,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
sci->sc_nblk_inc = 0;
sci->sc_curseg->sb_sum.flags = NILFS_SS_LOGBGN;
if (mode == SC_LSEG_DSYNC) {
- sci->sc_stage.scnt = NILFS_ST_DSYNC;
+ nilfs_sc_cstage_set(sci, NILFS_ST_DSYNC);
goto dsync_mode;
}
}
@@ -1079,10 +1140,10 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
sci->sc_stage.dirty_file_ptr = NULL;
sci->sc_stage.gc_inode_ptr = NULL;
if (mode == SC_FLUSH_DAT) {
- sci->sc_stage.scnt = NILFS_ST_DAT;
+ nilfs_sc_cstage_set(sci, NILFS_ST_DAT);
goto dat_stage;
}
- sci->sc_stage.scnt++; /* Fall through */
+ nilfs_sc_cstage_inc(sci); /* Fall through */
case NILFS_ST_GC:
if (nilfs_doing_gc()) {
head = &sci->sc_gc_inodes;
@@ -1103,7 +1164,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
}
sci->sc_stage.gc_inode_ptr = NULL;
}
- sci->sc_stage.scnt++; /* Fall through */
+ nilfs_sc_cstage_inc(sci); /* Fall through */
case NILFS_ST_FILE:
head = &sci->sc_dirty_files;
ii = list_prepare_entry(sci->sc_stage.dirty_file_ptr, head,
@@ -1125,10 +1186,10 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
}
sci->sc_stage.dirty_file_ptr = NULL;
if (mode == SC_FLUSH_FILE) {
- sci->sc_stage.scnt = NILFS_ST_DONE;
+ nilfs_sc_cstage_set(sci, NILFS_ST_DONE);
return 0;
}
- sci->sc_stage.scnt++;
+ nilfs_sc_cstage_inc(sci);
sci->sc_stage.flags |= NILFS_CF_IFILE_STARTED;
/* Fall through */
case NILFS_ST_IFILE:
@@ -1136,7 +1197,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
&nilfs_sc_file_ops);
if (unlikely(err))
break;
- sci->sc_stage.scnt++;
+ nilfs_sc_cstage_inc(sci);
/* Creating a checkpoint */
err = nilfs_segctor_create_checkpoint(sci);
if (unlikely(err))
@@ -1147,7 +1208,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
&nilfs_sc_file_ops);
if (unlikely(err))
break;
- sci->sc_stage.scnt++; /* Fall through */
+ nilfs_sc_cstage_inc(sci); /* Fall through */
case NILFS_ST_SUFILE:
err = nilfs_sufile_freev(nilfs->ns_sufile, sci->sc_freesegs,
sci->sc_nfreesegs, &ndone);
@@ -1163,7 +1224,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
&nilfs_sc_file_ops);
if (unlikely(err))
break;
- sci->sc_stage.scnt++; /* Fall through */
+ nilfs_sc_cstage_inc(sci); /* Fall through */
case NILFS_ST_DAT:
dat_stage:
err = nilfs_segctor_scan_file(sci, nilfs->ns_dat,
@@ -1171,10 +1232,10 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
if (unlikely(err))
break;
if (mode == SC_FLUSH_DAT) {
- sci->sc_stage.scnt = NILFS_ST_DONE;
+ nilfs_sc_cstage_set(sci, NILFS_ST_DONE);
return 0;
}
- sci->sc_stage.scnt++; /* Fall through */
+ nilfs_sc_cstage_inc(sci); /* Fall through */
case NILFS_ST_SR:
if (mode == SC_LSEG_SR) {
/* Appending a super root */
@@ -1184,7 +1245,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
}
/* End of a logical segment */
sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND;
- sci->sc_stage.scnt = NILFS_ST_DONE;
+ nilfs_sc_cstage_set(sci, NILFS_ST_DONE);
return 0;
case NILFS_ST_DSYNC:
dsync_mode:
@@ -1197,7 +1258,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
if (unlikely(err))
break;
sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND;
- sci->sc_stage.scnt = NILFS_ST_DONE;
+ nilfs_sc_cstage_set(sci, NILFS_ST_DONE);
return 0;
case NILFS_ST_DONE:
return 0;
@@ -1442,7 +1503,8 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
goto failed;
/* The current segment is filled up */
- if (mode != SC_LSEG_SR || sci->sc_stage.scnt < NILFS_ST_CPFILE)
+ if (mode != SC_LSEG_SR ||
+ nilfs_sc_cstage_get(sci) < NILFS_ST_CPFILE)
break;
nilfs_clear_logs(&sci->sc_segbufs);
@@ -1946,7 +2008,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
int err;
- sci->sc_stage.scnt = NILFS_ST_INIT;
+ nilfs_sc_cstage_set(sci, NILFS_ST_INIT);
sci->sc_cno = nilfs->ns_cno;
err = nilfs_segctor_collect_dirty_files(sci, nilfs);
@@ -1974,7 +2036,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
goto failed;
/* Avoid empty segment */
- if (sci->sc_stage.scnt == NILFS_ST_DONE &&
+ if (nilfs_sc_cstage_get(sci) == NILFS_ST_DONE &&
nilfs_segbuf_empty(sci->sc_curseg)) {
nilfs_segctor_abort_construction(sci, nilfs, 1);
goto out;
@@ -1988,7 +2050,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
nilfs_segctor_fill_in_file_bmap(sci);
if (mode == SC_LSEG_SR &&
- sci->sc_stage.scnt >= NILFS_ST_CPFILE) {
+ nilfs_sc_cstage_get(sci) >= NILFS_ST_CPFILE) {
err = nilfs_segctor_fill_in_checkpoint(sci);
if (unlikely(err))
goto failed_to_write;
@@ -2007,7 +2069,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
if (unlikely(err))
goto failed_to_write;
- if (sci->sc_stage.scnt == NILFS_ST_DONE ||
+ if (nilfs_sc_cstage_get(sci) == NILFS_ST_DONE ||
nilfs->ns_blocksize_bits != PAGE_CACHE_SHIFT) {
/*
* At this point, we avoid double buffering
@@ -2020,7 +2082,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
if (err)
goto failed_to_write;
}
- } while (sci->sc_stage.scnt != NILFS_ST_DONE);
+ } while (nilfs_sc_cstage_get(sci) != NILFS_ST_DONE);
out:
nilfs_segctor_drop_written_files(sci, nilfs);
@@ -2430,7 +2492,6 @@ static void nilfs_segctor_thread_construct(struct nilfs_sc_info *sci, int mode)
static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *sci)
{
int mode = 0;
- int err;
spin_lock(&sci->sc_state_lock);
mode = (sci->sc_flush_request & FLUSH_DAT_BIT) ?
@@ -2438,7 +2499,7 @@ static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *sci)
spin_unlock(&sci->sc_state_lock);
if (mode) {
- err = nilfs_segctor_do_construct(sci, mode);
+ nilfs_segctor_do_construct(sci, mode);
spin_lock(&sci->sc_state_lock);
sci->sc_flush_request &= (mode == SC_FLUSH_FILE) ?
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index a48d6de1e02c..0408b9b2814b 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -67,7 +67,8 @@ struct nilfs_recovery_info {
/**
* struct nilfs_cstage - Context of collection stage
- * @scnt: Stage count
+ * @scnt: Stage count, must be accessed via wrappers:
+ * nilfs_sc_cstage_inc(), nilfs_sc_cstage_set(), nilfs_sc_cstage_get()
* @flags: State flags
* @dirty_file_ptr: Pointer on dirty_files list, or inode of a target file
* @gc_inode_ptr: Pointer on the list of gc-inodes
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 2a869c35c362..52821ffc11f4 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -30,6 +30,8 @@
#include "mdt.h"
#include "sufile.h"
+#include <trace/events/nilfs2.h>
+
/**
* struct nilfs_sufile_info - on-memory private data of sufile
* @mi: on-memory private data of metadata file
@@ -317,7 +319,7 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
size_t susz = NILFS_MDT(sufile)->mi_entry_size;
__u64 segnum, maxsegnum, last_alloc;
void *kaddr;
- unsigned long nsegments, ncleansegs, nsus, cnt;
+ unsigned long nsegments, nsus, cnt;
int ret, j;
down_write(&NILFS_MDT(sufile)->mi_sem);
@@ -327,7 +329,6 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
goto out_sem;
kaddr = kmap_atomic(header_bh->b_page);
header = kaddr + bh_offset(header_bh);
- ncleansegs = le64_to_cpu(header->sh_ncleansegs);
last_alloc = le64_to_cpu(header->sh_last_alloc);
kunmap_atomic(kaddr);
@@ -358,6 +359,7 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
break; /* never happens */
}
}
+ trace_nilfs2_segment_usage_check(sufile, segnum, cnt);
ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1,
&su_bh);
if (ret < 0)
@@ -388,6 +390,9 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
nilfs_mdt_mark_dirty(sufile);
brelse(su_bh);
*segnump = segnum;
+
+ trace_nilfs2_segment_usage_allocated(sufile, segnum);
+
goto out_header;
}
@@ -490,6 +495,8 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum,
NILFS_SUI(sufile)->ncleansegs++;
nilfs_mdt_mark_dirty(sufile);
+
+ trace_nilfs2_segment_usage_freed(sufile, segnum);
}
/**
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index f47585bfeb01..7f5d3d9f1c37 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -361,7 +361,7 @@ static int nilfs_move_2nd_super(struct super_block *sb, loff_t sb2off)
struct nilfs_super_block *nsbp;
sector_t blocknr, newblocknr;
unsigned long offset;
- int sb2i = -1; /* array index of the secondary superblock */
+ int sb2i; /* array index of the secondary superblock */
int ret = 0;
/* nilfs->ns_sem must be locked by the caller. */
@@ -372,6 +372,9 @@ static int nilfs_move_2nd_super(struct super_block *sb, loff_t sb2off)
} else if (nilfs->ns_sbh[0]->b_blocknr > nilfs->ns_first_data_block) {
sb2i = 0;
blocknr = nilfs->ns_sbh[0]->b_blocknr;
+ } else {
+ sb2i = -1;
+ blocknr = 0;
}
if (sb2i >= 0 && (u64)blocknr << nilfs->ns_blocksize_bits == sb2off)
goto out; /* super block location is unchanged */
@@ -1313,13 +1316,11 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
}
if (!s->s_root) {
- char b[BDEVNAME_SIZE];
-
- s_new = true;
+ s_new = true;
/* New superblock instance created */
s->s_mode = mode;
- strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id));
+ snprintf(s->s_id, sizeof(s->s_id), "%pg", sd.bdev);
sb_set_blocksize(s, block_size(sd.bdev));
err = nilfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
@@ -1405,21 +1406,18 @@ static void nilfs_destroy_cachep(void)
*/
rcu_barrier();
- if (nilfs_inode_cachep)
- kmem_cache_destroy(nilfs_inode_cachep);
- if (nilfs_transaction_cachep)
- kmem_cache_destroy(nilfs_transaction_cachep);
- if (nilfs_segbuf_cachep)
- kmem_cache_destroy(nilfs_segbuf_cachep);
- if (nilfs_btree_path_cache)
- kmem_cache_destroy(nilfs_btree_path_cache);
+ kmem_cache_destroy(nilfs_inode_cachep);
+ kmem_cache_destroy(nilfs_transaction_cachep);
+ kmem_cache_destroy(nilfs_segbuf_cachep);
+ kmem_cache_destroy(nilfs_btree_path_cache);
}
static int __init nilfs_init_cachep(void)
{
nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache",
sizeof(struct nilfs_inode_info), 0,
- SLAB_RECLAIM_ACCOUNT, nilfs_inode_init_once);
+ SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT,
+ nilfs_inode_init_once);
if (!nilfs_inode_cachep)
goto fail;
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index 6b6f0d472ae8..fd98e5100cab 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -83,9 +83,16 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark);
inode = igrab(mark->inode);
if (inode) {
+ /*
+ * IN_ALL_EVENTS represents all of the mask bits
+ * that we expose to userspace. There is at
+ * least one bit (FS_EVENT_ON_CHILD) which is
+ * used only internally to the kernel.
+ */
+ u32 mask = mark->mask & IN_ALL_EVENTS;
seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:%x ",
inode_mark->wd, inode->i_ino, inode->i_sb->s_dev,
- mark->mask, mark->ignored_mask);
+ mask, mark->ignored_mask);
show_mark_fhandle(m, inode);
seq_putc(m, '\n');
iput(inode);
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index e785fd954c30..741077deef3b 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -199,8 +199,7 @@ void fsnotify_unmount_inodes(struct super_block *sb)
break;
}
spin_unlock(&next_i->i_lock);
- next_i = list_entry(next_i->i_sb_list.next,
- struct inode, i_sb_list);
+ next_i = list_next_entry(next_i, i_sb_list);
}
/*
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 5b1e2a497e51..b8d08d0d0a4d 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -706,7 +706,19 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
int ret;
unsigned flags = 0;
- /* don't allow invalid bits: we don't want flags set */
+ /*
+ * We share a lot of code with fs/dnotify. We also share
+ * the bit layout between inotify's IN_* and the fsnotify
+ * FS_*. This check ensures that only the inotify IN_*
+ * bits get passed in and set in watches/events.
+ */
+ if (unlikely(mask & ~ALL_INOTIFY_BITS))
+ return -EINVAL;
+ /*
+ * Require at least one valid bit set in the mask.
+ * Without _something_ set, we would have no events to
+ * watch for.
+ */
if (unlikely(!(mask & ALL_INOTIFY_BITS)))
return -EINVAL;
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index fc0df4442f7b..cfcbf114676e 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -92,9 +92,6 @@
#include "fsnotify.h"
struct srcu_struct fsnotify_mark_srcu;
-static DEFINE_SPINLOCK(destroy_lock);
-static LIST_HEAD(destroy_list);
-static DECLARE_WAIT_QUEUE_HEAD(destroy_waitq);
void fsnotify_get_mark(struct fsnotify_mark *mark)
{
@@ -168,10 +165,19 @@ void fsnotify_detach_mark(struct fsnotify_mark *mark)
atomic_dec(&group->num_marks);
}
+static void
+fsnotify_mark_free_rcu(struct rcu_head *rcu)
+{
+ struct fsnotify_mark *mark;
+
+ mark = container_of(rcu, struct fsnotify_mark, g_rcu);
+ fsnotify_put_mark(mark);
+}
+
/*
- * Free fsnotify mark. The freeing is actually happening from a kthread which
- * first waits for srcu period end. Caller must have a reference to the mark
- * or be protected by fsnotify_mark_srcu.
+ * Free fsnotify mark. The freeing is actually happening from a call_srcu
+ * callback. Caller must have a reference to the mark or be protected by
+ * fsnotify_mark_srcu.
*/
void fsnotify_free_mark(struct fsnotify_mark *mark)
{
@@ -186,10 +192,7 @@ void fsnotify_free_mark(struct fsnotify_mark *mark)
mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
spin_unlock(&mark->lock);
- spin_lock(&destroy_lock);
- list_add(&mark->g_list, &destroy_list);
- spin_unlock(&destroy_lock);
- wake_up(&destroy_waitq);
+ call_srcu(&fsnotify_mark_srcu, &mark->g_rcu, fsnotify_mark_free_rcu);
/*
* Some groups like to know that marks are being freed. This is a
@@ -385,11 +388,7 @@ err:
spin_unlock(&mark->lock);
- spin_lock(&destroy_lock);
- list_add(&mark->g_list, &destroy_list);
- spin_unlock(&destroy_lock);
- wake_up(&destroy_waitq);
-
+ call_srcu(&fsnotify_mark_srcu, &mark->g_rcu, fsnotify_mark_free_rcu);
return ret;
}
@@ -492,40 +491,3 @@ void fsnotify_init_mark(struct fsnotify_mark *mark,
atomic_set(&mark->refcnt, 1);
mark->free_mark = free_mark;
}
-
-static int fsnotify_mark_destroy(void *ignored)
-{
- struct fsnotify_mark *mark, *next;
- struct list_head private_destroy_list;
-
- for (;;) {
- spin_lock(&destroy_lock);
- /* exchange the list head */
- list_replace_init(&destroy_list, &private_destroy_list);
- spin_unlock(&destroy_lock);
-
- synchronize_srcu(&fsnotify_mark_srcu);
-
- list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) {
- list_del_init(&mark->g_list);
- fsnotify_put_mark(mark);
- }
-
- wait_event_interruptible(destroy_waitq, !list_empty(&destroy_list));
- }
-
- return 0;
-}
-
-static int __init fsnotify_mark_init(void)
-{
- struct task_struct *thread;
-
- thread = kthread_run(fsnotify_mark_destroy, NULL,
- "fsnotify_mark");
- if (IS_ERR(thread))
- panic("unable to start fsnotify mark destruction thread.");
-
- return 0;
-}
-device_initcall(fsnotify_mark_init);
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 262561fea923..9d383e5eff0e 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -525,8 +525,8 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
}
}
err = add_to_page_cache_lru(*cached_page, mapping,
- index,
- GFP_KERNEL & mapping_gfp_mask(mapping));
+ index,
+ mapping_gfp_constraint(mapping, GFP_KERNEL));
if (unlikely(err)) {
if (err == -EEXIST)
continue;
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index d1a853585b53..2f77f8dfb861 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -3139,8 +3139,8 @@ static int __init init_ntfs_fs(void)
ntfs_big_inode_cache = kmem_cache_create(ntfs_big_inode_cache_name,
sizeof(big_ntfs_inode), 0,
- SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
- ntfs_big_inode_init_once);
+ SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
+ SLAB_ACCOUNT, ntfs_big_inode_init_once);
if (!ntfs_big_inode_cache) {
pr_crit("Failed to create %s!\n", ntfs_big_inode_cache_name);
goto big_inode_err_out;
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 86181d6526dc..a3ded88718c9 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -164,7 +164,7 @@ static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
struct ocfs2_extent_rec *rec);
static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et);
static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
-static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
+static const struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
.eo_set_last_eb_blk = ocfs2_dinode_set_last_eb_blk,
.eo_get_last_eb_blk = ocfs2_dinode_get_last_eb_blk,
.eo_update_clusters = ocfs2_dinode_update_clusters,
@@ -286,7 +286,7 @@ static void ocfs2_xattr_value_update_clusters(struct ocfs2_extent_tree *et,
le32_add_cpu(&vb->vb_xv->xr_clusters, clusters);
}
-static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
+static const struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
.eo_set_last_eb_blk = ocfs2_xattr_value_set_last_eb_blk,
.eo_get_last_eb_blk = ocfs2_xattr_value_get_last_eb_blk,
.eo_update_clusters = ocfs2_xattr_value_update_clusters,
@@ -332,7 +332,7 @@ static void ocfs2_xattr_tree_update_clusters(struct ocfs2_extent_tree *et,
le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters);
}
-static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
+static const struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
.eo_set_last_eb_blk = ocfs2_xattr_tree_set_last_eb_blk,
.eo_get_last_eb_blk = ocfs2_xattr_tree_get_last_eb_blk,
.eo_update_clusters = ocfs2_xattr_tree_update_clusters,
@@ -379,7 +379,7 @@ static void ocfs2_dx_root_fill_root_el(struct ocfs2_extent_tree *et)
et->et_root_el = &dx_root->dr_list;
}
-static struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = {
+static const struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = {
.eo_set_last_eb_blk = ocfs2_dx_root_set_last_eb_blk,
.eo_get_last_eb_blk = ocfs2_dx_root_get_last_eb_blk,
.eo_update_clusters = ocfs2_dx_root_update_clusters,
@@ -425,7 +425,7 @@ ocfs2_refcount_tree_extent_contig(struct ocfs2_extent_tree *et,
return CONTIG_NONE;
}
-static struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = {
+static const struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = {
.eo_set_last_eb_blk = ocfs2_refcount_tree_set_last_eb_blk,
.eo_get_last_eb_blk = ocfs2_refcount_tree_get_last_eb_blk,
.eo_update_clusters = ocfs2_refcount_tree_update_clusters,
@@ -438,7 +438,7 @@ static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
struct buffer_head *bh,
ocfs2_journal_access_func access,
void *obj,
- struct ocfs2_extent_tree_operations *ops)
+ const struct ocfs2_extent_tree_operations *ops)
{
et->et_ops = ops;
et->et_root_bh = bh;
@@ -6174,8 +6174,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
}
bail:
- if (tl_inode)
- iput(tl_inode);
+ iput(tl_inode);
brelse(tl_bh);
if (status < 0) {
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index fb09b97db162..f3dc1b0dfffc 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -54,7 +54,7 @@
*/
struct ocfs2_extent_tree_operations;
struct ocfs2_extent_tree {
- struct ocfs2_extent_tree_operations *et_ops;
+ const struct ocfs2_extent_tree_operations *et_ops;
struct buffer_head *et_root_bh;
struct ocfs2_extent_list *et_root_el;
struct ocfs2_caching_info *et_ci;
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 64b11d90eca6..7f604727f487 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -589,6 +589,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
ret = -EIO;
goto bail;
}
+ set_buffer_new(bh_result);
up_write(&OCFS2_I(inode)->ip_alloc_sem);
}
@@ -864,6 +865,7 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
is_overwrite = ocfs2_is_overwrite(osb, inode, offset);
if (is_overwrite < 0) {
mlog_errno(is_overwrite);
+ ret = is_overwrite;
ocfs2_inode_unlock(inode, 1);
goto clean_orphan;
}
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index fa15debcc02b..a3cc6d2fc896 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -219,7 +219,8 @@ struct o2hb_region {
unsigned hr_unclean_stop:1,
hr_aborted_start:1,
hr_item_pinned:1,
- hr_item_dropped:1;
+ hr_item_dropped:1,
+ hr_node_deleted:1;
/* protected by the hr_callback_sem */
struct task_struct *hr_task;
@@ -1078,7 +1079,13 @@ static int o2hb_thread(void *data)
set_user_nice(current, MIN_NICE);
/* Pin node */
- o2nm_depend_this_node();
+ ret = o2nm_depend_this_node();
+ if (ret) {
+ mlog(ML_ERROR, "Node has been deleted, ret = %d\n", ret);
+ reg->hr_node_deleted = 1;
+ wake_up(&o2hb_steady_queue);
+ return 0;
+ }
while (!kthread_should_stop() &&
!reg->hr_unclean_stop && !reg->hr_aborted_start) {
@@ -1473,16 +1480,17 @@ static int o2hb_read_block_input(struct o2hb_region *reg,
return 0;
}
-static ssize_t o2hb_region_block_bytes_read(struct o2hb_region *reg,
+static ssize_t o2hb_region_block_bytes_show(struct config_item *item,
char *page)
{
- return sprintf(page, "%u\n", reg->hr_block_bytes);
+ return sprintf(page, "%u\n", to_o2hb_region(item)->hr_block_bytes);
}
-static ssize_t o2hb_region_block_bytes_write(struct o2hb_region *reg,
+static ssize_t o2hb_region_block_bytes_store(struct config_item *item,
const char *page,
size_t count)
{
+ struct o2hb_region *reg = to_o2hb_region(item);
int status;
unsigned long block_bytes;
unsigned int block_bits;
@@ -1501,16 +1509,17 @@ static ssize_t o2hb_region_block_bytes_write(struct o2hb_region *reg,
return count;
}
-static ssize_t o2hb_region_start_block_read(struct o2hb_region *reg,
+static ssize_t o2hb_region_start_block_show(struct config_item *item,
char *page)
{
- return sprintf(page, "%llu\n", reg->hr_start_block);
+ return sprintf(page, "%llu\n", to_o2hb_region(item)->hr_start_block);
}
-static ssize_t o2hb_region_start_block_write(struct o2hb_region *reg,
+static ssize_t o2hb_region_start_block_store(struct config_item *item,
const char *page,
size_t count)
{
+ struct o2hb_region *reg = to_o2hb_region(item);
unsigned long long tmp;
char *p = (char *)page;
@@ -1526,16 +1535,16 @@ static ssize_t o2hb_region_start_block_write(struct o2hb_region *reg,
return count;
}
-static ssize_t o2hb_region_blocks_read(struct o2hb_region *reg,
- char *page)
+static ssize_t o2hb_region_blocks_show(struct config_item *item, char *page)
{
- return sprintf(page, "%d\n", reg->hr_blocks);
+ return sprintf(page, "%d\n", to_o2hb_region(item)->hr_blocks);
}
-static ssize_t o2hb_region_blocks_write(struct o2hb_region *reg,
+static ssize_t o2hb_region_blocks_store(struct config_item *item,
const char *page,
size_t count)
{
+ struct o2hb_region *reg = to_o2hb_region(item);
unsigned long tmp;
char *p = (char *)page;
@@ -1554,13 +1563,12 @@ static ssize_t o2hb_region_blocks_write(struct o2hb_region *reg,
return count;
}
-static ssize_t o2hb_region_dev_read(struct o2hb_region *reg,
- char *page)
+static ssize_t o2hb_region_dev_show(struct config_item *item, char *page)
{
unsigned int ret = 0;
- if (reg->hr_bdev)
- ret = sprintf(page, "%s\n", reg->hr_dev_name);
+ if (to_o2hb_region(item)->hr_bdev)
+ ret = sprintf(page, "%s\n", to_o2hb_region(item)->hr_dev_name);
return ret;
}
@@ -1670,10 +1678,11 @@ out:
}
/* this is acting as commit; we set up all of hr_bdev and hr_task or nothing */
-static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
+static ssize_t o2hb_region_dev_store(struct config_item *item,
const char *page,
size_t count)
{
+ struct o2hb_region *reg = to_o2hb_region(item);
struct task_struct *hb_task;
long fd;
int sectsize;
@@ -1771,8 +1780,8 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
}
++live_threshold;
atomic_set(&reg->hr_steady_iterations, live_threshold);
- /* unsteady_iterations is double the steady_iterations */
- atomic_set(&reg->hr_unsteady_iterations, (live_threshold << 1));
+ /* unsteady_iterations is triple the steady_iterations */
+ atomic_set(&reg->hr_unsteady_iterations, (live_threshold * 3));
hb_task = kthread_run(o2hb_thread, reg, "o2hb-%s",
reg->hr_item.ci_name);
@@ -1787,7 +1796,8 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
spin_unlock(&o2hb_live_lock);
ret = wait_event_interruptible(o2hb_steady_queue,
- atomic_read(&reg->hr_steady_iterations) == 0);
+ atomic_read(&reg->hr_steady_iterations) == 0 ||
+ reg->hr_node_deleted);
if (ret) {
atomic_set(&reg->hr_steady_iterations, 0);
reg->hr_aborted_start = 1;
@@ -1798,6 +1808,11 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
goto out3;
}
+ if (reg->hr_node_deleted) {
+ ret = -EINVAL;
+ goto out3;
+ }
+
/* Ok, we were woken. Make sure it wasn't by drop_item() */
spin_lock(&o2hb_live_lock);
hb_task = reg->hr_task;
@@ -1828,9 +1843,9 @@ out:
return ret;
}
-static ssize_t o2hb_region_pid_read(struct o2hb_region *reg,
- char *page)
+static ssize_t o2hb_region_pid_show(struct config_item *item, char *page)
{
+ struct o2hb_region *reg = to_o2hb_region(item);
pid_t pid = 0;
spin_lock(&o2hb_live_lock);
@@ -1844,92 +1859,23 @@ static ssize_t o2hb_region_pid_read(struct o2hb_region *reg,
return sprintf(page, "%u\n", pid);
}
-struct o2hb_region_attribute {
- struct configfs_attribute attr;
- ssize_t (*show)(struct o2hb_region *, char *);
- ssize_t (*store)(struct o2hb_region *, const char *, size_t);
-};
-
-static struct o2hb_region_attribute o2hb_region_attr_block_bytes = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "block_bytes",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2hb_region_block_bytes_read,
- .store = o2hb_region_block_bytes_write,
-};
-
-static struct o2hb_region_attribute o2hb_region_attr_start_block = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "start_block",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2hb_region_start_block_read,
- .store = o2hb_region_start_block_write,
-};
-
-static struct o2hb_region_attribute o2hb_region_attr_blocks = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "blocks",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2hb_region_blocks_read,
- .store = o2hb_region_blocks_write,
-};
-
-static struct o2hb_region_attribute o2hb_region_attr_dev = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "dev",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2hb_region_dev_read,
- .store = o2hb_region_dev_write,
-};
-
-static struct o2hb_region_attribute o2hb_region_attr_pid = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "pid",
- .ca_mode = S_IRUGO | S_IRUSR },
- .show = o2hb_region_pid_read,
-};
+CONFIGFS_ATTR(o2hb_region_, block_bytes);
+CONFIGFS_ATTR(o2hb_region_, start_block);
+CONFIGFS_ATTR(o2hb_region_, blocks);
+CONFIGFS_ATTR(o2hb_region_, dev);
+CONFIGFS_ATTR_RO(o2hb_region_, pid);
static struct configfs_attribute *o2hb_region_attrs[] = {
- &o2hb_region_attr_block_bytes.attr,
- &o2hb_region_attr_start_block.attr,
- &o2hb_region_attr_blocks.attr,
- &o2hb_region_attr_dev.attr,
- &o2hb_region_attr_pid.attr,
+ &o2hb_region_attr_block_bytes,
+ &o2hb_region_attr_start_block,
+ &o2hb_region_attr_blocks,
+ &o2hb_region_attr_dev,
+ &o2hb_region_attr_pid,
NULL,
};
-static ssize_t o2hb_region_show(struct config_item *item,
- struct configfs_attribute *attr,
- char *page)
-{
- struct o2hb_region *reg = to_o2hb_region(item);
- struct o2hb_region_attribute *o2hb_region_attr =
- container_of(attr, struct o2hb_region_attribute, attr);
- ssize_t ret = 0;
-
- if (o2hb_region_attr->show)
- ret = o2hb_region_attr->show(reg, page);
- return ret;
-}
-
-static ssize_t o2hb_region_store(struct config_item *item,
- struct configfs_attribute *attr,
- const char *page, size_t count)
-{
- struct o2hb_region *reg = to_o2hb_region(item);
- struct o2hb_region_attribute *o2hb_region_attr =
- container_of(attr, struct o2hb_region_attribute, attr);
- ssize_t ret = -EINVAL;
-
- if (o2hb_region_attr->store)
- ret = o2hb_region_attr->store(reg, page, count);
- return ret;
-}
-
static struct configfs_item_operations o2hb_region_item_ops = {
.release = o2hb_region_release,
- .show_attribute = o2hb_region_show,
- .store_attribute = o2hb_region_store,
};
static struct config_item_type o2hb_region_type = {
@@ -2124,49 +2070,14 @@ unlock:
spin_unlock(&o2hb_live_lock);
}
-struct o2hb_heartbeat_group_attribute {
- struct configfs_attribute attr;
- ssize_t (*show)(struct o2hb_heartbeat_group *, char *);
- ssize_t (*store)(struct o2hb_heartbeat_group *, const char *, size_t);
-};
-
-static ssize_t o2hb_heartbeat_group_show(struct config_item *item,
- struct configfs_attribute *attr,
- char *page)
-{
- struct o2hb_heartbeat_group *reg = to_o2hb_heartbeat_group(to_config_group(item));
- struct o2hb_heartbeat_group_attribute *o2hb_heartbeat_group_attr =
- container_of(attr, struct o2hb_heartbeat_group_attribute, attr);
- ssize_t ret = 0;
-
- if (o2hb_heartbeat_group_attr->show)
- ret = o2hb_heartbeat_group_attr->show(reg, page);
- return ret;
-}
-
-static ssize_t o2hb_heartbeat_group_store(struct config_item *item,
- struct configfs_attribute *attr,
- const char *page, size_t count)
-{
- struct o2hb_heartbeat_group *reg = to_o2hb_heartbeat_group(to_config_group(item));
- struct o2hb_heartbeat_group_attribute *o2hb_heartbeat_group_attr =
- container_of(attr, struct o2hb_heartbeat_group_attribute, attr);
- ssize_t ret = -EINVAL;
-
- if (o2hb_heartbeat_group_attr->store)
- ret = o2hb_heartbeat_group_attr->store(reg, page, count);
- return ret;
-}
-
-static ssize_t o2hb_heartbeat_group_threshold_show(struct o2hb_heartbeat_group *group,
- char *page)
+static ssize_t o2hb_heartbeat_group_threshold_show(struct config_item *item,
+ char *page)
{
return sprintf(page, "%u\n", o2hb_dead_threshold);
}
-static ssize_t o2hb_heartbeat_group_threshold_store(struct o2hb_heartbeat_group *group,
- const char *page,
- size_t count)
+static ssize_t o2hb_heartbeat_group_threshold_store(struct config_item *item,
+ const char *page, size_t count)
{
unsigned long tmp;
char *p = (char *)page;
@@ -2181,17 +2092,15 @@ static ssize_t o2hb_heartbeat_group_threshold_store(struct o2hb_heartbeat_group
return count;
}
-static
-ssize_t o2hb_heartbeat_group_mode_show(struct o2hb_heartbeat_group *group,
- char *page)
+static ssize_t o2hb_heartbeat_group_mode_show(struct config_item *item,
+ char *page)
{
return sprintf(page, "%s\n",
o2hb_heartbeat_mode_desc[o2hb_heartbeat_mode]);
}
-static
-ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group,
- const char *page, size_t count)
+static ssize_t o2hb_heartbeat_group_mode_store(struct config_item *item,
+ const char *page, size_t count)
{
unsigned int i;
int ret;
@@ -2216,33 +2125,15 @@ ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group,
}
-static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "dead_threshold",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2hb_heartbeat_group_threshold_show,
- .store = o2hb_heartbeat_group_threshold_store,
-};
-
-static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_mode = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "mode",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2hb_heartbeat_group_mode_show,
- .store = o2hb_heartbeat_group_mode_store,
-};
+CONFIGFS_ATTR(o2hb_heartbeat_group_, threshold);
+CONFIGFS_ATTR(o2hb_heartbeat_group_, mode);
static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = {
- &o2hb_heartbeat_group_attr_threshold.attr,
- &o2hb_heartbeat_group_attr_mode.attr,
+ &o2hb_heartbeat_group_attr_threshold,
+ &o2hb_heartbeat_group_attr_mode,
NULL,
};
-static struct configfs_item_operations o2hb_heartbeat_group_item_ops = {
- .show_attribute = o2hb_heartbeat_group_show,
- .store_attribute = o2hb_heartbeat_group_store,
-};
-
static struct configfs_group_operations o2hb_heartbeat_group_group_ops = {
.make_item = o2hb_heartbeat_group_make_item,
.drop_item = o2hb_heartbeat_group_drop_item,
@@ -2250,7 +2141,6 @@ static struct configfs_group_operations o2hb_heartbeat_group_group_ops = {
static struct config_item_type o2hb_heartbeat_group_type = {
.ct_group_ops = &o2hb_heartbeat_group_group_ops,
- .ct_item_ops = &o2hb_heartbeat_group_item_ops,
.ct_attrs = o2hb_heartbeat_group_attrs,
.ct_owner = THIS_MODULE,
};
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 441c84e169e6..ebe543894db0 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -172,9 +172,9 @@ static void o2nm_node_release(struct config_item *item)
kfree(node);
}
-static ssize_t o2nm_node_num_read(struct o2nm_node *node, char *page)
+static ssize_t o2nm_node_num_show(struct config_item *item, char *page)
{
- return sprintf(page, "%d\n", node->nd_num);
+ return sprintf(page, "%d\n", to_o2nm_node(item)->nd_num);
}
static struct o2nm_cluster *to_o2nm_cluster_from_node(struct o2nm_node *node)
@@ -188,15 +188,16 @@ enum {
O2NM_NODE_ATTR_NUM = 0,
O2NM_NODE_ATTR_PORT,
O2NM_NODE_ATTR_ADDRESS,
- O2NM_NODE_ATTR_LOCAL,
};
-static ssize_t o2nm_node_num_write(struct o2nm_node *node, const char *page,
+static ssize_t o2nm_node_num_store(struct config_item *item, const char *page,
size_t count)
{
+ struct o2nm_node *node = to_o2nm_node(item);
struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
unsigned long tmp;
char *p = (char *)page;
+ int ret = 0;
tmp = simple_strtoul(p, &p, 0);
if (!p || (*p && (*p != '\n')))
@@ -215,26 +216,30 @@ static ssize_t o2nm_node_num_write(struct o2nm_node *node, const char *page,
write_lock(&cluster->cl_nodes_lock);
if (cluster->cl_nodes[tmp])
- p = NULL;
+ ret = -EEXIST;
+ else if (test_and_set_bit(O2NM_NODE_ATTR_NUM,
+ &node->nd_set_attributes))
+ ret = -EBUSY;
else {
cluster->cl_nodes[tmp] = node;
node->nd_num = tmp;
set_bit(tmp, cluster->cl_nodes_bitmap);
}
write_unlock(&cluster->cl_nodes_lock);
- if (p == NULL)
- return -EEXIST;
+ if (ret)
+ return ret;
return count;
}
-static ssize_t o2nm_node_ipv4_port_read(struct o2nm_node *node, char *page)
+static ssize_t o2nm_node_ipv4_port_show(struct config_item *item, char *page)
{
- return sprintf(page, "%u\n", ntohs(node->nd_ipv4_port));
+ return sprintf(page, "%u\n", ntohs(to_o2nm_node(item)->nd_ipv4_port));
}
-static ssize_t o2nm_node_ipv4_port_write(struct o2nm_node *node,
+static ssize_t o2nm_node_ipv4_port_store(struct config_item *item,
const char *page, size_t count)
{
+ struct o2nm_node *node = to_o2nm_node(item);
unsigned long tmp;
char *p = (char *)page;
@@ -247,20 +252,23 @@ static ssize_t o2nm_node_ipv4_port_write(struct o2nm_node *node,
if (tmp >= (u16)-1)
return -ERANGE;
+ if (test_and_set_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes))
+ return -EBUSY;
node->nd_ipv4_port = htons(tmp);
return count;
}
-static ssize_t o2nm_node_ipv4_address_read(struct o2nm_node *node, char *page)
+static ssize_t o2nm_node_ipv4_address_show(struct config_item *item, char *page)
{
- return sprintf(page, "%pI4\n", &node->nd_ipv4_address);
+ return sprintf(page, "%pI4\n", &to_o2nm_node(item)->nd_ipv4_address);
}
-static ssize_t o2nm_node_ipv4_address_write(struct o2nm_node *node,
+static ssize_t o2nm_node_ipv4_address_store(struct config_item *item,
const char *page,
size_t count)
{
+ struct o2nm_node *node = to_o2nm_node(item);
struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
int ret, i;
struct rb_node **p, *parent;
@@ -282,6 +290,9 @@ static ssize_t o2nm_node_ipv4_address_write(struct o2nm_node *node,
write_lock(&cluster->cl_nodes_lock);
if (o2nm_node_ip_tree_lookup(cluster, ipv4_addr, &p, &parent))
ret = -EEXIST;
+ else if (test_and_set_bit(O2NM_NODE_ATTR_ADDRESS,
+ &node->nd_set_attributes))
+ ret = -EBUSY;
else {
rb_link_node(&node->nd_ip_node, parent, p);
rb_insert_color(&node->nd_ip_node, &cluster->cl_node_ip_tree);
@@ -295,14 +306,15 @@ static ssize_t o2nm_node_ipv4_address_write(struct o2nm_node *node,
return count;
}
-static ssize_t o2nm_node_local_read(struct o2nm_node *node, char *page)
+static ssize_t o2nm_node_local_show(struct config_item *item, char *page)
{
- return sprintf(page, "%d\n", node->nd_local);
+ return sprintf(page, "%d\n", to_o2nm_node(item)->nd_local);
}
-static ssize_t o2nm_node_local_write(struct o2nm_node *node, const char *page,
+static ssize_t o2nm_node_local_store(struct config_item *item, const char *page,
size_t count)
{
+ struct o2nm_node *node = to_o2nm_node(item);
struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
unsigned long tmp;
char *p = (char *)page;
@@ -349,108 +361,21 @@ static ssize_t o2nm_node_local_write(struct o2nm_node *node, const char *page,
return count;
}
-struct o2nm_node_attribute {
- struct configfs_attribute attr;
- ssize_t (*show)(struct o2nm_node *, char *);
- ssize_t (*store)(struct o2nm_node *, const char *, size_t);
-};
-
-static struct o2nm_node_attribute o2nm_node_attr_num = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "num",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2nm_node_num_read,
- .store = o2nm_node_num_write,
-};
-
-static struct o2nm_node_attribute o2nm_node_attr_ipv4_port = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "ipv4_port",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2nm_node_ipv4_port_read,
- .store = o2nm_node_ipv4_port_write,
-};
-
-static struct o2nm_node_attribute o2nm_node_attr_ipv4_address = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "ipv4_address",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2nm_node_ipv4_address_read,
- .store = o2nm_node_ipv4_address_write,
-};
-
-static struct o2nm_node_attribute o2nm_node_attr_local = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "local",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2nm_node_local_read,
- .store = o2nm_node_local_write,
-};
+CONFIGFS_ATTR(o2nm_node_, num);
+CONFIGFS_ATTR(o2nm_node_, ipv4_port);
+CONFIGFS_ATTR(o2nm_node_, ipv4_address);
+CONFIGFS_ATTR(o2nm_node_, local);
static struct configfs_attribute *o2nm_node_attrs[] = {
- [O2NM_NODE_ATTR_NUM] = &o2nm_node_attr_num.attr,
- [O2NM_NODE_ATTR_PORT] = &o2nm_node_attr_ipv4_port.attr,
- [O2NM_NODE_ATTR_ADDRESS] = &o2nm_node_attr_ipv4_address.attr,
- [O2NM_NODE_ATTR_LOCAL] = &o2nm_node_attr_local.attr,
+ &o2nm_node_attr_num,
+ &o2nm_node_attr_ipv4_port,
+ &o2nm_node_attr_ipv4_address,
+ &o2nm_node_attr_local,
NULL,
};
-static int o2nm_attr_index(struct configfs_attribute *attr)
-{
- int i;
- for (i = 0; i < ARRAY_SIZE(o2nm_node_attrs); i++) {
- if (attr == o2nm_node_attrs[i])
- return i;
- }
- BUG();
- return 0;
-}
-
-static ssize_t o2nm_node_show(struct config_item *item,
- struct configfs_attribute *attr,
- char *page)
-{
- struct o2nm_node *node = to_o2nm_node(item);
- struct o2nm_node_attribute *o2nm_node_attr =
- container_of(attr, struct o2nm_node_attribute, attr);
- ssize_t ret = 0;
-
- if (o2nm_node_attr->show)
- ret = o2nm_node_attr->show(node, page);
- return ret;
-}
-
-static ssize_t o2nm_node_store(struct config_item *item,
- struct configfs_attribute *attr,
- const char *page, size_t count)
-{
- struct o2nm_node *node = to_o2nm_node(item);
- struct o2nm_node_attribute *o2nm_node_attr =
- container_of(attr, struct o2nm_node_attribute, attr);
- ssize_t ret;
- int attr_index = o2nm_attr_index(attr);
-
- if (o2nm_node_attr->store == NULL) {
- ret = -EINVAL;
- goto out;
- }
-
- if (test_bit(attr_index, &node->nd_set_attributes))
- return -EBUSY;
-
- ret = o2nm_node_attr->store(node, page, count);
- if (ret < count)
- goto out;
-
- set_bit(attr_index, &node->nd_set_attributes);
-out:
- return ret;
-}
-
static struct configfs_item_operations o2nm_node_item_ops = {
.release = o2nm_node_release,
- .show_attribute = o2nm_node_show,
- .store_attribute = o2nm_node_store,
};
static struct config_item_type o2nm_node_type = {
@@ -475,12 +400,6 @@ static struct o2nm_node_group *to_o2nm_node_group(struct config_group *group)
}
#endif
-struct o2nm_cluster_attribute {
- struct configfs_attribute attr;
- ssize_t (*show)(struct o2nm_cluster *, char *);
- ssize_t (*store)(struct o2nm_cluster *, const char *, size_t);
-};
-
static ssize_t o2nm_cluster_attr_write(const char *page, ssize_t count,
unsigned int *val)
{
@@ -501,15 +420,16 @@ static ssize_t o2nm_cluster_attr_write(const char *page, ssize_t count,
return count;
}
-static ssize_t o2nm_cluster_attr_idle_timeout_ms_read(
- struct o2nm_cluster *cluster, char *page)
+static ssize_t o2nm_cluster_idle_timeout_ms_show(struct config_item *item,
+ char *page)
{
- return sprintf(page, "%u\n", cluster->cl_idle_timeout_ms);
+ return sprintf(page, "%u\n", to_o2nm_cluster(item)->cl_idle_timeout_ms);
}
-static ssize_t o2nm_cluster_attr_idle_timeout_ms_write(
- struct o2nm_cluster *cluster, const char *page, size_t count)
+static ssize_t o2nm_cluster_idle_timeout_ms_store(struct config_item *item,
+ const char *page, size_t count)
{
+ struct o2nm_cluster *cluster = to_o2nm_cluster(item);
ssize_t ret;
unsigned int val;
@@ -536,15 +456,17 @@ static ssize_t o2nm_cluster_attr_idle_timeout_ms_write(
return ret;
}
-static ssize_t o2nm_cluster_attr_keepalive_delay_ms_read(
- struct o2nm_cluster *cluster, char *page)
+static ssize_t o2nm_cluster_keepalive_delay_ms_show(
+ struct config_item *item, char *page)
{
- return sprintf(page, "%u\n", cluster->cl_keepalive_delay_ms);
+ return sprintf(page, "%u\n",
+ to_o2nm_cluster(item)->cl_keepalive_delay_ms);
}
-static ssize_t o2nm_cluster_attr_keepalive_delay_ms_write(
- struct o2nm_cluster *cluster, const char *page, size_t count)
+static ssize_t o2nm_cluster_keepalive_delay_ms_store(
+ struct config_item *item, const char *page, size_t count)
{
+ struct o2nm_cluster *cluster = to_o2nm_cluster(item);
ssize_t ret;
unsigned int val;
@@ -571,22 +493,24 @@ static ssize_t o2nm_cluster_attr_keepalive_delay_ms_write(
return ret;
}
-static ssize_t o2nm_cluster_attr_reconnect_delay_ms_read(
- struct o2nm_cluster *cluster, char *page)
+static ssize_t o2nm_cluster_reconnect_delay_ms_show(
+ struct config_item *item, char *page)
{
- return sprintf(page, "%u\n", cluster->cl_reconnect_delay_ms);
+ return sprintf(page, "%u\n",
+ to_o2nm_cluster(item)->cl_reconnect_delay_ms);
}
-static ssize_t o2nm_cluster_attr_reconnect_delay_ms_write(
- struct o2nm_cluster *cluster, const char *page, size_t count)
+static ssize_t o2nm_cluster_reconnect_delay_ms_store(
+ struct config_item *item, const char *page, size_t count)
{
return o2nm_cluster_attr_write(page, count,
- &cluster->cl_reconnect_delay_ms);
+ &to_o2nm_cluster(item)->cl_reconnect_delay_ms);
}
-static ssize_t o2nm_cluster_attr_fence_method_read(
- struct o2nm_cluster *cluster, char *page)
+static ssize_t o2nm_cluster_fence_method_show(
+ struct config_item *item, char *page)
{
+ struct o2nm_cluster *cluster = to_o2nm_cluster(item);
ssize_t ret = 0;
if (cluster)
@@ -595,8 +519,8 @@ static ssize_t o2nm_cluster_attr_fence_method_read(
return ret;
}
-static ssize_t o2nm_cluster_attr_fence_method_write(
- struct o2nm_cluster *cluster, const char *page, size_t count)
+static ssize_t o2nm_cluster_fence_method_store(
+ struct config_item *item, const char *page, size_t count)
{
unsigned int i;
@@ -608,10 +532,10 @@ static ssize_t o2nm_cluster_attr_fence_method_write(
continue;
if (strncasecmp(page, o2nm_fence_method_desc[i], count - 1))
continue;
- if (cluster->cl_fence_method != i) {
+ if (to_o2nm_cluster(item)->cl_fence_method != i) {
printk(KERN_INFO "ocfs2: Changing fence method to %s\n",
o2nm_fence_method_desc[i]);
- cluster->cl_fence_method = i;
+ to_o2nm_cluster(item)->cl_fence_method = i;
}
return count;
}
@@ -620,79 +544,18 @@ bail:
return -EINVAL;
}
-static struct o2nm_cluster_attribute o2nm_cluster_attr_idle_timeout_ms = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "idle_timeout_ms",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2nm_cluster_attr_idle_timeout_ms_read,
- .store = o2nm_cluster_attr_idle_timeout_ms_write,
-};
-
-static struct o2nm_cluster_attribute o2nm_cluster_attr_keepalive_delay_ms = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "keepalive_delay_ms",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2nm_cluster_attr_keepalive_delay_ms_read,
- .store = o2nm_cluster_attr_keepalive_delay_ms_write,
-};
-
-static struct o2nm_cluster_attribute o2nm_cluster_attr_reconnect_delay_ms = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "reconnect_delay_ms",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2nm_cluster_attr_reconnect_delay_ms_read,
- .store = o2nm_cluster_attr_reconnect_delay_ms_write,
-};
-
-static struct o2nm_cluster_attribute o2nm_cluster_attr_fence_method = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "fence_method",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2nm_cluster_attr_fence_method_read,
- .store = o2nm_cluster_attr_fence_method_write,
-};
+CONFIGFS_ATTR(o2nm_cluster_, idle_timeout_ms);
+CONFIGFS_ATTR(o2nm_cluster_, keepalive_delay_ms);
+CONFIGFS_ATTR(o2nm_cluster_, reconnect_delay_ms);
+CONFIGFS_ATTR(o2nm_cluster_, fence_method);
static struct configfs_attribute *o2nm_cluster_attrs[] = {
- &o2nm_cluster_attr_idle_timeout_ms.attr,
- &o2nm_cluster_attr_keepalive_delay_ms.attr,
- &o2nm_cluster_attr_reconnect_delay_ms.attr,
- &o2nm_cluster_attr_fence_method.attr,
+ &o2nm_cluster_attr_idle_timeout_ms,
+ &o2nm_cluster_attr_keepalive_delay_ms,
+ &o2nm_cluster_attr_reconnect_delay_ms,
+ &o2nm_cluster_attr_fence_method,
NULL,
};
-static ssize_t o2nm_cluster_show(struct config_item *item,
- struct configfs_attribute *attr,
- char *page)
-{
- struct o2nm_cluster *cluster = to_o2nm_cluster(item);
- struct o2nm_cluster_attribute *o2nm_cluster_attr =
- container_of(attr, struct o2nm_cluster_attribute, attr);
- ssize_t ret = 0;
-
- if (o2nm_cluster_attr->show)
- ret = o2nm_cluster_attr->show(cluster, page);
- return ret;
-}
-
-static ssize_t o2nm_cluster_store(struct config_item *item,
- struct configfs_attribute *attr,
- const char *page, size_t count)
-{
- struct o2nm_cluster *cluster = to_o2nm_cluster(item);
- struct o2nm_cluster_attribute *o2nm_cluster_attr =
- container_of(attr, struct o2nm_cluster_attribute, attr);
- ssize_t ret;
-
- if (o2nm_cluster_attr->store == NULL) {
- ret = -EINVAL;
- goto out;
- }
-
- ret = o2nm_cluster_attr->store(cluster, page, count);
- if (ret < count)
- goto out;
-out:
- return ret;
-}
static struct config_item *o2nm_node_group_make_item(struct config_group *group,
const char *name)
@@ -773,8 +636,6 @@ static void o2nm_cluster_release(struct config_item *item)
static struct configfs_item_operations o2nm_cluster_item_ops = {
.release = o2nm_cluster_release,
- .show_attribute = o2nm_cluster_show,
- .store_attribute = o2nm_cluster_store,
};
static struct config_item_type o2nm_cluster_type = {
@@ -896,7 +757,7 @@ int o2nm_depend_item(struct config_item *item)
void o2nm_undepend_item(struct config_item *item)
{
- configfs_undepend_item(&o2nm_cluster_group.cs_subsys, item);
+ configfs_undepend_item(item);
}
int o2nm_depend_this_node(void)
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index e88ccf8c83ff..68c607e63ff6 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -376,17 +376,6 @@ struct dlm_lock
lksb_kernel_allocated:1;
};
-
-#define DLM_LKSB_UNUSED1 0x01
-#define DLM_LKSB_PUT_LVB 0x02
-#define DLM_LKSB_GET_LVB 0x04
-#define DLM_LKSB_UNUSED2 0x08
-#define DLM_LKSB_UNUSED3 0x10
-#define DLM_LKSB_UNUSED4 0x20
-#define DLM_LKSB_UNUSED5 0x40
-#define DLM_LKSB_UNUSED6 0x80
-
-
enum dlm_lockres_list {
DLM_GRANTED_LIST = 0,
DLM_CONVERTING_LIST = 1,
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 6918f30d02cd..2ee7fe747cea 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1866,6 +1866,7 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
int status;
unsigned int backoff;
unsigned int total_backoff = 0;
+ char wq_name[O2NM_MAX_NAME_LEN];
BUG_ON(!dlm);
@@ -1895,7 +1896,8 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
goto bail;
}
- dlm->dlm_worker = create_singlethread_workqueue("dlm_wq");
+ snprintf(wq_name, O2NM_MAX_NAME_LEN, "dlm_wq-%s", dlm->name);
+ dlm->dlm_worker = create_singlethread_workqueue(wq_name);
if (!dlm->dlm_worker) {
status = -ENOMEM;
mlog_errno(status);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index ce38b4ccc9ab..9477d6e1de37 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2388,8 +2388,8 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
spin_lock(&res->spinlock);
BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);
+ __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
if (test_bit(node, res->refmap)) {
- __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
dlm_lockres_clear_refmap_bit(dlm, res, node);
cleared = 1;
}
@@ -2519,6 +2519,11 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
spin_lock(&dlm->master_lock);
ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name,
namelen, target, dlm->node_num);
+ /* get an extra reference on the mle.
+ * otherwise the assert_master from the new
+ * master will destroy this.
+ */
+ dlm_get_mle_inuse(mle);
spin_unlock(&dlm->master_lock);
spin_unlock(&dlm->spinlock);
@@ -2544,7 +2549,7 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
}
fail:
- if (oldmle) {
+ if (ret != -EEXIST && oldmle) {
/* master is known, detach if not already detached */
dlm_mle_detach_hb_events(dlm, oldmle);
dlm_put_mle(oldmle);
@@ -2554,6 +2559,7 @@ fail:
if (mle_added) {
dlm_mle_detach_hb_events(dlm, mle);
dlm_put_mle(mle);
+ dlm_put_mle_inuse(mle);
} else if (mle) {
kmem_cache_free(dlm_mle_cache, mle);
mle = NULL;
@@ -2571,17 +2577,6 @@ fail:
* ensure that all assert_master work is flushed. */
flush_workqueue(dlm->dlm_worker);
- /* get an extra reference on the mle.
- * otherwise the assert_master from the new
- * master will destroy this.
- * also, make sure that all callers of dlm_get_mle
- * take both dlm->spinlock and dlm->master_lock */
- spin_lock(&dlm->spinlock);
- spin_lock(&dlm->master_lock);
- dlm_get_mle_inuse(mle);
- spin_unlock(&dlm->master_lock);
- spin_unlock(&dlm->spinlock);
-
/* notify new node and send all lock state */
/* call send_one_lockres with migration flag.
* this serves as notice to the target node that a
@@ -2843,6 +2838,8 @@ again:
res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY;
if (!ret)
BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING));
+ else
+ res->migration_pending = 0;
spin_unlock(&res->spinlock);
/*
@@ -3048,7 +3045,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
int ret = 0;
if (!dlm_grab(dlm))
- return -EINVAL;
+ return 0;
name = migrate->name;
namelen = migrate->namelen;
@@ -3139,7 +3136,8 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
mlog(0, "tried to migrate %.*s, but some "
"process beat me to it\n",
namelen, name);
- ret = -EEXIST;
+ spin_unlock(&tmp->spinlock);
+ return -EEXIST;
} else {
/* bad. 2 NODES are trying to migrate! */
mlog(ML_ERROR, "migration error mle: "
@@ -3310,6 +3308,15 @@ top:
mle->new_master != dead_node)
continue;
+ if (mle->new_master == dead_node && mle->inuse) {
+ mlog(ML_NOTICE, "%s: target %u died during "
+ "migration from %u, the MLE is "
+ "still keep used, ignore it!\n",
+ dlm->name, dead_node,
+ mle->master);
+ continue;
+ }
+
/* If we have reached this point, this mle needs to be
* removed from the list and freed. */
dlm_clean_migration_mle(dlm, mle);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 58eaa5c0d387..c5bdf02c213b 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -205,7 +205,7 @@ int dlm_launch_recovery_thread(struct dlm_ctxt *dlm)
mlog(0, "starting dlm recovery thread...\n");
dlm->dlm_reco_thread_task = kthread_run(dlm_recovery_thread, dlm,
- "dlm_reco_thread");
+ "dlm_reco-%s", dlm->name);
if (IS_ERR(dlm->dlm_reco_thread_task)) {
mlog_errno(PTR_ERR(dlm->dlm_reco_thread_task));
dlm->dlm_reco_thread_task = NULL;
@@ -1373,6 +1373,7 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
char *buf = NULL;
struct dlm_work_item *item = NULL;
struct dlm_lock_resource *res = NULL;
+ unsigned int hash;
if (!dlm_grab(dlm))
return -EINVAL;
@@ -1400,7 +1401,10 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
/* lookup the lock to see if we have a secondary queue for this
* already... just add the locks in and this will have its owner
* and RECOVERY flag changed when it completes. */
- res = dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len);
+ hash = dlm_lockid_hash(mres->lockname, mres->lockname_len);
+ spin_lock(&dlm->spinlock);
+ res = __dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len,
+ hash);
if (res) {
/* this will get a ref on res */
/* mark it as recovering/migrating and hash it */
@@ -1421,13 +1425,16 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
mres->lockname_len, mres->lockname);
ret = -EFAULT;
spin_unlock(&res->spinlock);
+ spin_unlock(&dlm->spinlock);
dlm_lockres_put(res);
goto leave;
}
res->state |= DLM_LOCK_RES_MIGRATING;
}
spin_unlock(&res->spinlock);
+ spin_unlock(&dlm->spinlock);
} else {
+ spin_unlock(&dlm->spinlock);
/* need to allocate, just like if it was
* mastered here normally */
res = dlm_new_lockres(dlm, mres->lockname, mres->lockname_len);
@@ -2450,11 +2457,7 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
* perhaps later we can genericize this for other waiters. */
wake_up(&dlm->migration_wq);
- if (test_bit(idx, dlm->recovery_map))
- mlog(0, "domain %s, node %u already added "
- "to recovery map!\n", dlm->name, idx);
- else
- set_bit(idx, dlm->recovery_map);
+ set_bit(idx, dlm->recovery_map);
}
void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data)
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 2e5e6d5fffe8..c5f6c241ecd7 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -493,7 +493,8 @@ int dlm_launch_thread(struct dlm_ctxt *dlm)
{
mlog(0, "Starting dlm_thread...\n");
- dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm_thread");
+ dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm-%s",
+ dlm->name);
if (IS_ERR(dlm->dlm_thread_task)) {
mlog_errno(PTR_ERR(dlm->dlm_thread_task));
dlm->dlm_thread_task = NULL;
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 2e3c9dbab68c..1082b2c3014b 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -421,7 +421,7 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data,
}
if (!dlm_grab(dlm))
- return DLM_REJECTED;
+ return DLM_FORWARD;
mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
"Domain %s not fully joined!\n", dlm->name);
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index b5cf27dcb18a..03768bb3aab1 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -638,7 +638,7 @@ static int __init init_dlmfs_fs(void)
dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache",
sizeof(struct dlmfs_inode_private),
0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
dlmfs_init_once);
if (!dlmfs_inode_cache) {
status = -ENOMEM;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 1c91103c1333..f92612e4b9d6 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2432,12 +2432,6 @@ bail:
* done this we have to return AOP_TRUNCATED_PAGE so the aop method
* that called us can bubble that back up into the VFS who will then
* immediately retry the aop call.
- *
- * We do a blocking lock and immediate unlock before returning, though, so that
- * the lock has a great chance of being cached on this node by the time the VFS
- * calls back to retry the aop. This has a potential to livelock as nodes
- * ping locks back and forth, but that's a risk we're willing to take to avoid
- * the lock inversion simply.
*/
int ocfs2_inode_lock_with_page(struct inode *inode,
struct buffer_head **ret_bh,
@@ -2449,8 +2443,6 @@ int ocfs2_inode_lock_with_page(struct inode *inode,
ret = ocfs2_inode_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK);
if (ret == -EAGAIN) {
unlock_page(page);
- if (ocfs2_inode_lock(inode, ret_bh, ex) == 0)
- ocfs2_inode_unlock(inode, ex);
ret = AOP_TRUNCATED_PAGE;
}
@@ -2998,7 +2990,8 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
}
/* launch downconvert thread */
- osb->dc_task = kthread_run(ocfs2_downconvert_thread, osb, "ocfs2dc");
+ osb->dc_task = kthread_run(ocfs2_downconvert_thread, osb, "ocfs2dc-%s",
+ osb->uuid_str);
if (IS_ERR(osb->dc_task)) {
status = PTR_ERR(osb->dc_task);
osb->dc_task = NULL;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 0e5b4515f92e..d63127932509 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1302,6 +1302,14 @@ int ocfs2_getattr(struct vfsmount *mnt,
}
generic_fillattr(inode, stat);
+ /*
+ * If there is inline data in the inode, the inode will normally not
+ * have data blocks allocated (it may have an external xattr block).
+ * Report at least one sector for such files, so tools like tar, rsync,
+ * others don't incorrectly think the file is completely sparse.
+ */
+ if (unlikely(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL))
+ stat->blocks += (stat->size + 511)>>9;
/* We set the blksize from the cluster size for performance */
stat->blksize = osb->s_clustersize;
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 8f87e05ee25d..97a563bab9a8 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -361,6 +361,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
break;
case S_IFLNK:
inode->i_op = &ocfs2_symlink_inode_operations;
+ inode_nohighmem(inode);
i_size_write(inode, le64_to_cpu(fe->i_size));
break;
default:
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index ca3431ee7f24..aac8b86f312e 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -112,6 +112,8 @@ struct ocfs2_inode_info
#define OCFS2_INODE_OPEN_DIRECT 0x00000020
/* Tell the inode wipe code it's not in orphan dir */
#define OCFS2_INODE_SKIP_ORPHAN_DIR 0x00000040
+/* Entry in orphan dir with 'dio-' prefix */
+#define OCFS2_INODE_DIO_ORPHAN_ENTRY 0x00000080
static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
{
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 3cb097ccce60..16b0bb482ea7 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -606,9 +606,7 @@ bail:
if (gb_inode)
mutex_unlock(&gb_inode->i_mutex);
- if (gb_inode)
- iput(gb_inode);
-
+ iput(gb_inode);
brelse(bh);
return status;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index ff82b28462a6..3772a2dbb980 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1042,8 +1042,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
// up_write(&journal->j_trans_barrier);
done:
- if (inode)
- iput(inode);
+ iput(inode);
}
static void ocfs2_clear_journal_error(struct super_block *sb,
@@ -1090,7 +1089,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed)
/* Launch the commit thread */
if (!local) {
osb->commit_task = kthread_run(ocfs2_commit_thread, osb,
- "ocfs2cmt");
+ "ocfs2cmt-%s", osb->uuid_str);
if (IS_ERR(osb->commit_task)) {
status = PTR_ERR(osb->commit_task);
osb->commit_task = NULL;
@@ -1507,7 +1506,7 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
goto out;
osb->recovery_thread_task = kthread_run(__ocfs2_recovery_thread, osb,
- "ocfs2rec");
+ "ocfs2rec-%s", osb->uuid_str);
if (IS_ERR(osb->recovery_thread_task)) {
mlog_errno((int)PTR_ERR(osb->recovery_thread_task));
osb->recovery_thread_task = NULL;
@@ -1687,9 +1686,7 @@ done:
if (got_lock)
ocfs2_inode_unlock(inode, 1);
- if (inode)
- iput(inode);
-
+ iput(inode);
brelse(bh);
return status;
@@ -1796,8 +1793,7 @@ static int ocfs2_trylock_journal(struct ocfs2_super *osb,
ocfs2_inode_unlock(inode, 1);
bail:
- if (inode)
- iput(inode);
+ iput(inode);
return status;
}
@@ -2021,6 +2017,7 @@ struct ocfs2_orphan_filldir_priv {
struct dir_context ctx;
struct inode *head;
struct ocfs2_super *osb;
+ enum ocfs2_orphan_reco_type orphan_reco_type;
};
static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name,
@@ -2036,12 +2033,22 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name,
if (name_len == 2 && !strncmp("..", name, 2))
return 0;
+ /* do not include dio entry in case of orphan scan */
+ if ((p->orphan_reco_type == ORPHAN_NO_NEED_TRUNCATE) &&
+ (!strncmp(name, OCFS2_DIO_ORPHAN_PREFIX,
+ OCFS2_DIO_ORPHAN_PREFIX_LEN)))
+ return 0;
+
/* Skip bad inodes so that recovery can continue */
iter = ocfs2_iget(p->osb, ino,
OCFS2_FI_FLAG_ORPHAN_RECOVERY, 0);
if (IS_ERR(iter))
return 0;
+ if (!strncmp(name, OCFS2_DIO_ORPHAN_PREFIX,
+ OCFS2_DIO_ORPHAN_PREFIX_LEN))
+ OCFS2_I(iter)->ip_flags |= OCFS2_INODE_DIO_ORPHAN_ENTRY;
+
/* Skip inodes which are already added to recover list, since dio may
* happen concurrently with unlink/rename */
if (OCFS2_I(iter)->ip_next_orphan) {
@@ -2060,14 +2067,16 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name,
static int ocfs2_queue_orphans(struct ocfs2_super *osb,
int slot,
- struct inode **head)
+ struct inode **head,
+ enum ocfs2_orphan_reco_type orphan_reco_type)
{
int status;
struct inode *orphan_dir_inode = NULL;
struct ocfs2_orphan_filldir_priv priv = {
.ctx.actor = ocfs2_orphan_filldir,
.osb = osb,
- .head = *head
+ .head = *head,
+ .orphan_reco_type = orphan_reco_type
};
orphan_dir_inode = ocfs2_get_system_file_inode(osb,
@@ -2170,7 +2179,7 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
trace_ocfs2_recover_orphans(slot);
ocfs2_mark_recovering_orphan_dir(osb, slot);
- ret = ocfs2_queue_orphans(osb, slot, &inode);
+ ret = ocfs2_queue_orphans(osb, slot, &inode, orphan_reco_type);
ocfs2_clear_recovering_orphan_dir(osb, slot);
/* Error here should be noted, but we want to continue with as
@@ -2186,25 +2195,51 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
iter = oi->ip_next_orphan;
oi->ip_next_orphan = NULL;
- mutex_lock(&inode->i_mutex);
- ret = ocfs2_rw_lock(inode, 1);
- if (ret < 0) {
- mlog_errno(ret);
- goto next;
- }
- /*
- * We need to take and drop the inode lock to
- * force read inode from disk.
- */
- ret = ocfs2_inode_lock(inode, &di_bh, 1);
- if (ret) {
- mlog_errno(ret);
- goto unlock_rw;
- }
+ if (oi->ip_flags & OCFS2_INODE_DIO_ORPHAN_ENTRY) {
+ mutex_lock(&inode->i_mutex);
+ ret = ocfs2_rw_lock(inode, 1);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto unlock_mutex;
+ }
+ /*
+ * We need to take and drop the inode lock to
+ * force read inode from disk.
+ */
+ ret = ocfs2_inode_lock(inode, &di_bh, 1);
+ if (ret) {
+ mlog_errno(ret);
+ goto unlock_rw;
+ }
+
+ di = (struct ocfs2_dinode *)di_bh->b_data;
- di = (struct ocfs2_dinode *)di_bh->b_data;
+ if (di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL)) {
+ ret = ocfs2_truncate_file(inode, di_bh,
+ i_size_read(inode));
+ if (ret < 0) {
+ if (ret != -ENOSPC)
+ mlog_errno(ret);
+ goto unlock_inode;
+ }
+
+ ret = ocfs2_del_inode_from_orphan(osb, inode,
+ di_bh, 0, 0);
+ if (ret)
+ mlog_errno(ret);
+ }
+unlock_inode:
+ ocfs2_inode_unlock(inode, 1);
+ brelse(di_bh);
+ di_bh = NULL;
+unlock_rw:
+ ocfs2_rw_unlock(inode, 1);
+unlock_mutex:
+ mutex_unlock(&inode->i_mutex);
- if (inode->i_nlink == 0) {
+ /* clear dio flag in ocfs2_inode_info */
+ oi->ip_flags &= ~OCFS2_INODE_DIO_ORPHAN_ENTRY;
+ } else {
spin_lock(&oi->ip_lock);
/* Set the proper information to get us going into
* ocfs2_delete_inode. */
@@ -2212,28 +2247,6 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
spin_unlock(&oi->ip_lock);
}
- if ((orphan_reco_type == ORPHAN_NEED_TRUNCATE) &&
- (di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) {
- ret = ocfs2_truncate_file(inode, di_bh,
- i_size_read(inode));
- if (ret < 0) {
- if (ret != -ENOSPC)
- mlog_errno(ret);
- goto unlock_inode;
- }
-
- ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, 0, 0);
- if (ret)
- mlog_errno(ret);
- } /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */
-unlock_inode:
- ocfs2_inode_unlock(inode, 1);
- brelse(di_bh);
- di_bh = NULL;
-unlock_rw:
- ocfs2_rw_unlock(inode, 1);
-next:
- mutex_unlock(&inode->i_mutex);
iput(inode);
inode = iter;
}
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 0a4457fb0711..e9c99e35f5ea 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -358,8 +358,7 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
bail:
if (status < 0)
brelse(alloc_bh);
- if (inode)
- iput(inode);
+ iput(inode);
trace_ocfs2_load_local_alloc(osb->local_alloc_bits);
@@ -473,8 +472,7 @@ out_mutex:
iput(main_bm_inode);
out:
- if (local_alloc_inode)
- iput(local_alloc_inode);
+ iput(local_alloc_inode);
kfree(alloc_copy);
}
@@ -1327,9 +1325,7 @@ bail:
brelse(main_bm_bh);
- if (main_bm_inode)
- iput(main_bm_inode);
-
+ iput(main_bm_inode);
kfree(alloc_copy);
if (ac)
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
index 6b6d092b0998..d56f0079b858 100644
--- a/fs/ocfs2/locks.c
+++ b/fs/ocfs2/locks.c
@@ -66,8 +66,11 @@ static int ocfs2_do_flock(struct file *file, struct inode *inode,
* level.
*/
- flock_lock_file_wait(file,
- &(struct file_lock){.fl_type = F_UNLCK});
+ locks_lock_file_wait(file,
+ &(struct file_lock) {
+ .fl_type = F_UNLCK,
+ .fl_flags = FL_FLOCK
+ });
ocfs2_file_unlock(file);
}
@@ -81,7 +84,7 @@ static int ocfs2_do_flock(struct file *file, struct inode *inode,
goto out;
}
- ret = flock_lock_file_wait(file, fl);
+ ret = locks_lock_file_wait(file, fl);
if (ret)
ocfs2_file_unlock(file);
@@ -98,7 +101,7 @@ static int ocfs2_do_funlock(struct file *file, int cmd, struct file_lock *fl)
mutex_lock(&fp->fp_mutex);
ocfs2_file_unlock(file);
- ret = flock_lock_file_wait(file, fl);
+ ret = locks_lock_file_wait(file, fl);
mutex_unlock(&fp->fp_mutex);
return ret;
@@ -119,7 +122,7 @@ int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl)
if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) ||
ocfs2_mount_local(osb))
- return flock_lock_file_wait(file, fl);
+ return locks_lock_file_wait(file, fl);
if (fl->fl_type == F_UNLCK)
return ocfs2_do_funlock(file, cmd, fl);
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index b7dfac226b1e..ab42c38031b1 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -106,8 +106,6 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2);
/* An orphan dir name is an 8 byte value, printed as a hex string */
#define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64)))
-#define OCFS2_DIO_ORPHAN_PREFIX "dio-"
-#define OCFS2_DIO_ORPHAN_PREFIX_LEN 4
static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags)
@@ -369,7 +367,7 @@ static int ocfs2_mknod(struct inode *dir,
goto leave;
}
- status = posix_acl_create(dir, &mode, &default_acl, &acl);
+ status = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
if (status) {
mlog_errno(status);
goto leave;
@@ -657,9 +655,18 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
return status;
}
- return __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh,
+ status = __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh,
parent_fe_bh, handle, inode_ac,
fe_blkno, suballoc_loc, suballoc_bit);
+ if (status < 0) {
+ u64 bg_blkno = ocfs2_which_suballoc_group(fe_blkno, suballoc_bit);
+ int tmp = ocfs2_free_suballoc_bits(handle, inode_ac->ac_inode,
+ inode_ac->ac_bh, suballoc_bit, bg_blkno, 1);
+ if (tmp)
+ mlog_errno(tmp);
+ }
+
+ return status;
}
static int ocfs2_mkdir(struct inode *dir,
@@ -1676,8 +1683,7 @@ bail:
if (new_inode)
sync_mapping_buffers(old_inode->i_mapping);
- if (new_inode)
- iput(new_inode);
+ iput(new_inode);
ocfs2_free_dir_lookup_result(&target_lookup_res);
ocfs2_free_dir_lookup_result(&old_entry_lookup);
@@ -1951,6 +1957,7 @@ static int ocfs2_symlink(struct inode *dir,
inode->i_rdev = 0;
newsize = l - 1;
inode->i_op = &ocfs2_symlink_inode_operations;
+ inode_nohighmem(inode);
if (l > ocfs2_fast_symlink_chars(sb)) {
u32 offset = 0;
@@ -2365,6 +2372,15 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
(unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno,
name, strlen(name));
+ status = ocfs2_journal_access_di(handle,
+ INODE_CACHE(orphan_dir_inode),
+ orphan_dir_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto leave;
+ }
+
/* find it's spot in the orphan directory */
status = ocfs2_find_entry(name, strlen(name), orphan_dir_inode,
&lookup);
@@ -2380,15 +2396,6 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
goto leave;
}
- status = ocfs2_journal_access_di(handle,
- INODE_CACHE(orphan_dir_inode),
- orphan_dir_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
-
/* do the i_nlink dance! :) */
orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
if (S_ISDIR(inode->i_mode))
diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h
index e173329eb830..1155918d6784 100644
--- a/fs/ocfs2/namei.h
+++ b/fs/ocfs2/namei.h
@@ -26,6 +26,9 @@
#ifndef OCFS2_NAMEI_H
#define OCFS2_NAMEI_H
+#define OCFS2_DIO_ORPHAN_PREFIX "dio-"
+#define OCFS2_DIO_ORPHAN_PREFIX_LEN 4
+
extern const struct inode_operations ocfs2_dir_iops;
struct dentry *ocfs2_get_parent(struct dentry *child);
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index b6d51333ad02..d153e6e31529 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -82,7 +82,7 @@ struct ocfs2_quota_chunk {
extern struct kmem_cache *ocfs2_dquot_cachep;
extern struct kmem_cache *ocfs2_qf_chunk_cachep;
-extern struct qtree_fmt_operations ocfs2_global_ops;
+extern const struct qtree_fmt_operations ocfs2_global_ops;
struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
struct ocfs2_super *osb, int slot_num);
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index c93d67220887..fde9ef18cff3 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -123,7 +123,7 @@ static int ocfs2_global_is_id(void *dp, struct dquot *dquot)
dquot->dq_id);
}
-struct qtree_fmt_operations ocfs2_global_ops = {
+const struct qtree_fmt_operations ocfs2_global_ops = {
.mem2disk_dqblk = ocfs2_global_mem2diskdqb,
.disk2mem_dqblk = ocfs2_global_disk2memdqb,
.is_id = ocfs2_global_is_id,
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index e5d57cd32505..252119860e6c 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2920,16 +2920,13 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle,
u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
struct page *page;
pgoff_t page_index;
- unsigned int from, to, readahead_pages;
+ unsigned int from, to;
loff_t offset, end, map_end;
struct address_space *mapping = inode->i_mapping;
trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster,
new_cluster, new_len);
- readahead_pages =
- (ocfs2_cow_contig_clusters(sb) <<
- OCFS2_SB(sb)->s_clustersize_bits) >> PAGE_CACHE_SHIFT;
offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits);
/*
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index d5da6f624142..79b8021302b3 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -54,11 +54,12 @@
static u16 ocfs2_calc_new_backup_super(struct inode *inode,
struct ocfs2_group_desc *gd,
u16 cl_cpg,
+ u16 old_bg_clusters,
int set)
{
int i;
u16 backups = 0;
- u32 cluster;
+ u32 cluster, lgd_cluster;
u64 blkno, gd_blkno, lgd_blkno = le64_to_cpu(gd->bg_blkno);
for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
@@ -71,6 +72,12 @@ static u16 ocfs2_calc_new_backup_super(struct inode *inode,
else if (gd_blkno > lgd_blkno)
break;
+ /* check if already done backup super */
+ lgd_cluster = ocfs2_blocks_to_clusters(inode->i_sb, lgd_blkno);
+ lgd_cluster += old_bg_clusters;
+ if (lgd_cluster >= cluster)
+ continue;
+
if (set)
ocfs2_set_bit(cluster % cl_cpg,
(unsigned long *)gd->bg_bitmap);
@@ -99,6 +106,7 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
u16 chain, num_bits, backups = 0;
u16 cl_bpc = le16_to_cpu(cl->cl_bpc);
u16 cl_cpg = le16_to_cpu(cl->cl_cpg);
+ u16 old_bg_clusters;
trace_ocfs2_update_last_group_and_inode(new_clusters,
first_new_cluster);
@@ -112,6 +120,7 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
group = (struct ocfs2_group_desc *)group_bh->b_data;
+ old_bg_clusters = le16_to_cpu(group->bg_bits) / cl_bpc;
/* update the group first. */
num_bits = new_clusters * cl_bpc;
le16_add_cpu(&group->bg_bits, num_bits);
@@ -125,7 +134,7 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
OCFS2_FEATURE_COMPAT_BACKUP_SB)) {
backups = ocfs2_calc_new_backup_super(bm_inode,
group,
- cl_cpg, 1);
+ cl_cpg, old_bg_clusters, 1);
le16_add_cpu(&group->bg_free_bits_count, -1 * backups);
}
@@ -163,7 +172,7 @@ out_rollback:
if (ret < 0) {
ocfs2_calc_new_backup_super(bm_inode,
group,
- cl_cpg, 0);
+ cl_cpg, old_bg_clusters, 0);
le16_add_cpu(&group->bg_free_bits_count, backups);
le16_add_cpu(&group->bg_bits, -1 * num_bits);
le16_add_cpu(&group->bg_free_bits_count, -1 * num_bits);
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index e78a203d44c8..1e09592148ad 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -322,8 +322,7 @@ static void __ocfs2_free_slot_info(struct ocfs2_slot_info *si)
if (si == NULL)
return;
- if (si->si_inode)
- iput(si->si_inode);
+ iput(si->si_inode);
if (si->si_bh) {
for (i = 0; i < si->si_blocks; i++) {
if (si->si_bh[i]) {
@@ -503,8 +502,17 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
trace_ocfs2_find_slot(osb->slot_num);
status = ocfs2_update_disk_slot(osb, si, osb->slot_num);
- if (status < 0)
+ if (status < 0) {
mlog_errno(status);
+ /*
+ * if write block failed, invalidate slot to avoid overwrite
+ * slot during dismount in case another node rightly has mounted
+ */
+ spin_lock(&osb->osb_lock);
+ ocfs2_invalidate_slot(si, osb->slot_num);
+ osb->slot_num = OCFS2_INVALID_SLOT;
+ spin_unlock(&osb->osb_lock);
+ }
bail:
return status;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index d83d2602cf2b..fc6d25f6d444 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1920,7 +1920,10 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
res, &bits_left);
if (!status) {
- hint = ocfs2_group_from_res(res);
+ if (ocfs2_is_cluster_bitmap(ac->ac_inode))
+ hint = res->sr_bg_blkno;
+ else
+ hint = ocfs2_group_from_res(res);
goto set_hint;
}
if (status < 0 && status != -ENOSPC) {
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 2de4c8a9340c..faa1365097bc 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1280,6 +1280,8 @@ static int ocfs2_parse_options(struct super_block *sb,
int status, user_stack = 0;
char *p;
u32 tmp;
+ int token, option;
+ substring_t args[MAX_OPT_ARGS];
trace_ocfs2_parse_options(is_remount, options ? options : "(none)");
@@ -1298,9 +1300,6 @@ static int ocfs2_parse_options(struct super_block *sb,
}
while ((p = strsep(&options, ",")) != NULL) {
- int token, option;
- substring_t args[MAX_OPT_ARGS];
-
if (!*p)
continue;
@@ -1367,7 +1366,6 @@ static int ocfs2_parse_options(struct super_block *sb,
mopt->atime_quantum = option;
break;
case Opt_slot:
- option = 0;
if (match_int(&args[0], &option)) {
status = 0;
goto bail;
@@ -1376,7 +1374,6 @@ static int ocfs2_parse_options(struct super_block *sb,
mopt->slot = (s16)option;
break;
case Opt_commit:
- option = 0;
if (match_int(&args[0], &option)) {
status = 0;
goto bail;
@@ -1388,7 +1385,6 @@ static int ocfs2_parse_options(struct super_block *sb,
mopt->commit_interval = HZ * option;
break;
case Opt_localalloc:
- option = 0;
if (match_int(&args[0], &option)) {
status = 0;
goto bail;
@@ -1726,8 +1722,7 @@ static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
ocfs2_inode_unlock(inode, 0);
status = 0;
bail:
- if (inode)
- iput(inode);
+ iput(inode);
if (status)
mlog_errno(status);
@@ -1771,7 +1766,7 @@ static int ocfs2_initialize_mem_caches(void)
sizeof(struct ocfs2_inode_info),
0,
(SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
ocfs2_inode_init_once);
ocfs2_dquot_cachep = kmem_cache_create("ocfs2_dquot_cache",
sizeof(struct ocfs2_dquot),
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index 66edce7ecfd7..6c2a3e3c521c 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -88,8 +88,7 @@ const struct address_space_operations ocfs2_fast_symlink_aops = {
const struct inode_operations ocfs2_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.getattr = ocfs2_getattr,
.setattr = ocfs2_setattr,
.setxattr = generic_setxattr,
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index ebfdea78659b..f0e241ffd94f 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -544,8 +544,7 @@ static inline const char *ocfs2_xattr_prefix(int name_index)
if (name_index > 0 && name_index < OCFS2_XATTR_MAX)
handler = ocfs2_xattr_handler_map[name_index];
-
- return handler ? handler->prefix : NULL;
+ return handler ? xattr_prefix(handler) : NULL;
}
static u32 ocfs2_xattr_name_hash(struct inode *inode,
@@ -884,14 +883,39 @@ static int ocfs2_xattr_value_truncate(struct inode *inode,
return ret;
}
-static int ocfs2_xattr_list_entry(char *buffer, size_t size,
- size_t *result, const char *prefix,
+static int ocfs2_xattr_list_entry(struct super_block *sb,
+ char *buffer, size_t size,
+ size_t *result, int type,
const char *name, int name_len)
{
char *p = buffer + *result;
- int prefix_len = strlen(prefix);
- int total_len = prefix_len + name_len + 1;
+ const char *prefix;
+ int prefix_len;
+ int total_len;
+ switch(type) {
+ case OCFS2_XATTR_INDEX_USER:
+ if (OCFS2_SB(sb)->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
+ return 0;
+ break;
+
+ case OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS:
+ case OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT:
+ if (!(sb->s_flags & MS_POSIXACL))
+ return 0;
+ break;
+
+ case OCFS2_XATTR_INDEX_TRUSTED:
+ if (!capable(CAP_SYS_ADMIN))
+ return 0;
+ break;
+ }
+
+ prefix = ocfs2_xattr_prefix(type);
+ if (!prefix)
+ return 0;
+ prefix_len = strlen(prefix);
+ total_len = prefix_len + name_len + 1;
*result += total_len;
/* we are just looking for how big our buffer needs to be */
@@ -914,23 +938,20 @@ static int ocfs2_xattr_list_entries(struct inode *inode,
{
size_t result = 0;
int i, type, ret;
- const char *prefix, *name;
+ const char *name;
for (i = 0 ; i < le16_to_cpu(header->xh_count); i++) {
struct ocfs2_xattr_entry *entry = &header->xh_entries[i];
type = ocfs2_xattr_get_type(entry);
- prefix = ocfs2_xattr_prefix(type);
-
- if (prefix) {
- name = (const char *)header +
- le16_to_cpu(entry->xe_name_offset);
+ name = (const char *)header +
+ le16_to_cpu(entry->xe_name_offset);
- ret = ocfs2_xattr_list_entry(buffer, buffer_size,
- &result, prefix, name,
- entry->xe_name_len);
- if (ret)
- return ret;
- }
+ ret = ocfs2_xattr_list_entry(inode->i_sb,
+ buffer, buffer_size,
+ &result, type, name,
+ entry->xe_name_len);
+ if (ret)
+ return ret;
}
return result;
@@ -4033,32 +4054,30 @@ static int ocfs2_list_xattr_bucket(struct inode *inode,
int ret = 0, type;
struct ocfs2_xattr_tree_list *xl = (struct ocfs2_xattr_tree_list *)para;
int i, block_off, new_offset;
- const char *prefix, *name;
+ const char *name;
for (i = 0 ; i < le16_to_cpu(bucket_xh(bucket)->xh_count); i++) {
struct ocfs2_xattr_entry *entry = &bucket_xh(bucket)->xh_entries[i];
type = ocfs2_xattr_get_type(entry);
- prefix = ocfs2_xattr_prefix(type);
- if (prefix) {
- ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
- bucket_xh(bucket),
- i,
- &block_off,
- &new_offset);
- if (ret)
- break;
+ ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
+ bucket_xh(bucket),
+ i,
+ &block_off,
+ &new_offset);
+ if (ret)
+ break;
- name = (const char *)bucket_block(bucket, block_off) +
- new_offset;
- ret = ocfs2_xattr_list_entry(xl->buffer,
- xl->buffer_size,
- &xl->result,
- prefix, name,
- entry->xe_name_len);
- if (ret)
- break;
- }
+ name = (const char *)bucket_block(bucket, block_off) +
+ new_offset;
+ ret = ocfs2_xattr_list_entry(inode->i_sb,
+ xl->buffer,
+ xl->buffer_size,
+ &xl->result,
+ type, name,
+ entry->xe_name_len);
+ if (ret)
+ break;
}
return ret;
@@ -7226,39 +7245,22 @@ int ocfs2_init_security_and_acl(struct inode *dir,
leave:
return ret;
}
+
/*
* 'security' attributes support
*/
-static size_t ocfs2_xattr_security_list(struct dentry *dentry, char *list,
- size_t list_size, const char *name,
- size_t name_len, int type)
-{
- const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
- const size_t total_len = prefix_len + name_len + 1;
-
- if (list && total_len <= list_size) {
- memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
- memcpy(list + prefix_len, name, name_len);
- list[prefix_len + name_len] = '\0';
- }
- return total_len;
-}
-
-static int ocfs2_xattr_security_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int ocfs2_xattr_security_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_SECURITY,
name, buffer, size);
}
-static int ocfs2_xattr_security_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+static int ocfs2_xattr_security_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
-
return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_SECURITY,
name, value, size, flags);
}
@@ -7311,7 +7313,6 @@ int ocfs2_init_security_set(handle_t *handle,
const struct xattr_handler ocfs2_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
- .list = ocfs2_xattr_security_list,
.get = ocfs2_xattr_security_get,
.set = ocfs2_xattr_security_set,
};
@@ -7319,46 +7320,24 @@ const struct xattr_handler ocfs2_xattr_security_handler = {
/*
* 'trusted' attributes support
*/
-static size_t ocfs2_xattr_trusted_list(struct dentry *dentry, char *list,
- size_t list_size, const char *name,
- size_t name_len, int type)
-{
- const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
- const size_t total_len = prefix_len + name_len + 1;
-
- if (!capable(CAP_SYS_ADMIN))
- return 0;
-
- if (list && total_len <= list_size) {
- memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
- memcpy(list + prefix_len, name, name_len);
- list[prefix_len + name_len] = '\0';
- }
- return total_len;
-}
-
-static int ocfs2_xattr_trusted_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int ocfs2_xattr_trusted_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_TRUSTED,
name, buffer, size);
}
-static int ocfs2_xattr_trusted_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+static int ocfs2_xattr_trusted_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
-
return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_TRUSTED,
name, value, size, flags);
}
const struct xattr_handler ocfs2_xattr_trusted_handler = {
.prefix = XATTR_TRUSTED_PREFIX,
- .list = ocfs2_xattr_trusted_list,
.get = ocfs2_xattr_trusted_get,
.set = ocfs2_xattr_trusted_set,
};
@@ -7366,45 +7345,24 @@ const struct xattr_handler ocfs2_xattr_trusted_handler = {
/*
* 'user' attributes support
*/
-static size_t ocfs2_xattr_user_list(struct dentry *dentry, char *list,
- size_t list_size, const char *name,
- size_t name_len, int type)
-{
- const size_t prefix_len = XATTR_USER_PREFIX_LEN;
- const size_t total_len = prefix_len + name_len + 1;
- struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
-
- if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
- return 0;
-
- if (list && total_len <= list_size) {
- memcpy(list, XATTR_USER_PREFIX, prefix_len);
- memcpy(list + prefix_len, name, name_len);
- list[prefix_len + name_len] = '\0';
- }
- return total_len;
-}
-
-static int ocfs2_xattr_user_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int ocfs2_xattr_user_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
- if (strcmp(name, "") == 0)
- return -EINVAL;
if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
return -EOPNOTSUPP;
return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_USER, name,
buffer, size);
}
-static int ocfs2_xattr_user_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+static int ocfs2_xattr_user_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
- if (strcmp(name, "") == 0)
- return -EINVAL;
if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
return -EOPNOTSUPP;
@@ -7414,7 +7372,6 @@ static int ocfs2_xattr_user_set(struct dentry *dentry, const char *name,
const struct xattr_handler ocfs2_xattr_user_handler = {
.prefix = XATTR_USER_PREFIX,
- .list = ocfs2_xattr_user_list,
.get = ocfs2_xattr_user_get,
.set = ocfs2_xattr_user_set,
};
diff --git a/fs/open.c b/fs/open.c
index b6f1e96a7c0b..b25b1542c530 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -887,7 +887,7 @@ EXPORT_SYMBOL(dentry_open);
static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op)
{
int lookup_flags = 0;
- int acc_mode;
+ int acc_mode = ACC_MODE(flags);
if (flags & (O_CREAT | __O_TMPFILE))
op->mode = (mode & S_IALLUGO) | S_IFREG;
@@ -909,7 +909,6 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
if (flags & __O_TMPFILE) {
if ((flags & O_TMPFILE_MASK) != O_TMPFILE)
return -EINVAL;
- acc_mode = MAY_OPEN | ACC_MODE(flags);
if (!(acc_mode & MAY_WRITE))
return -EINVAL;
} else if (flags & O_PATH) {
@@ -919,8 +918,6 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
*/
flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
acc_mode = 0;
- } else {
- acc_mode = MAY_OPEN | ACC_MODE(flags);
}
op->open_flag = flags;
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 15e4500cda3e..b61b883c8ff8 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -443,7 +443,7 @@ static int __init init_openprom_fs(void)
sizeof(struct op_inode_info),
0,
(SLAB_RECLAIM_ACCOUNT |
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD | SLAB_ACCOUNT),
op_inode_init_once);
if (!op_inode_cachep)
return -ENOMEM;
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 871fcb67be97..0a8983492d91 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -195,8 +195,7 @@ int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
struct dentry *dentry, struct path *lowerpath,
- struct kstat *stat, struct iattr *attr,
- const char *link)
+ struct kstat *stat, const char *link)
{
struct inode *wdir = workdir->d_inode;
struct inode *udir = upperdir->d_inode;
@@ -240,8 +239,6 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
mutex_lock(&newdentry->d_inode->i_mutex);
err = ovl_set_attr(newdentry, stat);
- if (!err && attr)
- err = notify_change(newdentry, attr, NULL);
mutex_unlock(&newdentry->d_inode->i_mutex);
if (err)
goto out_cleanup;
@@ -286,8 +283,7 @@ out_cleanup:
* that point the file will have already been copied up anyway.
*/
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
- struct path *lowerpath, struct kstat *stat,
- struct iattr *attr)
+ struct path *lowerpath, struct kstat *stat)
{
struct dentry *workdir = ovl_workdir(dentry);
int err;
@@ -345,26 +341,19 @@ int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
}
upperdentry = ovl_dentry_upper(dentry);
if (upperdentry) {
- unlock_rename(workdir, upperdir);
+ /* Raced with another copy-up? Nothing to do, then... */
err = 0;
- /* Raced with another copy-up? Do the setattr here */
- if (attr) {
- mutex_lock(&upperdentry->d_inode->i_mutex);
- err = notify_change(upperdentry, attr, NULL);
- mutex_unlock(&upperdentry->d_inode->i_mutex);
- }
- goto out_put_cred;
+ goto out_unlock;
}
err = ovl_copy_up_locked(workdir, upperdir, dentry, lowerpath,
- stat, attr, link);
+ stat, link);
if (!err) {
/* Restore timestamps on parent (best effort) */
ovl_set_timestamps(upperdir, &pstat);
}
out_unlock:
unlock_rename(workdir, upperdir);
-out_put_cred:
revert_creds(old_cred);
put_cred(override_cred);
@@ -406,7 +395,7 @@ int ovl_copy_up(struct dentry *dentry)
ovl_path_lower(next, &lowerpath);
err = vfs_getattr(&lowerpath, &stat);
if (!err)
- err = ovl_copy_up_one(parent, next, &lowerpath, &stat, NULL);
+ err = ovl_copy_up_one(parent, next, &lowerpath, &stat);
dput(parent);
dput(next);
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index ec0c2a050043..964a60fa7afc 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -12,8 +12,7 @@
#include <linux/xattr.h>
#include "overlayfs.h"
-static int ovl_copy_up_last(struct dentry *dentry, struct iattr *attr,
- bool no_data)
+static int ovl_copy_up_truncate(struct dentry *dentry)
{
int err;
struct dentry *parent;
@@ -30,10 +29,8 @@ static int ovl_copy_up_last(struct dentry *dentry, struct iattr *attr,
if (err)
goto out_dput_parent;
- if (no_data)
- stat.size = 0;
-
- err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat, attr);
+ stat.size = 0;
+ err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat);
out_dput_parent:
dput(parent);
@@ -49,13 +46,13 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr)
if (err)
goto out;
- upperdentry = ovl_dentry_upper(dentry);
- if (upperdentry) {
+ err = ovl_copy_up(dentry);
+ if (!err) {
+ upperdentry = ovl_dentry_upper(dentry);
+
mutex_lock(&upperdentry->d_inode->i_mutex);
err = notify_change(upperdentry, attr, NULL);
mutex_unlock(&upperdentry->d_inode->i_mutex);
- } else {
- err = ovl_copy_up_last(dentry, attr, false);
}
ovl_drop_write(dentry);
out:
@@ -134,57 +131,23 @@ out_dput:
return err;
}
-
-struct ovl_link_data {
- struct dentry *realdentry;
- void *cookie;
-};
-
-static const char *ovl_follow_link(struct dentry *dentry, void **cookie)
+static const char *ovl_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
struct dentry *realdentry;
struct inode *realinode;
- struct ovl_link_data *data = NULL;
- const char *ret;
+
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
realdentry = ovl_dentry_real(dentry);
realinode = realdentry->d_inode;
- if (WARN_ON(!realinode->i_op->follow_link))
+ if (WARN_ON(!realinode->i_op->get_link))
return ERR_PTR(-EPERM);
- if (realinode->i_op->put_link) {
- data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL);
- if (!data)
- return ERR_PTR(-ENOMEM);
- data->realdentry = realdentry;
- }
-
- ret = realinode->i_op->follow_link(realdentry, cookie);
- if (IS_ERR_OR_NULL(ret)) {
- kfree(data);
- return ret;
- }
-
- if (data)
- data->cookie = *cookie;
-
- *cookie = data;
-
- return ret;
-}
-
-static void ovl_put_link(struct inode *unused, void *c)
-{
- struct inode *realinode;
- struct ovl_link_data *data = c;
-
- if (!data)
- return;
-
- realinode = data->realdentry->d_inode;
- realinode->i_op->put_link(realinode, data->cookie);
- kfree(data);
+ return realinode->i_op->get_link(realdentry, realinode, done);
}
static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
@@ -353,7 +316,7 @@ struct inode *ovl_d_select_inode(struct dentry *dentry, unsigned file_flags)
return ERR_PTR(err);
if (file_flags & O_TRUNC)
- err = ovl_copy_up_last(dentry, NULL, true);
+ err = ovl_copy_up_truncate(dentry);
else
err = ovl_copy_up(dentry);
ovl_drop_write(dentry);
@@ -381,8 +344,7 @@ static const struct inode_operations ovl_file_inode_operations = {
static const struct inode_operations ovl_symlink_inode_operations = {
.setattr = ovl_setattr,
- .follow_link = ovl_follow_link,
- .put_link = ovl_put_link,
+ .get_link = ovl_get_link,
.readlink = ovl_readlink,
.getattr = ovl_getattr,
.setxattr = ovl_setxattr,
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index ea5a40b06e3a..e17154aeaae4 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -194,7 +194,6 @@ void ovl_cleanup(struct inode *dir, struct dentry *dentry);
/* copy_up.c */
int ovl_copy_up(struct dentry *dentry);
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
- struct path *lowerpath, struct kstat *stat,
- struct iattr *attr);
+ struct path *lowerpath, struct kstat *stat);
int ovl_copy_xattr(struct dentry *old, struct dentry *new);
int ovl_set_attr(struct dentry *upper, struct kstat *stat);
diff --git a/fs/pipe.c b/fs/pipe.c
index 8865f7963700..42cf8ddf0e55 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -366,18 +366,17 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
int offset = buf->offset + buf->len;
if (ops->can_merge && offset + chars <= PAGE_SIZE) {
- int error = ops->confirm(pipe, buf);
- if (error)
+ ret = ops->confirm(pipe, buf);
+ if (ret)
goto out;
ret = copy_page_from_iter(buf->page, offset, chars, from);
if (unlikely(ret < chars)) {
- error = -EFAULT;
+ ret = -EFAULT;
goto out;
}
do_wakeup = 1;
- buf->len += chars;
- ret = chars;
+ buf->len += ret;
if (!iov_iter_count(from))
goto out;
}
@@ -693,17 +692,20 @@ int create_pipe_files(struct file **res, int flags)
d_instantiate(path.dentry, inode);
- err = -ENFILE;
f = alloc_file(&path, FMODE_WRITE, &pipefifo_fops);
- if (IS_ERR(f))
+ if (IS_ERR(f)) {
+ err = PTR_ERR(f);
goto err_dentry;
+ }
f->f_flags = O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT));
f->private_data = inode->i_pipe;
res[0] = alloc_file(&path, FMODE_READ, &pipefifo_fops);
- if (IS_ERR(res[0]))
+ if (IS_ERR(res[0])) {
+ err = PTR_ERR(res[0]);
goto err_file;
+ }
path_get(&path);
res[0]->private_data = inode->i_pipe;
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 4fb17ded7d47..711dd5170376 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -762,8 +762,9 @@ posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl,
EXPORT_SYMBOL (posix_acl_to_xattr);
static int
-posix_acl_xattr_get(struct dentry *dentry, const char *name,
- void *value, size_t size, int type)
+posix_acl_xattr_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *value, size_t size)
{
struct posix_acl *acl;
int error;
@@ -773,7 +774,7 @@ posix_acl_xattr_get(struct dentry *dentry, const char *name,
if (d_is_symlink(dentry))
return -EOPNOTSUPP;
- acl = get_acl(d_backing_inode(dentry), type);
+ acl = get_acl(d_backing_inode(dentry), handler->flags);
if (IS_ERR(acl))
return PTR_ERR(acl);
if (acl == NULL)
@@ -786,8 +787,9 @@ posix_acl_xattr_get(struct dentry *dentry, const char *name,
}
static int
-posix_acl_xattr_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+posix_acl_xattr_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
struct inode *inode = d_backing_inode(dentry);
struct posix_acl *acl = NULL;
@@ -798,7 +800,7 @@ posix_acl_xattr_set(struct dentry *dentry, const char *name,
if (!inode->i_op->set_acl)
return -EOPNOTSUPP;
- if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
+ if (handler->flags == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
return value ? -EACCES : 0;
if (!inode_owner_or_capable(inode))
return -EPERM;
@@ -815,37 +817,20 @@ posix_acl_xattr_set(struct dentry *dentry, const char *name,
}
}
- ret = inode->i_op->set_acl(inode, acl, type);
+ ret = inode->i_op->set_acl(inode, acl, handler->flags);
out:
posix_acl_release(acl);
return ret;
}
-static size_t
-posix_acl_xattr_list(struct dentry *dentry, char *list, size_t list_size,
- const char *name, size_t name_len, int type)
+static bool
+posix_acl_xattr_list(struct dentry *dentry)
{
- const char *xname;
- size_t size;
-
- if (!IS_POSIXACL(d_backing_inode(dentry)))
- return -EOPNOTSUPP;
- if (d_is_symlink(dentry))
- return -EOPNOTSUPP;
-
- if (type == ACL_TYPE_ACCESS)
- xname = POSIX_ACL_XATTR_ACCESS;
- else
- xname = POSIX_ACL_XATTR_DEFAULT;
-
- size = strlen(xname) + 1;
- if (list && size <= list_size)
- memcpy(list, xname, size);
- return size;
+ return IS_POSIXACL(d_backing_inode(dentry));
}
const struct xattr_handler posix_acl_access_xattr_handler = {
- .prefix = POSIX_ACL_XATTR_ACCESS,
+ .name = XATTR_NAME_POSIX_ACL_ACCESS,
.flags = ACL_TYPE_ACCESS,
.list = posix_acl_xattr_list,
.get = posix_acl_xattr_get,
@@ -854,7 +839,7 @@ const struct xattr_handler posix_acl_access_xattr_handler = {
EXPORT_SYMBOL_GPL(posix_acl_access_xattr_handler);
const struct xattr_handler posix_acl_default_xattr_handler = {
- .prefix = POSIX_ACL_XATTR_DEFAULT,
+ .name = XATTR_NAME_POSIX_ACL_DEFAULT,
.flags = ACL_TYPE_DEFAULT,
.list = posix_acl_xattr_list,
.get = posix_acl_xattr_get,
diff --git a/fs/proc/array.c b/fs/proc/array.c
index f60f0121e331..d73291f5f0fc 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -91,18 +91,18 @@
static inline void task_name(struct seq_file *m, struct task_struct *p)
{
char *buf;
+ size_t size;
char tcomm[sizeof(p->comm)];
+ int ret;
get_task_comm(tcomm, p);
seq_puts(m, "Name:\t");
- buf = m->buf + m->count;
- /* Ignore error for now */
- buf += string_escape_str(tcomm, buf, m->size - m->count,
- ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\");
+ size = seq_get_buf(m, &buf);
+ ret = string_escape_str(tcomm, buf, size, ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\");
+ seq_commit(m, ret < size ? ret : -1);
- m->count = buf - m->buf;
seq_putc(m, '\n');
}
@@ -375,7 +375,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task, int whole)
{
- unsigned long vsize, eip, esp, wchan = ~0UL;
+ unsigned long vsize, eip, esp, wchan = 0;
int priority, nice;
int tty_pgrp = -1, tty_nr = 0;
sigset_t sigign, sigcatch;
@@ -507,7 +507,19 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
seq_put_decimal_ull(m, ' ', task->blocked.sig[0] & 0x7fffffffUL);
seq_put_decimal_ull(m, ' ', sigign.sig[0] & 0x7fffffffUL);
seq_put_decimal_ull(m, ' ', sigcatch.sig[0] & 0x7fffffffUL);
- seq_put_decimal_ull(m, ' ', wchan);
+
+ /*
+ * We used to output the absolute kernel address, but that's an
+ * information leak - so instead we show a 0/1 flag here, to signal
+ * to user-space whether there's a wchan field in /proc/PID/wchan.
+ *
+ * This works with older implementations of procps as well.
+ */
+ if (wchan)
+ seq_puts(m, " 1");
+ else
+ seq_puts(m, " 0");
+
seq_put_decimal_ull(m, ' ', 0);
seq_put_decimal_ull(m, ' ', 0);
seq_put_decimal_ll(m, ' ', task->exit_signal);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index b25eee4cead5..2cf5d7e37375 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -430,13 +430,10 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns,
wchan = get_wchan(task);
- if (lookup_symbol_name(wchan, symname) < 0) {
- if (!ptrace_may_access(task, PTRACE_MODE_READ))
- return 0;
- seq_printf(m, "%lu", wchan);
- } else {
+ if (wchan && ptrace_may_access(task, PTRACE_MODE_READ) && !lookup_symbol_name(wchan, symname))
seq_printf(m, "%s", symname);
- }
+ else
+ seq_putc(m, '0');
return 0;
}
@@ -1035,6 +1032,16 @@ static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,
return simple_read_from_buffer(buf, count, ppos, buffer, len);
}
+/*
+ * /proc/pid/oom_adj exists solely for backwards compatibility with previous
+ * kernels. The effective policy is defined by oom_score_adj, which has a
+ * different scale: oom_adj grew exponentially and oom_score_adj grows linearly.
+ * Values written to oom_adj are simply mapped linearly to oom_score_adj.
+ * Processes that become oom disabled via oom_adj will still be oom disabled
+ * with this implementation.
+ *
+ * oom_adj cannot be removed since existing userspace binaries use it.
+ */
static ssize_t oom_adj_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
@@ -1557,12 +1564,16 @@ static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
return -ENOENT;
}
-static const char *proc_pid_follow_link(struct dentry *dentry, void **cookie)
+static const char *proc_pid_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- struct inode *inode = d_inode(dentry);
struct path path;
int error = -EACCES;
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
/* Are we allowed to snoop on the tasks file descriptors? */
if (!proc_fd_access_allowed(inode))
goto out;
@@ -1623,7 +1634,7 @@ out:
const struct inode_operations proc_pid_link_inode_operations = {
.readlink = proc_pid_readlink,
- .follow_link = proc_pid_follow_link,
+ .get_link = proc_pid_get_link,
.setattr = proc_setattr,
};
@@ -1888,7 +1899,7 @@ static const struct dentry_operations tid_map_files_dentry_operations = {
.d_delete = pid_delete_dentry,
};
-static int proc_map_files_get_link(struct dentry *dentry, struct path *path)
+static int map_files_get_link(struct dentry *dentry, struct path *path)
{
unsigned long vm_start, vm_end;
struct vm_area_struct *vma;
@@ -1938,20 +1949,22 @@ struct map_files_info {
* path to the file in question.
*/
static const char *
-proc_map_files_follow_link(struct dentry *dentry, void **cookie)
+proc_map_files_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
if (!capable(CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);
- return proc_pid_follow_link(dentry, NULL);
+ return proc_pid_get_link(dentry, inode, done);
}
/*
- * Identical to proc_pid_link_inode_operations except for follow_link()
+ * Identical to proc_pid_link_inode_operations except for get_link()
*/
static const struct inode_operations proc_map_files_link_inode_operations = {
.readlink = proc_pid_readlink,
- .follow_link = proc_map_files_follow_link,
+ .get_link = proc_map_files_get_link,
.setattr = proc_setattr,
};
@@ -1968,7 +1981,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
return -ENOENT;
ei = PROC_I(inode);
- ei->op.proc_get_link = proc_map_files_get_link;
+ ei->op.proc_get_link = map_files_get_link;
inode->i_op = &proc_map_files_link_inode_operations;
inode->i_size = 64;
@@ -2352,7 +2365,7 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
size_t count, loff_t *ppos)
{
struct inode * inode = file_inode(file);
- char *page;
+ void *page;
ssize_t length;
struct task_struct *task = get_proc_task(inode);
@@ -2367,14 +2380,11 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
if (*ppos != 0)
goto out;
- length = -ENOMEM;
- page = (char*)__get_free_page(GFP_TEMPORARY);
- if (!page)
+ page = memdup_user(buf, count);
+ if (IS_ERR(page)) {
+ length = PTR_ERR(page);
goto out;
-
- length = -EFAULT;
- if (copy_from_user(page, buf, count))
- goto out_free;
+ }
/* Guard against adverse ptrace interaction */
length = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
@@ -2383,10 +2393,10 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
length = security_setprocattr(task,
(char*)file->f_path.dentry->d_name.name,
- (void*)page, count);
+ page, count);
mutex_unlock(&task->signal->cred_guard_mutex);
out_free:
- free_page((unsigned long) page);
+ kfree(page);
out:
put_task_struct(task);
out_no_task:
@@ -2487,6 +2497,7 @@ static ssize_t proc_coredump_filter_write(struct file *file,
mm = get_task_mm(task);
if (!mm)
goto out_no_mm;
+ ret = 0;
for (i = 0, mask = 1; i < MMF_DUMP_FILTER_BITS; i++, mask <<= 1) {
if (val & mask)
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 6e5fcd00733e..56afa5ef08f2 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -258,6 +258,7 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx,
name, len, instantiate, p,
(void *)(unsigned long)fd))
goto out_fd_loop;
+ cond_resched();
rcu_read_lock();
}
rcu_read_unlock();
@@ -291,11 +292,19 @@ static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry,
*/
int proc_fd_permission(struct inode *inode, int mask)
{
- int rv = generic_permission(inode, mask);
+ struct task_struct *p;
+ int rv;
+
+ rv = generic_permission(inode, mask);
if (rv == 0)
- return 0;
- if (task_tgid(current) == proc_pid(inode))
+ return rv;
+
+ rcu_read_lock();
+ p = pid_task(proc_pid(inode), PIDTYPE_PID);
+ if (p && same_thread_group(p, current))
rv = 0;
+ rcu_read_unlock();
+
return rv;
}
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index bd95b9fdebb0..42305ddcbaa0 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -95,7 +95,8 @@ void __init proc_init_inodecache(void)
proc_inode_cachep = kmem_cache_create("proc_inode_cache",
sizeof(struct proc_inode),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD|SLAB_PANIC),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT|
+ SLAB_PANIC),
init_once);
}
@@ -393,24 +394,25 @@ static const struct file_operations proc_reg_file_ops_no_compat = {
};
#endif
-static const char *proc_follow_link(struct dentry *dentry, void **cookie)
+static void proc_put_link(void *p)
{
- struct proc_dir_entry *pde = PDE(d_inode(dentry));
- if (unlikely(!use_pde(pde)))
- return ERR_PTR(-EINVAL);
- *cookie = pde;
- return pde->data;
+ unuse_pde(p);
}
-static void proc_put_link(struct inode *unused, void *p)
+static const char *proc_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- unuse_pde(p);
+ struct proc_dir_entry *pde = PDE(inode);
+ if (unlikely(!use_pde(pde)))
+ return ERR_PTR(-EINVAL);
+ set_delayed_call(done, proc_put_link, pde);
+ return pde->data;
}
const struct inode_operations proc_link_inode_operations = {
.readlink = generic_readlink,
- .follow_link = proc_follow_link,
- .put_link = proc_put_link,
+ .get_link = proc_get_link,
};
struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index d3ebf2e61853..df4661abadc4 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -27,7 +27,6 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
{
struct sysinfo i;
unsigned long committed;
- struct vmalloc_info vmi;
long cached;
long available;
unsigned long pagecache;
@@ -49,8 +48,6 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
if (cached < 0)
cached = 0;
- get_vmalloc_info(&vmi);
-
for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
pages[lru] = global_page_state(NR_LRU_BASE + lru);
@@ -60,11 +57,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
/*
* Estimate the amount of memory available for userspace allocations,
* without causing swapping.
- *
- * Free memory cannot be taken below the low watermark, before the
- * system starts swapping.
*/
- available = i.freeram - wmark_low;
+ available = i.freeram - totalreserve_pages;
/*
* Not all the page cache can be freed, otherwise the system will
@@ -191,8 +185,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
K(vm_commit_limit()),
K(committed),
(unsigned long)VMALLOC_TOTAL >> 10,
- vmi.used >> 10,
- vmi.largest_chunk >> 10
+ 0ul, // used to be vmalloc 'used'
+ 0ul // used to be vmalloc 'largest_chunk'
#ifdef CONFIG_MEMORY_FAILURE
, atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10)
#endif
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index f6e8354b8cea..1dece8781f91 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -30,14 +30,18 @@ static const struct proc_ns_operations *ns_entries[] = {
&mntns_operations,
};
-static const char *proc_ns_follow_link(struct dentry *dentry, void **cookie)
+static const char *proc_ns_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- struct inode *inode = d_inode(dentry);
const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops;
struct task_struct *task;
struct path ns_path;
void *error = ERR_PTR(-EACCES);
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
task = get_proc_task(inode);
if (!task)
return error;
@@ -74,7 +78,7 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl
static const struct inode_operations proc_ns_link_inode_operations = {
.readlink = proc_ns_readlink,
- .follow_link = proc_ns_follow_link,
+ .get_link = proc_ns_get_link,
.setattr = proc_setattr,
};
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 93484034a03d..b2855eea5405 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -103,9 +103,9 @@ u64 stable_page_flags(struct page *page)
* pseudo flags for the well known (anonymous) memory mapped pages
*
* Note that page->_mapcount is overloaded in SLOB/SLUB/SLQB, so the
- * simple test in page_mapped() is not enough.
+ * simple test in page_mapcount() is not enough.
*/
- if (!PageSlab(page) && page_mapped(page))
+ if (!PageSlab(page) && page_mapcount(page))
u |= 1 << KPF_MMAP;
if (PageAnon(page))
u |= 1 << KPF_ANON;
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index fdda62e6115e..fe5b6e6c4671 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -948,7 +948,7 @@ static struct ctl_dir *get_subdir(struct ctl_dir *dir,
found:
subdir->header.nreg++;
failed:
- if (unlikely(IS_ERR(subdir))) {
+ if (IS_ERR(subdir)) {
pr_err("sysctl could not get directory: ");
sysctl_print_dir(dir);
pr_cont("/%*.*s %ld\n",
diff --git a/fs/proc/self.c b/fs/proc/self.c
index 113b8d061fc0..67e8db442cf0 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -18,26 +18,28 @@ static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
return readlink_copy(buffer, buflen, tmp);
}
-static const char *proc_self_follow_link(struct dentry *dentry, void **cookie)
+static const char *proc_self_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- struct pid_namespace *ns = dentry->d_sb->s_fs_info;
+ struct pid_namespace *ns = inode->i_sb->s_fs_info;
pid_t tgid = task_tgid_nr_ns(current, ns);
char *name;
if (!tgid)
return ERR_PTR(-ENOENT);
/* 11 for max length of signed int in decimal + NULL term */
- name = kmalloc(12, GFP_KERNEL);
- if (!name)
- return ERR_PTR(-ENOMEM);
+ name = kmalloc(12, dentry ? GFP_KERNEL : GFP_ATOMIC);
+ if (unlikely(!name))
+ return dentry ? ERR_PTR(-ENOMEM) : ERR_PTR(-ECHILD);
sprintf(name, "%d", tgid);
- return *cookie = name;
+ set_delayed_call(done, kfree_link, name);
+ return name;
}
static const struct inode_operations proc_self_inode_operations = {
.readlink = proc_self_readlink,
- .follow_link = proc_self_follow_link,
- .put_link = kfree_put_link,
+ .get_link = proc_self_get_link,
};
static unsigned self_inum;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index e2d46adb54b4..65a1b6c69c11 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -14,6 +14,7 @@
#include <linux/swapops.h>
#include <linux/mmu_notifier.h>
#include <linux/page_idle.h>
+#include <linux/shmem_fs.h>
#include <asm/elf.h>
#include <asm/uaccess.h>
@@ -22,9 +23,13 @@
void task_mem(struct seq_file *m, struct mm_struct *mm)
{
- unsigned long data, text, lib, swap, ptes, pmds;
+ unsigned long text, lib, swap, ptes, pmds, anon, file, shmem;
unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
+ anon = get_mm_counter(mm, MM_ANONPAGES);
+ file = get_mm_counter(mm, MM_FILEPAGES);
+ shmem = get_mm_counter(mm, MM_SHMEMPAGES);
+
/*
* Note: to minimize their overhead, mm maintains hiwater_vm and
* hiwater_rss only when about to *lower* total_vm or rss. Any
@@ -35,11 +40,10 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
hiwater_vm = total_vm = mm->total_vm;
if (hiwater_vm < mm->hiwater_vm)
hiwater_vm = mm->hiwater_vm;
- hiwater_rss = total_rss = get_mm_rss(mm);
+ hiwater_rss = total_rss = anon + file + shmem;
if (hiwater_rss < mm->hiwater_rss)
hiwater_rss = mm->hiwater_rss;
- data = mm->total_vm - mm->shared_vm - mm->stack_vm;
text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
swap = get_mm_counter(mm, MM_SWAPENTS);
@@ -52,6 +56,9 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
"VmPin:\t%8lu kB\n"
"VmHWM:\t%8lu kB\n"
"VmRSS:\t%8lu kB\n"
+ "RssAnon:\t%8lu kB\n"
+ "RssFile:\t%8lu kB\n"
+ "RssShmem:\t%8lu kB\n"
"VmData:\t%8lu kB\n"
"VmStk:\t%8lu kB\n"
"VmExe:\t%8lu kB\n"
@@ -65,11 +72,15 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
mm->pinned_vm << (PAGE_SHIFT-10),
hiwater_rss << (PAGE_SHIFT-10),
total_rss << (PAGE_SHIFT-10),
- data << (PAGE_SHIFT-10),
+ anon << (PAGE_SHIFT-10),
+ file << (PAGE_SHIFT-10),
+ shmem << (PAGE_SHIFT-10),
+ mm->data_vm << (PAGE_SHIFT-10),
mm->stack_vm << (PAGE_SHIFT-10), text, lib,
ptes >> 10,
pmds >> 10,
swap << (PAGE_SHIFT-10));
+ hugetlb_report_usage(m, mm);
}
unsigned long task_vsize(struct mm_struct *mm)
@@ -81,10 +92,11 @@ unsigned long task_statm(struct mm_struct *mm,
unsigned long *shared, unsigned long *text,
unsigned long *data, unsigned long *resident)
{
- *shared = get_mm_counter(mm, MM_FILEPAGES);
+ *shared = get_mm_counter(mm, MM_FILEPAGES) +
+ get_mm_counter(mm, MM_SHMEMPAGES);
*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
>> PAGE_SHIFT;
- *data = mm->total_vm - mm->shared_vm;
+ *data = mm->data_vm + mm->stack_vm;
*resident = *shared + get_mm_counter(mm, MM_ANONPAGES);
return mm->total_vm;
}
@@ -446,14 +458,18 @@ struct mem_size_stats {
unsigned long anonymous;
unsigned long anonymous_thp;
unsigned long swap;
+ unsigned long shared_hugetlb;
+ unsigned long private_hugetlb;
u64 pss;
u64 swap_pss;
+ bool check_shmem_swap;
};
static void smaps_account(struct mem_size_stats *mss, struct page *page,
- unsigned long size, bool young, bool dirty)
+ bool compound, bool young, bool dirty)
{
- int mapcount;
+ int i, nr = compound ? HPAGE_PMD_NR : 1;
+ unsigned long size = nr * PAGE_SIZE;
if (PageAnon(page))
mss->anonymous += size;
@@ -462,25 +478,52 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
/* Accumulate the size in pages that have been accessed. */
if (young || page_is_young(page) || PageReferenced(page))
mss->referenced += size;
- mapcount = page_mapcount(page);
- if (mapcount >= 2) {
- u64 pss_delta;
- if (dirty || PageDirty(page))
- mss->shared_dirty += size;
- else
- mss->shared_clean += size;
- pss_delta = (u64)size << PSS_SHIFT;
- do_div(pss_delta, mapcount);
- mss->pss += pss_delta;
- } else {
+ /*
+ * page_count(page) == 1 guarantees the page is mapped exactly once.
+ * If any subpage of the compound page mapped with PTE it would elevate
+ * page_count().
+ */
+ if (page_count(page) == 1) {
if (dirty || PageDirty(page))
mss->private_dirty += size;
else
mss->private_clean += size;
mss->pss += (u64)size << PSS_SHIFT;
+ return;
}
+
+ for (i = 0; i < nr; i++, page++) {
+ int mapcount = page_mapcount(page);
+
+ if (mapcount >= 2) {
+ if (dirty || PageDirty(page))
+ mss->shared_dirty += PAGE_SIZE;
+ else
+ mss->shared_clean += PAGE_SIZE;
+ mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount;
+ } else {
+ if (dirty || PageDirty(page))
+ mss->private_dirty += PAGE_SIZE;
+ else
+ mss->private_clean += PAGE_SIZE;
+ mss->pss += PAGE_SIZE << PSS_SHIFT;
+ }
+ }
+}
+
+#ifdef CONFIG_SHMEM
+static int smaps_pte_hole(unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct mem_size_stats *mss = walk->private;
+
+ mss->swap += shmem_partial_swap_usage(
+ walk->vma->vm_file->f_mapping, addr, end);
+
+ return 0;
}
+#endif
static void smaps_pte_entry(pte_t *pte, unsigned long addr,
struct mm_walk *walk)
@@ -509,11 +552,25 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
}
} else if (is_migration_entry(swpent))
page = migration_entry_to_page(swpent);
+ } else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap
+ && pte_none(*pte))) {
+ page = find_get_entry(vma->vm_file->f_mapping,
+ linear_page_index(vma, addr));
+ if (!page)
+ return;
+
+ if (radix_tree_exceptional_entry(page))
+ mss->swap += PAGE_SIZE;
+ else
+ page_cache_release(page);
+
+ return;
}
if (!page)
return;
- smaps_account(mss, page, PAGE_SIZE, pte_young(*pte), pte_dirty(*pte));
+
+ smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte));
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -529,8 +586,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
if (IS_ERR_OR_NULL(page))
return;
mss->anonymous_thp += HPAGE_PMD_SIZE;
- smaps_account(mss, page, HPAGE_PMD_SIZE,
- pmd_young(*pmd), pmd_dirty(*pmd));
+ smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd));
}
#else
static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
@@ -546,7 +602,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pte_t *pte;
spinlock_t *ptl;
- if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
smaps_pmd_entry(pmd, addr, walk);
spin_unlock(ptl);
return 0;
@@ -625,17 +681,74 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
seq_putc(m, '\n');
}
+#ifdef CONFIG_HUGETLB_PAGE
+static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
+ unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct mem_size_stats *mss = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ struct page *page = NULL;
+
+ if (pte_present(*pte)) {
+ page = vm_normal_page(vma, addr, *pte);
+ } else if (is_swap_pte(*pte)) {
+ swp_entry_t swpent = pte_to_swp_entry(*pte);
+
+ if (is_migration_entry(swpent))
+ page = migration_entry_to_page(swpent);
+ }
+ if (page) {
+ int mapcount = page_mapcount(page);
+
+ if (mapcount >= 2)
+ mss->shared_hugetlb += huge_page_size(hstate_vma(vma));
+ else
+ mss->private_hugetlb += huge_page_size(hstate_vma(vma));
+ }
+ return 0;
+}
+#endif /* HUGETLB_PAGE */
+
static int show_smap(struct seq_file *m, void *v, int is_pid)
{
struct vm_area_struct *vma = v;
struct mem_size_stats mss;
struct mm_walk smaps_walk = {
.pmd_entry = smaps_pte_range,
+#ifdef CONFIG_HUGETLB_PAGE
+ .hugetlb_entry = smaps_hugetlb_range,
+#endif
.mm = vma->vm_mm,
.private = &mss,
};
memset(&mss, 0, sizeof mss);
+
+#ifdef CONFIG_SHMEM
+ if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) {
+ /*
+ * For shared or readonly shmem mappings we know that all
+ * swapped out pages belong to the shmem object, and we can
+ * obtain the swap value much more efficiently. For private
+ * writable mappings, we might have COW pages that are
+ * not affected by the parent swapped out pages of the shmem
+ * object, so we have to distinguish them during the page walk.
+ * Unless we know that the shmem object (or the part mapped by
+ * our VMA) has no swapped out pages at all.
+ */
+ unsigned long shmem_swapped = shmem_swap_usage(vma);
+
+ if (!shmem_swapped || (vma->vm_flags & VM_SHARED) ||
+ !(vma->vm_flags & VM_WRITE)) {
+ mss.swap = shmem_swapped;
+ } else {
+ mss.check_shmem_swap = true;
+ smaps_walk.pte_hole = smaps_pte_hole;
+ }
+ }
+#endif
+
/* mmap_sem is held in m_start */
walk_page_vma(vma, &smaps_walk);
@@ -652,6 +765,8 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
"Referenced: %8lu kB\n"
"Anonymous: %8lu kB\n"
"AnonHugePages: %8lu kB\n"
+ "Shared_Hugetlb: %8lu kB\n"
+ "Private_Hugetlb: %7lu kB\n"
"Swap: %8lu kB\n"
"SwapPss: %8lu kB\n"
"KernelPageSize: %8lu kB\n"
@@ -667,6 +782,8 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
mss.referenced >> 10,
mss.anonymous >> 10,
mss.anonymous_thp >> 10,
+ mss.shared_hugetlb >> 10,
+ mss.private_hugetlb >> 10,
mss.swap >> 10,
(unsigned long)(mss.swap_pss >> (10 + PSS_SHIFT)),
vma_kernel_pagesize(vma) >> 10,
@@ -753,36 +870,34 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
pte_t ptent = *pte;
if (pte_present(ptent)) {
+ ptent = ptep_modify_prot_start(vma->vm_mm, addr, pte);
ptent = pte_wrprotect(ptent);
- ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY);
+ ptent = pte_clear_soft_dirty(ptent);
+ ptep_modify_prot_commit(vma->vm_mm, addr, pte, ptent);
} else if (is_swap_pte(ptent)) {
ptent = pte_swp_clear_soft_dirty(ptent);
+ set_pte_at(vma->vm_mm, addr, pte, ptent);
}
-
- set_pte_at(vma->vm_mm, addr, pte, ptent);
}
+#else
+static inline void clear_soft_dirty(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *pte)
+{
+}
+#endif
+#if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmdp)
{
- pmd_t pmd = *pmdp;
+ pmd_t pmd = pmdp_huge_get_and_clear(vma->vm_mm, addr, pmdp);
pmd = pmd_wrprotect(pmd);
- pmd = pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY);
-
- if (vma->vm_flags & VM_SOFTDIRTY)
- vma->vm_flags &= ~VM_SOFTDIRTY;
+ pmd = pmd_clear_soft_dirty(pmd);
set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
}
-
#else
-
-static inline void clear_soft_dirty(struct vm_area_struct *vma,
- unsigned long addr, pte_t *pte)
-{
-}
-
static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmdp)
{
@@ -798,7 +913,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
spinlock_t *ptl;
struct page *page;
- if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
clear_soft_dirty_pmd(vma, addr, pmd);
goto out;
@@ -1072,7 +1187,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
int err = 0;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- if (pmd_trans_huge_lock(pmdp, vma, &ptl) == 1) {
+ if (pmd_trans_huge_lock(pmdp, vma, &ptl)) {
u64 flags = 0, frame = 0;
pmd_t pmd = *pmdp;
@@ -1404,7 +1519,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
pte_t *orig_pte;
pte_t *pte;
- if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
pte_t huge_pte = *(pte_t *)pmd;
struct page *page;
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
index 947b0f4fd0a1..9eacd59e0360 100644
--- a/fs/proc/thread_self.c
+++ b/fs/proc/thread_self.c
@@ -19,26 +19,29 @@ static int proc_thread_self_readlink(struct dentry *dentry, char __user *buffer,
return readlink_copy(buffer, buflen, tmp);
}
-static const char *proc_thread_self_follow_link(struct dentry *dentry, void **cookie)
+static const char *proc_thread_self_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- struct pid_namespace *ns = dentry->d_sb->s_fs_info;
+ struct pid_namespace *ns = inode->i_sb->s_fs_info;
pid_t tgid = task_tgid_nr_ns(current, ns);
pid_t pid = task_pid_nr_ns(current, ns);
char *name;
if (!pid)
return ERR_PTR(-ENOENT);
- name = kmalloc(PROC_NUMBUF + 6 + PROC_NUMBUF, GFP_KERNEL);
- if (!name)
- return ERR_PTR(-ENOMEM);
+ name = kmalloc(PROC_NUMBUF + 6 + PROC_NUMBUF,
+ dentry ? GFP_KERNEL : GFP_ATOMIC);
+ if (unlikely(!name))
+ return dentry ? ERR_PTR(-ENOMEM) : ERR_PTR(-ECHILD);
sprintf(name, "%d/task/%d", tgid, pid);
- return *cookie = name;
+ set_delayed_call(done, kfree_link, name);
+ return name;
}
static const struct inode_operations proc_thread_self_inode_operations = {
.readlink = proc_thread_self_readlink,
- .follow_link = proc_thread_self_follow_link,
- .put_link = kfree_put_link,
+ .get_link = proc_thread_self_get_link,
};
static unsigned thread_self_inum;
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 8ebd9a334085..2256e7e23e67 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -95,9 +95,9 @@ static int show_vfsmnt(struct seq_file *m, struct vfsmount *mnt)
{
struct proc_mounts *p = m->private;
struct mount *r = real_mount(mnt);
- int err = 0;
struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
struct super_block *sb = mnt_path.dentry->d_sb;
+ int err;
if (sb->s_op->show_devname) {
err = sb->s_op->show_devname(m, mnt_path.dentry);
@@ -131,16 +131,17 @@ static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt)
struct mount *r = real_mount(mnt);
struct super_block *sb = mnt->mnt_sb;
struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
- int err = 0;
+ int err;
seq_printf(m, "%i %i %u:%u ", r->mnt_id, r->mnt_parent->mnt_id,
MAJOR(sb->s_dev), MINOR(sb->s_dev));
- if (sb->s_op->show_path)
+ if (sb->s_op->show_path) {
err = sb->s_op->show_path(m, mnt->mnt_root);
- else
+ if (err)
+ goto out;
+ } else {
seq_dentry(m, mnt->mnt_root, " \t\n\\");
- if (err)
- goto out;
+ }
seq_putc(m, ' ');
/* mountpoints outside of chroot jail will give SEQ_SKIP on this */
@@ -168,12 +169,13 @@ static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt)
seq_puts(m, " - ");
show_type(m, sb);
seq_putc(m, ' ');
- if (sb->s_op->show_devname)
+ if (sb->s_op->show_devname) {
err = sb->s_op->show_devname(m, mnt->mnt_root);
- else
+ if (err)
+ goto out;
+ } else {
mangle(m, r->mnt_devname ? r->mnt_devname : "none");
- if (err)
- goto out;
+ }
seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw");
err = show_sb_opts(m, sb);
if (err)
@@ -191,7 +193,7 @@ static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt)
struct mount *r = real_mount(mnt);
struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
struct super_block *sb = mnt_path.dentry->d_sb;
- int err = 0;
+ int err;
/* device */
if (sb->s_op->show_devname) {
@@ -220,8 +222,7 @@ static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt)
/* optional statistics */
if (sb->s_op->show_stats) {
seq_putc(m, ' ');
- if (!err)
- err = sb->s_op->show_stats(m, mnt_path.dentry);
+ err = sb->s_op->show_stats(m, mnt_path.dentry);
}
seq_putc(m, '\n');
diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig
index 916b8e23d968..360ae43f590c 100644
--- a/fs/pstore/Kconfig
+++ b/fs/pstore/Kconfig
@@ -1,5 +1,5 @@
config PSTORE
- bool "Persistent store support"
+ tristate "Persistent store support"
default n
select ZLIB_DEFLATE
select ZLIB_INFLATE
diff --git a/fs/pstore/Makefile b/fs/pstore/Makefile
index e647d8e81712..b8803cc07fce 100644
--- a/fs/pstore/Makefile
+++ b/fs/pstore/Makefile
@@ -2,12 +2,12 @@
# Makefile for the linux pstorefs routines.
#
-obj-y += pstore.o
+obj-$(CONFIG_PSTORE) += pstore.o
pstore-objs += inode.o platform.o
-obj-$(CONFIG_PSTORE_FTRACE) += ftrace.o
+pstore-$(CONFIG_PSTORE_FTRACE) += ftrace.o
-obj-$(CONFIG_PSTORE_PMSG) += pmsg.o
+pstore-$(CONFIG_PSTORE_PMSG) += pmsg.o
ramoops-objs += ram.o ram_core.o
obj-$(CONFIG_PSTORE_RAM) += ramoops.o
diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c
index 76a4eeb92982..d4887705bb61 100644
--- a/fs/pstore/ftrace.c
+++ b/fs/pstore/ftrace.c
@@ -104,22 +104,23 @@ static const struct file_operations pstore_knob_fops = {
.write = pstore_ftrace_knob_write,
};
+static struct dentry *pstore_ftrace_dir;
+
void pstore_register_ftrace(void)
{
- struct dentry *dir;
struct dentry *file;
if (!psinfo->write_buf)
return;
- dir = debugfs_create_dir("pstore", NULL);
- if (!dir) {
+ pstore_ftrace_dir = debugfs_create_dir("pstore", NULL);
+ if (!pstore_ftrace_dir) {
pr_err("%s: unable to create pstore directory\n", __func__);
return;
}
- file = debugfs_create_file("record_ftrace", 0600, dir, NULL,
- &pstore_knob_fops);
+ file = debugfs_create_file("record_ftrace", 0600, pstore_ftrace_dir,
+ NULL, &pstore_knob_fops);
if (!file) {
pr_err("%s: unable to create record_ftrace file\n", __func__);
goto err_file;
@@ -127,5 +128,17 @@ void pstore_register_ftrace(void)
return;
err_file:
- debugfs_remove(dir);
+ debugfs_remove(pstore_ftrace_dir);
+}
+
+void pstore_unregister_ftrace(void)
+{
+ mutex_lock(&pstore_ftrace_lock);
+ if (pstore_ftrace_enabled) {
+ unregister_ftrace_function(&pstore_ftrace_ops);
+ pstore_ftrace_enabled = 0;
+ }
+ mutex_unlock(&pstore_ftrace_lock);
+
+ debugfs_remove_recursive(pstore_ftrace_dir);
}
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 3adcc4669fac..d8c439d813ce 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -178,6 +178,7 @@ static loff_t pstore_file_llseek(struct file *file, loff_t off, int whence)
}
static const struct file_operations pstore_file_operations = {
+ .owner = THIS_MODULE,
.open = pstore_file_open,
.read = pstore_file_read,
.llseek = pstore_file_llseek,
@@ -287,7 +288,7 @@ static const struct super_operations pstore_ops = {
static struct super_block *pstore_sb;
-int pstore_is_mounted(void)
+bool pstore_is_mounted(void)
{
return pstore_sb != NULL;
}
@@ -456,6 +457,7 @@ static void pstore_kill_sb(struct super_block *sb)
}
static struct file_system_type pstore_fs_type = {
+ .owner = THIS_MODULE,
.name = "pstore",
.mount = pstore_mount,
.kill_sb = pstore_kill_sb,
@@ -479,5 +481,12 @@ out:
}
module_init(init_pstore_fs)
+static void __exit exit_pstore_fs(void)
+{
+ unregister_filesystem(&pstore_fs_type);
+ sysfs_remove_mount_point(fs_kobj, "pstore");
+}
+module_exit(exit_pstore_fs)
+
MODULE_AUTHOR("Tony Luck <tony.luck@intel.com>");
MODULE_LICENSE("GPL");
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
index c36ba2cd0b5d..e38a22b31282 100644
--- a/fs/pstore/internal.h
+++ b/fs/pstore/internal.h
@@ -41,14 +41,18 @@ pstore_ftrace_decode_cpu(struct pstore_ftrace_record *rec)
#ifdef CONFIG_PSTORE_FTRACE
extern void pstore_register_ftrace(void);
+extern void pstore_unregister_ftrace(void);
#else
static inline void pstore_register_ftrace(void) {}
+static inline void pstore_unregister_ftrace(void) {}
#endif
#ifdef CONFIG_PSTORE_PMSG
extern void pstore_register_pmsg(void);
+extern void pstore_unregister_pmsg(void);
#else
static inline void pstore_register_pmsg(void) {}
+static inline void pstore_unregister_pmsg(void) {}
#endif
extern struct pstore_info *psinfo;
@@ -59,6 +63,6 @@ extern int pstore_mkfile(enum pstore_type_id, char *psname, u64 id,
int count, char *data, bool compressed,
size_t size, struct timespec time,
struct pstore_info *psi);
-extern int pstore_is_mounted(void);
+extern bool pstore_is_mounted(void);
#endif
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 791743deedf1..588461bb2dd4 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -237,6 +237,14 @@ static void allocate_buf_for_compression(void)
}
+static void free_buf_for_compression(void)
+{
+ kfree(stream.workspace);
+ stream.workspace = NULL;
+ kfree(big_oops_buf);
+ big_oops_buf = NULL;
+}
+
/*
* Called when compression fails, since the printk buffer
* would be fetched for compression calling it again when
@@ -353,6 +361,19 @@ static struct kmsg_dumper pstore_dumper = {
.dump = pstore_dump,
};
+/*
+ * Register with kmsg_dump to save last part of console log on panic.
+ */
+static void pstore_register_kmsg(void)
+{
+ kmsg_dump_register(&pstore_dumper);
+}
+
+static void pstore_unregister_kmsg(void)
+{
+ kmsg_dump_unregister(&pstore_dumper);
+}
+
#ifdef CONFIG_PSTORE_CONSOLE
static void pstore_console_write(struct console *con, const char *s, unsigned c)
{
@@ -390,8 +411,14 @@ static void pstore_register_console(void)
{
register_console(&pstore_console);
}
+
+static void pstore_unregister_console(void)
+{
+ unregister_console(&pstore_console);
+}
#else
static void pstore_register_console(void) {}
+static void pstore_unregister_console(void) {}
#endif
static int pstore_write_compat(enum pstore_type_id type,
@@ -410,8 +437,6 @@ static int pstore_write_compat(enum pstore_type_id type,
* read function right away to populate the file system. If not
* then the pstore mount code will call us later to fill out
* the file system.
- *
- * Register with kmsg_dump to save last part of console log on panic.
*/
int pstore_register(struct pstore_info *psi)
{
@@ -442,7 +467,7 @@ int pstore_register(struct pstore_info *psi)
if (pstore_is_mounted())
pstore_get_records(0);
- kmsg_dump_register(&pstore_dumper);
+ pstore_register_kmsg();
if ((psi->flags & PSTORE_FLAGS_FRAGILE) == 0) {
pstore_register_console();
@@ -462,12 +487,28 @@ int pstore_register(struct pstore_info *psi)
*/
backend = psi->name;
+ module_put(owner);
+
pr_info("Registered %s as persistent store backend\n", psi->name);
return 0;
}
EXPORT_SYMBOL_GPL(pstore_register);
+void pstore_unregister(struct pstore_info *psi)
+{
+ pstore_unregister_pmsg();
+ pstore_unregister_ftrace();
+ pstore_unregister_console();
+ pstore_unregister_kmsg();
+
+ free_buf_for_compression();
+
+ psinfo = NULL;
+ backend = NULL;
+}
+EXPORT_SYMBOL_GPL(pstore_unregister);
+
/*
* Read all the records from the persistent store. Create
* files in our filesystem. Don't warn about -EEXIST errors
diff --git a/fs/pstore/pmsg.c b/fs/pstore/pmsg.c
index feb5dd2948b4..7de20cd3797f 100644
--- a/fs/pstore/pmsg.c
+++ b/fs/pstore/pmsg.c
@@ -37,6 +37,8 @@ static ssize_t write_pmsg(struct file *file, const char __user *buf,
if (buffer_size > PMSG_MAX_BOUNCE_BUFFER_SIZE)
buffer_size = PMSG_MAX_BOUNCE_BUFFER_SIZE;
buffer = vmalloc(buffer_size);
+ if (!buffer)
+ return -ENOMEM;
mutex_lock(&pmsg_lock);
for (i = 0; i < count; ) {
@@ -112,3 +114,10 @@ err_class:
err:
return;
}
+
+void pstore_unregister_pmsg(void)
+{
+ device_destroy(pmsg_class, MKDEV(pmsg_major, 0));
+ class_destroy(pmsg_class);
+ unregister_chrdev(pmsg_major, PMSG_NAME);
+}
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 6c26c4daaec9..319c3a60cfa5 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -578,30 +578,27 @@ fail_out:
return err;
}
-static int __exit ramoops_remove(struct platform_device *pdev)
+static int ramoops_remove(struct platform_device *pdev)
{
-#if 0
- /* TODO(kees): We cannot unload ramoops since pstore doesn't support
- * unregistering yet.
- */
struct ramoops_context *cxt = &oops_cxt;
- iounmap(cxt->virt_addr);
- release_mem_region(cxt->phys_addr, cxt->size);
+ pstore_unregister(&cxt->pstore);
cxt->max_dump_cnt = 0;
- /* TODO(kees): When pstore supports unregistering, call it here. */
kfree(cxt->pstore.buf);
cxt->pstore.bufsize = 0;
+ persistent_ram_free(cxt->mprz);
+ persistent_ram_free(cxt->fprz);
+ persistent_ram_free(cxt->cprz);
+ ramoops_free_przs(cxt);
+
return 0;
-#endif
- return -EBUSY;
}
static struct platform_driver ramoops_driver = {
.probe = ramoops_probe,
- .remove = __exit_p(ramoops_remove),
+ .remove = ramoops_remove,
.driver = {
.name = "ramoops",
},
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index c4bcb778886e..3a67cfb142d8 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -316,6 +316,7 @@ struct inode *qnx4_iget(struct super_block *sb, unsigned long ino)
inode->i_fop = &qnx4_dir_operations;
} else if (S_ISLNK(inode->i_mode)) {
inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &qnx4_aops;
qnx4_i(inode)->mmu_private = inode->i_size;
} else {
@@ -364,7 +365,7 @@ static int init_inodecache(void)
qnx4_inode_cachep = kmem_cache_create("qnx4_inode_cache",
sizeof(struct qnx4_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (qnx4_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index 32d2e1a9774c..47bb1de07155 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -582,6 +582,7 @@ struct inode *qnx6_iget(struct super_block *sb, unsigned ino)
inode->i_mapping->a_ops = &qnx6_aops;
} else if (S_ISLNK(inode->i_mode)) {
inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &qnx6_aops;
} else
init_special_inode(inode, inode->i_mode, 0);
@@ -624,7 +625,7 @@ static int init_inodecache(void)
qnx6_inode_cachep = kmem_cache_create("qnx6_inode_cache",
sizeof(struct qnx6_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (!qnx6_inode_cachep)
return -ENOMEM;
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index ef0d64b2a6d9..fbd70af98820 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2924,4 +2924,4 @@ static int __init dquot_init(void)
return 0;
}
-module_init(dquot_init);
+fs_initcall(dquot_init);
diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c
index bb2869f5dfd8..d07a2f91d858 100644
--- a/fs/quota/netlink.c
+++ b/fs/quota/netlink.c
@@ -1,7 +1,5 @@
-
#include <linux/cred.h>
#include <linux/init.h>
-#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/quotaops.h>
#include <linux/sched.h>
@@ -105,5 +103,4 @@ static int __init quota_init(void)
"VFS: Failed to create quota netlink interface.\n");
return 0;
};
-
-module_init(quota_init);
+fs_initcall(quota_init);
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index 2aa012a68e90..ed85d4f35c04 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -30,13 +30,13 @@ static void v2r1_mem2diskdqb(void *dp, struct dquot *dquot);
static void v2r1_disk2memdqb(struct dquot *dquot, void *dp);
static int v2r1_is_id(void *dp, struct dquot *dquot);
-static struct qtree_fmt_operations v2r0_qtree_ops = {
+static const struct qtree_fmt_operations v2r0_qtree_ops = {
.mem2disk_dqblk = v2r0_mem2diskdqb,
.disk2mem_dqblk = v2r0_disk2memdqb,
.is_id = v2r0_is_id,
};
-static struct qtree_fmt_operations v2r1_qtree_ops = {
+static const struct qtree_fmt_operations v2r1_qtree_ops = {
.mem2disk_dqblk = v2r1_mem2diskdqb,
.disk2mem_dqblk = v2r1_disk2memdqb,
.is_id = v2r1_is_id,
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 889d558b4e05..38981b037524 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -79,6 +79,7 @@ struct inode *ramfs_get_inode(struct super_block *sb,
break;
case S_IFLNK:
inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
break;
}
}
diff --git a/fs/read_write.c b/fs/read_write.c
index 819ef3faf1bb..06b07d5a08fe 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -16,6 +16,7 @@
#include <linux/pagemap.h>
#include <linux/splice.h>
#include <linux/compat.h>
+#include <linux/mount.h>
#include "internal.h"
#include <asm/uaccess.h>
@@ -171,6 +172,45 @@ loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t si
EXPORT_SYMBOL(fixed_size_llseek);
/**
+ * no_seek_end_llseek - llseek implementation for fixed-sized devices
+ * @file: file structure to seek on
+ * @offset: file offset to seek to
+ * @whence: type of seek
+ *
+ */
+loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence)
+{
+ switch (whence) {
+ case SEEK_SET: case SEEK_CUR:
+ return generic_file_llseek_size(file, offset, whence,
+ ~0ULL, 0);
+ default:
+ return -EINVAL;
+ }
+}
+EXPORT_SYMBOL(no_seek_end_llseek);
+
+/**
+ * no_seek_end_llseek_size - llseek implementation for fixed-sized devices
+ * @file: file structure to seek on
+ * @offset: file offset to seek to
+ * @whence: type of seek
+ * @size: maximal offset allowed
+ *
+ */
+loff_t no_seek_end_llseek_size(struct file *file, loff_t offset, int whence, loff_t size)
+{
+ switch (whence) {
+ case SEEK_SET: case SEEK_CUR:
+ return generic_file_llseek_size(file, offset, whence,
+ size, 0);
+ default:
+ return -EINVAL;
+ }
+}
+EXPORT_SYMBOL(no_seek_end_llseek_size);
+
+/**
* noop_llseek - No Operation Performed llseek implementation
* @file: file structure to seek on
* @offset: file offset to seek to
@@ -395,9 +435,8 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t
}
if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
- retval = locks_mandatory_area(
- read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
- inode, file, pos, count);
+ retval = locks_mandatory_area(inode, file, pos, pos + count - 1,
+ read_write == READ ? F_RDLCK : F_WRLCK);
if (retval < 0)
return retval;
}
@@ -1327,3 +1366,299 @@ COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
return do_sendfile(out_fd, in_fd, NULL, count, 0);
}
#endif
+
+/*
+ * copy_file_range() differs from regular file read and write in that it
+ * specifically allows return partial success. When it does so is up to
+ * the copy_file_range method.
+ */
+ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ size_t len, unsigned int flags)
+{
+ struct inode *inode_in = file_inode(file_in);
+ struct inode *inode_out = file_inode(file_out);
+ ssize_t ret;
+
+ if (flags != 0)
+ return -EINVAL;
+
+ /* copy_file_range allows full ssize_t len, ignoring MAX_RW_COUNT */
+ ret = rw_verify_area(READ, file_in, &pos_in, len);
+ if (ret >= 0)
+ ret = rw_verify_area(WRITE, file_out, &pos_out, len);
+ if (ret < 0)
+ return ret;
+
+ if (!(file_in->f_mode & FMODE_READ) ||
+ !(file_out->f_mode & FMODE_WRITE) ||
+ (file_out->f_flags & O_APPEND))
+ return -EBADF;
+
+ /* this could be relaxed once a method supports cross-fs copies */
+ if (inode_in->i_sb != inode_out->i_sb)
+ return -EXDEV;
+
+ if (len == 0)
+ return 0;
+
+ ret = mnt_want_write_file(file_out);
+ if (ret)
+ return ret;
+
+ ret = -EOPNOTSUPP;
+ if (file_out->f_op->copy_file_range)
+ ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out,
+ pos_out, len, flags);
+ if (ret == -EOPNOTSUPP)
+ ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
+ len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
+
+ if (ret > 0) {
+ fsnotify_access(file_in);
+ add_rchar(current, ret);
+ fsnotify_modify(file_out);
+ add_wchar(current, ret);
+ }
+ inc_syscr(current);
+ inc_syscw(current);
+
+ mnt_drop_write_file(file_out);
+
+ return ret;
+}
+EXPORT_SYMBOL(vfs_copy_file_range);
+
+SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
+ int, fd_out, loff_t __user *, off_out,
+ size_t, len, unsigned int, flags)
+{
+ loff_t pos_in;
+ loff_t pos_out;
+ struct fd f_in;
+ struct fd f_out;
+ ssize_t ret = -EBADF;
+
+ f_in = fdget(fd_in);
+ if (!f_in.file)
+ goto out2;
+
+ f_out = fdget(fd_out);
+ if (!f_out.file)
+ goto out1;
+
+ ret = -EFAULT;
+ if (off_in) {
+ if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
+ goto out;
+ } else {
+ pos_in = f_in.file->f_pos;
+ }
+
+ if (off_out) {
+ if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
+ goto out;
+ } else {
+ pos_out = f_out.file->f_pos;
+ }
+
+ ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len,
+ flags);
+ if (ret > 0) {
+ pos_in += ret;
+ pos_out += ret;
+
+ if (off_in) {
+ if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
+ ret = -EFAULT;
+ } else {
+ f_in.file->f_pos = pos_in;
+ }
+
+ if (off_out) {
+ if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
+ ret = -EFAULT;
+ } else {
+ f_out.file->f_pos = pos_out;
+ }
+ }
+
+out:
+ fdput(f_out);
+out1:
+ fdput(f_in);
+out2:
+ return ret;
+}
+
+static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
+{
+ struct inode *inode = file_inode(file);
+
+ if (unlikely(pos < 0))
+ return -EINVAL;
+
+ if (unlikely((loff_t) (pos + len) < 0))
+ return -EINVAL;
+
+ if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
+ loff_t end = len ? pos + len - 1 : OFFSET_MAX;
+ int retval;
+
+ retval = locks_mandatory_area(inode, file, pos, end,
+ write ? F_WRLCK : F_RDLCK);
+ if (retval < 0)
+ return retval;
+ }
+
+ return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
+}
+
+int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out, u64 len)
+{
+ struct inode *inode_in = file_inode(file_in);
+ struct inode *inode_out = file_inode(file_out);
+ int ret;
+
+ if (inode_in->i_sb != inode_out->i_sb ||
+ file_in->f_path.mnt != file_out->f_path.mnt)
+ return -EXDEV;
+
+ if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
+ return -EISDIR;
+ if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
+ return -EINVAL;
+
+ if (!(file_in->f_mode & FMODE_READ) ||
+ !(file_out->f_mode & FMODE_WRITE) ||
+ (file_out->f_flags & O_APPEND) ||
+ !file_in->f_op->clone_file_range)
+ return -EBADF;
+
+ ret = clone_verify_area(file_in, pos_in, len, false);
+ if (ret)
+ return ret;
+
+ ret = clone_verify_area(file_out, pos_out, len, true);
+ if (ret)
+ return ret;
+
+ if (pos_in + len > i_size_read(inode_in))
+ return -EINVAL;
+
+ ret = mnt_want_write_file(file_out);
+ if (ret)
+ return ret;
+
+ ret = file_in->f_op->clone_file_range(file_in, pos_in,
+ file_out, pos_out, len);
+ if (!ret) {
+ fsnotify_access(file_in);
+ fsnotify_modify(file_out);
+ }
+
+ mnt_drop_write_file(file_out);
+ return ret;
+}
+EXPORT_SYMBOL(vfs_clone_file_range);
+
+int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
+{
+ struct file_dedupe_range_info *info;
+ struct inode *src = file_inode(file);
+ u64 off;
+ u64 len;
+ int i;
+ int ret;
+ bool is_admin = capable(CAP_SYS_ADMIN);
+ u16 count = same->dest_count;
+ struct file *dst_file;
+ loff_t dst_off;
+ ssize_t deduped;
+
+ if (!(file->f_mode & FMODE_READ))
+ return -EINVAL;
+
+ if (same->reserved1 || same->reserved2)
+ return -EINVAL;
+
+ off = same->src_offset;
+ len = same->src_length;
+
+ ret = -EISDIR;
+ if (S_ISDIR(src->i_mode))
+ goto out;
+
+ ret = -EINVAL;
+ if (!S_ISREG(src->i_mode))
+ goto out;
+
+ ret = clone_verify_area(file, off, len, false);
+ if (ret < 0)
+ goto out;
+ ret = 0;
+
+ /* pre-format output fields to sane values */
+ for (i = 0; i < count; i++) {
+ same->info[i].bytes_deduped = 0ULL;
+ same->info[i].status = FILE_DEDUPE_RANGE_SAME;
+ }
+
+ for (i = 0, info = same->info; i < count; i++, info++) {
+ struct inode *dst;
+ struct fd dst_fd = fdget(info->dest_fd);
+
+ dst_file = dst_fd.file;
+ if (!dst_file) {
+ info->status = -EBADF;
+ goto next_loop;
+ }
+ dst = file_inode(dst_file);
+
+ ret = mnt_want_write_file(dst_file);
+ if (ret) {
+ info->status = ret;
+ goto next_loop;
+ }
+
+ dst_off = info->dest_offset;
+ ret = clone_verify_area(dst_file, dst_off, len, true);
+ if (ret < 0) {
+ info->status = ret;
+ goto next_file;
+ }
+ ret = 0;
+
+ if (info->reserved) {
+ info->status = -EINVAL;
+ } else if (!(is_admin || (dst_file->f_mode & FMODE_WRITE))) {
+ info->status = -EINVAL;
+ } else if (file->f_path.mnt != dst_file->f_path.mnt) {
+ info->status = -EXDEV;
+ } else if (S_ISDIR(dst->i_mode)) {
+ info->status = -EISDIR;
+ } else if (dst_file->f_op->dedupe_file_range == NULL) {
+ info->status = -EINVAL;
+ } else {
+ deduped = dst_file->f_op->dedupe_file_range(file, off,
+ len, dst_file,
+ info->dest_offset);
+ if (deduped == -EBADE)
+ info->status = FILE_DEDUPE_RANGE_DIFFERS;
+ else if (deduped < 0)
+ info->status = deduped;
+ else
+ info->bytes_deduped += deduped;
+ }
+
+next_file:
+ mnt_drop_write_file(dst_file);
+next_loop:
+ fdput(dst_fd);
+ }
+
+out:
+ return ret;
+}
+EXPORT_SYMBOL(vfs_dedupe_file_range);
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 3d8e7e671d5b..ae9e5b308cf9 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1361,6 +1361,7 @@ static void init_inode(struct inode *inode, struct treepath *path)
inode->i_fop = &reiserfs_dir_operations;
} else if (S_ISLNK(inode->i_mode)) {
inode->i_op = &reiserfs_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &reiserfs_address_space_operations;
} else {
inode->i_blocks = 0;
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 9d6486d416a3..44c2bdced1c8 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -618,12 +618,10 @@ static void release_buffer_page(struct buffer_head *bh)
static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
{
- char b[BDEVNAME_SIZE];
-
if (buffer_journaled(bh)) {
reiserfs_warning(NULL, "clm-2084",
- "pinned buffer %lu:%s sent to disk",
- bh->b_blocknr, bdevname(bh->b_bdev, b));
+ "pinned buffer %lu:%pg sent to disk",
+ bh->b_blocknr, bh->b_bdev);
}
if (uptodate)
set_buffer_uptodate(bh);
@@ -2387,11 +2385,10 @@ static int journal_read(struct super_block *sb)
int replay_count = 0;
int continue_replay = 1;
int ret;
- char b[BDEVNAME_SIZE];
cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(sb);
- reiserfs_info(sb, "checking transaction log (%s)\n",
- bdevname(journal->j_dev_bd, b));
+ reiserfs_info(sb, "checking transaction log (%pg)\n",
+ journal->j_dev_bd);
start = get_seconds();
/*
@@ -2651,8 +2648,8 @@ static int journal_init_dev(struct super_block *super,
set_blocksize(journal->j_dev_bd, super->s_blocksize);
reiserfs_info(super,
- "journal_init_dev: journal device: %s\n",
- bdevname(journal->j_dev_bd, b));
+ "journal_init_dev: journal device: %pg\n",
+ journal->j_dev_bd);
return 0;
}
@@ -2724,7 +2721,6 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
struct reiserfs_journal_header *jh;
struct reiserfs_journal *journal;
struct reiserfs_journal_list *jl;
- char b[BDEVNAME_SIZE];
int ret;
journal = SB_JOURNAL(sb) = vzalloc(sizeof(struct reiserfs_journal));
@@ -2794,10 +2790,10 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
&& (le32_to_cpu(jh->jh_journal.jp_journal_magic) !=
sb_jp_journal_magic(rs))) {
reiserfs_warning(sb, "sh-460",
- "journal header magic %x (device %s) does "
+ "journal header magic %x (device %pg) does "
"not match to magic found in super block %x",
jh->jh_journal.jp_journal_magic,
- bdevname(journal->j_dev_bd, b),
+ journal->j_dev_bd,
sb_jp_journal_magic(rs));
brelse(bhjh);
goto free_and_return;
@@ -2818,10 +2814,10 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
journal->j_max_trans_age = commit_max_age;
}
- reiserfs_info(sb, "journal params: device %s, size %u, "
+ reiserfs_info(sb, "journal params: device %pg, size %u, "
"journal first block %u, max trans len %u, max batch %u, "
"max commit age %u, max trans age %u\n",
- bdevname(journal->j_dev_bd, b),
+ journal->j_dev_bd,
SB_ONDISK_JOURNAL_SIZE(sb),
SB_ONDISK_JOURNAL_1st_BLOCK(sb),
journal->j_trans_max,
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 5f1c9c29eb8c..2a12d46d7fb4 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -712,9 +712,6 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode
2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
- if (!new_valid_dev(rdev))
- return -EINVAL;
-
retval = dquot_initialize(dir);
if (retval)
return retval;
@@ -1173,6 +1170,7 @@ static int reiserfs_symlink(struct inode *parent_dir,
reiserfs_update_inode_transaction(parent_dir);
inode->i_op = &reiserfs_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &reiserfs_address_space_operations;
retval = reiserfs_add_entry(&th, parent_dir, dentry->d_name.name,
@@ -1667,8 +1665,7 @@ const struct inode_operations reiserfs_dir_inode_operations = {
*/
const struct inode_operations reiserfs_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.setattr = reiserfs_setattr,
.setxattr = reiserfs_setxattr,
.getxattr = reiserfs_getxattr,
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
index ae1dc841db3a..4f3f928076f3 100644
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c
@@ -139,11 +139,9 @@ static void sprintf_block_head(char *buf, struct buffer_head *bh)
static void sprintf_buffer_head(char *buf, struct buffer_head *bh)
{
- char b[BDEVNAME_SIZE];
-
sprintf(buf,
- "dev %s, size %zd, blocknr %llu, count %d, state 0x%lx, page %p, (%s, %s, %s)",
- bdevname(bh->b_bdev, b), bh->b_size,
+ "dev %pg, size %zd, blocknr %llu, count %d, state 0x%lx, page %p, (%s, %s, %s)",
+ bh->b_bdev, bh->b_size,
(unsigned long long)bh->b_blocknr, atomic_read(&(bh->b_count)),
bh->b_state, bh->b_page,
buffer_uptodate(bh) ? "UPTODATE" : "!UPTODATE",
@@ -530,7 +528,6 @@ static int print_super_block(struct buffer_head *bh)
(struct reiserfs_super_block *)(bh->b_data);
int skipped, data_blocks;
char *version;
- char b[BDEVNAME_SIZE];
if (is_reiserfs_3_5(rs)) {
version = "3.5";
@@ -543,7 +540,7 @@ static int print_super_block(struct buffer_head *bh)
return 1;
}
- printk("%s\'s super block is in block %llu\n", bdevname(bh->b_bdev, b),
+ printk("%pg\'s super block is in block %llu\n", bh->b_bdev,
(unsigned long long)bh->b_blocknr);
printk("Reiserfs version %s\n", version);
printk("Block count %u\n", sb_block_count(rs));
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index 621b9f381fe1..fe999157dd97 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -303,11 +303,10 @@ static int show_journal(struct seq_file *m, void *unused)
struct reiserfs_sb_info *r = REISERFS_SB(sb);
struct reiserfs_super_block *rs = r->s_rs;
struct journal_params *jp = &rs->s_v1.s_journal;
- char b[BDEVNAME_SIZE];
seq_printf(m, /* on-disk fields */
"jp_journal_1st_block: \t%i\n"
- "jp_journal_dev: \t%s[%x]\n"
+ "jp_journal_dev: \t%pg[%x]\n"
"jp_journal_size: \t%i\n"
"jp_journal_trans_max: \t%i\n"
"jp_journal_magic: \t%i\n"
@@ -348,7 +347,7 @@ static int show_journal(struct seq_file *m, void *unused)
"prepare: \t%12lu\n"
"prepare_retry: \t%12lu\n",
DJP(jp_journal_1st_block),
- bdevname(SB_JOURNAL(sb)->j_dev_bd, b),
+ SB_JOURNAL(sb)->j_dev_bd,
DJP(jp_journal_dev),
DJP(jp_journal_size),
DJP(jp_journal_trans_max),
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 4a62fe8cc3bf..05db7473bcb5 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -626,7 +626,8 @@ static int __init init_inodecache(void)
sizeof(struct
reiserfs_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|
+ SLAB_ACCOUNT),
init_once);
if (reiserfs_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index e87f9b52bf06..e5ddb4e5ea94 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -756,7 +756,8 @@ find_xattr_handler_prefix(const struct xattr_handler **handlers,
return NULL;
for_each_xattr_handler(handlers, xah) {
- if (strncmp(xah->prefix, name, strlen(xah->prefix)) == 0)
+ const char *prefix = xattr_prefix(xah);
+ if (strncmp(prefix, name, strlen(prefix)) == 0)
break;
}
@@ -778,7 +779,7 @@ reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer,
if (!handler || get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1)
return -EOPNOTSUPP;
- return handler->get(dentry, name, buffer, size, handler->flags);
+ return handler->get(handler, dentry, name, buffer, size);
}
/*
@@ -797,7 +798,7 @@ reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
if (!handler || get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1)
return -EOPNOTSUPP;
- return handler->set(dentry, name, value, size, flags, handler->flags);
+ return handler->set(handler, dentry, name, value, size, flags);
}
/*
@@ -814,7 +815,7 @@ int reiserfs_removexattr(struct dentry *dentry, const char *name)
if (!handler || get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1)
return -EOPNOTSUPP;
- return handler->set(dentry, name, NULL, 0, XATTR_REPLACE, handler->flags);
+ return handler->set(handler, dentry, name, NULL, 0, XATTR_REPLACE);
}
struct listxattr_buf {
@@ -839,19 +840,16 @@ static int listxattr_filler(struct dir_context *ctx, const char *name,
handler = find_xattr_handler_prefix(b->dentry->d_sb->s_xattr,
name);
- if (!handler) /* Unsupported xattr name */
+ if (!handler /* Unsupported xattr name */ ||
+ (handler->list && !handler->list(b->dentry)))
return 0;
+ size = namelen + 1;
if (b->buf) {
- size = handler->list(b->dentry, b->buf + b->pos,
- b->size, name, namelen,
- handler->flags);
if (size > b->size)
return -ERANGE;
- } else {
- size = handler->list(b->dentry, NULL, 0, name,
- namelen, handler->flags);
+ memcpy(b->buf + b->pos, name, namelen);
+ b->buf[b->pos + namelen] = 0;
}
-
b->pos += size;
}
return 0;
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 4b34b9dc03dd..558a16beaacb 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -186,10 +186,10 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
switch (type) {
case ACL_TYPE_ACCESS:
- name = POSIX_ACL_XATTR_ACCESS;
+ name = XATTR_NAME_POSIX_ACL_ACCESS;
break;
case ACL_TYPE_DEFAULT:
- name = POSIX_ACL_XATTR_DEFAULT;
+ name = XATTR_NAME_POSIX_ACL_DEFAULT;
break;
default:
BUG();
@@ -244,7 +244,7 @@ __reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
switch (type) {
case ACL_TYPE_ACCESS:
- name = POSIX_ACL_XATTR_ACCESS;
+ name = XATTR_NAME_POSIX_ACL_ACCESS;
if (acl) {
error = posix_acl_equiv_mode(acl, &inode->i_mode);
if (error < 0)
@@ -256,7 +256,7 @@ __reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
}
break;
case ACL_TYPE_DEFAULT:
- name = POSIX_ACL_XATTR_DEFAULT;
+ name = XATTR_NAME_POSIX_ACL_DEFAULT;
if (!S_ISDIR(inode->i_mode))
return acl ? -EACCES : 0;
break;
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index 9a3b0616f283..ab0217d32039 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -9,8 +9,8 @@
#include <linux/uaccess.h>
static int
-security_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
- int handler_flags)
+security_get(const struct xattr_handler *handler, struct dentry *dentry,
+ const char *name, void *buffer, size_t size)
{
if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX))
return -EINVAL;
@@ -22,8 +22,8 @@ security_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
}
static int
-security_set(struct dentry *dentry, const char *name, const void *buffer,
- size_t size, int flags, int handler_flags)
+security_set(const struct xattr_handler *handler, struct dentry *dentry,
+ const char *name, const void *buffer, size_t size, int flags)
{
if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX))
return -EINVAL;
@@ -34,20 +34,9 @@ security_set(struct dentry *dentry, const char *name, const void *buffer,
return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags);
}
-static size_t security_list(struct dentry *dentry, char *list, size_t list_len,
- const char *name, size_t namelen, int handler_flags)
+static bool security_list(struct dentry *dentry)
{
- const size_t len = namelen + 1;
-
- if (IS_PRIVATE(d_inode(dentry)))
- return 0;
-
- if (list && len <= list_len) {
- memcpy(list, name, namelen);
- list[namelen] = '\0';
- }
-
- return len;
+ return !IS_PRIVATE(d_inode(dentry));
}
/* Initializes the security context for a new inode and returns the number
diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c
index e4f1343714e0..64b67aa643a9 100644
--- a/fs/reiserfs/xattr_trusted.c
+++ b/fs/reiserfs/xattr_trusted.c
@@ -8,8 +8,8 @@
#include <linux/uaccess.h>
static int
-trusted_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
- int handler_flags)
+trusted_get(const struct xattr_handler *handler, struct dentry *dentry,
+ const char *name, void *buffer, size_t size)
{
if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX))
return -EINVAL;
@@ -21,8 +21,8 @@ trusted_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
}
static int
-trusted_set(struct dentry *dentry, const char *name, const void *buffer,
- size_t size, int flags, int handler_flags)
+trusted_set(const struct xattr_handler *handler, struct dentry *dentry,
+ const char *name, const void *buffer, size_t size, int flags)
{
if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX))
return -EINVAL;
@@ -33,19 +33,9 @@ trusted_set(struct dentry *dentry, const char *name, const void *buffer,
return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags);
}
-static size_t trusted_list(struct dentry *dentry, char *list, size_t list_size,
- const char *name, size_t name_len, int handler_flags)
+static bool trusted_list(struct dentry *dentry)
{
- const size_t len = name_len + 1;
-
- if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(d_inode(dentry)))
- return 0;
-
- if (list && len <= list_size) {
- memcpy(list, name, name_len);
- list[name_len] = '\0';
- }
- return len;
+ return capable(CAP_SYS_ADMIN) && !IS_PRIVATE(d_inode(dentry));
}
const struct xattr_handler reiserfs_xattr_trusted_handler = {
diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c
index d0b08d3e5689..12e6306f562a 100644
--- a/fs/reiserfs/xattr_user.c
+++ b/fs/reiserfs/xattr_user.c
@@ -7,8 +7,8 @@
#include <linux/uaccess.h>
static int
-user_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
- int handler_flags)
+user_get(const struct xattr_handler *handler, struct dentry *dentry,
+ const char *name, void *buffer, size_t size)
{
if (strlen(name) < sizeof(XATTR_USER_PREFIX))
@@ -19,8 +19,8 @@ user_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
}
static int
-user_set(struct dentry *dentry, const char *name, const void *buffer,
- size_t size, int flags, int handler_flags)
+user_set(const struct xattr_handler *handler, struct dentry *dentry,
+ const char *name, const void *buffer, size_t size, int flags)
{
if (strlen(name) < sizeof(XATTR_USER_PREFIX))
return -EINVAL;
@@ -30,18 +30,9 @@ user_set(struct dentry *dentry, const char *name, const void *buffer,
return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags);
}
-static size_t user_list(struct dentry *dentry, char *list, size_t list_size,
- const char *name, size_t name_len, int handler_flags)
+static bool user_list(struct dentry *dentry)
{
- const size_t len = name_len + 1;
-
- if (!reiserfs_xattrs_user(dentry->d_sb))
- return 0;
- if (list && len <= list_size) {
- memcpy(list, name, name_len);
- list[name_len] = '\0';
- }
- return len;
+ return reiserfs_xattrs_user(dentry->d_sb);
}
const struct xattr_handler reiserfs_xattr_user_handler = {
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 268733cda397..6b00ca357c58 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -360,6 +360,7 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos)
break;
case ROMFH_SYM:
i->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(i);
i->i_data.a_ops = &romfs_aops;
mode |= S_IRWXUGO;
break;
@@ -618,8 +619,8 @@ static int __init init_romfs_fs(void)
romfs_inode_cachep =
kmem_cache_create("romfs_i",
sizeof(struct romfs_inode_info), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
- romfs_i_init_once);
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD |
+ SLAB_ACCOUNT, romfs_i_init_once);
if (!romfs_inode_cachep) {
pr_err("Failed to initialise inode cache\n");
diff --git a/fs/select.c b/fs/select.c
index 015547330e88..79d0d4953cad 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -778,8 +778,8 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait,
return mask;
}
-static int do_poll(unsigned int nfds, struct poll_list *list,
- struct poll_wqueues *wait, struct timespec *end_time)
+static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
+ struct timespec *end_time)
{
poll_table* pt = &wait->pt;
ktime_t expire, *to = NULL;
@@ -908,7 +908,7 @@ int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
}
poll_initwait(&table);
- fdcount = do_poll(nfds, head, &table, end_time);
+ fdcount = do_poll(head, &table, end_time);
poll_freewait(&table);
for (walk = head; walk; walk = walk->next) {
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 225586e141ca..e85664b7c7d9 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -13,6 +13,7 @@
#include <linux/cred.h>
#include <linux/mm.h>
#include <linux/printk.h>
+#include <linux/string_helpers.h>
#include <asm/uaccess.h>
#include <asm/page.h>
@@ -25,12 +26,17 @@ static void seq_set_overflow(struct seq_file *m)
static void *seq_buf_alloc(unsigned long size)
{
void *buf;
+ gfp_t gfp = GFP_KERNEL;
/*
- * __GFP_NORETRY to avoid oom-killings with high-order allocations -
- * it's better to fall back to vmalloc() than to kill things.
+ * For high order allocations, use __GFP_NORETRY to avoid oom-killing -
+ * it's better to fall back to vmalloc() than to kill things. For small
+ * allocations, just use GFP_KERNEL which will oom kill, thus no need
+ * for vmalloc fallback.
*/
- buf = kmalloc(size, GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
+ if (size > PAGE_SIZE)
+ gfp |= __GFP_NORETRY | __GFP_NOWARN;
+ buf = kmalloc(size, gfp);
if (!buf && size > PAGE_SIZE)
buf = vmalloc(size);
return buf;
@@ -377,26 +383,12 @@ EXPORT_SYMBOL(seq_release);
*/
void seq_escape(struct seq_file *m, const char *s, const char *esc)
{
- char *end = m->buf + m->size;
- char *p;
- char c;
+ char *buf;
+ size_t size = seq_get_buf(m, &buf);
+ int ret;
- for (p = m->buf + m->count; (c = *s) != '\0' && p < end; s++) {
- if (!strchr(esc, c)) {
- *p++ = c;
- continue;
- }
- if (p + 3 < end) {
- *p++ = '\\';
- *p++ = '0' + ((c & 0300) >> 6);
- *p++ = '0' + ((c & 070) >> 3);
- *p++ = '0' + (c & 07);
- continue;
- }
- seq_set_overflow(m);
- return;
- }
- m->count = p - m->buf;
+ ret = string_escape_str(s, buf, size, ESCAPE_OCTAL, esc);
+ seq_commit(m, ret < size ? ret : -1);
}
EXPORT_SYMBOL(seq_escape);
@@ -773,6 +765,8 @@ void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type,
{
const u8 *ptr = buf;
int i, linelen, remaining = len;
+ char *buffer;
+ size_t size;
int ret;
if (rowsize != 16 && rowsize != 32)
@@ -794,15 +788,12 @@ void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type,
break;
}
+ size = seq_get_buf(m, &buffer);
ret = hex_dump_to_buffer(ptr + i, linelen, rowsize, groupsize,
- m->buf + m->count, m->size - m->count,
- ascii);
- if (ret >= m->size - m->count) {
- seq_set_overflow(m);
- } else {
- m->count += ret;
- seq_putc(m, '\n');
- }
+ buffer, size, ascii);
+ seq_commit(m, ret < size ? ret : -1);
+
+ seq_putc(m, '\n');
}
}
EXPORT_SYMBOL(seq_hex_dump);
diff --git a/fs/splice.c b/fs/splice.c
index 5fc1e50a7f30..82bc0d64fc38 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -360,7 +360,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
break;
error = add_to_page_cache_lru(page, mapping, index,
- GFP_KERNEL & mapping_gfp_mask(mapping));
+ mapping_gfp_constraint(mapping, GFP_KERNEL));
if (unlikely(error)) {
page_cache_release(page);
if (error == -EEXIST)
@@ -415,6 +415,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
*/
if (!page->mapping) {
unlock_page(page);
+retry_lookup:
page = find_or_create_page(mapping, index,
mapping_gfp_mask(mapping));
@@ -439,13 +440,10 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
error = mapping->a_ops->readpage(in, page);
if (unlikely(error)) {
/*
- * We really should re-lookup the page here,
- * but it complicates things a lot. Instead
- * lets just do what we already stored, and
- * we'll get it the next time we are called.
+ * Re-lookup the page
*/
if (error == AOP_TRUNCATED_PAGE)
- error = 0;
+ goto retry_lookup;
break;
}
@@ -809,6 +807,13 @@ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_des
*/
static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
{
+ /*
+ * Check for signal early to make process killable when there are
+ * always buffers available
+ */
+ if (signal_pending(current))
+ return -ERESTARTSYS;
+
while (!pipe->nrbufs) {
if (!pipe->writers)
return 0;
@@ -884,6 +889,7 @@ ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
splice_from_pipe_begin(sd);
do {
+ cond_resched();
ret = splice_from_pipe_next(pipe, sd);
if (ret > 0)
ret = splice_from_pipe_feed(pipe, sd, actor);
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index a1ce5ce60632..0927b1e80ab6 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -41,6 +41,7 @@
#include <linux/fs.h>
#include <linux/vfs.h>
#include <linux/xattr.h>
+#include <linux/pagemap.h>
#include "squashfs_fs.h"
#include "squashfs_fs_sb.h"
@@ -291,6 +292,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
inode->i_size = le32_to_cpu(sqsh_ino->symlink_size);
inode->i_op = &squashfs_symlink_inode_ops;
+ inode_nohighmem(inode);
inode->i_data.a_ops = &squashfs_symlink_aops;
inode->i_mode |= S_IFLNK;
squashfs_i(inode)->start = block;
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 5056babe00df..5e79bfa4f260 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -80,7 +80,6 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
{
struct squashfs_sb_info *msblk;
struct squashfs_super_block *sblk = NULL;
- char b[BDEVNAME_SIZE];
struct inode *root;
long long root_inode;
unsigned short flags;
@@ -124,8 +123,8 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_magic = le32_to_cpu(sblk->s_magic);
if (sb->s_magic != SQUASHFS_MAGIC) {
if (!silent)
- ERROR("Can't find a SQUASHFS superblock on %s\n",
- bdevname(sb->s_bdev, b));
+ ERROR("Can't find a SQUASHFS superblock on %pg\n",
+ sb->s_bdev);
goto failed_mount;
}
@@ -178,7 +177,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
msblk->inodes = le32_to_cpu(sblk->inodes);
flags = le16_to_cpu(sblk->flags);
- TRACE("Found valid superblock on %s\n", bdevname(sb->s_bdev, b));
+ TRACE("Found valid superblock on %pg\n", sb->s_bdev);
TRACE("Inodes are %scompressed\n", SQUASHFS_UNCOMPRESSED_INODES(flags)
? "un" : "");
TRACE("Data is %scompressed\n", SQUASHFS_UNCOMPRESSED_DATA(flags)
@@ -420,7 +419,8 @@ static int __init init_inodecache(void)
{
squashfs_inode_cachep = kmem_cache_create("squashfs_inode_cache",
sizeof(struct squashfs_inode_info), 0,
- SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT, init_once);
+ SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT,
+ init_once);
return squashfs_inode_cachep ? 0 : -ENOMEM;
}
diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
index 12806dffb345..dbcc2f54bad4 100644
--- a/fs/squashfs/symlink.c
+++ b/fs/squashfs/symlink.c
@@ -119,8 +119,7 @@ const struct address_space_operations squashfs_symlink_aops = {
const struct inode_operations squashfs_symlink_inode_ops = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.getxattr = generic_getxattr,
.listxattr = squashfs_listxattr
};
diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c
index e5e0ddf5b143..1e9de96288d8 100644
--- a/fs/squashfs/xattr.c
+++ b/fs/squashfs/xattr.c
@@ -58,7 +58,7 @@ ssize_t squashfs_listxattr(struct dentry *d, char *buffer,
struct squashfs_xattr_entry entry;
struct squashfs_xattr_val val;
const struct xattr_handler *handler;
- int name_size, prefix_size = 0;
+ int name_size;
err = squashfs_read_metadata(sb, &entry, &start, &offset,
sizeof(entry));
@@ -67,15 +67,16 @@ ssize_t squashfs_listxattr(struct dentry *d, char *buffer,
name_size = le16_to_cpu(entry.size);
handler = squashfs_xattr_handler(le16_to_cpu(entry.type));
- if (handler)
- prefix_size = handler->list(d, buffer, rest, NULL,
- name_size, handler->flags);
- if (prefix_size) {
+ if (handler && (!handler->list || handler->list(d))) {
+ const char *prefix = handler->prefix ?: handler->name;
+ size_t prefix_size = strlen(prefix);
+
if (buffer) {
if (prefix_size + name_size + 1 > rest) {
err = -ERANGE;
goto failed;
}
+ memcpy(buffer, prefix, prefix_size);
buffer += prefix_size;
}
err = squashfs_read_metadata(sb, buffer, &start,
@@ -212,88 +213,45 @@ failed:
}
-/*
- * User namespace support
- */
-static size_t squashfs_user_list(struct dentry *d, char *list, size_t list_size,
- const char *name, size_t name_len, int type)
-{
- if (list && XATTR_USER_PREFIX_LEN <= list_size)
- memcpy(list, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
- return XATTR_USER_PREFIX_LEN;
-}
-
-static int squashfs_user_get(struct dentry *d, const char *name, void *buffer,
- size_t size, int type)
+static int squashfs_xattr_handler_get(const struct xattr_handler *handler,
+ struct dentry *d, const char *name,
+ void *buffer, size_t size)
{
- if (name[0] == '\0')
- return -EINVAL;
-
- return squashfs_xattr_get(d_inode(d), SQUASHFS_XATTR_USER, name,
+ return squashfs_xattr_get(d_inode(d), handler->flags, name,
buffer, size);
}
+/*
+ * User namespace support
+ */
static const struct xattr_handler squashfs_xattr_user_handler = {
.prefix = XATTR_USER_PREFIX,
- .list = squashfs_user_list,
- .get = squashfs_user_get
+ .flags = SQUASHFS_XATTR_USER,
+ .get = squashfs_xattr_handler_get
};
/*
* Trusted namespace support
*/
-static size_t squashfs_trusted_list(struct dentry *d, char *list,
- size_t list_size, const char *name, size_t name_len, int type)
+static bool squashfs_trusted_xattr_handler_list(struct dentry *d)
{
- if (!capable(CAP_SYS_ADMIN))
- return 0;
-
- if (list && XATTR_TRUSTED_PREFIX_LEN <= list_size)
- memcpy(list, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
- return XATTR_TRUSTED_PREFIX_LEN;
-}
-
-static int squashfs_trusted_get(struct dentry *d, const char *name,
- void *buffer, size_t size, int type)
-{
- if (name[0] == '\0')
- return -EINVAL;
-
- return squashfs_xattr_get(d_inode(d), SQUASHFS_XATTR_TRUSTED, name,
- buffer, size);
+ return capable(CAP_SYS_ADMIN);
}
static const struct xattr_handler squashfs_xattr_trusted_handler = {
.prefix = XATTR_TRUSTED_PREFIX,
- .list = squashfs_trusted_list,
- .get = squashfs_trusted_get
+ .flags = SQUASHFS_XATTR_TRUSTED,
+ .list = squashfs_trusted_xattr_handler_list,
+ .get = squashfs_xattr_handler_get
};
/*
* Security namespace support
*/
-static size_t squashfs_security_list(struct dentry *d, char *list,
- size_t list_size, const char *name, size_t name_len, int type)
-{
- if (list && XATTR_SECURITY_PREFIX_LEN <= list_size)
- memcpy(list, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN);
- return XATTR_SECURITY_PREFIX_LEN;
-}
-
-static int squashfs_security_get(struct dentry *d, const char *name,
- void *buffer, size_t size, int type)
-{
- if (name[0] == '\0')
- return -EINVAL;
-
- return squashfs_xattr_get(d_inode(d), SQUASHFS_XATTR_SECURITY, name,
- buffer, size);
-}
-
static const struct xattr_handler squashfs_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
- .list = squashfs_security_list,
- .get = squashfs_security_get
+ .flags = SQUASHFS_XATTR_SECURITY,
+ .get = squashfs_xattr_handler_get
};
static const struct xattr_handler *squashfs_xattr_handler(int type)
diff --git a/fs/stat.c b/fs/stat.c
index cccc1aab9a8b..bc045c7994e1 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -219,7 +219,7 @@ SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, stat
# define choose_32_64(a,b) b
#endif
-#define valid_dev(x) choose_32_64(old_valid_dev,new_valid_dev)(x)
+#define valid_dev(x) choose_32_64(old_valid_dev(x),true)
#define encode_dev(x) choose_32_64(old_encode_dev,new_encode_dev)(x)
#ifndef INIT_STRUCT_STAT_PADDING
@@ -367,8 +367,6 @@ static long cp_new_stat64(struct kstat *stat, struct stat64 __user *statbuf)
INIT_STRUCT_STAT64_PADDING(tmp);
#ifdef CONFIG_MIPS
/* mips has weird padding, so we don't get 64 bits there */
- if (!new_valid_dev(stat->dev) || !new_valid_dev(stat->rdev))
- return -EOVERFLOW;
tmp.st_dev = new_encode_dev(stat->dev);
tmp.st_rdev = new_encode_dev(stat->rdev);
#else
diff --git a/fs/super.c b/fs/super.c
index 954aeb80e202..1182af8fd5ff 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1012,10 +1012,8 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
blkdev_put(bdev, mode);
down_write(&s->s_umount);
} else {
- char b[BDEVNAME_SIZE];
-
s->s_mode = mode;
- strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
+ snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
sb_set_blocksize(s, block_size(bdev));
error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
if (error) {
@@ -1199,7 +1197,7 @@ int __sb_start_write(struct super_block *sb, int level, bool wait)
else
ret = percpu_down_read_trylock(sb->s_writers.rw_sem + level-1);
- WARN_ON(force_trylock & !ret);
+ WARN_ON(force_trylock && !ret);
return ret;
}
EXPORT_SYMBOL(__sb_start_write);
diff --git a/fs/sync.c b/fs/sync.c
index fbc98ee62044..dd5d1711c7ac 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -86,7 +86,12 @@ static void fdatawrite_one_bdev(struct block_device *bdev, void *arg)
static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
{
- filemap_fdatawait(bdev->bd_inode->i_mapping);
+ /*
+ * We keep the error status of individual mapping so that
+ * applications can catch the writeback error using fsync(2).
+ * See filemap_fdatawait_keep_errors() for details.
+ */
+ filemap_fdatawait_keep_errors(bdev->bd_inode->i_mapping);
}
/*
@@ -343,7 +348,8 @@ SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes,
}
if (flags & SYNC_FILE_RANGE_WRITE) {
- ret = filemap_fdatawrite_range(mapping, offset, endbyte);
+ ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
+ WB_SYNC_NONE);
if (ret < 0)
goto out_put;
}
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 6c95628ea377..f35523d4fa3a 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -108,6 +108,7 @@ static ssize_t sysfs_kf_read(struct kernfs_open_file *of, char *buf,
{
const struct sysfs_ops *ops = sysfs_file_ops(of->kn);
struct kobject *kobj = of->kn->parent->priv;
+ size_t len;
/*
* If buf != of->prealloc_buf, we don't know how
@@ -115,7 +116,8 @@ static ssize_t sysfs_kf_read(struct kernfs_open_file *of, char *buf,
*/
if (pos || WARN_ON_ONCE(buf != of->prealloc_buf))
return 0;
- return ops->show(kobj, of->kn->priv, buf);
+ len = ops->show(kobj, of->kn->priv, buf);
+ return min(count, len);
}
/* kernfs write callback for regular sysfs files */
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 39a019936768..dc1358b5ec95 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -73,13 +73,26 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj,
}
if (grp->bin_attrs) {
- for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) {
+ for (i = 0, bin_attr = grp->bin_attrs; *bin_attr; i++, bin_attr++) {
+ umode_t mode = (*bin_attr)->attr.mode;
+
if (update)
kernfs_remove_by_name(parent,
(*bin_attr)->attr.name);
+ if (grp->is_bin_visible) {
+ mode = grp->is_bin_visible(kobj, *bin_attr, i);
+ if (!mode)
+ continue;
+ }
+
+ WARN(mode & ~(SYSFS_PREALLOC | 0664),
+ "Attribute %s: Invalid permissions 0%o\n",
+ (*bin_attr)->attr.name, mode);
+
+ mode &= SYSFS_PREALLOC | 0664;
error = sysfs_add_file_mode_ns(parent,
&(*bin_attr)->attr, true,
- (*bin_attr)->attr.mode, NULL);
+ mode, NULL);
if (error)
break;
}
@@ -352,3 +365,47 @@ void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name,
}
}
EXPORT_SYMBOL_GPL(sysfs_remove_link_from_group);
+
+/**
+ * __compat_only_sysfs_link_entry_to_kobj - add a symlink to a kobject pointing
+ * to a group or an attribute
+ * @kobj: The kobject containing the group.
+ * @target_kobj: The target kobject.
+ * @target_name: The name of the target group or attribute.
+ */
+int __compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj,
+ struct kobject *target_kobj,
+ const char *target_name)
+{
+ struct kernfs_node *target;
+ struct kernfs_node *entry;
+ struct kernfs_node *link;
+
+ /*
+ * We don't own @target_kobj and it may be removed at any time.
+ * Synchronize using sysfs_symlink_target_lock. See sysfs_remove_dir()
+ * for details.
+ */
+ spin_lock(&sysfs_symlink_target_lock);
+ target = target_kobj->sd;
+ if (target)
+ kernfs_get(target);
+ spin_unlock(&sysfs_symlink_target_lock);
+ if (!target)
+ return -ENOENT;
+
+ entry = kernfs_find_and_get(target_kobj->sd, target_name);
+ if (!entry) {
+ kernfs_put(target);
+ return -ENOENT;
+ }
+
+ link = kernfs_create_link(kobj->sd, target_name, entry);
+ if (IS_ERR(link) && PTR_ERR(link) == -EEXIST)
+ sysfs_warn_dup(kobj->sd, target_name);
+
+ kernfs_put(entry);
+ kernfs_put(target);
+ return IS_ERR(link) ? PTR_ERR(link) : 0;
+}
+EXPORT_SYMBOL_GPL(__compat_only_sysfs_link_entry_to_kobj);
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 590ad9206e3f..d62c423a5a2d 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -146,8 +146,7 @@ static inline void write3byte(struct sysv_sb_info *sbi,
static const struct inode_operations sysv_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.getattr = sysv_getattr,
};
@@ -162,15 +161,9 @@ void sysv_set_inode(struct inode *inode, dev_t rdev)
inode->i_fop = &sysv_dir_operations;
inode->i_mapping->a_ops = &sysv_aops;
} else if (S_ISLNK(inode->i_mode)) {
- if (inode->i_blocks) {
- inode->i_op = &sysv_symlink_inode_operations;
- inode->i_mapping->a_ops = &sysv_aops;
- } else {
- inode->i_op = &simple_symlink_inode_operations;
- inode->i_link = (char *)SYSV_I(inode)->i_data;
- nd_terminate_link(inode->i_link, inode->i_size,
- sizeof(SYSV_I(inode)->i_data) - 1);
- }
+ inode->i_op = &sysv_symlink_inode_operations;
+ inode_nohighmem(inode);
+ inode->i_mapping->a_ops = &sysv_aops;
} else
init_special_inode(inode, inode->i_mode, rdev);
}
@@ -353,7 +346,7 @@ int __init sysv_init_icache(void)
{
sysv_inode_cachep = kmem_cache_create("sysv_inode_cache",
sizeof(struct sysv_inode_info), 0,
- SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
+ SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT,
init_once);
if (!sysv_inode_cachep)
return -ENOMEM;
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index cbc8d5d2755a..c66f2423e1f5 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -340,8 +340,12 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
dput(dentry);
dentry = ERR_PTR(-EEXIST);
}
- if (IS_ERR(dentry))
+
+ if (IS_ERR(dentry)) {
mutex_unlock(&parent->d_inode->i_mutex);
+ simple_release_fs(&tracefs_mount, &tracefs_mount_count);
+ }
+
return dentry;
}
diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig
index ba66d508006a..7ff7712f284e 100644
--- a/fs/ubifs/Kconfig
+++ b/fs/ubifs/Kconfig
@@ -35,3 +35,18 @@ config UBIFS_FS_ZLIB
default y
help
Zlib compresses better than LZO but it is slower. Say 'Y' if unsure.
+
+config UBIFS_ATIME_SUPPORT
+ bool "Access time support" if UBIFS_FS
+ depends on UBIFS_FS
+ default n
+ help
+ Originally UBIFS did not support atime, because it looked like a bad idea due
+ increased flash wear. This option adds atime support and it is disabled by default
+ to preserve the old behavior. If you enable this option, UBIFS starts updating atime,
+ which means that file-system read operations will cause writes (inode atime
+ updates). This may affect file-system performance and increase flash device wear,
+ so be careful. How often atime is updated depends on the selected strategy:
+ strictatime is the "heavy", relatime is "lighter", etc.
+
+ If unsure, say 'N'
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 4c46a9865fa7..595ca0debe11 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -2573,7 +2573,7 @@ int dbg_leb_write(struct ubifs_info *c, int lnum, const void *buf,
{
int err, failing;
- if (c->dbg->pc_happened)
+ if (dbg_is_power_cut(c))
return -EROFS;
failing = power_cut_emulated(c, lnum, 1);
@@ -2595,7 +2595,7 @@ int dbg_leb_change(struct ubifs_info *c, int lnum, const void *buf,
{
int err;
- if (c->dbg->pc_happened)
+ if (dbg_is_power_cut(c))
return -EROFS;
if (power_cut_emulated(c, lnum, 1))
return -EROFS;
@@ -2611,7 +2611,7 @@ int dbg_leb_unmap(struct ubifs_info *c, int lnum)
{
int err;
- if (c->dbg->pc_happened)
+ if (dbg_is_power_cut(c))
return -EROFS;
if (power_cut_emulated(c, lnum, 0))
return -EROFS;
@@ -2627,7 +2627,7 @@ int dbg_leb_map(struct ubifs_info *c, int lnum)
{
int err;
- if (c->dbg->pc_happened)
+ if (dbg_is_power_cut(c))
return -EROFS;
if (power_cut_emulated(c, lnum, 0))
return -EROFS;
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 5c27c66c224a..e49bd2808bf3 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -449,13 +449,14 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx)
}
out:
+ kfree(file->private_data);
+ file->private_data = NULL;
+
if (err != -ENOENT) {
ubifs_err(c, "cannot find next direntry, error %d", err);
return err;
}
- kfree(file->private_data);
- file->private_data = NULL;
/* 2 is a special value indicating that there are no more direntries */
ctx->pos = 2;
return 0;
@@ -787,9 +788,6 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry,
dbg_gen("dent '%pd' in dir ino %lu", dentry, dir->i_ino);
- if (!new_valid_dev(rdev))
- return -EINVAL;
-
if (S_ISBLK(mode) || S_ISCHR(mode)) {
dev = kmalloc(sizeof(union ubifs_dev_desc), GFP_NOFS);
if (!dev)
@@ -1188,6 +1186,9 @@ const struct inode_operations ubifs_dir_inode_operations = {
.getxattr = ubifs_getxattr,
.listxattr = ubifs_listxattr,
.removexattr = ubifs_removexattr,
+#ifdef CONFIG_UBIFS_ATIME_SUPPORT
+ .update_time = ubifs_update_time,
+#endif
};
const struct file_operations ubifs_dir_operations = {
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index a3dfe2ae79f2..eff62801acbf 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1354,6 +1354,47 @@ static inline int mctime_update_needed(const struct inode *inode,
return 0;
}
+#ifdef CONFIG_UBIFS_ATIME_SUPPORT
+/**
+ * ubifs_update_time - update time of inode.
+ * @inode: inode to update
+ *
+ * This function updates time of the inode.
+ */
+int ubifs_update_time(struct inode *inode, struct timespec *time,
+ int flags)
+{
+ struct ubifs_inode *ui = ubifs_inode(inode);
+ struct ubifs_info *c = inode->i_sb->s_fs_info;
+ struct ubifs_budget_req req = { .dirtied_ino = 1,
+ .dirtied_ino_d = ALIGN(ui->data_len, 8) };
+ int iflags = I_DIRTY_TIME;
+ int err, release;
+
+ err = ubifs_budget_space(c, &req);
+ if (err)
+ return err;
+
+ mutex_lock(&ui->ui_mutex);
+ if (flags & S_ATIME)
+ inode->i_atime = *time;
+ if (flags & S_CTIME)
+ inode->i_ctime = *time;
+ if (flags & S_MTIME)
+ inode->i_mtime = *time;
+
+ if (!(inode->i_sb->s_flags & MS_LAZYTIME))
+ iflags |= I_DIRTY_SYNC;
+
+ release = ui->dirty;
+ __mark_inode_dirty(inode, iflags);
+ mutex_unlock(&ui->ui_mutex);
+ if (release)
+ ubifs_release_budget(c, &req);
+ return 0;
+}
+#endif
+
/**
* update_ctime - update mtime and ctime of an inode.
* @inode: inode to update
@@ -1537,6 +1578,9 @@ static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
if (err)
return err;
vma->vm_ops = &ubifs_file_vm_ops;
+#ifdef CONFIG_UBIFS_ATIME_SUPPORT
+ file_accessed(file);
+#endif
return 0;
}
@@ -1557,17 +1601,23 @@ const struct inode_operations ubifs_file_inode_operations = {
.getxattr = ubifs_getxattr,
.listxattr = ubifs_listxattr,
.removexattr = ubifs_removexattr,
+#ifdef CONFIG_UBIFS_ATIME_SUPPORT
+ .update_time = ubifs_update_time,
+#endif
};
const struct inode_operations ubifs_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = simple_follow_link,
+ .get_link = simple_get_link,
.setattr = ubifs_setattr,
.getattr = ubifs_getattr,
.setxattr = ubifs_setxattr,
.getxattr = ubifs_getxattr,
.listxattr = ubifs_listxattr,
.removexattr = ubifs_removexattr,
+#ifdef CONFIG_UBIFS_ATIME_SUPPORT
+ .update_time = ubifs_update_time,
+#endif
};
const struct file_operations ubifs_file_operations = {
diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
index 92a8491a8f8c..c0a95e393347 100644
--- a/fs/ubifs/key.h
+++ b/fs/ubifs/key.h
@@ -34,6 +34,12 @@
* node. We use "r5" hash borrowed from reiserfs.
*/
+/*
+ * Lot's of the key helpers require a struct ubifs_info *c as the first parameter.
+ * But we are not using it at all currently. That's designed for future extensions of
+ * different c->key_format. But right now, there is only one key type, UBIFS_SIMPLE_KEY_FMT.
+ */
+
#ifndef __UBIFS_KEY_H__
#define __UBIFS_KEY_H__
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index dc9f27e9d61b..9a517109da0f 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -1498,11 +1498,10 @@ static struct ubifs_nnode *dirty_cow_nnode(struct ubifs_info *c,
}
/* nnode is being committed, so copy it */
- n = kmalloc(sizeof(struct ubifs_nnode), GFP_NOFS);
+ n = kmemdup(nnode, sizeof(struct ubifs_nnode), GFP_NOFS);
if (unlikely(!n))
return ERR_PTR(-ENOMEM);
- memcpy(n, nnode, sizeof(struct ubifs_nnode));
n->cnext = NULL;
__set_bit(DIRTY_CNODE, &n->flags);
__clear_bit(COW_CNODE, &n->flags);
@@ -1549,11 +1548,10 @@ static struct ubifs_pnode *dirty_cow_pnode(struct ubifs_info *c,
}
/* pnode is being committed, so copy it */
- p = kmalloc(sizeof(struct ubifs_pnode), GFP_NOFS);
+ p = kmemdup(pnode, sizeof(struct ubifs_pnode), GFP_NOFS);
if (unlikely(!p))
return ERR_PTR(-ENOMEM);
- memcpy(p, pnode, sizeof(struct ubifs_pnode));
p->cnext = NULL;
__set_bit(DIRTY_CNODE, &p->flags);
__clear_bit(COW_CNODE, &p->flags);
diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h
index ee7cb5ebb6e8..8ece6ca58c0b 100644
--- a/fs/ubifs/misc.h
+++ b/fs/ubifs/misc.h
@@ -155,13 +155,8 @@ static inline int ubifs_wbuf_sync(struct ubifs_wbuf *wbuf)
*/
static inline int ubifs_encode_dev(union ubifs_dev_desc *dev, dev_t rdev)
{
- if (new_valid_dev(rdev)) {
- dev->new = cpu_to_le32(new_encode_dev(rdev));
- return sizeof(dev->new);
- } else {
- dev->huge = cpu_to_le64(huge_encode_dev(rdev));
- return sizeof(dev->huge);
- }
+ dev->new = cpu_to_le32(new_encode_dev(rdev));
+ return sizeof(dev->new);
}
/**
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 695fc71d5244..586d59347fff 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -789,7 +789,7 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
corrupted_rescan:
/* Re-scan the corrupted data with verbose messages */
ubifs_err(c, "corruption %d", ret);
- ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
+ ubifs_scan_a_node(c, buf, len, lnum, offs, 0);
corrupted:
ubifs_scanned_corruption(c, lnum, offs, buf);
err = -EUCLEAN;
@@ -1331,8 +1331,7 @@ void ubifs_destroy_size_tree(struct ubifs_info *c)
struct size_entry *e, *n;
rbtree_postorder_for_each_entry_safe(e, n, &c->size_tree, rb) {
- if (e->inode)
- iput(e->inode);
+ iput(e->inode);
kfree(e);
}
@@ -1533,8 +1532,7 @@ int ubifs_recover_size(struct ubifs_info *c)
err = fix_size_in_place(c, e);
if (err)
return err;
- if (e->inode)
- iput(e->inode);
+ iput(e->inode);
}
}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 9547a27868ad..a233ba913be4 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -128,7 +128,10 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)
if (err)
goto out_ino;
- inode->i_flags |= (S_NOCMTIME | S_NOATIME);
+ inode->i_flags |= S_NOCMTIME;
+#ifndef CONFIG_UBIFS_ATIME_SUPPORT
+ inode->i_flags |= S_NOATIME;
+#endif
set_nlink(inode, le32_to_cpu(ino->nlink));
i_uid_write(inode, le32_to_cpu(ino->uid));
i_gid_write(inode, le32_to_cpu(ino->gid));
@@ -2037,7 +2040,6 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
if (c->max_inode_sz > MAX_LFS_FILESIZE)
sb->s_maxbytes = c->max_inode_sz = MAX_LFS_FILESIZE;
sb->s_op = &ubifs_super_operations;
- sb->s_xattr = ubifs_xattr_handlers;
mutex_lock(&c->umount_mutex);
err = mount_ubifs(c);
@@ -2139,7 +2141,12 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags,
if (err)
goto out_deact;
/* We do not support atime */
- sb->s_flags |= MS_ACTIVE | MS_NOATIME;
+ sb->s_flags |= MS_ACTIVE;
+#ifndef CONFIG_UBIFS_ATIME_SUPPORT
+ sb->s_flags |= MS_NOATIME;
+#else
+ ubifs_msg(c, "full atime support is enabled.");
+#endif
}
/* 'fill_super()' opens ubi again so we must close it here */
@@ -2241,8 +2248,8 @@ static int __init ubifs_init(void)
ubifs_inode_slab = kmem_cache_create("ubifs_inode_slab",
sizeof(struct ubifs_inode), 0,
- SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT,
- &inode_slab_ctor);
+ SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT |
+ SLAB_ACCOUNT, &inode_slab_ctor);
if (!ubifs_inode_slab)
return -ENOMEM;
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 957f5757f374..fa9a20cc60d6 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -198,11 +198,10 @@ static struct ubifs_znode *copy_znode(struct ubifs_info *c,
{
struct ubifs_znode *zn;
- zn = kmalloc(c->max_znode_sz, GFP_NOFS);
+ zn = kmemdup(znode, c->max_znode_sz, GFP_NOFS);
if (unlikely(!zn))
return ERR_PTR(-ENOMEM);
- memcpy(zn, znode, c->max_znode_sz);
zn->cnext = NULL;
__set_bit(DIRTY_ZNODE, &zn->flags);
__clear_bit(COW_ZNODE, &zn->flags);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index de759022f3d6..a5697de763f5 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -858,9 +858,9 @@ struct ubifs_compressor {
* @mod_dent: non-zero if the operation removes or modifies an existing
* directory entry
* @new_ino: non-zero if the operation adds a new inode
- * @new_ino_d: now much data newly created inode contains
+ * @new_ino_d: how much data newly created inode contains
* @dirtied_ino: how many inodes the operation makes dirty
- * @dirtied_ino_d: now much data dirtied inode contains
+ * @dirtied_ino_d: how much data dirtied inode contains
* @idx_growth: how much the index will supposedly grow
* @data_growth: how much new data the operation will supposedly add
* @dd_growth: how much data that makes other data dirty the operation will
@@ -1470,7 +1470,6 @@ extern spinlock_t ubifs_infos_lock;
extern atomic_long_t ubifs_clean_zn_cnt;
extern struct kmem_cache *ubifs_inode_slab;
extern const struct super_operations ubifs_super_operations;
-extern const struct xattr_handler *ubifs_xattr_handlers[];
extern const struct address_space_operations ubifs_file_address_operations;
extern const struct file_operations ubifs_file_operations;
extern const struct inode_operations ubifs_file_inode_operations;
@@ -1746,6 +1745,9 @@ int ubifs_calc_dark(const struct ubifs_info *c, int spc);
/* file.c */
int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync);
int ubifs_setattr(struct dentry *dentry, struct iattr *attr);
+#ifdef CONFIG_UBIFS_ATIME_SUPPORT
+int ubifs_update_time(struct inode *inode, struct timespec *time, int flags);
+#endif
/* dir.c */
struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index fd65b3f1923c..e53292d0c21b 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -200,6 +200,7 @@ static int change_xattr(struct ubifs_info *c, struct inode *host,
int err;
struct ubifs_inode *host_ui = ubifs_inode(host);
struct ubifs_inode *ui = ubifs_inode(inode);
+ void *buf = NULL;
struct ubifs_budget_req req = { .dirtied_ino = 2,
.dirtied_ino_d = ALIGN(size, 8) + ALIGN(host_ui->data_len, 8) };
@@ -208,14 +209,17 @@ static int change_xattr(struct ubifs_info *c, struct inode *host,
if (err)
return err;
- kfree(ui->data);
- ui->data = kmemdup(value, size, GFP_NOFS);
- if (!ui->data) {
+ buf = kmemdup(value, size, GFP_NOFS);
+ if (!buf) {
err = -ENOMEM;
goto out_free;
}
+ mutex_lock(&ui->ui_mutex);
+ kfree(ui->data);
+ ui->data = buf;
inode->i_size = ui->ui_size = size;
ui->data_len = size;
+ mutex_unlock(&ui->ui_mutex);
mutex_lock(&host_ui->ui_mutex);
host->i_ctime = ubifs_current_time(host);
@@ -263,7 +267,7 @@ static int check_namespace(const struct qstr *nm)
if (!strncmp(nm->name, XATTR_TRUSTED_PREFIX,
XATTR_TRUSTED_PREFIX_LEN)) {
- if (nm->name[sizeof(XATTR_TRUSTED_PREFIX) - 1] == '\0')
+ if (nm->name[XATTR_TRUSTED_PREFIX_LEN] == '\0')
return -EINVAL;
type = TRUSTED_XATTR;
} else if (!strncmp(nm->name, XATTR_USER_PREFIX,
@@ -273,7 +277,7 @@ static int check_namespace(const struct qstr *nm)
type = USER_XATTR;
} else if (!strncmp(nm->name, XATTR_SECURITY_PREFIX,
XATTR_SECURITY_PREFIX_LEN)) {
- if (nm->name[sizeof(XATTR_SECURITY_PREFIX) - 1] == '\0')
+ if (nm->name[XATTR_SECURITY_PREFIX_LEN] == '\0')
return -EINVAL;
type = SECURITY_XATTR;
} else
@@ -409,6 +413,7 @@ ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
ubifs_assert(inode->i_size == ui->data_len);
ubifs_assert(ubifs_inode(host)->xattr_size > ui->data_len);
+ mutex_lock(&ui->ui_mutex);
if (buf) {
/* If @buf is %NULL we are supposed to return the length */
if (ui->data_len > size) {
@@ -423,6 +428,7 @@ ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
err = ui->data_len;
out_iput:
+ mutex_unlock(&ui->ui_mutex);
iput(inode);
out_unlock:
kfree(xent);
@@ -582,46 +588,6 @@ out_free:
return err;
}
-static size_t security_listxattr(struct dentry *d, char *list, size_t list_size,
- const char *name, size_t name_len, int flags)
-{
- const int prefix_len = XATTR_SECURITY_PREFIX_LEN;
- const size_t total_len = prefix_len + name_len + 1;
-
- if (list && total_len <= list_size) {
- memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
- memcpy(list + prefix_len, name, name_len);
- list[prefix_len + name_len] = '\0';
- }
-
- return total_len;
-}
-
-static int security_getxattr(struct dentry *d, const char *name, void *buffer,
- size_t size, int flags)
-{
- return ubifs_getxattr(d, name, buffer, size);
-}
-
-static int security_setxattr(struct dentry *d, const char *name,
- const void *value, size_t size, int flags,
- int handler_flags)
-{
- return ubifs_setxattr(d, name, value, size, flags);
-}
-
-static const struct xattr_handler ubifs_xattr_security_handler = {
- .prefix = XATTR_SECURITY_PREFIX,
- .list = security_listxattr,
- .get = security_getxattr,
- .set = security_setxattr,
-};
-
-const struct xattr_handler *ubifs_xattr_handlers[] = {
- &ubifs_xattr_security_handler,
- NULL,
-};
-
static int init_xattrs(struct inode *inode, const struct xattr *xattr_array,
void *fs_info)
{
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 6d6a96b4e73f..e0fd65fe73e8 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -447,9 +447,6 @@ static void udf_table_free_blocks(struct super_block *sb,
*/
int adsize;
- struct short_ad *sad = NULL;
- struct long_ad *lad = NULL;
- struct allocExtDesc *aed;
eloc.logicalBlockNum = start;
elen = EXT_RECORDED_ALLOCATED |
@@ -466,102 +463,17 @@ static void udf_table_free_blocks(struct super_block *sb,
}
if (epos.offset + (2 * adsize) > sb->s_blocksize) {
- unsigned char *sptr, *dptr;
- int loffset;
-
- brelse(oepos.bh);
- oepos = epos;
-
/* Steal a block from the extent being free'd */
- epos.block.logicalBlockNum = eloc.logicalBlockNum;
+ udf_setup_indirect_aext(table, eloc.logicalBlockNum,
+ &epos);
+
eloc.logicalBlockNum++;
elen -= sb->s_blocksize;
-
- epos.bh = udf_tread(sb,
- udf_get_lb_pblock(sb, &epos.block, 0));
- if (!epos.bh) {
- brelse(oepos.bh);
- goto error_return;
- }
- aed = (struct allocExtDesc *)(epos.bh->b_data);
- aed->previousAllocExtLocation =
- cpu_to_le32(oepos.block.logicalBlockNum);
- if (epos.offset + adsize > sb->s_blocksize) {
- loffset = epos.offset;
- aed->lengthAllocDescs = cpu_to_le32(adsize);
- sptr = iinfo->i_ext.i_data + epos.offset
- - adsize;
- dptr = epos.bh->b_data +
- sizeof(struct allocExtDesc);
- memcpy(dptr, sptr, adsize);
- epos.offset = sizeof(struct allocExtDesc) +
- adsize;
- } else {
- loffset = epos.offset + adsize;
- aed->lengthAllocDescs = cpu_to_le32(0);
- if (oepos.bh) {
- sptr = oepos.bh->b_data + epos.offset;
- aed = (struct allocExtDesc *)
- oepos.bh->b_data;
- le32_add_cpu(&aed->lengthAllocDescs,
- adsize);
- } else {
- sptr = iinfo->i_ext.i_data +
- epos.offset;
- iinfo->i_lenAlloc += adsize;
- mark_inode_dirty(table);
- }
- epos.offset = sizeof(struct allocExtDesc);
- }
- if (sbi->s_udfrev >= 0x0200)
- udf_new_tag(epos.bh->b_data, TAG_IDENT_AED,
- 3, 1, epos.block.logicalBlockNum,
- sizeof(struct tag));
- else
- udf_new_tag(epos.bh->b_data, TAG_IDENT_AED,
- 2, 1, epos.block.logicalBlockNum,
- sizeof(struct tag));
-
- switch (iinfo->i_alloc_type) {
- case ICBTAG_FLAG_AD_SHORT:
- sad = (struct short_ad *)sptr;
- sad->extLength = cpu_to_le32(
- EXT_NEXT_EXTENT_ALLOCDECS |
- sb->s_blocksize);
- sad->extPosition =
- cpu_to_le32(epos.block.logicalBlockNum);
- break;
- case ICBTAG_FLAG_AD_LONG:
- lad = (struct long_ad *)sptr;
- lad->extLength = cpu_to_le32(
- EXT_NEXT_EXTENT_ALLOCDECS |
- sb->s_blocksize);
- lad->extLocation =
- cpu_to_lelb(epos.block);
- break;
- }
- if (oepos.bh) {
- udf_update_tag(oepos.bh->b_data, loffset);
- mark_buffer_dirty(oepos.bh);
- } else {
- mark_inode_dirty(table);
- }
}
/* It's possible that stealing the block emptied the extent */
- if (elen) {
- udf_write_aext(table, &epos, &eloc, elen, 1);
-
- if (!epos.bh) {
- iinfo->i_lenAlloc += adsize;
- mark_inode_dirty(table);
- } else {
- aed = (struct allocExtDesc *)epos.bh->b_data;
- le32_add_cpu(&aed->lengthAllocDescs, adsize);
- udf_update_tag(epos.bh->b_data, epos.offset);
- mark_buffer_dirty(epos.bh);
- }
- }
+ if (elen)
+ __udf_add_aext(table, &epos, &eloc, elen, 1);
}
brelse(epos.bh);
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 8d0b3ade0ff0..87dc16d15572 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -539,9 +539,18 @@ static int udf_do_extend_file(struct inode *inode,
udf_add_aext(inode, last_pos, &last_ext->extLocation,
last_ext->extLength, 1);
count++;
- } else
+ } else {
+ struct kernel_lb_addr tmploc;
+ uint32_t tmplen;
+
udf_write_aext(inode, last_pos, &last_ext->extLocation,
last_ext->extLength, 1);
+ /*
+ * We've rewritten the last extent but there may be empty
+ * indirect extent after it - enter it.
+ */
+ udf_next_aext(inode, last_pos, &tmploc, &tmplen, 0);
+ }
/* Managed to do everything necessary? */
if (!blocks)
@@ -1540,7 +1549,8 @@ reread:
break;
case ICBTAG_FILE_TYPE_SYMLINK:
inode->i_data.a_ops = &udf_symlink_aops;
- inode->i_op = &udf_symlink_inode_operations;
+ inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mode = S_IFLNK | S_IRWXUGO;
break;
case ICBTAG_FILE_TYPE_MAIN:
@@ -1866,22 +1876,90 @@ struct inode *__udf_iget(struct super_block *sb, struct kernel_lb_addr *ino,
return inode;
}
-int udf_add_aext(struct inode *inode, struct extent_position *epos,
- struct kernel_lb_addr *eloc, uint32_t elen, int inc)
+int udf_setup_indirect_aext(struct inode *inode, int block,
+ struct extent_position *epos)
{
- int adsize;
- struct short_ad *sad = NULL;
- struct long_ad *lad = NULL;
+ struct super_block *sb = inode->i_sb;
+ struct buffer_head *bh;
struct allocExtDesc *aed;
- uint8_t *ptr;
- struct udf_inode_info *iinfo = UDF_I(inode);
+ struct extent_position nepos;
+ struct kernel_lb_addr neloc;
+ int ver, adsize;
- if (!epos->bh)
- ptr = iinfo->i_ext.i_data + epos->offset -
- udf_file_entry_alloc_offset(inode) +
- iinfo->i_lenEAttr;
+ if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
+ adsize = sizeof(struct short_ad);
+ else if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_LONG)
+ adsize = sizeof(struct long_ad);
else
- ptr = epos->bh->b_data + epos->offset;
+ return -EIO;
+
+ neloc.logicalBlockNum = block;
+ neloc.partitionReferenceNum = epos->block.partitionReferenceNum;
+
+ bh = udf_tgetblk(sb, udf_get_lb_pblock(sb, &neloc, 0));
+ if (!bh)
+ return -EIO;
+ lock_buffer(bh);
+ memset(bh->b_data, 0x00, sb->s_blocksize);
+ set_buffer_uptodate(bh);
+ unlock_buffer(bh);
+ mark_buffer_dirty_inode(bh, inode);
+
+ aed = (struct allocExtDesc *)(bh->b_data);
+ if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT)) {
+ aed->previousAllocExtLocation =
+ cpu_to_le32(epos->block.logicalBlockNum);
+ }
+ aed->lengthAllocDescs = cpu_to_le32(0);
+ if (UDF_SB(sb)->s_udfrev >= 0x0200)
+ ver = 3;
+ else
+ ver = 2;
+ udf_new_tag(bh->b_data, TAG_IDENT_AED, ver, 1, block,
+ sizeof(struct tag));
+
+ nepos.block = neloc;
+ nepos.offset = sizeof(struct allocExtDesc);
+ nepos.bh = bh;
+
+ /*
+ * Do we have to copy current last extent to make space for indirect
+ * one?
+ */
+ if (epos->offset + adsize > sb->s_blocksize) {
+ struct kernel_lb_addr cp_loc;
+ uint32_t cp_len;
+ int cp_type;
+
+ epos->offset -= adsize;
+ cp_type = udf_current_aext(inode, epos, &cp_loc, &cp_len, 0);
+ cp_len |= ((uint32_t)cp_type) << 30;
+
+ __udf_add_aext(inode, &nepos, &cp_loc, cp_len, 1);
+ udf_write_aext(inode, epos, &nepos.block,
+ sb->s_blocksize | EXT_NEXT_EXTENT_ALLOCDECS, 0);
+ } else {
+ __udf_add_aext(inode, epos, &nepos.block,
+ sb->s_blocksize | EXT_NEXT_EXTENT_ALLOCDECS, 0);
+ }
+
+ brelse(epos->bh);
+ *epos = nepos;
+
+ return 0;
+}
+
+/*
+ * Append extent at the given position - should be the first free one in inode
+ * / indirect extent. This function assumes there is enough space in the inode
+ * or indirect extent. Use udf_add_aext() if you didn't check for this before.
+ */
+int __udf_add_aext(struct inode *inode, struct extent_position *epos,
+ struct kernel_lb_addr *eloc, uint32_t elen, int inc)
+{
+ struct udf_inode_info *iinfo = UDF_I(inode);
+ struct allocExtDesc *aed;
+ int adsize;
if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
adsize = sizeof(struct short_ad);
@@ -1890,88 +1968,14 @@ int udf_add_aext(struct inode *inode, struct extent_position *epos,
else
return -EIO;
- if (epos->offset + (2 * adsize) > inode->i_sb->s_blocksize) {
- unsigned char *sptr, *dptr;
- struct buffer_head *nbh;
- int err, loffset;
- struct kernel_lb_addr obloc = epos->block;
-
- epos->block.logicalBlockNum = udf_new_block(inode->i_sb, NULL,
- obloc.partitionReferenceNum,
- obloc.logicalBlockNum, &err);
- if (!epos->block.logicalBlockNum)
- return -ENOSPC;
- nbh = udf_tgetblk(inode->i_sb, udf_get_lb_pblock(inode->i_sb,
- &epos->block,
- 0));
- if (!nbh)
- return -EIO;
- lock_buffer(nbh);
- memset(nbh->b_data, 0x00, inode->i_sb->s_blocksize);
- set_buffer_uptodate(nbh);
- unlock_buffer(nbh);
- mark_buffer_dirty_inode(nbh, inode);
-
- aed = (struct allocExtDesc *)(nbh->b_data);
- if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT))
- aed->previousAllocExtLocation =
- cpu_to_le32(obloc.logicalBlockNum);
- if (epos->offset + adsize > inode->i_sb->s_blocksize) {
- loffset = epos->offset;
- aed->lengthAllocDescs = cpu_to_le32(adsize);
- sptr = ptr - adsize;
- dptr = nbh->b_data + sizeof(struct allocExtDesc);
- memcpy(dptr, sptr, adsize);
- epos->offset = sizeof(struct allocExtDesc) + adsize;
- } else {
- loffset = epos->offset + adsize;
- aed->lengthAllocDescs = cpu_to_le32(0);
- sptr = ptr;
- epos->offset = sizeof(struct allocExtDesc);
-
- if (epos->bh) {
- aed = (struct allocExtDesc *)epos->bh->b_data;
- le32_add_cpu(&aed->lengthAllocDescs, adsize);
- } else {
- iinfo->i_lenAlloc += adsize;
- mark_inode_dirty(inode);
- }
- }
- if (UDF_SB(inode->i_sb)->s_udfrev >= 0x0200)
- udf_new_tag(nbh->b_data, TAG_IDENT_AED, 3, 1,
- epos->block.logicalBlockNum, sizeof(struct tag));
- else
- udf_new_tag(nbh->b_data, TAG_IDENT_AED, 2, 1,
- epos->block.logicalBlockNum, sizeof(struct tag));
- switch (iinfo->i_alloc_type) {
- case ICBTAG_FLAG_AD_SHORT:
- sad = (struct short_ad *)sptr;
- sad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS |
- inode->i_sb->s_blocksize);
- sad->extPosition =
- cpu_to_le32(epos->block.logicalBlockNum);
- break;
- case ICBTAG_FLAG_AD_LONG:
- lad = (struct long_ad *)sptr;
- lad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS |
- inode->i_sb->s_blocksize);
- lad->extLocation = cpu_to_lelb(epos->block);
- memset(lad->impUse, 0x00, sizeof(lad->impUse));
- break;
- }
- if (epos->bh) {
- if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) ||
- UDF_SB(inode->i_sb)->s_udfrev >= 0x0201)
- udf_update_tag(epos->bh->b_data, loffset);
- else
- udf_update_tag(epos->bh->b_data,
- sizeof(struct allocExtDesc));
- mark_buffer_dirty_inode(epos->bh, inode);
- brelse(epos->bh);
- } else {
- mark_inode_dirty(inode);
- }
- epos->bh = nbh;
+ if (!epos->bh) {
+ WARN_ON(iinfo->i_lenAlloc !=
+ epos->offset - udf_file_entry_alloc_offset(inode));
+ } else {
+ aed = (struct allocExtDesc *)epos->bh->b_data;
+ WARN_ON(le32_to_cpu(aed->lengthAllocDescs) !=
+ epos->offset - sizeof(struct allocExtDesc));
+ WARN_ON(epos->offset + adsize > inode->i_sb->s_blocksize);
}
udf_write_aext(inode, epos, eloc, elen, inc);
@@ -1995,6 +1999,41 @@ int udf_add_aext(struct inode *inode, struct extent_position *epos,
return 0;
}
+/*
+ * Append extent at given position - should be the first free one in inode
+ * / indirect extent. Takes care of allocating and linking indirect blocks.
+ */
+int udf_add_aext(struct inode *inode, struct extent_position *epos,
+ struct kernel_lb_addr *eloc, uint32_t elen, int inc)
+{
+ int adsize;
+ struct super_block *sb = inode->i_sb;
+
+ if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
+ adsize = sizeof(struct short_ad);
+ else if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_LONG)
+ adsize = sizeof(struct long_ad);
+ else
+ return -EIO;
+
+ if (epos->offset + (2 * adsize) > sb->s_blocksize) {
+ int err;
+ int new_block;
+
+ new_block = udf_new_block(sb, NULL,
+ epos->block.partitionReferenceNum,
+ epos->block.logicalBlockNum, &err);
+ if (!new_block)
+ return -ENOSPC;
+
+ err = udf_setup_indirect_aext(inode, new_block, epos);
+ if (err)
+ return err;
+ }
+
+ return __udf_add_aext(inode, epos, eloc, elen, inc);
+}
+
void udf_write_aext(struct inode *inode, struct extent_position *epos,
struct kernel_lb_addr *eloc, uint32_t elen, int inc)
{
@@ -2047,14 +2086,29 @@ void udf_write_aext(struct inode *inode, struct extent_position *epos,
epos->offset += adsize;
}
+/*
+ * Only 1 indirect extent in a row really makes sense but allow upto 16 in case
+ * someone does some weird stuff.
+ */
+#define UDF_MAX_INDIR_EXTS 16
+
int8_t udf_next_aext(struct inode *inode, struct extent_position *epos,
struct kernel_lb_addr *eloc, uint32_t *elen, int inc)
{
int8_t etype;
+ unsigned int indirections = 0;
while ((etype = udf_current_aext(inode, epos, eloc, elen, inc)) ==
(EXT_NEXT_EXTENT_ALLOCDECS >> 30)) {
int block;
+
+ if (++indirections > UDF_MAX_INDIR_EXTS) {
+ udf_err(inode->i_sb,
+ "too many indirect extents in inode %lu\n",
+ inode->i_ino);
+ return -1;
+ }
+
epos->block = *eloc;
epos->offset = sizeof(struct allocExtDesc);
brelse(epos->bh);
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index c97b5a8d1e24..42eafb91f7ff 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -921,7 +921,8 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
}
inode->i_data.a_ops = &udf_symlink_aops;
- inode->i_op = &udf_symlink_inode_operations;
+ inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
struct kernel_lb_addr eloc;
@@ -1344,8 +1345,3 @@ const struct inode_operations udf_dir_inode_operations = {
.rename = udf_rename,
.tmpfile = udf_tmpfile,
};
-const struct inode_operations udf_symlink_inode_operations = {
- .readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
-};
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 81155b9b445b..0fbb4c7c72e8 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -179,7 +179,8 @@ static int __init init_inodecache(void)
udf_inode_cachep = kmem_cache_create("udf_inode_cache",
sizeof(struct udf_inode_info),
0, (SLAB_RECLAIM_ACCOUNT |
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD |
+ SLAB_ACCOUNT),
init_once);
if (!udf_inode_cachep)
return -ENOMEM;
@@ -1586,6 +1587,13 @@ static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_
}
/*
+ * Maximum number of Terminating Descriptor redirections. The chosen number is
+ * arbitrary - just that we hopefully don't limit any real use of rewritten
+ * inode on write-once media but avoid looping for too long on corrupted media.
+ */
+#define UDF_MAX_TD_NESTING 64
+
+/*
* Process a main/reserve volume descriptor sequence.
* @block First block of first extent of the sequence.
* @lastblock Lastblock of first extent of the sequence.
@@ -1609,6 +1617,7 @@ static noinline int udf_process_sequence(
uint16_t ident;
long next_s = 0, next_e = 0;
int ret;
+ unsigned int indirections = 0;
memset(vds, 0, sizeof(struct udf_vds_record) * VDS_POS_LENGTH);
@@ -1679,6 +1688,12 @@ static noinline int udf_process_sequence(
}
break;
case TAG_IDENT_TD: /* ISO 13346 3/10.9 */
+ if (++indirections > UDF_MAX_TD_NESTING) {
+ udf_err(sb, "too many TDs (max %u supported)\n", UDF_MAX_TD_NESTING);
+ brelse(bh);
+ return -EIO;
+ }
+
vds[VDS_POS_TERMINATING_DESC].block = block;
if (next_e) {
block = next_s;
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index 862535b3ba58..8d619773056b 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -107,7 +107,7 @@ static int udf_symlink_filler(struct file *file, struct page *page)
struct buffer_head *bh = NULL;
unsigned char *symlink;
int err;
- unsigned char *p = kmap(page);
+ unsigned char *p = page_address(page);
struct udf_inode_info *iinfo;
uint32_t pos;
@@ -141,7 +141,6 @@ static int udf_symlink_filler(struct file *file, struct page *page)
up_read(&iinfo->i_data_sem);
SetPageUptodate(page);
- kunmap(page);
unlock_page(page);
return 0;
@@ -149,7 +148,6 @@ out_unlock_inode:
up_read(&iinfo->i_data_sem);
SetPageError(page);
out_unmap:
- kunmap(page);
unlock_page(page);
return err;
}
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 47bb3f5ca360..fa0044b6b81d 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -85,7 +85,6 @@ extern const struct inode_operations udf_dir_inode_operations;
extern const struct file_operations udf_dir_operations;
extern const struct inode_operations udf_file_inode_operations;
extern const struct file_operations udf_file_operations;
-extern const struct inode_operations udf_symlink_inode_operations;
extern const struct address_space_operations udf_aops;
extern const struct address_space_operations udf_adinicb_aops;
extern const struct address_space_operations udf_symlink_aops;
@@ -159,6 +158,10 @@ extern int udf_write_inode(struct inode *, struct writeback_control *wbc);
extern long udf_block_map(struct inode *, sector_t);
extern int8_t inode_bmap(struct inode *, sector_t, struct extent_position *,
struct kernel_lb_addr *, uint32_t *, sector_t *);
+extern int udf_setup_indirect_aext(struct inode *inode, int block,
+ struct extent_position *epos);
+extern int __udf_add_aext(struct inode *inode, struct extent_position *epos,
+ struct kernel_lb_addr *eloc, uint32_t elen, int inc);
extern int udf_add_aext(struct inode *, struct extent_position *,
struct kernel_lb_addr *, uint32_t, int);
extern void udf_write_aext(struct inode *, struct extent_position *,
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index ab478e62baae..e788a05aab83 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -128,11 +128,15 @@ int udf_CS0toUTF8(struct ustr *utf_o, const struct ustr *ocu_i)
if (c < 0x80U)
utf_o->u_name[utf_o->u_len++] = (uint8_t)c;
else if (c < 0x800U) {
+ if (utf_o->u_len > (UDF_NAME_LEN - 4))
+ break;
utf_o->u_name[utf_o->u_len++] =
(uint8_t)(0xc0 | (c >> 6));
utf_o->u_name[utf_o->u_len++] =
(uint8_t)(0x80 | (c & 0x3f));
} else {
+ if (utf_o->u_len > (UDF_NAME_LEN - 5))
+ break;
utf_o->u_name[utf_o->u_len++] =
(uint8_t)(0xe0 | (c >> 12));
utf_o->u_name[utf_o->u_len++] =
@@ -173,17 +177,22 @@ int udf_CS0toUTF8(struct ustr *utf_o, const struct ustr *ocu_i)
static int udf_UTF8toCS0(dstring *ocu, struct ustr *utf, int length)
{
unsigned c, i, max_val, utf_char;
- int utf_cnt, u_len;
+ int utf_cnt, u_len, u_ch;
memset(ocu, 0, sizeof(dstring) * length);
ocu[0] = 8;
max_val = 0xffU;
+ u_ch = 1;
try_again:
u_len = 0U;
utf_char = 0U;
utf_cnt = 0U;
for (i = 0U; i < utf->u_len; i++) {
+ /* Name didn't fit? */
+ if (u_len + 1 + u_ch >= length)
+ return 0;
+
c = (uint8_t)utf->u_name[i];
/* Complete a multi-byte UTF-8 character */
@@ -225,6 +234,7 @@ try_again:
if (max_val == 0xffU) {
max_val = 0xffffU;
ocu[0] = (uint8_t)0x10U;
+ u_ch = 2;
goto try_again;
}
goto error_out;
@@ -277,7 +287,7 @@ static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
c = (c << 8) | ocu[i++];
len = nls->uni2char(c, &utf_o->u_name[utf_o->u_len],
- UDF_NAME_LEN - utf_o->u_len);
+ UDF_NAME_LEN - 2 - utf_o->u_len);
/* Valid character? */
if (len >= 0)
utf_o->u_len += len;
@@ -295,15 +305,19 @@ static int udf_NLStoCS0(struct nls_table *nls, dstring *ocu, struct ustr *uni,
int len;
unsigned i, max_val;
uint16_t uni_char;
- int u_len;
+ int u_len, u_ch;
memset(ocu, 0, sizeof(dstring) * length);
ocu[0] = 8;
max_val = 0xffU;
+ u_ch = 1;
try_again:
u_len = 0U;
for (i = 0U; i < uni->u_len; i++) {
+ /* Name didn't fit? */
+ if (u_len + 1 + u_ch >= length)
+ return 0;
len = nls->char2uni(&uni->u_name[i], uni->u_len - i, &uni_char);
if (!len)
continue;
@@ -316,6 +330,7 @@ try_again:
if (uni_char > max_val) {
max_val = 0xffffU;
ocu[0] = (uint8_t)0x10U;
+ u_ch = 2;
goto try_again;
}
diff --git a/fs/ufs/Makefile b/fs/ufs/Makefile
index 392db25c0b56..ec4a6b49fa13 100644
--- a/fs/ufs/Makefile
+++ b/fs/ufs/Makefile
@@ -5,5 +5,5 @@
obj-$(CONFIG_UFS_FS) += ufs.o
ufs-objs := balloc.o cylinder.o dir.o file.o ialloc.o inode.o \
- namei.o super.o symlink.o util.o
+ namei.o super.o util.o
ccflags-$(CONFIG_UFS_DEBUG) += -DDEBUG
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index a064cf44b143..d897e169ab9c 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -528,11 +528,12 @@ static void ufs_set_inode_ops(struct inode *inode)
inode->i_mapping->a_ops = &ufs_aops;
} else if (S_ISLNK(inode->i_mode)) {
if (!inode->i_blocks) {
- inode->i_op = &ufs_fast_symlink_inode_operations;
inode->i_link = (char *)UFS_I(inode)->i_u1.i_symlink;
+ inode->i_op = &simple_symlink_inode_operations;
} else {
- inode->i_op = &ufs_symlink_inode_operations;
inode->i_mapping->a_ops = &ufs_aops;
+ inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
}
} else
init_special_inode(inode, inode->i_mode,
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 47966554317c..acf4a3b61b81 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -123,14 +123,15 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
if (l > UFS_SB(sb)->s_uspi->s_maxsymlinklen) {
/* slow symlink */
- inode->i_op = &ufs_symlink_inode_operations;
+ inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &ufs_aops;
err = page_symlink(inode, symname, l);
if (err)
goto out_fail;
} else {
/* fast symlink */
- inode->i_op = &ufs_fast_symlink_inode_operations;
+ inode->i_op = &simple_symlink_inode_operations;
inode->i_link = (char *)UFS_I(inode)->i_u1.i_symlink;
memcpy(inode->i_link, symname, l);
inode->i_size = l-1;
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index f6390eec02ca..442fd52ebffe 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1427,7 +1427,7 @@ static int __init init_inodecache(void)
ufs_inode_cachep = kmem_cache_create("ufs_inode_cache",
sizeof(struct ufs_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (ufs_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/ufs/symlink.c b/fs/ufs/symlink.c
deleted file mode 100644
index 874480bb43e9..000000000000
--- a/fs/ufs/symlink.c
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * linux/fs/ufs/symlink.c
- *
- * Only fast symlinks left here - the rest is done by generic code. AV, 1999
- *
- * Copyright (C) 1998
- * Daniel Pirkl <daniel.pirkl@emai.cz>
- * Charles University, Faculty of Mathematics and Physics
- *
- * from
- *
- * linux/fs/ext2/symlink.c
- *
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- *
- * from
- *
- * linux/fs/minix/symlink.c
- *
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * ext2 symlink handling code
- */
-
-#include "ufs_fs.h"
-#include "ufs.h"
-
-const struct inode_operations ufs_fast_symlink_inode_operations = {
- .readlink = generic_readlink,
- .follow_link = simple_follow_link,
- .setattr = ufs_setattr,
-};
-
-const struct inode_operations ufs_symlink_inode_operations = {
- .readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
- .setattr = ufs_setattr,
-};
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index 7da4aca868c0..c87f4c3fa9dd 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -136,10 +136,6 @@ extern __printf(3, 4)
void ufs_panic(struct super_block *, const char *, const char *, ...);
void ufs_mark_sb_dirty(struct super_block *sb);
-/* symlink.c */
-extern const struct inode_operations ufs_fast_symlink_inode_operations;
-extern const struct inode_operations ufs_symlink_inode_operations;
-
static inline struct ufs_sb_info *UFS_SB(struct super_block *sb)
{
return sb->s_fs_info;
diff --git a/fs/xattr.c b/fs/xattr.c
index 072fee1258dd..d5dd6c8b82a7 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -208,25 +208,6 @@ vfs_getxattr_alloc(struct dentry *dentry, const char *name, char **xattr_value,
return error;
}
-/* Compare an extended attribute value with the given value */
-int vfs_xattr_cmp(struct dentry *dentry, const char *xattr_name,
- const char *value, size_t size, gfp_t flags)
-{
- char *xattr_value = NULL;
- int rc;
-
- rc = vfs_getxattr_alloc(dentry, xattr_name, &xattr_value, 0, flags);
- if (rc < 0)
- return rc;
-
- if ((rc != size) || (memcmp(xattr_value, value, rc) != 0))
- rc = -EINVAL;
- else
- rc = 0;
- kfree(xattr_value);
- return rc;
-}
-
ssize_t
vfs_getxattr(struct dentry *dentry, const char *name, void *value, size_t size)
{
@@ -324,7 +305,6 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value,
{
int error;
void *kvalue = NULL;
- void *vvalue = NULL; /* If non-NULL, we used vmalloc() */
char kname[XATTR_NAME_MAX + 1];
if (flags & ~(XATTR_CREATE|XATTR_REPLACE))
@@ -341,10 +321,9 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value,
return -E2BIG;
kvalue = kmalloc(size, GFP_KERNEL | __GFP_NOWARN);
if (!kvalue) {
- vvalue = vmalloc(size);
- if (!vvalue)
+ kvalue = vmalloc(size);
+ if (!kvalue)
return -ENOMEM;
- kvalue = vvalue;
}
if (copy_from_user(kvalue, value, size)) {
error = -EFAULT;
@@ -357,10 +336,8 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value,
error = vfs_setxattr(d, kname, kvalue, size, flags);
out:
- if (vvalue)
- vfree(vvalue);
- else
- kfree(kvalue);
+ kvfree(kvalue);
+
return error;
}
@@ -428,7 +405,6 @@ getxattr(struct dentry *d, const char __user *name, void __user *value,
{
ssize_t error;
void *kvalue = NULL;
- void *vvalue = NULL;
char kname[XATTR_NAME_MAX + 1];
error = strncpy_from_user(kname, name, sizeof(kname));
@@ -442,10 +418,9 @@ getxattr(struct dentry *d, const char __user *name, void __user *value,
size = XATTR_SIZE_MAX;
kvalue = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
if (!kvalue) {
- vvalue = vmalloc(size);
- if (!vvalue)
+ kvalue = vmalloc(size);
+ if (!kvalue)
return -ENOMEM;
- kvalue = vvalue;
}
}
@@ -461,10 +436,9 @@ getxattr(struct dentry *d, const char __user *name, void __user *value,
than XATTR_SIZE_MAX bytes. Not possible. */
error = -E2BIG;
}
- if (vvalue)
- vfree(vvalue);
- else
- kfree(kvalue);
+
+ kvfree(kvalue);
+
return error;
}
@@ -521,17 +495,15 @@ listxattr(struct dentry *d, char __user *list, size_t size)
{
ssize_t error;
char *klist = NULL;
- char *vlist = NULL; /* If non-NULL, we used vmalloc() */
if (size) {
if (size > XATTR_LIST_MAX)
size = XATTR_LIST_MAX;
klist = kmalloc(size, __GFP_NOWARN | GFP_KERNEL);
if (!klist) {
- vlist = vmalloc(size);
- if (!vlist)
+ klist = vmalloc(size);
+ if (!klist)
return -ENOMEM;
- klist = vlist;
}
}
@@ -544,10 +516,9 @@ listxattr(struct dentry *d, char __user *list, size_t size)
than XATTR_LIST_MAX bytes. Not possible. */
error = -E2BIG;
}
- if (vlist)
- vfree(vlist);
- else
- kfree(klist);
+
+ kvfree(klist);
+
return error;
}
@@ -700,13 +671,20 @@ xattr_resolve_name(const struct xattr_handler **handlers, const char **name)
return NULL;
for_each_xattr_handler(handlers, handler) {
- const char *n = strcmp_prefix(*name, handler->prefix);
+ const char *n;
+
+ n = strcmp_prefix(*name, xattr_prefix(handler));
if (n) {
+ if (!handler->prefix ^ !*n) {
+ if (*n)
+ continue;
+ return ERR_PTR(-EINVAL);
+ }
*name = n;
- break;
+ return handler;
}
}
- return handler;
+ return ERR_PTR(-EOPNOTSUPP);
}
/*
@@ -718,9 +696,9 @@ generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t s
const struct xattr_handler *handler;
handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
- if (!handler)
- return -EOPNOTSUPP;
- return handler->get(dentry, name, buffer, size, handler->flags);
+ if (IS_ERR(handler))
+ return PTR_ERR(handler);
+ return handler->get(handler, dentry, name, buffer, size);
}
/*
@@ -735,19 +713,25 @@ generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
if (!buffer) {
for_each_xattr_handler(handlers, handler) {
- size += handler->list(dentry, NULL, 0, NULL, 0,
- handler->flags);
+ if (!handler->name ||
+ (handler->list && !handler->list(dentry)))
+ continue;
+ size += strlen(handler->name) + 1;
}
} else {
char *buf = buffer;
+ size_t len;
for_each_xattr_handler(handlers, handler) {
- size = handler->list(dentry, buf, buffer_size,
- NULL, 0, handler->flags);
- if (size > buffer_size)
+ if (!handler->name ||
+ (handler->list && !handler->list(dentry)))
+ continue;
+ len = strlen(handler->name);
+ if (len + 1 > buffer_size)
return -ERANGE;
- buf += size;
- buffer_size -= size;
+ memcpy(buf, handler->name, len + 1);
+ buf += len + 1;
+ buffer_size -= len + 1;
}
size = buf - buffer;
}
@@ -765,9 +749,9 @@ generic_setxattr(struct dentry *dentry, const char *name, const void *value, siz
if (size == 0)
value = ""; /* empty EA, do not remove */
handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
- if (!handler)
- return -EOPNOTSUPP;
- return handler->set(dentry, name, value, size, flags, handler->flags);
+ if (IS_ERR(handler))
+ return PTR_ERR(handler);
+ return handler->set(handler, dentry, name, value, size, flags);
}
/*
@@ -780,10 +764,9 @@ generic_removexattr(struct dentry *dentry, const char *name)
const struct xattr_handler *handler;
handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
- if (!handler)
- return -EOPNOTSUPP;
- return handler->set(dentry, name, NULL, 0,
- XATTR_REPLACE, handler->flags);
+ if (IS_ERR(handler))
+ return PTR_ERR(handler);
+ return handler->set(handler, dentry, name, NULL, 0, XATTR_REPLACE);
}
EXPORT_SYMBOL(generic_getxattr);
@@ -791,6 +774,30 @@ EXPORT_SYMBOL(generic_listxattr);
EXPORT_SYMBOL(generic_setxattr);
EXPORT_SYMBOL(generic_removexattr);
+/**
+ * xattr_full_name - Compute full attribute name from suffix
+ *
+ * @handler: handler of the xattr_handler operation
+ * @name: name passed to the xattr_handler operation
+ *
+ * The get and set xattr handler operations are called with the remainder of
+ * the attribute name after skipping the handler's prefix: for example, "foo"
+ * is passed to the get operation of a handler with prefix "user." to get
+ * attribute "user.foo". The full name is still "there" in the name though.
+ *
+ * Note: the list xattr handler operation when called from the vfs is passed a
+ * NULL name; some file systems use this operation internally, with varying
+ * semantics.
+ */
+const char *xattr_full_name(const struct xattr_handler *handler,
+ const char *name)
+{
+ size_t prefix_len = strlen(xattr_prefix(handler));
+
+ return name - prefix_len;
+}
+EXPORT_SYMBOL(xattr_full_name);
+
/*
* Allocate new xattr and copy in the value; but leave the name to callers.
*/
@@ -840,8 +847,22 @@ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name,
return ret;
}
-static int __simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
- const void *value, size_t size, int flags)
+/**
+ * simple_xattr_set - xattr SET operation for in-memory/pseudo filesystems
+ * @xattrs: target simple_xattr list
+ * @name: name of the extended attribute
+ * @value: value of the xattr. If %NULL, will remove the attribute.
+ * @size: size of the new xattr
+ * @flags: %XATTR_{CREATE|REPLACE}
+ *
+ * %XATTR_CREATE is set, the xattr shouldn't exist already; otherwise fails
+ * with -EEXIST. If %XATTR_REPLACE is set, the xattr should exist;
+ * otherwise, fails with -ENODATA.
+ *
+ * Returns 0 on success, -errno on failure.
+ */
+int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
+ const void *value, size_t size, int flags)
{
struct simple_xattr *xattr;
struct simple_xattr *new_xattr = NULL;
@@ -891,73 +912,64 @@ out:
}
-/**
- * simple_xattr_set - xattr SET operation for in-memory/pseudo filesystems
- * @xattrs: target simple_xattr list
- * @name: name of the new extended attribute
- * @value: value of the new xattr. If %NULL, will remove the attribute
- * @size: size of the new xattr
- * @flags: %XATTR_{CREATE|REPLACE}
- *
- * %XATTR_CREATE is set, the xattr shouldn't exist already; otherwise fails
- * with -EEXIST. If %XATTR_REPLACE is set, the xattr should exist;
- * otherwise, fails with -ENODATA.
- *
- * Returns 0 on success, -errno on failure.
- */
-int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
- const void *value, size_t size, int flags)
-{
- if (size == 0)
- value = ""; /* empty EA, do not remove */
- return __simple_xattr_set(xattrs, name, value, size, flags);
-}
-
-/*
- * xattr REMOVE operation for in-memory/pseudo filesystems
- */
-int simple_xattr_remove(struct simple_xattrs *xattrs, const char *name)
+static bool xattr_is_trusted(const char *name)
{
- return __simple_xattr_set(xattrs, name, NULL, 0, XATTR_REPLACE);
+ return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
}
-static bool xattr_is_trusted(const char *name)
+static int xattr_list_one(char **buffer, ssize_t *remaining_size,
+ const char *name)
{
- return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
+ size_t len = strlen(name) + 1;
+ if (*buffer) {
+ if (*remaining_size < len)
+ return -ERANGE;
+ memcpy(*buffer, name, len);
+ *buffer += len;
+ }
+ *remaining_size -= len;
+ return 0;
}
/*
* xattr LIST operation for in-memory/pseudo filesystems
*/
-ssize_t simple_xattr_list(struct simple_xattrs *xattrs, char *buffer,
- size_t size)
+ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs,
+ char *buffer, size_t size)
{
bool trusted = capable(CAP_SYS_ADMIN);
struct simple_xattr *xattr;
- size_t used = 0;
+ ssize_t remaining_size = size;
+ int err;
+
+#ifdef CONFIG_FS_POSIX_ACL
+ if (inode->i_acl) {
+ err = xattr_list_one(&buffer, &remaining_size,
+ XATTR_NAME_POSIX_ACL_ACCESS);
+ if (err)
+ return err;
+ }
+ if (inode->i_default_acl) {
+ err = xattr_list_one(&buffer, &remaining_size,
+ XATTR_NAME_POSIX_ACL_DEFAULT);
+ if (err)
+ return err;
+ }
+#endif
spin_lock(&xattrs->lock);
list_for_each_entry(xattr, &xattrs->head, list) {
- size_t len;
-
/* skip "trusted." attributes for unprivileged callers */
if (!trusted && xattr_is_trusted(xattr->name))
continue;
- len = strlen(xattr->name) + 1;
- used += len;
- if (buffer) {
- if (size < used) {
- used = -ERANGE;
- break;
- }
- memcpy(buffer, xattr->name, len);
- buffer += len;
- }
+ err = xattr_list_one(&buffer, &remaining_size, xattr->name);
+ if (err)
+ return err;
}
spin_unlock(&xattrs->lock);
- return used;
+ return size - remaining_size;
}
/*
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index a096841bd06c..f64639176670 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -84,6 +84,7 @@ xfs-y += xfs_aops.o \
xfs_message.o \
xfs_mount.o \
xfs_mru_cache.o \
+ xfs_stats.o \
xfs_super.o \
xfs_symlink.o \
xfs_sysfs.o \
@@ -118,7 +119,6 @@ xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \
xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o
xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o
-xfs-$(CONFIG_PROC_FS) += xfs_stats.o
xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o
xfs-$(CONFIG_COMPAT) += xfs_ioctl32.o
xfs-$(CONFIG_NFSD_PNFS) += xfs_pnfs.o
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index a7a3a63bb360..686ba6fb20dd 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -55,8 +55,9 @@ kmem_alloc(size_t size, xfs_km_flags_t flags)
return ptr;
if (!(++retries % 100))
xfs_err(NULL,
- "possible memory allocation deadlock in %s (mode:0x%x)",
- __func__, lflags);
+ "%s(%u) possible memory allocation deadlock size %u in %s (mode:0x%x)",
+ current->comm, current->pid,
+ (unsigned int)size, __func__, lflags);
congestion_wait(BLK_RW_ASYNC, HZ/50);
} while (1);
}
@@ -120,8 +121,9 @@ kmem_zone_alloc(kmem_zone_t *zone, xfs_km_flags_t flags)
return ptr;
if (!(++retries % 100))
xfs_err(NULL,
- "possible memory allocation deadlock in %s (mode:0x%x)",
- __func__, lflags);
+ "%s(%u) possible memory allocation deadlock in %s (mode:0x%x)",
+ current->comm, current->pid,
+ __func__, lflags);
congestion_wait(BLK_RW_ASYNC, HZ/50);
} while (1);
}
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index cc6b768fc068..d1c66e465ca5 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -84,6 +84,7 @@ kmem_zalloc(size_t size, xfs_km_flags_t flags)
#define KM_ZONE_HWALIGN SLAB_HWCACHE_ALIGN
#define KM_ZONE_RECLAIM SLAB_RECLAIM_ACCOUNT
#define KM_ZONE_SPREAD SLAB_MEM_SPREAD
+#define KM_ZONE_ACCOUNT SLAB_ACCOUNT
#define kmem_zone kmem_cache
#define kmem_zone_t struct kmem_cache
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index ffad7f20342f..a708e38b494c 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -482,7 +482,9 @@ xfs_agfl_verify(
be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks)
return false;
}
- return true;
+
+ return xfs_log_check_lsn(mp,
+ be64_to_cpu(XFS_BUF_TO_AGFL(bp)->agfl_lsn));
}
static void
@@ -533,6 +535,7 @@ xfs_agfl_write_verify(
}
const struct xfs_buf_ops xfs_agfl_buf_ops = {
+ .name = "xfs_agfl",
.verify_read = xfs_agfl_read_verify,
.verify_write = xfs_agfl_write_verify,
};
@@ -651,8 +654,8 @@ xfs_alloc_ag_vextent(
-((long)(args->len)));
}
- XFS_STATS_INC(xs_allocx);
- XFS_STATS_ADD(xs_allocb, args->len);
+ XFS_STATS_INC(args->mp, xs_allocx);
+ XFS_STATS_ADD(args->mp, xs_allocb, args->len);
return error;
}
@@ -1808,8 +1811,8 @@ xfs_free_ag_extent(
if (!isfl)
xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);
- XFS_STATS_INC(xs_freex);
- XFS_STATS_ADD(xs_freeb, len);
+ XFS_STATS_INC(mp, xs_freex);
+ XFS_STATS_ADD(mp, xs_freeb, len);
trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright);
@@ -1924,7 +1927,7 @@ xfs_alloc_space_available(
* Decide whether to use this allocation group for this allocation.
* If so, fix up the btree freelist's size.
*/
-STATIC int /* error */
+int /* error */
xfs_alloc_fix_freelist(
struct xfs_alloc_arg *args, /* allocation argument structure */
int flags) /* XFS_ALLOC_FLAG_... */
@@ -2259,9 +2262,13 @@ xfs_agf_verify(
{
struct xfs_agf *agf = XFS_BUF_TO_AGF(bp);
- if (xfs_sb_version_hascrc(&mp->m_sb) &&
- !uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid))
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid))
+ return false;
+ if (!xfs_log_check_lsn(mp,
+ be64_to_cpu(XFS_BUF_TO_AGF(bp)->agf_lsn)))
return false;
+ }
if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
@@ -2333,6 +2340,7 @@ xfs_agf_write_verify(
}
const struct xfs_buf_ops xfs_agf_buf_ops = {
+ .name = "xfs_agf",
.verify_read = xfs_agf_read_verify,
.verify_write = xfs_agf_write_verify,
};
@@ -2503,7 +2511,7 @@ xfs_alloc_vextent(
* Try near allocation first, then anywhere-in-ag after
* the first a.g. fails.
*/
- if ((args->userdata == XFS_ALLOC_INITIAL_USER_DATA) &&
+ if ((args->userdata & XFS_ALLOC_INITIAL_USER_DATA) &&
(mp->m_flags & XFS_MOUNT_32BITINODES)) {
args->fsbno = XFS_AGB_TO_FSB(mp,
((mp->m_agfrotor / rotorstep) %
@@ -2634,6 +2642,14 @@ xfs_alloc_vextent(
XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno),
args->len);
#endif
+
+ /* Zero the extent if we were asked to do so */
+ if (args->userdata & XFS_ALLOC_USERDATA_ZERO) {
+ error = xfs_zero_extent(args->ip, args->fsbno, args->len);
+ if (error)
+ goto error0;
+ }
+
}
xfs_perag_put(args->pag);
return 0;
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index ca1c8168373a..135eb3d24db7 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -101,6 +101,7 @@ typedef struct xfs_alloc_arg {
struct xfs_mount *mp; /* file system mount point */
struct xfs_buf *agbp; /* buffer for a.g. freelist header */
struct xfs_perag *pag; /* per-ag struct for this agno */
+ struct xfs_inode *ip; /* for userdata zeroing method */
xfs_fsblock_t fsbno; /* file system block number */
xfs_agnumber_t agno; /* allocation group number */
xfs_agblock_t agbno; /* allocation group-relative block # */
@@ -120,15 +121,16 @@ typedef struct xfs_alloc_arg {
char wasdel; /* set if allocation was prev delayed */
char wasfromfl; /* set if allocation is from freelist */
char isfl; /* set if is freelist blocks - !acctg */
- char userdata; /* set if this is user data */
+ char userdata; /* mask defining userdata treatment */
xfs_fsblock_t firstblock; /* io first block allocated */
} xfs_alloc_arg_t;
/*
* Defines for userdata
*/
-#define XFS_ALLOC_USERDATA 1 /* allocation is for user data*/
-#define XFS_ALLOC_INITIAL_USER_DATA 2 /* special case start of file */
+#define XFS_ALLOC_USERDATA (1 << 0)/* allocation is for user data*/
+#define XFS_ALLOC_INITIAL_USER_DATA (1 << 1)/* special case start of file */
+#define XFS_ALLOC_USERDATA_ZERO (1 << 2)/* zero extent on allocation */
xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_mount *mp,
struct xfs_perag *pag, xfs_extlen_t need);
@@ -233,5 +235,6 @@ xfs_alloc_get_rec(
int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
+int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags);
#endif /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index 90de071dd4c2..444626ddbd1b 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -293,14 +293,7 @@ xfs_allocbt_verify(
level = be16_to_cpu(block->bb_level);
switch (block->bb_magic) {
case cpu_to_be32(XFS_ABTB_CRC_MAGIC):
- if (!xfs_sb_version_hascrc(&mp->m_sb))
- return false;
- if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
- return false;
- if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
- return false;
- if (pag &&
- be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
+ if (!xfs_btree_sblock_v5hdr_verify(bp))
return false;
/* fall through */
case cpu_to_be32(XFS_ABTB_MAGIC):
@@ -311,14 +304,7 @@ xfs_allocbt_verify(
return false;
break;
case cpu_to_be32(XFS_ABTC_CRC_MAGIC):
- if (!xfs_sb_version_hascrc(&mp->m_sb))
- return false;
- if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
- return false;
- if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
- return false;
- if (pag &&
- be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
+ if (!xfs_btree_sblock_v5hdr_verify(bp))
return false;
/* fall through */
case cpu_to_be32(XFS_ABTC_MAGIC):
@@ -332,21 +318,7 @@ xfs_allocbt_verify(
return false;
}
- /* numrecs verification */
- if (be16_to_cpu(block->bb_numrecs) > mp->m_alloc_mxr[level != 0])
- return false;
-
- /* sibling pointer verification */
- if (!block->bb_u.s.bb_leftsib ||
- (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks &&
- block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK)))
- return false;
- if (!block->bb_u.s.bb_rightsib ||
- (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks &&
- block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK)))
- return false;
-
- return true;
+ return xfs_btree_sblock_verify(bp, mp->m_alloc_mxr[level != 0]);
}
static void
@@ -379,6 +351,7 @@ xfs_allocbt_write_verify(
}
const struct xfs_buf_ops xfs_allocbt_buf_ops = {
+ .name = "xfs_allocbt",
.verify_read = xfs_allocbt_read_verify,
.verify_write = xfs_allocbt_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index ff065578969f..fa3b948ef9c2 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -125,7 +125,7 @@ xfs_attr_get(
uint lock_mode;
int error;
- XFS_STATS_INC(xs_attr_get);
+ XFS_STATS_INC(ip->i_mount, xs_attr_get);
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
return -EIO;
@@ -207,9 +207,9 @@ xfs_attr_set(
struct xfs_trans_res tres;
xfs_fsblock_t firstblock;
int rsvd = (flags & ATTR_ROOT) != 0;
- int error, err2, committed, local;
+ int error, err2, local;
- XFS_STATS_INC(xs_attr_set);
+ XFS_STATS_INC(mp, xs_attr_set);
if (XFS_FORCED_SHUTDOWN(dp->i_mount))
return -EIO;
@@ -334,25 +334,15 @@ xfs_attr_set(
*/
xfs_bmap_init(args.flist, args.firstblock);
error = xfs_attr_shortform_to_leaf(&args);
- if (!error) {
- error = xfs_bmap_finish(&args.trans, args.flist,
- &committed);
- }
+ if (!error)
+ error = xfs_bmap_finish(&args.trans, args.flist, dp);
if (error) {
- ASSERT(committed);
args.trans = NULL;
xfs_bmap_cancel(&flist);
goto out;
}
/*
- * bmap_finish() may have committed the last trans and started
- * a new one. We need the inode to be in all transactions.
- */
- if (committed)
- xfs_trans_ijoin(args.trans, dp, 0);
-
- /*
* Commit the leaf transformation. We'll need another (linked)
* transaction to add the new attribute to the leaf.
*/
@@ -412,7 +402,7 @@ xfs_attr_remove(
xfs_fsblock_t firstblock;
int error;
- XFS_STATS_INC(xs_attr_remove);
+ XFS_STATS_INC(mp, xs_attr_remove);
if (XFS_FORCED_SHUTDOWN(dp->i_mount))
return -EIO;
@@ -568,7 +558,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
{
xfs_inode_t *dp;
struct xfs_buf *bp;
- int retval, error, committed, forkoff;
+ int retval, error, forkoff;
trace_xfs_attr_leaf_addname(args);
@@ -628,25 +618,15 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
*/
xfs_bmap_init(args->flist, args->firstblock);
error = xfs_attr3_leaf_to_node(args);
- if (!error) {
- error = xfs_bmap_finish(&args->trans, args->flist,
- &committed);
- }
+ if (!error)
+ error = xfs_bmap_finish(&args->trans, args->flist, dp);
if (error) {
- ASSERT(committed);
args->trans = NULL;
xfs_bmap_cancel(args->flist);
return error;
}
/*
- * bmap_finish() may have committed the last trans and started
- * a new one. We need the inode to be in all transactions.
- */
- if (committed)
- xfs_trans_ijoin(args->trans, dp, 0);
-
- /*
* Commit the current trans (including the inode) and start
* a new one.
*/
@@ -729,25 +709,14 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
xfs_bmap_init(args->flist, args->firstblock);
error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
/* bp is gone due to xfs_da_shrink_inode */
- if (!error) {
+ if (!error)
error = xfs_bmap_finish(&args->trans,
- args->flist,
- &committed);
- }
+ args->flist, dp);
if (error) {
- ASSERT(committed);
args->trans = NULL;
xfs_bmap_cancel(args->flist);
return error;
}
-
- /*
- * bmap_finish() may have committed the last trans
- * and started a new one. We need the inode to be
- * in all transactions.
- */
- if (committed)
- xfs_trans_ijoin(args->trans, dp, 0);
}
/*
@@ -775,7 +744,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
{
xfs_inode_t *dp;
struct xfs_buf *bp;
- int error, committed, forkoff;
+ int error, forkoff;
trace_xfs_attr_leaf_removename(args);
@@ -803,23 +772,13 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
xfs_bmap_init(args->flist, args->firstblock);
error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
/* bp is gone due to xfs_da_shrink_inode */
- if (!error) {
- error = xfs_bmap_finish(&args->trans, args->flist,
- &committed);
- }
+ if (!error)
+ error = xfs_bmap_finish(&args->trans, args->flist, dp);
if (error) {
- ASSERT(committed);
args->trans = NULL;
xfs_bmap_cancel(args->flist);
return error;
}
-
- /*
- * bmap_finish() may have committed the last trans and started
- * a new one. We need the inode to be in all transactions.
- */
- if (committed)
- xfs_trans_ijoin(args->trans, dp, 0);
}
return 0;
}
@@ -877,7 +836,7 @@ xfs_attr_node_addname(xfs_da_args_t *args)
xfs_da_state_blk_t *blk;
xfs_inode_t *dp;
xfs_mount_t *mp;
- int committed, retval, error;
+ int retval, error;
trace_xfs_attr_node_addname(args);
@@ -938,27 +897,16 @@ restart:
state = NULL;
xfs_bmap_init(args->flist, args->firstblock);
error = xfs_attr3_leaf_to_node(args);
- if (!error) {
+ if (!error)
error = xfs_bmap_finish(&args->trans,
- args->flist,
- &committed);
- }
+ args->flist, dp);
if (error) {
- ASSERT(committed);
args->trans = NULL;
xfs_bmap_cancel(args->flist);
goto out;
}
/*
- * bmap_finish() may have committed the last trans
- * and started a new one. We need the inode to be
- * in all transactions.
- */
- if (committed)
- xfs_trans_ijoin(args->trans, dp, 0);
-
- /*
* Commit the node conversion and start the next
* trans in the chain.
*/
@@ -977,23 +925,13 @@ restart:
*/
xfs_bmap_init(args->flist, args->firstblock);
error = xfs_da3_split(state);
- if (!error) {
- error = xfs_bmap_finish(&args->trans, args->flist,
- &committed);
- }
+ if (!error)
+ error = xfs_bmap_finish(&args->trans, args->flist, dp);
if (error) {
- ASSERT(committed);
args->trans = NULL;
xfs_bmap_cancel(args->flist);
goto out;
}
-
- /*
- * bmap_finish() may have committed the last trans and started
- * a new one. We need the inode to be in all transactions.
- */
- if (committed)
- xfs_trans_ijoin(args->trans, dp, 0);
} else {
/*
* Addition succeeded, update Btree hashvals.
@@ -1086,25 +1024,14 @@ restart:
if (retval && (state->path.active > 1)) {
xfs_bmap_init(args->flist, args->firstblock);
error = xfs_da3_join(state);
- if (!error) {
+ if (!error)
error = xfs_bmap_finish(&args->trans,
- args->flist,
- &committed);
- }
+ args->flist, dp);
if (error) {
- ASSERT(committed);
args->trans = NULL;
xfs_bmap_cancel(args->flist);
goto out;
}
-
- /*
- * bmap_finish() may have committed the last trans
- * and started a new one. We need the inode to be
- * in all transactions.
- */
- if (committed)
- xfs_trans_ijoin(args->trans, dp, 0);
}
/*
@@ -1146,7 +1073,7 @@ xfs_attr_node_removename(xfs_da_args_t *args)
xfs_da_state_blk_t *blk;
xfs_inode_t *dp;
struct xfs_buf *bp;
- int retval, error, committed, forkoff;
+ int retval, error, forkoff;
trace_xfs_attr_node_removename(args);
@@ -1220,24 +1147,13 @@ xfs_attr_node_removename(xfs_da_args_t *args)
if (retval && (state->path.active > 1)) {
xfs_bmap_init(args->flist, args->firstblock);
error = xfs_da3_join(state);
- if (!error) {
- error = xfs_bmap_finish(&args->trans, args->flist,
- &committed);
- }
+ if (!error)
+ error = xfs_bmap_finish(&args->trans, args->flist, dp);
if (error) {
- ASSERT(committed);
args->trans = NULL;
xfs_bmap_cancel(args->flist);
goto out;
}
-
- /*
- * bmap_finish() may have committed the last trans and started
- * a new one. We need the inode to be in all transactions.
- */
- if (committed)
- xfs_trans_ijoin(args->trans, dp, 0);
-
/*
* Commit the Btree join operation and start a new trans.
*/
@@ -1265,25 +1181,14 @@ xfs_attr_node_removename(xfs_da_args_t *args)
xfs_bmap_init(args->flist, args->firstblock);
error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
/* bp is gone due to xfs_da_shrink_inode */
- if (!error) {
+ if (!error)
error = xfs_bmap_finish(&args->trans,
- args->flist,
- &committed);
- }
+ args->flist, dp);
if (error) {
- ASSERT(committed);
args->trans = NULL;
xfs_bmap_cancel(args->flist);
goto out;
}
-
- /*
- * bmap_finish() may have committed the last trans
- * and started a new one. We need the inode to be
- * in all transactions.
- */
- if (committed)
- xfs_trans_ijoin(args->trans, dp, 0);
} else
xfs_trans_brelse(args->trans, bp);
}
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 33df52d97ec7..01a5ecfedfcf 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -41,6 +41,7 @@
#include "xfs_buf_item.h"
#include "xfs_cksum.h"
#include "xfs_dir2.h"
+#include "xfs_log.h"
/*
@@ -266,6 +267,8 @@ xfs_attr3_leaf_verify(
return false;
if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
return false;
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn)))
+ return false;
} else {
if (ichdr.magic != XFS_ATTR_LEAF_MAGIC)
return false;
@@ -325,6 +328,7 @@ xfs_attr3_leaf_read_verify(
}
const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = {
+ .name = "xfs_attr3_leaf",
.verify_read = xfs_attr3_leaf_read_verify,
.verify_write = xfs_attr3_leaf_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index f38f9bd81557..a572532a55cd 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -107,7 +107,7 @@ xfs_attr3_rmt_verify(
if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt))
return false;
if (be32_to_cpu(rmt->rm_offset) +
- be32_to_cpu(rmt->rm_bytes) > XATTR_SIZE_MAX)
+ be32_to_cpu(rmt->rm_bytes) > XFS_XATTR_SIZE_MAX)
return false;
if (rmt->rm_owner == 0)
return false;
@@ -201,6 +201,7 @@ xfs_attr3_rmt_write_verify(
}
const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = {
+ .name = "xfs_attr3_rmt",
.verify_read = xfs_attr3_rmt_read_verify,
.verify_write = xfs_attr3_rmt_write_verify,
};
@@ -447,8 +448,6 @@ xfs_attr_rmtval_set(
* Roll through the "value", allocating blocks on disk as required.
*/
while (blkcnt > 0) {
- int committed;
-
/*
* Allocate a single extent, up to the size of the value.
*
@@ -466,24 +465,14 @@ xfs_attr_rmtval_set(
error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno,
blkcnt, XFS_BMAPI_ATTRFORK, args->firstblock,
args->total, &map, &nmap, args->flist);
- if (!error) {
- error = xfs_bmap_finish(&args->trans, args->flist,
- &committed);
- }
+ if (!error)
+ error = xfs_bmap_finish(&args->trans, args->flist, dp);
if (error) {
- ASSERT(committed);
args->trans = NULL;
xfs_bmap_cancel(args->flist);
return error;
}
- /*
- * bmap_finish() may have committed the last trans and started
- * a new one. We need the inode to be in all transactions.
- */
- if (committed)
- xfs_trans_ijoin(args->trans, dp, 0);
-
ASSERT(nmap == 1);
ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
(map.br_startblock != HOLESTARTBLOCK));
@@ -614,31 +603,20 @@ xfs_attr_rmtval_remove(
blkcnt = args->rmtblkcnt;
done = 0;
while (!done) {
- int committed;
-
xfs_bmap_init(args->flist, args->firstblock);
error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
XFS_BMAPI_ATTRFORK, 1, args->firstblock,
args->flist, &done);
- if (!error) {
+ if (!error)
error = xfs_bmap_finish(&args->trans, args->flist,
- &committed);
- }
+ args->dp);
if (error) {
- ASSERT(committed);
args->trans = NULL;
xfs_bmap_cancel(args->flist);
return error;
}
/*
- * bmap_finish() may have committed the last trans and started
- * a new one. We need the inode to be in all transactions.
- */
- if (committed)
- xfs_trans_ijoin(args->trans, args->dp, 0);
-
- /*
* Close out trans and start the next one in the chain.
*/
error = xfs_trans_roll(&args->trans, args->dp);
diff --git a/fs/xfs/libxfs/xfs_bit.c b/fs/xfs/libxfs/xfs_bit.c
index 0e8885a59646..0a94cce5ea35 100644
--- a/fs/xfs/libxfs/xfs_bit.c
+++ b/fs/xfs/libxfs/xfs_bit.c
@@ -32,13 +32,13 @@ int
xfs_bitmap_empty(uint *map, uint size)
{
uint i;
- uint ret = 0;
for (i = 0; i < size; i++) {
- ret |= map[i];
+ if (map[i] != 0)
+ return 0;
}
- return (ret == 0);
+ return 1;
}
/*
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 8e2010d53b07..ef00156f4f96 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -325,9 +325,11 @@ xfs_check_block(
/*
* Check that the extents for the inode ip are in the right order in all
- * btree leaves.
+ * btree leaves. THis becomes prohibitively expensive for large extent count
+ * files, so don't bother with inodes that have more than 10,000 extents in
+ * them. The btree record ordering checks will still be done, so for such large
+ * bmapbt constructs that is going to catch most corruptions.
*/
-
STATIC void
xfs_bmap_check_leaf_extents(
xfs_btree_cur_t *cur, /* btree cursor or null */
@@ -352,6 +354,10 @@ xfs_bmap_check_leaf_extents(
return;
}
+ /* skip large extent count inodes */
+ if (ip->i_d.di_nextents > 10000)
+ return;
+
bno = NULLFSBLOCK;
mp = ip->i_mount;
ifp = XFS_IFORK_PTR(ip, whichfork);
@@ -948,14 +954,16 @@ xfs_bmap_local_to_extents(
bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
/*
- * Initialise the block and copy the data
+ * Initialize the block, copy the data and log the remote buffer.
*
- * Note: init_fn must set the buffer log item type correctly!
+ * The callout is responsible for logging because the remote format
+ * might differ from the local format and thus we don't know how much to
+ * log here. Note that init_fn must also set the buffer log item type
+ * correctly.
*/
init_fn(tp, bp, ip, ifp);
- /* account for the change in fork size and log everything */
- xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
+ /* account for the change in fork size */
xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
xfs_bmap_local_to_extents_empty(ip, whichfork);
flags |= XFS_ILOG_CORE;
@@ -1109,7 +1117,6 @@ xfs_bmap_add_attrfork(
xfs_trans_t *tp; /* transaction pointer */
int blks; /* space reservation */
int version = 1; /* superblock attr version */
- int committed; /* xaction was committed */
int logflags; /* logging flags */
int error; /* error return value */
@@ -1212,7 +1219,7 @@ xfs_bmap_add_attrfork(
xfs_log_sb(tp);
}
- error = xfs_bmap_finish(&tp, &flist, &committed);
+ error = xfs_bmap_finish(&tp, &flist, NULL);
if (error)
goto bmap_cancel;
error = xfs_trans_commit(tp);
@@ -1435,7 +1442,7 @@ xfs_bmap_search_extents(
xfs_ifork_t *ifp; /* inode fork pointer */
xfs_bmbt_rec_host_t *ep; /* extent record pointer */
- XFS_STATS_INC(xs_look_exlist);
+ XFS_STATS_INC(ip->i_mount, xs_look_exlist);
ifp = XFS_IFORK_PTR(ip, fork);
ep = xfs_bmap_search_multi_extents(ifp, bno, eofp, lastxp, gotp, prevp);
@@ -1721,10 +1728,11 @@ xfs_bmap_add_extent_delay_real(
xfs_filblks_t temp=0; /* value for da_new calculations */
xfs_filblks_t temp2=0;/* value for da_new calculations */
int tmp_rval; /* partial logging flags */
+ int whichfork = XFS_DATA_FORK;
struct xfs_mount *mp;
- mp = bma->tp ? bma->tp->t_mountp : NULL;
- ifp = XFS_IFORK_PTR(bma->ip, XFS_DATA_FORK);
+ mp = bma->ip->i_mount;
+ ifp = XFS_IFORK_PTR(bma->ip, whichfork);
ASSERT(bma->idx >= 0);
ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
@@ -1732,7 +1740,7 @@ xfs_bmap_add_extent_delay_real(
ASSERT(!bma->cur ||
(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
- XFS_STATS_INC(xs_add_exlist);
+ XFS_STATS_INC(mp, xs_add_exlist);
#define LEFT r[0]
#define RIGHT r[1]
@@ -1783,7 +1791,7 @@ xfs_bmap_add_extent_delay_real(
* Don't set contiguous if the combined extent would be too large.
* Also check for all-three-contiguous being too large.
*/
- if (bma->idx < bma->ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
+ if (bma->idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
state |= BMAP_RIGHT_VALID;
xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx + 1), &RIGHT);
@@ -2014,10 +2022,10 @@ xfs_bmap_add_extent_delay_real(
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
}
- if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
+ if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
bma->firstblock, bma->flist,
- &bma->cur, 1, &tmp_rval, XFS_DATA_FORK);
+ &bma->cur, 1, &tmp_rval, whichfork);
rval |= tmp_rval;
if (error)
goto done;
@@ -2098,10 +2106,10 @@ xfs_bmap_add_extent_delay_real(
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
}
- if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
+ if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
bma->firstblock, bma->flist, &bma->cur, 1,
- &tmp_rval, XFS_DATA_FORK);
+ &tmp_rval, whichfork);
rval |= tmp_rval;
if (error)
goto done;
@@ -2167,10 +2175,10 @@ xfs_bmap_add_extent_delay_real(
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
}
- if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
+ if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
bma->firstblock, bma->flist, &bma->cur,
- 1, &tmp_rval, XFS_DATA_FORK);
+ 1, &tmp_rval, whichfork);
rval |= tmp_rval;
if (error)
goto done;
@@ -2213,13 +2221,13 @@ xfs_bmap_add_extent_delay_real(
}
/* convert to a btree if necessary */
- if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
+ if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
int tmp_logflags; /* partial log flag return val */
ASSERT(bma->cur == NULL);
error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
bma->firstblock, bma->flist, &bma->cur,
- da_old > 0, &tmp_logflags, XFS_DATA_FORK);
+ da_old > 0, &tmp_logflags, whichfork);
bma->logflags |= tmp_logflags;
if (error)
goto done;
@@ -2240,7 +2248,7 @@ xfs_bmap_add_extent_delay_real(
if (bma->cur)
bma->cur->bc_private.b.allocated = 0;
- xfs_bmap_check_leaf_extents(bma->cur, bma->ip, XFS_DATA_FORK);
+ xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork);
done:
bma->logflags |= rval;
return error;
@@ -2286,7 +2294,7 @@ xfs_bmap_add_extent_unwritten_real(
ASSERT(*idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
ASSERT(!isnullstartblock(new->br_startblock));
- XFS_STATS_INC(xs_add_exlist);
+ XFS_STATS_INC(mp, xs_add_exlist);
#define LEFT r[0]
#define RIGHT r[1]
@@ -2937,7 +2945,7 @@ xfs_bmap_add_extent_hole_real(
int state; /* state bits, accessed thru macros */
struct xfs_mount *mp;
- mp = bma->tp ? bma->tp->t_mountp : NULL;
+ mp = bma->ip->i_mount;
ifp = XFS_IFORK_PTR(bma->ip, whichfork);
ASSERT(bma->idx >= 0);
@@ -2946,7 +2954,7 @@ xfs_bmap_add_extent_hole_real(
ASSERT(!bma->cur ||
!(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
- XFS_STATS_INC(xs_add_exlist);
+ XFS_STATS_INC(mp, xs_add_exlist);
state = 0;
if (whichfork == XFS_ATTR_FORK)
@@ -3800,8 +3808,13 @@ xfs_bmap_btalloc(
args.wasdel = ap->wasdel;
args.isfl = 0;
args.userdata = ap->userdata;
- if ((error = xfs_alloc_vextent(&args)))
+ if (ap->userdata & XFS_ALLOC_USERDATA_ZERO)
+ args.ip = ap->ip;
+
+ error = xfs_alloc_vextent(&args);
+ if (error)
return error;
+
if (tryagain && args.fsbno == NULLFSBLOCK) {
/*
* Exact allocation failed. Now try with alignment
@@ -4036,7 +4049,7 @@ xfs_bmapi_read(
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
- XFS_STATS_INC(xs_blk_mapr);
+ XFS_STATS_INC(mp, xs_blk_mapr);
ifp = XFS_IFORK_PTR(ip, whichfork);
@@ -4221,7 +4234,7 @@ xfs_bmapi_delay(
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
- XFS_STATS_INC(xs_blk_mapw);
+ XFS_STATS_INC(mp, xs_blk_mapw);
if (!(ifp->if_flags & XFS_IFEXTENTS)) {
error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
@@ -4300,11 +4313,14 @@ xfs_bmapi_allocate(
/*
* Indicate if this is the first user data in the file, or just any
- * user data.
+ * user data. And if it is userdata, indicate whether it needs to
+ * be initialised to zero during allocation.
*/
if (!(bma->flags & XFS_BMAPI_METADATA)) {
bma->userdata = (bma->offset == 0) ?
XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA;
+ if (bma->flags & XFS_BMAPI_ZERO)
+ bma->userdata |= XFS_ALLOC_USERDATA_ZERO;
}
bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1;
@@ -4419,6 +4435,17 @@ xfs_bmapi_convert_unwritten(
mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
? XFS_EXT_NORM : XFS_EXT_UNWRITTEN;
+ /*
+ * Before insertion into the bmbt, zero the range being converted
+ * if required.
+ */
+ if (flags & XFS_BMAPI_ZERO) {
+ error = xfs_zero_extent(bma->ip, mval->br_startblock,
+ mval->br_blockcount);
+ if (error)
+ return error;
+ }
+
error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx,
&bma->cur, mval, bma->firstblock, bma->flist,
&tmp_logflags);
@@ -4512,6 +4539,18 @@ xfs_bmapi_write(
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ /* zeroing is for currently only for data extents, not metadata */
+ ASSERT((flags & (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO)) !=
+ (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO));
+ /*
+ * we can allocate unwritten extents or pre-zero allocated blocks,
+ * but it makes no sense to do both at once. This would result in
+ * zeroing the unwritten extent twice, but it still being an
+ * unwritten extent....
+ */
+ ASSERT((flags & (XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO)) !=
+ (XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO));
+
if (unlikely(XFS_TEST_ERROR(
(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
@@ -4525,7 +4564,7 @@ xfs_bmapi_write(
ifp = XFS_IFORK_PTR(ip, whichfork);
- XFS_STATS_INC(xs_blk_mapw);
+ XFS_STATS_INC(mp, xs_blk_mapw);
if (*firstblock == NULLFSBLOCK) {
if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE)
@@ -4718,12 +4757,12 @@ xfs_bmap_del_extent(
xfs_filblks_t temp2; /* for indirect length calculations */
int state = 0;
- XFS_STATS_INC(xs_del_exlist);
+ mp = ip->i_mount;
+ XFS_STATS_INC(mp, xs_del_exlist);
if (whichfork == XFS_ATTR_FORK)
state |= BMAP_ATTRFORK;
- mp = ip->i_mount;
ifp = XFS_IFORK_PTR(ip, whichfork);
ASSERT((*idx >= 0) && (*idx < ifp->if_bytes /
(uint)sizeof(xfs_bmbt_rec_t)));
@@ -5070,7 +5109,7 @@ xfs_bunmapi(
*done = 1;
return 0;
}
- XFS_STATS_INC(xs_blk_unmap);
+ XFS_STATS_INC(mp, xs_blk_unmap);
isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
start = bno;
bno = start + len - 1;
@@ -5917,7 +5956,6 @@ xfs_bmap_split_extent(
struct xfs_trans *tp;
struct xfs_bmap_free free_list;
xfs_fsblock_t firstfsb;
- int committed;
int error;
tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
@@ -5938,7 +5976,7 @@ xfs_bmap_split_extent(
if (error)
goto out;
- error = xfs_bmap_finish(&tp, &free_list, &committed);
+ error = xfs_bmap_finish(&tp, &free_list, NULL);
if (error)
goto out;
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 6aaa0c1c7200..423a34e832bd 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -52,9 +52,9 @@ struct xfs_bmalloca {
xfs_extlen_t minleft; /* amount must be left after alloc */
bool eof; /* set if allocating past last extent */
bool wasdel; /* replacing a delayed allocation */
- bool userdata;/* set if is user data */
bool aeof; /* allocated space at eof */
bool conv; /* overwriting unwritten extents */
+ char userdata;/* userdata mask */
int flags;
};
@@ -109,6 +109,14 @@ typedef struct xfs_bmap_free
*/
#define XFS_BMAPI_CONVERT 0x040
+/*
+ * allocate zeroed extents - this requires all newly allocated user data extents
+ * to be initialised to zero. It will be ignored if XFS_BMAPI_METADATA is set.
+ * Use in conjunction with XFS_BMAPI_CONVERT to convert unwritten extents found
+ * during the allocation range to zeroed written extents.
+ */
+#define XFS_BMAPI_ZERO 0x080
+
#define XFS_BMAPI_FLAGS \
{ XFS_BMAPI_ENTIRE, "ENTIRE" }, \
{ XFS_BMAPI_METADATA, "METADATA" }, \
@@ -116,7 +124,8 @@ typedef struct xfs_bmap_free
{ XFS_BMAPI_PREALLOC, "PREALLOC" }, \
{ XFS_BMAPI_IGSTATE, "IGSTATE" }, \
{ XFS_BMAPI_CONTIG, "CONTIG" }, \
- { XFS_BMAPI_CONVERT, "CONVERT" }
+ { XFS_BMAPI_CONVERT, "CONVERT" }, \
+ { XFS_BMAPI_ZERO, "ZERO" }
static inline int xfs_bmapi_aflag(int w)
@@ -186,7 +195,7 @@ void xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len,
struct xfs_bmap_free *flist, struct xfs_mount *mp);
void xfs_bmap_cancel(struct xfs_bmap_free *flist);
int xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist,
- int *committed);
+ struct xfs_inode *ip);
void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork);
int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork);
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 6b0cf6546a82..1637c37bfbaa 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -720,6 +720,7 @@ xfs_bmbt_write_verify(
}
const struct xfs_buf_ops xfs_bmbt_buf_ops = {
+ .name = "xfs_bmbt",
.verify_read = xfs_bmbt_read_verify,
.verify_write = xfs_bmbt_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index f7d7ee7a2607..a0eb18ce3ad3 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -32,6 +32,7 @@
#include "xfs_trace.h"
#include "xfs_cksum.h"
#include "xfs_alloc.h"
+#include "xfs_log.h"
/*
* Cursor allocation zone.
@@ -222,7 +223,7 @@ xfs_btree_check_ptr(
* long-form btree header.
*
* Prior to calculting the CRC, pull the LSN out of the buffer log item and put
- * it into the buffer so recovery knows what the last modifcation was that made
+ * it into the buffer so recovery knows what the last modification was that made
* it to disk.
*/
void
@@ -243,8 +244,14 @@ bool
xfs_btree_lblock_verify_crc(
struct xfs_buf *bp)
{
- if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.l.bb_lsn)))
+ return false;
return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
+ }
return true;
}
@@ -254,7 +261,7 @@ xfs_btree_lblock_verify_crc(
* short-form btree header.
*
* Prior to calculting the CRC, pull the LSN out of the buffer log item and put
- * it into the buffer so recovery knows what the last modifcation was that made
+ * it into the buffer so recovery knows what the last modification was that made
* it to disk.
*/
void
@@ -275,8 +282,14 @@ bool
xfs_btree_sblock_verify_crc(
struct xfs_buf *bp)
{
- if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn)))
+ return false;
return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
+ }
return true;
}
@@ -4067,3 +4080,61 @@ xfs_btree_change_owner(
return 0;
}
+
+/**
+ * xfs_btree_sblock_v5hdr_verify() -- verify the v5 fields of a short-format
+ * btree block
+ *
+ * @bp: buffer containing the btree block
+ * @max_recs: pointer to the m_*_mxr max records field in the xfs mount
+ * @pag_max_level: pointer to the per-ag max level field
+ */
+bool
+xfs_btree_sblock_v5hdr_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ struct xfs_perag *pag = bp->b_pag;
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return false;
+ if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
+ return false;
+ if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
+ return false;
+ if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
+ return false;
+ return true;
+}
+
+/**
+ * xfs_btree_sblock_verify() -- verify a short-format btree block
+ *
+ * @bp: buffer containing the btree block
+ * @max_recs: maximum records allowed in this btree node
+ */
+bool
+xfs_btree_sblock_verify(
+ struct xfs_buf *bp,
+ unsigned int max_recs)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+
+ /* numrecs verification */
+ if (be16_to_cpu(block->bb_numrecs) > max_recs)
+ return false;
+
+ /* sibling pointer verification */
+ if (!block->bb_u.s.bb_leftsib ||
+ (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks &&
+ block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK)))
+ return false;
+ if (!block->bb_u.s.bb_rightsib ||
+ (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks &&
+ block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK)))
+ return false;
+
+ return true;
+}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 8f18bab73ea5..2e874be70209 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -84,31 +84,38 @@ union xfs_btree_rec {
/*
* Generic stats interface
*/
-#define __XFS_BTREE_STATS_INC(type, stat) \
- XFS_STATS_INC(xs_ ## type ## _2_ ## stat)
-#define XFS_BTREE_STATS_INC(cur, stat) \
+#define __XFS_BTREE_STATS_INC(mp, type, stat) \
+ XFS_STATS_INC(mp, xs_ ## type ## _2_ ## stat)
+#define XFS_BTREE_STATS_INC(cur, stat) \
do { \
+ struct xfs_mount *__mp = cur->bc_mp; \
switch (cur->bc_btnum) { \
- case XFS_BTNUM_BNO: __XFS_BTREE_STATS_INC(abtb, stat); break; \
- case XFS_BTNUM_CNT: __XFS_BTREE_STATS_INC(abtc, stat); break; \
- case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(bmbt, stat); break; \
- case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(ibt, stat); break; \
- case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(fibt, stat); break; \
+ case XFS_BTNUM_BNO: __XFS_BTREE_STATS_INC(__mp, abtb, stat); break; \
+ case XFS_BTNUM_CNT: __XFS_BTREE_STATS_INC(__mp, abtc, stat); break; \
+ case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(__mp, bmbt, stat); break; \
+ case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(__mp, ibt, stat); break; \
+ case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(__mp, fibt, stat); break; \
case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \
} \
} while (0)
-#define __XFS_BTREE_STATS_ADD(type, stat, val) \
- XFS_STATS_ADD(xs_ ## type ## _2_ ## stat, val)
+#define __XFS_BTREE_STATS_ADD(mp, type, stat, val) \
+ XFS_STATS_ADD(mp, xs_ ## type ## _2_ ## stat, val)
#define XFS_BTREE_STATS_ADD(cur, stat, val) \
do { \
+ struct xfs_mount *__mp = cur->bc_mp; \
switch (cur->bc_btnum) { \
- case XFS_BTNUM_BNO: __XFS_BTREE_STATS_ADD(abtb, stat, val); break; \
- case XFS_BTNUM_CNT: __XFS_BTREE_STATS_ADD(abtc, stat, val); break; \
- case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_ADD(bmbt, stat, val); break; \
- case XFS_BTNUM_INO: __XFS_BTREE_STATS_ADD(ibt, stat, val); break; \
- case XFS_BTNUM_FINO: __XFS_BTREE_STATS_ADD(fibt, stat, val); break; \
- case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \
+ case XFS_BTNUM_BNO: \
+ __XFS_BTREE_STATS_ADD(__mp, abtb, stat, val); break; \
+ case XFS_BTNUM_CNT: \
+ __XFS_BTREE_STATS_ADD(__mp, abtc, stat, val); break; \
+ case XFS_BTNUM_BMAP: \
+ __XFS_BTREE_STATS_ADD(__mp, bmbt, stat, val); break; \
+ case XFS_BTNUM_INO: \
+ __XFS_BTREE_STATS_ADD(__mp, ibt, stat, val); break; \
+ case XFS_BTNUM_FINO: \
+ __XFS_BTREE_STATS_ADD(__mp, fibt, stat, val); break; \
+ case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \
} \
} while (0)
@@ -465,4 +472,7 @@ static inline int xfs_btree_get_level(struct xfs_btree_block *block)
#define XFS_BTREE_TRACE_ARGR(c, r)
#define XFS_BTREE_TRACE_CURSOR(c, t)
+bool xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp);
+bool xfs_btree_sblock_verify(struct xfs_buf *bp, unsigned int max_recs);
+
#endif /* __XFS_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index be43248a5822..097bf7717d80 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -39,6 +39,7 @@
#include "xfs_trace.h"
#include "xfs_cksum.h"
#include "xfs_buf_item.h"
+#include "xfs_log.h"
/*
* xfs_da_btree.c
@@ -150,6 +151,8 @@ xfs_da3_node_verify(
return false;
if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
return false;
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn)))
+ return false;
} else {
if (ichdr.magic != XFS_DA_NODE_MAGIC)
return false;
@@ -242,6 +245,7 @@ xfs_da3_node_read_verify(
}
const struct xfs_buf_ops xfs_da3_node_buf_ops = {
+ .name = "xfs_da3_node",
.verify_read = xfs_da3_node_read_verify,
.verify_write = xfs_da3_node_write_verify,
};
@@ -322,6 +326,7 @@ xfs_da3_node_create(
if (xfs_sb_version_hascrc(&mp->m_sb)) {
struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
+ memset(hdr3, 0, sizeof(struct xfs_da3_node_hdr));
ichdr.magic = XFS_DA3_NODE_MAGIC;
hdr3->info.blkno = cpu_to_be64(bp->b_bn);
hdr3->info.owner = cpu_to_be64(args->dp->i_ino);
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 9de401d297e5..2fb53a5c0a74 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -271,7 +271,7 @@ xfs_dir_createname(
rval = xfs_dir_ino_validate(tp->t_mountp, inum);
if (rval)
return rval;
- XFS_STATS_INC(xs_dir_create);
+ XFS_STATS_INC(dp->i_mount, xs_dir_create);
}
args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
@@ -365,7 +365,7 @@ xfs_dir_lookup(
int lock_mode;
ASSERT(S_ISDIR(dp->i_d.di_mode));
- XFS_STATS_INC(xs_dir_lookup);
+ XFS_STATS_INC(dp->i_mount, xs_dir_lookup);
/*
* We need to use KM_NOFS here so that lockdep will not throw false
@@ -444,7 +444,7 @@ xfs_dir_removename(
int v; /* type-checking value */
ASSERT(S_ISDIR(dp->i_d.di_mode));
- XFS_STATS_INC(xs_dir_remove);
+ XFS_STATS_INC(dp->i_mount, xs_dir_remove);
args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
if (!args)
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index 4778d1dd511a..aa17cb788946 100644
--- a/fs/xfs/libxfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -34,6 +34,7 @@
#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_cksum.h"
+#include "xfs_log.h"
/*
* Local function prototypes.
@@ -71,6 +72,8 @@ xfs_dir3_block_verify(
return false;
if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
return false;
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
+ return false;
} else {
if (hdr3->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))
return false;
@@ -120,6 +123,7 @@ xfs_dir3_block_write_verify(
}
const struct xfs_buf_ops xfs_dir3_block_buf_ops = {
+ .name = "xfs_dir3_block",
.verify_read = xfs_dir3_block_read_verify,
.verify_write = xfs_dir3_block_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c
index 824131e71bc5..725fc7841fde 100644
--- a/fs/xfs/libxfs/xfs_dir2_data.c
+++ b/fs/xfs/libxfs/xfs_dir2_data.c
@@ -31,6 +31,7 @@
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_cksum.h"
+#include "xfs_log.h"
/*
* Check the consistency of the data block.
@@ -224,6 +225,8 @@ xfs_dir3_data_verify(
return false;
if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
return false;
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
+ return false;
} else {
if (hdr3->magic != cpu_to_be32(XFS_DIR2_DATA_MAGIC))
return false;
@@ -302,11 +305,13 @@ xfs_dir3_data_write_verify(
}
const struct xfs_buf_ops xfs_dir3_data_buf_ops = {
+ .name = "xfs_dir3_data",
.verify_read = xfs_dir3_data_read_verify,
.verify_write = xfs_dir3_data_write_verify,
};
static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = {
+ .name = "xfs_dir3_data_reada",
.verify_read = xfs_dir3_data_reada_verify,
.verify_write = xfs_dir3_data_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c
index f300240ebb8d..b887fb2a2bcf 100644
--- a/fs/xfs/libxfs/xfs_dir2_leaf.c
+++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
@@ -33,6 +33,7 @@
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_cksum.h"
+#include "xfs_log.h"
/*
* Local function declarations.
@@ -164,6 +165,8 @@ xfs_dir3_leaf_verify(
return false;
if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
return false;
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(leaf3->info.lsn)))
+ return false;
} else {
if (leaf->hdr.info.magic != cpu_to_be16(magic))
return false;
@@ -242,11 +245,13 @@ xfs_dir3_leafn_write_verify(
}
const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops = {
+ .name = "xfs_dir3_leaf1",
.verify_read = xfs_dir3_leaf1_read_verify,
.verify_write = xfs_dir3_leaf1_write_verify,
};
const struct xfs_buf_ops xfs_dir3_leafn_buf_ops = {
+ .name = "xfs_dir3_leafn",
.verify_read = xfs_dir3_leafn_read_verify,
.verify_write = xfs_dir3_leafn_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index cc28e924545b..63ee03db796c 100644
--- a/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -33,6 +33,7 @@
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_cksum.h"
+#include "xfs_log.h"
/*
* Function declarations.
@@ -97,6 +98,8 @@ xfs_dir3_free_verify(
return false;
if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
return false;
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
+ return false;
} else {
if (hdr->magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC))
return false;
@@ -147,6 +150,7 @@ xfs_dir3_free_write_verify(
}
const struct xfs_buf_ops xfs_dir3_free_buf_ops = {
+ .name = "xfs_dir3_free",
.verify_read = xfs_dir3_free_read_verify,
.verify_write = xfs_dir3_free_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
index 5331b7f0460c..3cc3cf767474 100644
--- a/fs/xfs/libxfs/xfs_dquot_buf.c
+++ b/fs/xfs/libxfs/xfs_dquot_buf.c
@@ -54,7 +54,7 @@ xfs_dqcheck(
xfs_dqid_t id,
uint type, /* used only when IO_dorepair is true */
uint flags,
- char *str)
+ const char *str)
{
xfs_dqblk_t *d = (xfs_dqblk_t *)ddq;
int errs = 0;
@@ -207,7 +207,8 @@ xfs_dquot_buf_verify_crc(
STATIC bool
xfs_dquot_buf_verify(
struct xfs_mount *mp,
- struct xfs_buf *bp)
+ struct xfs_buf *bp,
+ int warn)
{
struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr;
xfs_dqid_t id = 0;
@@ -240,8 +241,7 @@ xfs_dquot_buf_verify(
if (i == 0)
id = be32_to_cpu(ddq->d_id);
- error = xfs_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN,
- "xfs_dquot_buf_verify");
+ error = xfs_dqcheck(mp, ddq, id + i, 0, warn, __func__);
if (error)
return false;
}
@@ -256,7 +256,7 @@ xfs_dquot_buf_read_verify(
if (!xfs_dquot_buf_verify_crc(mp, bp))
xfs_buf_ioerror(bp, -EFSBADCRC);
- else if (!xfs_dquot_buf_verify(mp, bp))
+ else if (!xfs_dquot_buf_verify(mp, bp, XFS_QMOPT_DOWARN))
xfs_buf_ioerror(bp, -EFSCORRUPTED);
if (bp->b_error)
@@ -264,6 +264,25 @@ xfs_dquot_buf_read_verify(
}
/*
+ * readahead errors are silent and simply leave the buffer as !done so a real
+ * read will then be run with the xfs_dquot_buf_ops verifier. See
+ * xfs_inode_buf_verify() for why we use EIO and ~XBF_DONE here rather than
+ * reporting the failure.
+ */
+static void
+xfs_dquot_buf_readahead_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ if (!xfs_dquot_buf_verify_crc(mp, bp) ||
+ !xfs_dquot_buf_verify(mp, bp, 0)) {
+ xfs_buf_ioerror(bp, -EIO);
+ bp->b_flags &= ~XBF_DONE;
+ }
+}
+
+/*
* we don't calculate the CRC here as that is done when the dquot is flushed to
* the buffer after the update is done. This ensures that the dquot in the
* buffer always has an up-to-date CRC value.
@@ -274,7 +293,7 @@ xfs_dquot_buf_write_verify(
{
struct xfs_mount *mp = bp->b_target->bt_mount;
- if (!xfs_dquot_buf_verify(mp, bp)) {
+ if (!xfs_dquot_buf_verify(mp, bp, XFS_QMOPT_DOWARN)) {
xfs_buf_ioerror(bp, -EFSCORRUPTED);
xfs_verifier_error(bp);
return;
@@ -282,7 +301,13 @@ xfs_dquot_buf_write_verify(
}
const struct xfs_buf_ops xfs_dquot_buf_ops = {
+ .name = "xfs_dquot",
.verify_read = xfs_dquot_buf_read_verify,
.verify_write = xfs_dquot_buf_write_verify,
};
+const struct xfs_buf_ops xfs_dquot_buf_ra_ops = {
+ .name = "xfs_dquot_ra",
+ .verify_read = xfs_dquot_buf_readahead_verify,
+ .verify_write = xfs_dquot_buf_write_verify,
+};
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 9590a069e556..e2536bb1c760 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -60,6 +60,14 @@ struct xfs_ifork;
#define XFS_SB_VERSION_MOREBITSBIT 0x8000
/*
+ * The size of a single extended attribute on disk is limited by
+ * the size of index values within the attribute entries themselves.
+ * These are be16 fields, so we can only support attribute data
+ * sizes up to 2^16 bytes in length.
+ */
+#define XFS_XATTR_SIZE_MAX (1 << 16)
+
+/*
* Supported feature bit list is just all bits in the versionnum field because
* we've used them all up and understand them all. Except, of course, for the
* shared superblock bit, which nobody knows what it does and so is unsupported.
@@ -778,7 +786,7 @@ typedef struct xfs_agfl {
__be64 agfl_lsn;
__be32 agfl_crc;
__be32 agfl_bno[]; /* actually XFS_AGFL_SIZE(mp) */
-} xfs_agfl_t;
+} __attribute__((packed)) xfs_agfl_t;
#define XFS_AGFL_CRC_OFF offsetof(struct xfs_agfl, agfl_crc)
@@ -1483,13 +1491,17 @@ struct xfs_acl {
*/
#define XFS_ACL_MAX_ENTRIES(mp) \
(xfs_sb_version_hascrc(&mp->m_sb) \
- ? (XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \
+ ? (XFS_XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \
sizeof(struct xfs_acl_entry) \
: 25)
-#define XFS_ACL_MAX_SIZE(mp) \
+#define XFS_ACL_SIZE(cnt) \
(sizeof(struct xfs_acl) + \
- sizeof(struct xfs_acl_entry) * XFS_ACL_MAX_ENTRIES((mp)))
+ sizeof(struct xfs_acl_entry) * cnt)
+
+#define XFS_ACL_MAX_SIZE(mp) \
+ XFS_ACL_SIZE(XFS_ACL_MAX_ENTRIES((mp)))
+
/* On-disk XFS extended attribute names */
#define SGI_ACL_FILE "SGI_ACL_FILE"
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 89689c6a43e2..b2b73a998d42 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -490,6 +490,16 @@ typedef struct xfs_swapext
#define XFS_FSOP_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */
/*
+ * ioctl limits
+ */
+#ifdef XATTR_LIST_MAX
+# define XFS_XATTR_LIST_MAX XATTR_LIST_MAX
+#else
+# define XFS_XATTR_LIST_MAX 65536
+#endif
+
+
+/*
* ioctl commands that are used by Linux filesystems
*/
#define XFS_IOC_GETXFLAGS FS_IOC_GETFLAGS
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 54deb2d12ac6..66d702e6b9ff 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -38,6 +38,7 @@
#include "xfs_icreate_item.h"
#include "xfs_icache.h"
#include "xfs_trace.h"
+#include "xfs_log.h"
/*
@@ -2500,9 +2501,14 @@ xfs_agi_verify(
struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_agi *agi = XFS_BUF_TO_AGI(bp);
- if (xfs_sb_version_hascrc(&mp->m_sb) &&
- !uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid))
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (!uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid))
+ return false;
+ if (!xfs_log_check_lsn(mp,
+ be64_to_cpu(XFS_BUF_TO_AGI(bp)->agi_lsn)))
return false;
+ }
+
/*
* Validate the magic number of the agi block.
*/
@@ -2566,6 +2572,7 @@ xfs_agi_write_verify(
}
const struct xfs_buf_ops xfs_agi_buf_ops = {
+ .name = "xfs_agi",
.verify_read = xfs_agi_read_verify,
.verify_write = xfs_agi_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index f39b285beb19..c679f3c05b63 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -221,7 +221,6 @@ xfs_inobt_verify(
{
struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
- struct xfs_perag *pag = bp->b_pag;
unsigned int level;
/*
@@ -237,14 +236,7 @@ xfs_inobt_verify(
switch (block->bb_magic) {
case cpu_to_be32(XFS_IBT_CRC_MAGIC):
case cpu_to_be32(XFS_FIBT_CRC_MAGIC):
- if (!xfs_sb_version_hascrc(&mp->m_sb))
- return false;
- if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
- return false;
- if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
- return false;
- if (pag &&
- be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
+ if (!xfs_btree_sblock_v5hdr_verify(bp))
return false;
/* fall through */
case cpu_to_be32(XFS_IBT_MAGIC):
@@ -254,24 +246,12 @@ xfs_inobt_verify(
return 0;
}
- /* numrecs and level verification */
+ /* level verification */
level = be16_to_cpu(block->bb_level);
if (level >= mp->m_in_maxlevels)
return false;
- if (be16_to_cpu(block->bb_numrecs) > mp->m_inobt_mxr[level != 0])
- return false;
-
- /* sibling pointer verification */
- if (!block->bb_u.s.bb_leftsib ||
- (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks &&
- block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK)))
- return false;
- if (!block->bb_u.s.bb_rightsib ||
- (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks &&
- block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK)))
- return false;
- return true;
+ return xfs_btree_sblock_verify(bp, mp->m_inobt_mxr[level != 0]);
}
static void
@@ -304,6 +284,7 @@ xfs_inobt_write_verify(
}
const struct xfs_buf_ops xfs_inobt_buf_ops = {
+ .name = "xfs_inobt",
.verify_read = xfs_inobt_read_verify,
.verify_write = xfs_inobt_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 268c00f4f83a..1aabfda669b0 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -62,11 +62,14 @@ xfs_inobp_check(
* has not had the inode cores stamped into it. Hence for readahead, the buffer
* may be potentially invalid.
*
- * If the readahead buffer is invalid, we don't want to mark it with an error,
- * but we do want to clear the DONE status of the buffer so that a followup read
- * will re-read it from disk. This will ensure that we don't get an unnecessary
- * warnings during log recovery and we don't get unnecssary panics on debug
- * kernels.
+ * If the readahead buffer is invalid, we need to mark it with an error and
+ * clear the DONE status of the buffer so that a followup read will re-read it
+ * from disk. We don't report the error otherwise to avoid warnings during log
+ * recovery and we don't get unnecssary panics on debug kernels. We use EIO here
+ * because all we want to do is say readahead failed; there is no-one to report
+ * the error to, so this will distinguish it from a non-ra verifier failure.
+ * Changes to this readahead error behavour also need to be reflected in
+ * xfs_dquot_buf_readahead_verify().
*/
static void
xfs_inode_buf_verify(
@@ -93,6 +96,7 @@ xfs_inode_buf_verify(
XFS_RANDOM_ITOBP_INOTOBP))) {
if (readahead) {
bp->b_flags &= ~XBF_DONE;
+ xfs_buf_ioerror(bp, -EIO);
return;
}
@@ -132,11 +136,13 @@ xfs_inode_buf_write_verify(
}
const struct xfs_buf_ops xfs_inode_buf_ops = {
+ .name = "xfs_inode",
.verify_read = xfs_inode_buf_read_verify,
.verify_write = xfs_inode_buf_write_verify,
};
const struct xfs_buf_ops xfs_inode_buf_ra_ops = {
+ .name = "xxfs_inode_ra",
.verify_read = xfs_inode_buf_readahead_verify,
.verify_write = xfs_inode_buf_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
index 1c55ccbb379d..8e385f91d660 100644
--- a/fs/xfs/libxfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -60,6 +60,7 @@ typedef struct xlog_recover {
*/
#define XLOG_BC_TABLE_SIZE 64
+#define XLOG_RECOVER_CRCPASS 0
#define XLOG_RECOVER_PASS1 1
#define XLOG_RECOVER_PASS2 2
diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h
index 1b0a08379759..f51078f1e92a 100644
--- a/fs/xfs/libxfs/xfs_quota_defs.h
+++ b/fs/xfs/libxfs/xfs_quota_defs.h
@@ -153,7 +153,7 @@ typedef __uint16_t xfs_qwarncnt_t;
#define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS)
extern int xfs_dqcheck(struct xfs_mount *mp, xfs_disk_dquot_t *ddq,
- xfs_dqid_t id, uint type, uint flags, char *str);
+ xfs_dqid_t id, uint type, uint flags, const char *str);
extern int xfs_calc_dquots_per_chunk(unsigned int nbblks);
#endif /* __XFS_QUOTA_H__ */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 47425140f343..8a53eaa349f4 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -35,6 +35,7 @@
#include "xfs_bmap_btree.h"
#include "xfs_alloc_btree.h"
#include "xfs_ialloc_btree.h"
+#include "xfs_log.h"
/*
* Physical superblock buffer manipulations. Shared with libxfs in userspace.
@@ -163,6 +164,15 @@ xfs_mount_validate_sb(
"Filesystem can not be safely mounted by this kernel.");
return -EINVAL;
}
+ } else if (xfs_sb_version_hascrc(sbp)) {
+ /*
+ * We can't read verify the sb LSN because the read verifier is
+ * called before the log is allocated and processed. We know the
+ * log is set up before write verifier (!check_version) calls,
+ * so just check it here.
+ */
+ if (!xfs_log_check_lsn(mp, sbp->sb_lsn))
+ return -EFSCORRUPTED;
}
if (xfs_sb_version_has_pquotino(sbp)) {
@@ -669,11 +679,13 @@ xfs_sb_write_verify(
}
const struct xfs_buf_ops xfs_sb_buf_ops = {
+ .name = "xfs_sb",
.verify_read = xfs_sb_read_verify,
.verify_write = xfs_sb_write_verify,
};
const struct xfs_buf_ops xfs_sb_quiet_buf_ops = {
+ .name = "xfs_sb_quiet",
.verify_read = xfs_sb_quiet_read_verify,
.verify_write = xfs_sb_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 5be529707903..15c3ceb845b9 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -49,6 +49,7 @@ extern const struct xfs_buf_ops xfs_inobt_buf_ops;
extern const struct xfs_buf_ops xfs_inode_buf_ops;
extern const struct xfs_buf_ops xfs_inode_buf_ra_ops;
extern const struct xfs_buf_ops xfs_dquot_buf_ops;
+extern const struct xfs_buf_ops xfs_dquot_buf_ra_ops;
extern const struct xfs_buf_ops xfs_sb_buf_ops;
extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops;
extern const struct xfs_buf_ops xfs_symlink_buf_ops;
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
index 8f8af05b3f13..2e2c6716b623 100644
--- a/fs/xfs/libxfs/xfs_symlink_remote.c
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -31,6 +31,7 @@
#include "xfs_cksum.h"
#include "xfs_trans.h"
#include "xfs_buf_item.h"
+#include "xfs_log.h"
/*
@@ -60,6 +61,7 @@ xfs_symlink_hdr_set(
if (!xfs_sb_version_hascrc(&mp->m_sb))
return 0;
+ memset(dsl, 0, sizeof(struct xfs_dsymlink_hdr));
dsl->sl_magic = cpu_to_be32(XFS_SYMLINK_MAGIC);
dsl->sl_offset = cpu_to_be32(offset);
dsl->sl_bytes = cpu_to_be32(size);
@@ -116,6 +118,8 @@ xfs_symlink_verify(
return false;
if (dsl->sl_owner == 0)
return false;
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(dsl->sl_lsn)))
+ return false;
return true;
}
@@ -164,6 +168,7 @@ xfs_symlink_write_verify(
}
const struct xfs_buf_ops xfs_symlink_buf_ops = {
+ .name = "xfs_symlink",
.verify_read = xfs_symlink_read_verify,
.verify_write = xfs_symlink_write_verify,
};
@@ -183,6 +188,7 @@ xfs_symlink_local_to_remote(
if (!xfs_sb_version_hascrc(&mp->m_sb)) {
bp->b_ops = NULL;
memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
+ xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
return;
}
@@ -198,4 +204,6 @@ xfs_symlink_local_to_remote(
buf = bp->b_addr;
buf += xfs_symlink_hdr_set(mp, ip->i_ino, 0, ifp->if_bytes, bp);
memcpy(buf, ifp->if_u1.if_data, ifp->if_bytes);
+ xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsymlink_hdr) +
+ ifp->if_bytes - 1);
}
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 4b641676f258..2d5df1f23bbc 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -37,16 +37,19 @@
STATIC struct posix_acl *
xfs_acl_from_disk(
- struct xfs_acl *aclp,
- int max_entries)
+ const struct xfs_acl *aclp,
+ int len,
+ int max_entries)
{
struct posix_acl_entry *acl_e;
struct posix_acl *acl;
- struct xfs_acl_entry *ace;
+ const struct xfs_acl_entry *ace;
unsigned int count, i;
+ if (len < sizeof(*aclp))
+ return ERR_PTR(-EFSCORRUPTED);
count = be32_to_cpu(aclp->acl_cnt);
- if (count > max_entries)
+ if (count > max_entries || XFS_ACL_SIZE(count) != len)
return ERR_PTR(-EFSCORRUPTED);
acl = posix_acl_alloc(count, GFP_KERNEL);
@@ -160,10 +163,11 @@ xfs_get_acl(struct inode *inode, int type)
*/
if (error == -ENOATTR)
goto out_update_cache;
+ acl = ERR_PTR(error);
goto out;
}
- acl = xfs_acl_from_disk(xfs_acl, XFS_ACL_MAX_ENTRIES(ip->i_mount));
+ acl = xfs_acl_from_disk(xfs_acl, len, XFS_ACL_MAX_ENTRIES(ip->i_mount));
if (IS_ERR(acl))
goto out;
@@ -248,29 +252,6 @@ xfs_set_mode(struct inode *inode, umode_t mode)
return error;
}
-static int
-xfs_acl_exists(struct inode *inode, unsigned char *name)
-{
- int len = XFS_ACL_MAX_SIZE(XFS_M(inode->i_sb));
-
- return (xfs_attr_get(XFS_I(inode), name, NULL, &len,
- ATTR_ROOT|ATTR_KERNOVAL) == 0);
-}
-
-int
-posix_acl_access_exists(struct inode *inode)
-{
- return xfs_acl_exists(inode, SGI_ACL_FILE);
-}
-
-int
-posix_acl_default_exists(struct inode *inode)
-{
- if (!S_ISDIR(inode->i_mode))
- return 0;
- return xfs_acl_exists(inode, SGI_ACL_DEFAULT);
-}
-
int
xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 3841b07f27bf..286fa89217f5 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -20,20 +20,18 @@
struct inode;
struct posix_acl;
-struct xfs_inode;
#ifdef CONFIG_XFS_POSIX_ACL
extern struct posix_acl *xfs_get_acl(struct inode *inode, int type);
extern int xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
-extern int posix_acl_access_exists(struct inode *inode);
-extern int posix_acl_default_exists(struct inode *inode);
#else
static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type)
{
return NULL;
}
# define xfs_set_acl NULL
-# define posix_acl_access_exists(inode) 0
-# define posix_acl_default_exists(inode) 0
#endif /* CONFIG_XFS_POSIX_ACL */
+
+extern void xfs_forget_acl(struct inode *inode, const char *name, int xflags);
+
#endif /* __XFS_ACL_H__ */
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 50ab2879b9da..379c089fb051 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -172,6 +172,12 @@ xfs_setfilesize_ioend(
current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
__sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
+ /* we abort the update if there was an IO error */
+ if (ioend->io_error) {
+ xfs_trans_cancel(tp);
+ return ioend->io_error;
+ }
+
return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
}
@@ -212,14 +218,17 @@ xfs_end_io(
ioend->io_error = -EIO;
goto done;
}
- if (ioend->io_error)
- goto done;
/*
* For unwritten extents we need to issue transactions to convert a
* range to normal written extens after the data I/O has finished.
+ * Detecting and handling completion IO errors is done individually
+ * for each case as different cleanup operations need to be performed
+ * on error.
*/
if (ioend->io_type == XFS_IO_UNWRITTEN) {
+ if (ioend->io_error)
+ goto done;
error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
ioend->io_size);
} else if (ioend->io_append_trans) {
@@ -1250,13 +1259,28 @@ xfs_vm_releasepage(
* the DIO. There is only going to be one reference to the ioend and its life
* cycle is constrained by the DIO completion code. hence we don't need
* reference counting here.
+ *
+ * Note that for DIO, an IO to the highest supported file block offset (i.e.
+ * 2^63 - 1FSB bytes) will result in the offset + count overflowing a signed 64
+ * bit variable. Hence if we see this overflow, we have to assume that the IO is
+ * extending the file size. We won't know for sure until IO completion is run
+ * and the actual max write offset is communicated to the IO completion
+ * routine.
+ *
+ * For DAX page faults, we are preparing to never see unwritten extents here,
+ * nor should we ever extend the inode size. Hence we will soon have nothing to
+ * do here for this case, ensuring we don't have to provide an IO completion
+ * callback to free an ioend that we don't actually need for a fault into the
+ * page at offset (2^63 - 1FSB) bytes.
*/
+
static void
xfs_map_direct(
struct inode *inode,
struct buffer_head *bh_result,
struct xfs_bmbt_irec *imap,
- xfs_off_t offset)
+ xfs_off_t offset,
+ bool dax_fault)
{
struct xfs_ioend *ioend;
xfs_off_t size = bh_result->b_size;
@@ -1269,6 +1293,13 @@ xfs_map_direct(
trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap);
+ if (dax_fault) {
+ ASSERT(type == XFS_IO_OVERWRITE);
+ trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
+ imap);
+ return;
+ }
+
if (bh_result->b_private) {
ioend = bh_result->b_private;
ASSERT(ioend->io_size > 0);
@@ -1283,7 +1314,8 @@ xfs_map_direct(
ioend->io_size, ioend->io_type,
imap);
} else if (type == XFS_IO_UNWRITTEN ||
- offset + size > i_size_read(inode)) {
+ offset + size > i_size_read(inode) ||
+ offset + size < 0) {
ioend = xfs_alloc_ioend(inode, type);
ioend->io_offset = offset;
ioend->io_size = size;
@@ -1345,7 +1377,8 @@ __xfs_get_blocks(
sector_t iblock,
struct buffer_head *bh_result,
int create,
- bool direct)
+ bool direct,
+ bool dax_fault)
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
@@ -1393,18 +1426,20 @@ __xfs_get_blocks(
if (error)
goto out_unlock;
+ /* for DAX, we convert unwritten extents directly */
if (create &&
(!nimaps ||
(imap.br_startblock == HOLESTARTBLOCK ||
- imap.br_startblock == DELAYSTARTBLOCK))) {
+ imap.br_startblock == DELAYSTARTBLOCK) ||
+ (IS_DAX(inode) && ISUNWRITTEN(&imap)))) {
if (direct || xfs_get_extsz_hint(ip)) {
/*
- * Drop the ilock in preparation for starting the block
- * allocation transaction. It will be retaken
- * exclusively inside xfs_iomap_write_direct for the
- * actual allocation.
+ * xfs_iomap_write_direct() expects the shared lock. It
+ * is unlocked on return.
*/
- xfs_iunlock(ip, lockmode);
+ if (lockmode == XFS_ILOCK_EXCL)
+ xfs_ilock_demote(ip, lockmode);
+
error = xfs_iomap_write_direct(ip, offset, size,
&imap, nimaps);
if (error)
@@ -1441,6 +1476,12 @@ __xfs_get_blocks(
goto out_unlock;
}
+ if (IS_DAX(inode) && create) {
+ ASSERT(!ISUNWRITTEN(&imap));
+ /* zeroing is not needed at a higher layer */
+ new = 0;
+ }
+
/* trim mapping down to size requested */
if (direct || size > (1 << inode->i_blkbits))
xfs_map_trim_size(inode, iblock, bh_result,
@@ -1458,7 +1499,8 @@ __xfs_get_blocks(
set_buffer_unwritten(bh_result);
/* direct IO needs special help */
if (create && direct)
- xfs_map_direct(inode, bh_result, &imap, offset);
+ xfs_map_direct(inode, bh_result, &imap, offset,
+ dax_fault);
}
/*
@@ -1505,7 +1547,7 @@ xfs_get_blocks(
struct buffer_head *bh_result,
int create)
{
- return __xfs_get_blocks(inode, iblock, bh_result, create, false);
+ return __xfs_get_blocks(inode, iblock, bh_result, create, false, false);
}
int
@@ -1515,7 +1557,17 @@ xfs_get_blocks_direct(
struct buffer_head *bh_result,
int create)
{
- return __xfs_get_blocks(inode, iblock, bh_result, create, true);
+ return __xfs_get_blocks(inode, iblock, bh_result, create, true, false);
+}
+
+int
+xfs_get_blocks_dax_fault(
+ struct inode *inode,
+ sector_t iblock,
+ struct buffer_head *bh_result,
+ int create)
+{
+ return __xfs_get_blocks(inode, iblock, bh_result, create, true, true);
}
static void
@@ -1614,45 +1666,6 @@ xfs_end_io_direct_write(
__xfs_end_io_direct_write(inode, ioend, offset, size);
}
-/*
- * For DAX we need a mapping buffer callback for unwritten extent conversion
- * when page faults allocate blocks and then zero them. Note that in this
- * case the mapping indicated by the ioend may extend beyond EOF. We most
- * definitely do not want to extend EOF here, so we trim back the ioend size to
- * EOF.
- */
-#ifdef CONFIG_FS_DAX
-void
-xfs_end_io_dax_write(
- struct buffer_head *bh,
- int uptodate)
-{
- struct xfs_ioend *ioend = bh->b_private;
- struct inode *inode = ioend->io_inode;
- ssize_t size = ioend->io_size;
-
- ASSERT(IS_DAX(ioend->io_inode));
-
- /* if there was an error zeroing, then don't convert it */
- if (!uptodate)
- ioend->io_error = -EIO;
-
- /*
- * Trim update to EOF, so we don't extend EOF during unwritten extent
- * conversion of partial EOF blocks.
- */
- spin_lock(&XFS_I(inode)->i_flags_lock);
- if (ioend->io_offset + size > i_size_read(inode))
- size = i_size_read(inode) - ioend->io_offset;
- spin_unlock(&XFS_I(inode)->i_flags_lock);
-
- __xfs_end_io_direct_write(inode, ioend, ioend->io_offset, size);
-
-}
-#else
-void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate) { }
-#endif
-
static inline ssize_t
xfs_vm_do_dio(
struct inode *inode,
@@ -1904,6 +1917,7 @@ xfs_vm_readpage(
struct file *unused,
struct page *page)
{
+ trace_xfs_vm_readpage(page->mapping->host, 1);
return mpage_readpage(page, xfs_get_blocks);
}
@@ -1914,6 +1928,7 @@ xfs_vm_readpages(
struct list_head *pages,
unsigned nr_pages)
{
+ trace_xfs_vm_readpages(mapping->host, nr_pages);
return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
}
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index 86afd1ac7895..f6ffc9ae5ceb 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -58,7 +58,8 @@ int xfs_get_blocks(struct inode *inode, sector_t offset,
struct buffer_head *map_bh, int create);
int xfs_get_blocks_direct(struct inode *inode, sector_t offset,
struct buffer_head *map_bh, int create);
-void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate);
+int xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset,
+ struct buffer_head *map_bh, int create);
extern void xfs_count_page_state(struct page *, int *, int *);
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 65fb37a18e92..0ef7c2ed3f8a 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -511,7 +511,7 @@ xfs_attr_list_int(
xfs_inode_t *dp = context->dp;
uint lock_mode;
- XFS_STATS_INC(xs_attr_list);
+ XFS_STATS_INC(dp->i_mount, xs_attr_list);
if (XFS_FORCED_SHUTDOWN(dp->i_mount))
return -EIO;
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 3bf4ad0d19e4..45ec9e40150c 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -57,37 +57,66 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
}
/*
+ * Routine to zero an extent on disk allocated to the specific inode.
+ *
+ * The VFS functions take a linearised filesystem block offset, so we have to
+ * convert the sparse xfs fsb to the right format first.
+ * VFS types are real funky, too.
+ */
+int
+xfs_zero_extent(
+ struct xfs_inode *ip,
+ xfs_fsblock_t start_fsb,
+ xfs_off_t count_fsb)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_daddr_t sector = xfs_fsb_to_db(ip, start_fsb);
+ sector_t block = XFS_BB_TO_FSBT(mp, sector);
+ ssize_t size = XFS_FSB_TO_B(mp, count_fsb);
+
+ if (IS_DAX(VFS_I(ip)))
+ return dax_clear_blocks(VFS_I(ip), block, size);
+
+ /*
+ * let the block layer decide on the fastest method of
+ * implementing the zeroing.
+ */
+ return sb_issue_zeroout(mp->m_super, block, count_fsb, GFP_NOFS);
+
+}
+
+/*
* Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
* caller. Frees all the extents that need freeing, which must be done
* last due to locking considerations. We never free any extents in
* the first transaction.
*
- * Return 1 if the given transaction was committed and a new one
- * started, and 0 otherwise in the committed parameter.
+ * If an inode *ip is provided, rejoin it to the transaction if
+ * the transaction was committed.
*/
int /* error */
xfs_bmap_finish(
struct xfs_trans **tp, /* transaction pointer addr */
struct xfs_bmap_free *flist, /* i/o: list extents to free */
- int *committed)/* xact committed or not */
+ struct xfs_inode *ip)
{
struct xfs_efd_log_item *efd; /* extent free data */
struct xfs_efi_log_item *efi; /* extent free intention */
int error; /* error return value */
+ int committed;/* xact committed or not */
struct xfs_bmap_free_item *free; /* free extent item */
struct xfs_bmap_free_item *next; /* next item on free list */
ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
- if (flist->xbf_count == 0) {
- *committed = 0;
+ if (flist->xbf_count == 0)
return 0;
- }
+
efi = xfs_trans_get_efi(*tp, flist->xbf_count);
for (free = flist->xbf_first; free; free = free->xbfi_next)
xfs_trans_log_efi_extent(*tp, efi, free->xbfi_startblock,
free->xbfi_blockcount);
- error = __xfs_trans_roll(tp, NULL, committed);
+ error = __xfs_trans_roll(tp, ip, &committed);
if (error) {
/*
* If the transaction was committed, drop the EFD reference
@@ -99,16 +128,13 @@ xfs_bmap_finish(
* transaction so we should return committed=1 even though we're
* returning an error.
*/
- if (*committed) {
+ if (committed) {
xfs_efi_release(efi);
xfs_force_shutdown((*tp)->t_mountp,
(error == -EFSCORRUPTED) ?
SHUTDOWN_CORRUPT_INCORE :
SHUTDOWN_META_IO_ERROR);
- } else {
- *committed = 1;
}
-
return error;
}
@@ -229,6 +255,13 @@ xfs_bmap_rtalloc(
xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
XFS_TRANS_DQ_RTBCOUNT, (long) ralen);
+
+ /* Zero the extent if we were asked to do so */
+ if (ap->userdata & XFS_ALLOC_USERDATA_ZERO) {
+ error = xfs_zero_extent(ap->ip, ap->blkno, ap->length);
+ if (error)
+ return error;
+ }
} else {
ap->length = 0;
}
@@ -933,7 +966,6 @@ xfs_alloc_file_space(
xfs_bmbt_irec_t imaps[1], *imapp;
xfs_bmap_free_t free_list;
uint qblocks, resblks, resrtextents;
- int committed;
int error;
trace_xfs_alloc_file_space(ip);
@@ -1027,24 +1059,21 @@ xfs_alloc_file_space(
xfs_bmap_init(&free_list, &firstfsb);
error = xfs_bmapi_write(tp, ip, startoffset_fsb,
allocatesize_fsb, alloc_type, &firstfsb,
- 0, imapp, &nimaps, &free_list);
- if (error) {
+ resblks, imapp, &nimaps, &free_list);
+ if (error)
goto error0;
- }
/*
* Complete the transaction
*/
- error = xfs_bmap_finish(&tp, &free_list, &committed);
- if (error) {
+ error = xfs_bmap_finish(&tp, &free_list, NULL);
+ if (error)
goto error0;
- }
error = xfs_trans_commit(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
- if (error) {
+ if (error)
break;
- }
allocated_fsb = imapp->br_blockcount;
@@ -1170,7 +1199,6 @@ xfs_free_file_space(
xfs_off_t offset,
xfs_off_t len)
{
- int committed;
int done;
xfs_fileoff_t endoffset_fsb;
int error;
@@ -1310,17 +1338,15 @@ xfs_free_file_space(
error = xfs_bunmapi(tp, ip, startoffset_fsb,
endoffset_fsb - startoffset_fsb,
0, 2, &firstfsb, &free_list, &done);
- if (error) {
+ if (error)
goto error0;
- }
/*
* complete the transaction
*/
- error = xfs_bmap_finish(&tp, &free_list, &committed);
- if (error) {
+ error = xfs_bmap_finish(&tp, &free_list, NULL);
+ if (error)
goto error0;
- }
error = xfs_trans_commit(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -1398,7 +1424,6 @@ xfs_shift_file_space(
int error;
struct xfs_bmap_free free_list;
xfs_fsblock_t first_block;
- int committed;
xfs_fileoff_t stop_fsb;
xfs_fileoff_t next_fsb;
xfs_fileoff_t shift_fsb;
@@ -1490,7 +1515,7 @@ xfs_shift_file_space(
if (error)
goto out_bmap_cancel;
- error = xfs_bmap_finish(&tp, &free_list, &committed);
+ error = xfs_bmap_finish(&tp, &free_list, NULL);
if (error)
goto out_bmap_cancel;
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 8ecffb35935b..daed4bfb85b2 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -201,7 +201,7 @@ _xfs_buf_alloc(
atomic_set(&bp->b_pin_count, 0);
init_waitqueue_head(&bp->b_waiters);
- XFS_STATS_INC(xb_create);
+ XFS_STATS_INC(target->bt_mount, xb_create);
trace_xfs_buf_init(bp, _RET_IP_);
return bp;
@@ -354,15 +354,16 @@ retry:
*/
if (!(++retries % 100))
xfs_err(NULL,
- "possible memory allocation deadlock in %s (mode:0x%x)",
+ "%s(%u) possible memory allocation deadlock in %s (mode:0x%x)",
+ current->comm, current->pid,
__func__, gfp_mask);
- XFS_STATS_INC(xb_page_retries);
+ XFS_STATS_INC(bp->b_target->bt_mount, xb_page_retries);
congestion_wait(BLK_RW_ASYNC, HZ/50);
goto retry;
}
- XFS_STATS_INC(xb_page_found);
+ XFS_STATS_INC(bp->b_target->bt_mount, xb_page_found);
nbytes = min_t(size_t, size, PAGE_SIZE - offset);
size -= nbytes;
@@ -516,7 +517,7 @@ _xfs_buf_find(
new_bp->b_pag = pag;
spin_unlock(&pag->pag_buf_lock);
} else {
- XFS_STATS_INC(xb_miss_locked);
+ XFS_STATS_INC(btp->bt_mount, xb_miss_locked);
spin_unlock(&pag->pag_buf_lock);
xfs_perag_put(pag);
}
@@ -529,11 +530,11 @@ found:
if (!xfs_buf_trylock(bp)) {
if (flags & XBF_TRYLOCK) {
xfs_buf_rele(bp);
- XFS_STATS_INC(xb_busy_locked);
+ XFS_STATS_INC(btp->bt_mount, xb_busy_locked);
return NULL;
}
xfs_buf_lock(bp);
- XFS_STATS_INC(xb_get_locked_waited);
+ XFS_STATS_INC(btp->bt_mount, xb_get_locked_waited);
}
/*
@@ -549,7 +550,7 @@ found:
}
trace_xfs_buf_find(bp, flags, _RET_IP_);
- XFS_STATS_INC(xb_get_locked);
+ XFS_STATS_INC(btp->bt_mount, xb_get_locked);
return bp;
}
@@ -603,7 +604,14 @@ found:
}
}
- XFS_STATS_INC(xb_get);
+ /*
+ * Clear b_error if this is a lookup from a caller that doesn't expect
+ * valid data to be found in the buffer.
+ */
+ if (!(flags & XBF_READ))
+ xfs_buf_ioerror(bp, 0);
+
+ XFS_STATS_INC(target->bt_mount, xb_get);
trace_xfs_buf_get(bp, flags, _RET_IP_);
return bp;
}
@@ -643,7 +651,7 @@ xfs_buf_read_map(
trace_xfs_buf_read(bp, flags, _RET_IP_);
if (!XFS_BUF_ISDONE(bp)) {
- XFS_STATS_INC(xb_get_read);
+ XFS_STATS_INC(target->bt_mount, xb_get_read);
bp->b_ops = ops;
_xfs_buf_read(bp, flags);
} else if (flags & XBF_ASYNC) {
@@ -1044,7 +1052,7 @@ xfs_buf_ioend_work(
xfs_buf_ioend(bp);
}
-void
+static void
xfs_buf_ioend_async(
struct xfs_buf *bp)
{
@@ -1631,13 +1639,9 @@ xfs_setsize_buftarg(
btp->bt_meta_sectormask = sectorsize - 1;
if (set_blocksize(btp->bt_bdev, sectorsize)) {
- char name[BDEVNAME_SIZE];
-
- bdevname(btp->bt_bdev, name);
-
xfs_warn(btp->bt_mount,
- "Cannot set_blocksize to %u on device %s",
- sectorsize, name);
+ "Cannot set_blocksize to %u on device %pg",
+ sectorsize, btp->bt_bdev);
return -EINVAL;
}
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index c79b717d9b88..c75721acd867 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -132,6 +132,7 @@ struct xfs_buf_map {
struct xfs_buf_map (map) = { .bm_bn = (blkno), .bm_len = (numblk) };
struct xfs_buf_ops {
+ char *name;
void (*verify_read)(struct xfs_buf *);
void (*verify_write)(struct xfs_buf *);
};
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index a989a9c7edb7..642d55d10075 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -666,7 +666,7 @@ xfs_readdir(
return -EIO;
ASSERT(S_ISDIR(dp->i_d.di_mode));
- XFS_STATS_INC(xs_dir_getdents);
+ XFS_STATS_INC(dp->i_mount, xs_dir_getdents);
args.dp = dp;
args.geo = dp->i_mount->m_dir_geo;
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 30cb3afb67f0..9c44d38dcd1f 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -75,9 +75,9 @@ xfs_qm_dqdestroy(
ASSERT(list_empty(&dqp->q_lru));
mutex_destroy(&dqp->q_qlock);
- kmem_zone_free(xfs_qm_dqzone, dqp);
- XFS_STATS_DEC(xs_qm_dquot);
+ XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot);
+ kmem_zone_free(xfs_qm_dqzone, dqp);
}
/*
@@ -306,7 +306,7 @@ xfs_qm_dqalloc(
xfs_fsblock_t firstblock;
xfs_bmap_free_t flist;
xfs_bmbt_irec_t map;
- int nmaps, error, committed;
+ int nmaps, error;
xfs_buf_t *bp;
xfs_trans_t *tp = *tpp;
@@ -379,11 +379,12 @@ xfs_qm_dqalloc(
xfs_trans_bhold(tp, bp);
- if ((error = xfs_bmap_finish(tpp, &flist, &committed))) {
+ error = xfs_bmap_finish(tpp, &flist, NULL);
+ if (error)
goto error1;
- }
- if (committed) {
+ /* Transaction was committed? */
+ if (*tpp != tp) {
tp = *tpp;
xfs_trans_bjoin(tp, bp);
} else {
@@ -393,9 +394,9 @@ xfs_qm_dqalloc(
*O_bpp = bp;
return 0;
- error1:
+error1:
xfs_bmap_cancel(&flist);
- error0:
+error0:
xfs_iunlock(quotip, XFS_ILOCK_EXCL);
return error;
@@ -605,7 +606,7 @@ xfs_qm_dqread(
break;
}
- XFS_STATS_INC(xs_qm_dquot);
+ XFS_STATS_INC(mp, xs_qm_dquot);
trace_xfs_dqread(dqp);
@@ -747,12 +748,12 @@ restart:
mutex_unlock(&qi->qi_tree_lock);
trace_xfs_dqget_hit(dqp);
- XFS_STATS_INC(xs_qm_dqcachehits);
+ XFS_STATS_INC(mp, xs_qm_dqcachehits);
*O_dqpp = dqp;
return 0;
}
mutex_unlock(&qi->qi_tree_lock);
- XFS_STATS_INC(xs_qm_dqcachemisses);
+ XFS_STATS_INC(mp, xs_qm_dqcachemisses);
/*
* Dquot cache miss. We don't want to keep the inode lock across
@@ -806,7 +807,7 @@ restart:
mutex_unlock(&qi->qi_tree_lock);
trace_xfs_dqget_dup(dqp);
xfs_qm_dqdestroy(dqp);
- XFS_STATS_INC(xs_qm_dquot_dups);
+ XFS_STATS_INC(mp, xs_qm_dquot_dups);
goto restart;
}
@@ -846,7 +847,7 @@ xfs_qm_dqput(
trace_xfs_dqput_free(dqp);
if (list_lru_add(&qi->qi_lru, &dqp->q_lru))
- XFS_STATS_INC(xs_qm_dquot_unused);
+ XFS_STATS_INC(dqp->q_mount, xs_qm_dquot_unused);
}
xfs_dqunlock(dqp);
}
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 74d0e5966ebc..88693a98fac5 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -164,9 +164,9 @@ xfs_verifier_error(
{
struct xfs_mount *mp = bp->b_target->bt_mount;
- xfs_alert(mp, "Metadata %s detected at %pF, block 0x%llx",
+ xfs_alert(mp, "Metadata %s detected at %pF, %s block 0x%llx",
bp->b_error == -EFSBADCRC ? "CRC error" : "corruption",
- __return_address, bp->b_bn);
+ __return_address, bp->b_ops->name, bp->b_bn);
xfs_alert(mp, "Unmount and run xfs_repair");
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index e78feb400e22..ebe9b8290a70 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -242,19 +242,30 @@ xfs_file_fsync(
}
/*
- * All metadata updates are logged, which means that we just have
- * to flush the log up to the latest LSN that touched the inode.
+ * All metadata updates are logged, which means that we just have to
+ * flush the log up to the latest LSN that touched the inode. If we have
+ * concurrent fsync/fdatasync() calls, we need them to all block on the
+ * log force before we clear the ili_fsync_fields field. This ensures
+ * that we don't get a racing sync operation that does not wait for the
+ * metadata to hit the journal before returning. If we race with
+ * clearing the ili_fsync_fields, then all that will happen is the log
+ * force will do nothing as the lsn will already be on disk. We can't
+ * race with setting ili_fsync_fields because that is done under
+ * XFS_ILOCK_EXCL, and that can't happen because we hold the lock shared
+ * until after the ili_fsync_fields is cleared.
*/
xfs_ilock(ip, XFS_ILOCK_SHARED);
if (xfs_ipincount(ip)) {
if (!datasync ||
- (ip->i_itemp->ili_fields & ~XFS_ILOG_TIMESTAMP))
+ (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
lsn = ip->i_itemp->ili_last_lsn;
}
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
- if (lsn)
+ if (lsn) {
error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
+ ip->i_itemp->ili_fsync_fields = 0;
+ }
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
/*
* If we only have a single device, and the log force about was
@@ -287,7 +298,7 @@ xfs_file_read_iter(
xfs_fsize_t n;
loff_t pos = iocb->ki_pos;
- XFS_STATS_INC(xs_read_calls);
+ XFS_STATS_INC(mp, xs_read_calls);
if (unlikely(iocb->ki_flags & IOCB_DIRECT))
ioflags |= XFS_IO_ISDIRECT;
@@ -365,7 +376,7 @@ xfs_file_read_iter(
ret = generic_file_read_iter(iocb, to);
if (ret > 0)
- XFS_STATS_ADD(xs_read_bytes, ret);
+ XFS_STATS_ADD(mp, xs_read_bytes, ret);
xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
return ret;
@@ -383,7 +394,7 @@ xfs_file_splice_read(
int ioflags = 0;
ssize_t ret;
- XFS_STATS_INC(xs_read_calls);
+ XFS_STATS_INC(ip->i_mount, xs_read_calls);
if (infilp->f_mode & FMODE_NOCMTIME)
ioflags |= XFS_IO_INVIS;
@@ -391,19 +402,26 @@ xfs_file_splice_read(
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
return -EIO;
- xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
-
trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
- /* for dax, we need to avoid the page cache */
- if (IS_DAX(VFS_I(ip)))
- ret = default_file_splice_read(infilp, ppos, pipe, count, flags);
- else
- ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
- if (ret > 0)
- XFS_STATS_ADD(xs_read_bytes, ret);
+ /*
+ * DAX inodes cannot ues the page cache for splice, so we have to push
+ * them through the VFS IO path. This means it goes through
+ * ->read_iter, which for us takes the XFS_IOLOCK_SHARED. Hence we
+ * cannot lock the splice operation at this level for DAX inodes.
+ */
+ if (IS_DAX(VFS_I(ip))) {
+ ret = default_file_splice_read(infilp, ppos, pipe, count,
+ flags);
+ goto out;
+ }
+ xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+ ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+out:
+ if (ret > 0)
+ XFS_STATS_ADD(ip->i_mount, xs_read_bytes, ret);
return ret;
}
@@ -482,6 +500,8 @@ xfs_zero_eof(
ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
ASSERT(offset > isize);
+ trace_xfs_zero_eof(ip, isize, offset - isize);
+
/*
* First handle zeroing the block on which isize resides.
*
@@ -574,6 +594,7 @@ xfs_file_aio_write_checks(
struct xfs_inode *ip = XFS_I(inode);
ssize_t error = 0;
size_t count = iov_iter_count(from);
+ bool drained_dio = false;
restart:
error = generic_write_checks(iocb, from);
@@ -611,12 +632,13 @@ restart:
bool zero = false;
spin_unlock(&ip->i_flags_lock);
- if (*iolock == XFS_IOLOCK_SHARED) {
- xfs_rw_iunlock(ip, *iolock);
- *iolock = XFS_IOLOCK_EXCL;
- xfs_rw_ilock(ip, *iolock);
- iov_iter_reexpand(from, count);
-
+ if (!drained_dio) {
+ if (*iolock == XFS_IOLOCK_SHARED) {
+ xfs_rw_iunlock(ip, *iolock);
+ *iolock = XFS_IOLOCK_EXCL;
+ xfs_rw_ilock(ip, *iolock);
+ iov_iter_reexpand(from, count);
+ }
/*
* We now have an IO submission barrier in place, but
* AIO can do EOF updates during IO completion and hence
@@ -626,6 +648,7 @@ restart:
* no-op.
*/
inode_dio_wait(inode);
+ drained_dio = true;
goto restart;
}
error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), &zero);
@@ -867,7 +890,7 @@ xfs_file_write_iter(
ssize_t ret;
size_t ocount = iov_iter_count(from);
- XFS_STATS_INC(xs_write_calls);
+ XFS_STATS_INC(ip->i_mount, xs_write_calls);
if (ocount == 0)
return 0;
@@ -883,7 +906,7 @@ xfs_file_write_iter(
if (ret > 0) {
ssize_t err;
- XFS_STATS_ADD(xs_write_bytes, ret);
+ XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
/* Handle various SYNC-type writes */
err = generic_write_sync(file, iocb->ki_pos - ret, ret);
@@ -1477,7 +1500,7 @@ xfs_file_llseek(
*
* mmap_sem (MM)
* sb_start_pagefault(vfs, freeze)
- * i_mmap_lock (XFS - truncate serialisation)
+ * i_mmaplock (XFS - truncate serialisation)
* page_lock (MM)
* i_lock (XFS - extent map serialisation)
*/
@@ -1503,10 +1526,9 @@ xfs_filemap_page_mkwrite(
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
if (IS_DAX(inode)) {
- ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_direct,
- xfs_end_io_dax_write);
+ ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault, NULL);
} else {
- ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks);
+ ret = block_page_mkwrite(vma, vmf, xfs_get_blocks);
ret = block_page_mkwrite_return(ret);
}
@@ -1538,7 +1560,7 @@ xfs_filemap_fault(
* changes to xfs_get_blocks_direct() to map unwritten extent
* ioend for conversion on read-only mappings.
*/
- ret = __dax_fault(vma, vmf, xfs_get_blocks_direct, NULL);
+ ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault, NULL);
} else
ret = filemap_fault(vma, vmf);
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
@@ -1546,6 +1568,13 @@ xfs_filemap_fault(
return ret;
}
+/*
+ * Similar to xfs_filemap_fault(), the DAX fault path can call into here on
+ * both read and write faults. Hence we need to handle both cases. There is no
+ * ->pmd_mkwrite callout for huge pages, so we have a single function here to
+ * handle both cases here. @flags carries the information on the type of fault
+ * occuring.
+ */
STATIC int
xfs_filemap_pmd_fault(
struct vm_area_struct *vma,
@@ -1562,22 +1591,62 @@ xfs_filemap_pmd_fault(
trace_xfs_filemap_pmd_fault(ip);
- sb_start_pagefault(inode->i_sb);
- file_update_time(vma->vm_file);
+ if (flags & FAULT_FLAG_WRITE) {
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vma->vm_file);
+ }
+
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_direct,
- xfs_end_io_dax_write);
+ ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault,
+ NULL);
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- sb_end_pagefault(inode->i_sb);
+
+ if (flags & FAULT_FLAG_WRITE)
+ sb_end_pagefault(inode->i_sb);
return ret;
}
+/*
+ * pfn_mkwrite was originally inteneded to ensure we capture time stamp
+ * updates on write faults. In reality, it's need to serialise against
+ * truncate similar to page_mkwrite. Hence we open-code dax_pfn_mkwrite()
+ * here and cycle the XFS_MMAPLOCK_SHARED to ensure we serialise the fault
+ * barrier in place.
+ */
+static int
+xfs_filemap_pfn_mkwrite(
+ struct vm_area_struct *vma,
+ struct vm_fault *vmf)
+{
+
+ struct inode *inode = file_inode(vma->vm_file);
+ struct xfs_inode *ip = XFS_I(inode);
+ int ret = VM_FAULT_NOPAGE;
+ loff_t size;
+
+ trace_xfs_filemap_pfn_mkwrite(ip);
+
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vma->vm_file);
+
+ /* check if the faulting page hasn't raced with truncate */
+ xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
+ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (vmf->pgoff >= size)
+ ret = VM_FAULT_SIGBUS;
+ xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
+ sb_end_pagefault(inode->i_sb);
+ return ret;
+
+}
+
static const struct vm_operations_struct xfs_file_vm_ops = {
.fault = xfs_filemap_fault,
.pmd_fault = xfs_filemap_pmd_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = xfs_filemap_page_mkwrite,
+ .pfn_mkwrite = xfs_filemap_pfn_mkwrite,
};
STATIC int
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 0a326bd64d4e..d7a490f24ead 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -63,7 +63,7 @@ xfs_inode_alloc(
return NULL;
}
- XFS_STATS_INC(vn_active);
+ XFS_STATS_INC(mp, vn_active);
ASSERT(atomic_read(&ip->i_pincount) == 0);
ASSERT(!spin_is_locked(&ip->i_flags_lock));
ASSERT(!xfs_isiflocked(ip));
@@ -129,7 +129,7 @@ xfs_inode_free(
/* asserts to verify all state is correct here */
ASSERT(atomic_read(&ip->i_pincount) == 0);
ASSERT(!xfs_isiflocked(ip));
- XFS_STATS_DEC(vn_active);
+ XFS_STATS_DEC(ip->i_mount, vn_active);
call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
}
@@ -159,7 +159,7 @@ xfs_iget_cache_hit(
spin_lock(&ip->i_flags_lock);
if (ip->i_ino != ino) {
trace_xfs_iget_skip(ip);
- XFS_STATS_INC(xs_ig_frecycle);
+ XFS_STATS_INC(mp, xs_ig_frecycle);
error = -EAGAIN;
goto out_error;
}
@@ -177,7 +177,7 @@ xfs_iget_cache_hit(
*/
if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
trace_xfs_iget_skip(ip);
- XFS_STATS_INC(xs_ig_frecycle);
+ XFS_STATS_INC(mp, xs_ig_frecycle);
error = -EAGAIN;
goto out_error;
}
@@ -259,7 +259,7 @@ xfs_iget_cache_hit(
xfs_ilock(ip, lock_flags);
xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE);
- XFS_STATS_INC(xs_ig_found);
+ XFS_STATS_INC(mp, xs_ig_found);
return 0;
@@ -342,7 +342,7 @@ xfs_iget_cache_miss(
error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
if (unlikely(error)) {
WARN_ON(error != -EEXIST);
- XFS_STATS_INC(xs_ig_dup);
+ XFS_STATS_INC(mp, xs_ig_dup);
error = -EAGAIN;
goto out_preload_end;
}
@@ -412,7 +412,7 @@ xfs_iget(
if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
return -EINVAL;
- XFS_STATS_INC(xs_ig_attempts);
+ XFS_STATS_INC(mp, xs_ig_attempts);
/* get the perag structure and ensure that it's inode capable */
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
@@ -429,7 +429,7 @@ again:
goto out_error_or_again;
} else {
rcu_read_unlock();
- XFS_STATS_INC(xs_ig_missed);
+ XFS_STATS_INC(mp, xs_ig_missed);
error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
flags, lock_flags);
@@ -965,7 +965,7 @@ reclaim:
xfs_ifunlock(ip);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
- XFS_STATS_INC(xs_ig_reclaims);
+ XFS_STATS_INC(ip->i_mount, xs_ig_reclaims);
/*
* Remove the inode from the per-AG radix tree.
*
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index dc40a6d5ae0d..ae3758a90ed6 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1143,7 +1143,6 @@ xfs_create(
xfs_bmap_free_t free_list;
xfs_fsblock_t first_block;
bool unlock_dp_on_error = false;
- int committed;
prid_t prid;
struct xfs_dquot *udqp = NULL;
struct xfs_dquot *gdqp = NULL;
@@ -1226,7 +1225,7 @@ xfs_create(
* pointing to itself.
*/
error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev,
- prid, resblks > 0, &ip, &committed);
+ prid, resblks > 0, &ip, NULL);
if (error)
goto out_trans_cancel;
@@ -1275,7 +1274,7 @@ xfs_create(
*/
xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
- error = xfs_bmap_finish(&tp, &free_list, &committed);
+ error = xfs_bmap_finish(&tp, &free_list, NULL);
if (error)
goto out_bmap_cancel;
@@ -1427,7 +1426,6 @@ xfs_link(
int error;
xfs_bmap_free_t free_list;
xfs_fsblock_t first_block;
- int committed;
int resblks;
trace_xfs_link(tdp, target_name);
@@ -1502,11 +1500,10 @@ xfs_link(
* link transaction goes to disk before returning to
* the user.
*/
- if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
+ if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
xfs_trans_set_sync(tp);
- }
- error = xfs_bmap_finish (&tp, &free_list, &committed);
+ error = xfs_bmap_finish(&tp, &free_list, NULL);
if (error) {
xfs_bmap_cancel(&free_list);
goto error_return;
@@ -1555,7 +1552,6 @@ xfs_itruncate_extents(
xfs_fileoff_t first_unmap_block;
xfs_fileoff_t last_block;
xfs_filblks_t unmap_len;
- int committed;
int error = 0;
int done = 0;
@@ -1601,9 +1597,7 @@ xfs_itruncate_extents(
* Duplicate the transaction that has the permanent
* reservation and commit the old transaction.
*/
- error = xfs_bmap_finish(&tp, &free_list, &committed);
- if (committed)
- xfs_trans_ijoin(tp, ip, 0);
+ error = xfs_bmap_finish(&tp, &free_list, ip);
if (error)
goto out_bmap_cancel;
@@ -1774,7 +1768,6 @@ xfs_inactive_ifree(
{
xfs_bmap_free_t free_list;
xfs_fsblock_t first_block;
- int committed;
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
int error;
@@ -1841,7 +1834,7 @@ xfs_inactive_ifree(
* Just ignore errors at this point. There is nothing we can do except
* to try to keep going. Make sure it's not a silent error.
*/
- error = xfs_bmap_finish(&tp, &free_list, &committed);
+ error = xfs_bmap_finish(&tp, &free_list, NULL);
if (error) {
xfs_notice(mp, "%s: xfs_bmap_finish returned error %d",
__func__, error);
@@ -2365,6 +2358,7 @@ retry:
iip->ili_last_fields = iip->ili_fields;
iip->ili_fields = 0;
+ iip->ili_fsync_fields = 0;
iip->ili_logged = 1;
xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
&iip->ili_item.li_lsn);
@@ -2522,7 +2516,6 @@ xfs_remove(
int error = 0;
xfs_bmap_free_t free_list;
xfs_fsblock_t first_block;
- int committed;
uint resblks;
trace_xfs_remove(dp, name);
@@ -2623,7 +2616,7 @@ xfs_remove(
if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
xfs_trans_set_sync(tp);
- error = xfs_bmap_finish(&tp, &free_list, &committed);
+ error = xfs_bmap_finish(&tp, &free_list, NULL);
if (error)
goto out_bmap_cancel;
@@ -2700,7 +2693,6 @@ xfs_finish_rename(
struct xfs_trans *tp,
struct xfs_bmap_free *free_list)
{
- int committed = 0;
int error;
/*
@@ -2710,7 +2702,7 @@ xfs_finish_rename(
if (tp->t_mountp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
xfs_trans_set_sync(tp);
- error = xfs_bmap_finish(&tp, free_list, &committed);
+ error = xfs_bmap_finish(&tp, free_list, NULL);
if (error) {
xfs_bmap_cancel(free_list);
xfs_trans_cancel(tp);
@@ -3271,8 +3263,8 @@ xfs_iflush_cluster(
}
if (clcount) {
- XFS_STATS_INC(xs_icluster_flushcnt);
- XFS_STATS_ADD(xs_icluster_flushinode, clcount);
+ XFS_STATS_INC(mp, xs_icluster_flushcnt);
+ XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount);
}
out_free:
@@ -3345,7 +3337,7 @@ xfs_iflush(
struct xfs_dinode *dip;
int error;
- XFS_STATS_INC(xs_iflush_count);
+ XFS_STATS_INC(mp, xs_iflush_count);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
ASSERT(xfs_isiflocked(ip));
@@ -3560,6 +3552,7 @@ xfs_iflush_int(
*/
iip->ili_last_fields = iip->ili_fields;
iip->ili_fields = 0;
+ iip->ili_fsync_fields = 0;
iip->ili_logged = 1;
xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 62bd80f4edd9..d14b12b8cfef 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -719,6 +719,7 @@ xfs_iflush_abort(
* attempted.
*/
iip->ili_fields = 0;
+ iip->ili_fsync_fields = 0;
}
/*
* Release the inode's flush lock since we're done with it.
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 488d81254e28..4c7722e325b3 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -34,6 +34,7 @@ typedef struct xfs_inode_log_item {
unsigned short ili_logged; /* flushed logged data */
unsigned int ili_last_fields; /* fields when flushed */
unsigned int ili_fields; /* fields to be logged */
+ unsigned int ili_fsync_fields; /* logged since last fsync */
} xfs_inode_log_item_t;
static inline int xfs_inode_clean(xfs_inode_t *ip)
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index ea7d85af5310..d42738deec6d 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -40,6 +40,7 @@
#include "xfs_symlink.h"
#include "xfs_trans.h"
#include "xfs_pnfs.h"
+#include "xfs_acl.h"
#include <linux/capability.h>
#include <linux/dcache.h>
@@ -411,7 +412,7 @@ xfs_attrlist_by_handle(
if (copy_from_user(&al_hreq, arg, sizeof(xfs_fsop_attrlist_handlereq_t)))
return -EFAULT;
if (al_hreq.buflen < sizeof(struct attrlist) ||
- al_hreq.buflen > XATTR_LIST_MAX)
+ al_hreq.buflen > XFS_XATTR_LIST_MAX)
return -EINVAL;
/*
@@ -455,7 +456,7 @@ xfs_attrmulti_attr_get(
unsigned char *kbuf;
int error = -EFAULT;
- if (*len > XATTR_SIZE_MAX)
+ if (*len > XFS_XATTR_SIZE_MAX)
return -EINVAL;
kbuf = kmem_zalloc_large(*len, KM_SLEEP);
if (!kbuf)
@@ -482,17 +483,22 @@ xfs_attrmulti_attr_set(
__uint32_t flags)
{
unsigned char *kbuf;
+ int error;
if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
return -EPERM;
- if (len > XATTR_SIZE_MAX)
+ if (len > XFS_XATTR_SIZE_MAX)
return -EINVAL;
kbuf = memdup_user(ubuf, len);
if (IS_ERR(kbuf))
return PTR_ERR(kbuf);
- return xfs_attr_set(XFS_I(inode), name, kbuf, len, flags);
+ error = xfs_attr_set(XFS_I(inode), name, kbuf, len, flags);
+ if (!error)
+ xfs_forget_acl(inode, name, flags);
+ kfree(kbuf);
+ return error;
}
int
@@ -501,9 +507,14 @@ xfs_attrmulti_attr_remove(
unsigned char *name,
__uint32_t flags)
{
+ int error;
+
if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
return -EPERM;
- return xfs_attr_remove(XFS_I(inode), name, flags);
+ error = xfs_attr_remove(XFS_I(inode), name, flags);
+ if (!error)
+ xfs_forget_acl(inode, name, flags);
+ return error;
}
STATIC int
@@ -1028,7 +1039,7 @@ xfs_ioctl_setattr_xflags(
xfs_diflags_to_linux(ip);
xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- XFS_STATS_INC(xs_ig_attrchg);
+ XFS_STATS_INC(mp, xs_ig_attrchg);
return 0;
}
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index b88bdc85dd3d..1a05d8ae327d 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -356,7 +356,7 @@ xfs_compat_attrlist_by_handle(
sizeof(compat_xfs_fsop_attrlist_handlereq_t)))
return -EFAULT;
if (al_hreq.buflen < sizeof(struct attrlist) ||
- al_hreq.buflen > XATTR_LIST_MAX)
+ al_hreq.buflen > XFS_XATTR_LIST_MAX)
return -EINVAL;
/*
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 1f86033171c8..d81bdc080370 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -129,22 +129,31 @@ xfs_iomap_write_direct(
xfs_trans_t *tp;
xfs_bmap_free_t free_list;
uint qblocks, resblks, resrtextents;
- int committed;
int error;
-
- error = xfs_qm_dqattach(ip, 0);
- if (error)
- return error;
+ int lockmode;
+ int bmapi_flags = XFS_BMAPI_PREALLOC;
rt = XFS_IS_REALTIME_INODE(ip);
extsz = xfs_get_extsz_hint(ip);
+ lockmode = XFS_ILOCK_SHARED; /* locked by caller */
+
+ ASSERT(xfs_isilocked(ip, lockmode));
offset_fsb = XFS_B_TO_FSBT(mp, offset);
last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
if ((offset + count) > XFS_ISIZE(ip)) {
+ /*
+ * Assert that the in-core extent list is present since this can
+ * call xfs_iread_extents() and we only have the ilock shared.
+ * This should be safe because the lock was held around a bmapi
+ * call in the caller and we only need it to access the in-core
+ * list.
+ */
+ ASSERT(XFS_IFORK_PTR(ip, XFS_DATA_FORK)->if_flags &
+ XFS_IFEXTENTS);
error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
if (error)
- return error;
+ goto out_unlock;
} else {
if (nmaps && (imap->br_startblock == HOLESTARTBLOCK))
last_fsb = MIN(last_fsb, (xfs_fileoff_t)
@@ -174,9 +183,40 @@ xfs_iomap_write_direct(
}
/*
+ * Drop the shared lock acquired by the caller, attach the dquot if
+ * necessary and move on to transaction setup.
+ */
+ xfs_iunlock(ip, lockmode);
+ error = xfs_qm_dqattach(ip, 0);
+ if (error)
+ return error;
+
+ /*
* Allocate and setup the transaction
*/
tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
+
+ /*
+ * For DAX, we do not allocate unwritten extents, but instead we zero
+ * the block before we commit the transaction. Ideally we'd like to do
+ * this outside the transaction context, but if we commit and then crash
+ * we may not have zeroed the blocks and this will be exposed on
+ * recovery of the allocation. Hence we must zero before commit.
+ *
+ * Further, if we are mapping unwritten extents here, we need to zero
+ * and convert them to written so that we don't need an unwritten extent
+ * callback for DAX. This also means that we need to be able to dip into
+ * the reserve block pool for bmbt block allocation if there is no space
+ * left but we need to do unwritten extent conversion.
+ */
+
+ if (IS_DAX(VFS_I(ip))) {
+ bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO;
+ if (ISUNWRITTEN(imap)) {
+ tp->t_flags |= XFS_TRANS_RESERVE;
+ resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1;
+ }
+ }
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
resblks, resrtextents);
/*
@@ -187,7 +227,8 @@ xfs_iomap_write_direct(
return error;
}
- xfs_ilock(ip, XFS_ILOCK_EXCL);
+ lockmode = XFS_ILOCK_EXCL;
+ xfs_ilock(ip, lockmode);
error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, 0, quota_flag);
if (error)
@@ -202,17 +243,18 @@ xfs_iomap_write_direct(
xfs_bmap_init(&free_list, &firstfsb);
nimaps = 1;
error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
- XFS_BMAPI_PREALLOC, &firstfsb, 0,
- imap, &nimaps, &free_list);
+ bmapi_flags, &firstfsb, resblks, imap,
+ &nimaps, &free_list);
if (error)
goto out_bmap_cancel;
/*
* Complete the transaction
*/
- error = xfs_bmap_finish(&tp, &free_list, &committed);
+ error = xfs_bmap_finish(&tp, &free_list, NULL);
if (error)
goto out_bmap_cancel;
+
error = xfs_trans_commit(tp);
if (error)
goto out_unlock;
@@ -229,7 +271,7 @@ xfs_iomap_write_direct(
error = xfs_alert_fsblock_zero(ip, imap);
out_unlock:
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_iunlock(ip, lockmode);
return error;
out_bmap_cancel:
@@ -655,7 +697,7 @@ xfs_iomap_write_allocate(
xfs_bmap_free_t free_list;
xfs_filblks_t count_fsb;
xfs_trans_t *tp;
- int nimaps, committed;
+ int nimaps;
int error = 0;
int nres;
@@ -670,7 +712,7 @@ xfs_iomap_write_allocate(
count_fsb = imap->br_blockcount;
map_start_fsb = imap->br_startoff;
- XFS_STATS_ADD(xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb));
+ XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb));
while (count_fsb != 0) {
/*
@@ -750,13 +792,13 @@ xfs_iomap_write_allocate(
* pointer that the caller gave to us.
*/
error = xfs_bmapi_write(tp, ip, map_start_fsb,
- count_fsb, 0,
- &first_block, 1,
- imap, &nimaps, &free_list);
+ count_fsb, 0, &first_block,
+ nres, imap, &nimaps,
+ &free_list);
if (error)
goto trans_cancel;
- error = xfs_bmap_finish(&tp, &free_list, &committed);
+ error = xfs_bmap_finish(&tp, &free_list, NULL);
if (error)
goto trans_cancel;
@@ -777,7 +819,7 @@ xfs_iomap_write_allocate(
if ((offset_fsb >= imap->br_startoff) &&
(offset_fsb < (imap->br_startoff +
imap->br_blockcount))) {
- XFS_STATS_INC(xs_xstrat_quick);
+ XFS_STATS_INC(mp, xs_xstrat_quick);
return 0;
}
@@ -814,7 +856,6 @@ xfs_iomap_write_unwritten(
xfs_bmap_free_t free_list;
xfs_fsize_t i_size;
uint resblks;
- int committed;
int error;
trace_xfs_unwritten_convert(ip, offset, count);
@@ -866,8 +907,8 @@ xfs_iomap_write_unwritten(
xfs_bmap_init(&free_list, &firstfsb);
nimaps = 1;
error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
- XFS_BMAPI_CONVERT, &firstfsb,
- 1, &imap, &nimaps, &free_list);
+ XFS_BMAPI_CONVERT, &firstfsb, resblks,
+ &imap, &nimaps, &free_list);
if (error)
goto error_on_bmapi_transaction;
@@ -886,7 +927,7 @@ xfs_iomap_write_unwritten(
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
}
- error = xfs_bmap_finish(&tp, &free_list, &committed);
+ error = xfs_bmap_finish(&tp, &free_list, NULL);
if (error)
goto error_on_bmapi_transaction;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 8294132e6a3c..06eafafe636e 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -414,13 +414,17 @@ xfs_vn_rename(
* uio is kmalloced for this reason...
*/
STATIC const char *
-xfs_vn_follow_link(
+xfs_vn_get_link(
struct dentry *dentry,
- void **cookie)
+ struct inode *inode,
+ struct delayed_call *done)
{
char *link;
int error = -ENOMEM;
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
link = kmalloc(MAXPATHLEN+1, GFP_KERNEL);
if (!link)
goto out_err;
@@ -429,7 +433,8 @@ xfs_vn_follow_link(
if (unlikely(error))
goto out_kfree;
- return *cookie = link;
+ set_delayed_call(done, kfree_link, link);
+ return link;
out_kfree:
kfree(link);
@@ -695,7 +700,7 @@ xfs_setattr_nonsize(
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- XFS_STATS_INC(xs_ig_attrchg);
+ XFS_STATS_INC(mp, xs_ig_attrchg);
if (mp->m_flags & XFS_MOUNT_WSYNC)
xfs_trans_set_sync(tp);
@@ -922,7 +927,7 @@ xfs_setattr_size(
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- XFS_STATS_INC(xs_ig_attrchg);
+ XFS_STATS_INC(mp, xs_ig_attrchg);
if (mp->m_flags & XFS_MOUNT_WSYNC)
xfs_trans_set_sync(tp);
@@ -1172,8 +1177,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = {
static const struct inode_operations xfs_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = xfs_vn_follow_link,
- .put_link = kfree_put_link,
+ .get_link = xfs_vn_get_link,
.getattr = xfs_vn_getattr,
.setattr = xfs_vn_setattr,
.setxattr = generic_setxattr,
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 85f883dd6207..ec0e239a0fa9 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -171,6 +171,13 @@ struct xfs_kobj {
struct completion complete;
};
+struct xstats {
+ struct xfsstats __percpu *xs_stats;
+ struct xfs_kobj xs_kobj;
+};
+
+extern struct xstats xfsstats;
+
/* Kernel uid/gid conversion. These are used to convert to/from the on disk
* uid_t/gid_t types to the kuid_t/kgid_t types that the kernel uses internally.
* The conversion here is type only, the value will remain the same since we
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index aaadee0969c9..9c9a1c9bcc7f 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -268,7 +268,7 @@ xlog_grant_head_wait(
__set_current_state(TASK_UNINTERRUPTIBLE);
spin_unlock(&head->lock);
- XFS_STATS_INC(xs_sleep_logspace);
+ XFS_STATS_INC(log->l_mp, xs_sleep_logspace);
trace_xfs_log_grant_sleep(log, tic);
schedule();
@@ -379,7 +379,7 @@ xfs_log_regrant(
if (XLOG_FORCED_SHUTDOWN(log))
return -EIO;
- XFS_STATS_INC(xs_try_logspace);
+ XFS_STATS_INC(mp, xs_try_logspace);
/*
* This is a new transaction on the ticket, so we need to change the
@@ -448,7 +448,7 @@ xfs_log_reserve(
if (XLOG_FORCED_SHUTDOWN(log))
return -EIO;
- XFS_STATS_INC(xs_try_logspace);
+ XFS_STATS_INC(mp, xs_try_logspace);
ASSERT(*ticp == NULL);
tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent,
@@ -1188,10 +1188,16 @@ xlog_iodone(xfs_buf_t *bp)
int aborted = 0;
/*
- * Race to shutdown the filesystem if we see an error.
+ * Race to shutdown the filesystem if we see an error or the iclog is in
+ * IOABORT state. The IOABORT state is only set in DEBUG mode to inject
+ * CRC errors into log recovery.
*/
- if (XFS_TEST_ERROR(bp->b_error, l->l_mp,
- XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) {
+ if (XFS_TEST_ERROR(bp->b_error, l->l_mp, XFS_ERRTAG_IODONE_IOERR,
+ XFS_RANDOM_IODONE_IOERR) ||
+ iclog->ic_state & XLOG_STATE_IOABORT) {
+ if (iclog->ic_state & XLOG_STATE_IOABORT)
+ iclog->ic_state &= ~XLOG_STATE_IOABORT;
+
xfs_buf_ioerror_alert(bp, __func__);
xfs_buf_stale(bp);
xfs_force_shutdown(l->l_mp, SHUTDOWN_LOG_IO_ERROR);
@@ -1768,7 +1774,7 @@ xlog_sync(
int v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb);
int size;
- XFS_STATS_INC(xs_log_writes);
+ XFS_STATS_INC(log->l_mp, xs_log_writes);
ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
/* Add for LR header */
@@ -1805,7 +1811,7 @@ xlog_sync(
bp = iclog->ic_bp;
XFS_BUF_SET_ADDR(bp, BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn)));
- XFS_STATS_ADD(xs_log_blocks, BTOBB(count));
+ XFS_STATS_ADD(log->l_mp, xs_log_blocks, BTOBB(count));
/* Do we need to split this write into 2 parts? */
if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) {
@@ -1838,6 +1844,23 @@ xlog_sync(
/* calculcate the checksum */
iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header,
iclog->ic_datap, size);
+#ifdef DEBUG
+ /*
+ * Intentionally corrupt the log record CRC based on the error injection
+ * frequency, if defined. This facilitates testing log recovery in the
+ * event of torn writes. Hence, set the IOABORT state to abort the log
+ * write on I/O completion and shutdown the fs. The subsequent mount
+ * detects the bad CRC and attempts to recover.
+ */
+ if (log->l_badcrc_factor &&
+ (prandom_u32() % log->l_badcrc_factor == 0)) {
+ iclog->ic_header.h_crc &= 0xAAAAAAAA;
+ iclog->ic_state |= XLOG_STATE_IOABORT;
+ xfs_warn(log->l_mp,
+ "Intentionally corrupted log record at LSN 0x%llx. Shutdown imminent.",
+ be64_to_cpu(iclog->ic_header.h_lsn));
+ }
+#endif
bp->b_io_length = BTOBB(count);
bp->b_fspriv = iclog;
@@ -2045,12 +2068,14 @@ xlog_print_tic_res(
"QM_DQCLUSTER",
"QM_QINOCREATE",
"QM_QUOTAOFF_END",
- "SB_UNIT",
"FSYNC_TS",
"GROWFSRT_ALLOC",
"GROWFSRT_ZERO",
"GROWFSRT_FREE",
- "SWAPEXT"
+ "SWAPEXT",
+ "CHECKPOINT",
+ "ICREATE",
+ "CREATE_TMPFILE"
};
xfs_warn(mp, "xlog_write: reservation summary:");
@@ -2422,11 +2447,20 @@ xlog_write(
&partial_copy_len);
xlog_verify_dest_ptr(log, ptr);
- /* copy region */
+ /*
+ * Copy region.
+ *
+ * Unmount records just log an opheader, so can have
+ * empty payloads with no data region to copy. Hence we
+ * only copy the payload if the vector says it has data
+ * to copy.
+ */
ASSERT(copy_len >= 0);
- memcpy(ptr, reg->i_addr + copy_off, copy_len);
- xlog_write_adv_cnt(&ptr, &len, &log_offset, copy_len);
-
+ if (copy_len > 0) {
+ memcpy(ptr, reg->i_addr + copy_off, copy_len);
+ xlog_write_adv_cnt(&ptr, &len, &log_offset,
+ copy_len);
+ }
copy_len += start_rec_copy + sizeof(xlog_op_header_t);
record_cnt++;
data_cnt += contwr ? copy_len : 0;
@@ -2782,11 +2816,19 @@ xlog_state_do_callback(
}
} while (!ioerrors && loopdidcallbacks);
+#ifdef DEBUG
/*
- * make one last gasp attempt to see if iclogs are being left in
- * limbo..
+ * Make one last gasp attempt to see if iclogs are being left in limbo.
+ * If the above loop finds an iclog earlier than the current iclog and
+ * in one of the syncing states, the current iclog is put into
+ * DO_CALLBACK and the callbacks are deferred to the completion of the
+ * earlier iclog. Walk the iclogs in order and make sure that no iclog
+ * is in DO_CALLBACK unless an earlier iclog is in one of the syncing
+ * states.
+ *
+ * Note that SYNCING|IOABORT is a valid state so we cannot just check
+ * for ic_state == SYNCING.
*/
-#ifdef DEBUG
if (funcdidcallbacks) {
first_iclog = iclog = log->l_iclog;
do {
@@ -2801,7 +2843,7 @@ xlog_state_do_callback(
* IOERROR - give up hope all ye who enter here
*/
if (iclog->ic_state == XLOG_STATE_WANT_SYNC ||
- iclog->ic_state == XLOG_STATE_SYNCING ||
+ iclog->ic_state & XLOG_STATE_SYNCING ||
iclog->ic_state == XLOG_STATE_DONE_SYNC ||
iclog->ic_state == XLOG_STATE_IOERROR )
break;
@@ -2913,7 +2955,7 @@ restart:
iclog = log->l_iclog;
if (iclog->ic_state != XLOG_STATE_ACTIVE) {
- XFS_STATS_INC(xs_log_noiclogs);
+ XFS_STATS_INC(log->l_mp, xs_log_noiclogs);
/* Wait for log writes to have flushed */
xlog_wait(&log->l_flush_wait, &log->l_icloglock);
@@ -3165,11 +3207,19 @@ xlog_state_switch_iclogs(
}
if (log->l_curr_block >= log->l_logBBsize) {
+ /*
+ * Rewind the current block before the cycle is bumped to make
+ * sure that the combined LSN never transiently moves forward
+ * when the log wraps to the next cycle. This is to support the
+ * unlocked sample of these fields from xlog_valid_lsn(). Most
+ * other cases should acquire l_icloglock.
+ */
+ log->l_curr_block -= log->l_logBBsize;
+ ASSERT(log->l_curr_block >= 0);
+ smp_wmb();
log->l_curr_cycle++;
if (log->l_curr_cycle == XLOG_HEADER_MAGIC_NUM)
log->l_curr_cycle++;
- log->l_curr_block -= log->l_logBBsize;
- ASSERT(log->l_curr_block >= 0);
}
ASSERT(iclog == log->l_iclog);
log->l_iclog = iclog->ic_next;
@@ -3212,7 +3262,7 @@ _xfs_log_force(
struct xlog_in_core *iclog;
xfs_lsn_t lsn;
- XFS_STATS_INC(xs_log_force);
+ XFS_STATS_INC(mp, xs_log_force);
xlog_cil_force(log);
@@ -3297,7 +3347,7 @@ maybe_sleep:
spin_unlock(&log->l_icloglock);
return -EIO;
}
- XFS_STATS_INC(xs_log_force_sleep);
+ XFS_STATS_INC(mp, xs_log_force_sleep);
xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
/*
* No need to grab the log lock here since we're
@@ -3362,7 +3412,7 @@ _xfs_log_force_lsn(
ASSERT(lsn != 0);
- XFS_STATS_INC(xs_log_force);
+ XFS_STATS_INC(mp, xs_log_force);
lsn = xlog_cil_force_lsn(log, lsn);
if (lsn == NULLCOMMITLSN)
@@ -3411,7 +3461,7 @@ try_again:
(XLOG_STATE_WANT_SYNC | XLOG_STATE_SYNCING))) {
ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR));
- XFS_STATS_INC(xs_log_force_sleep);
+ XFS_STATS_INC(mp, xs_log_force_sleep);
xlog_wait(&iclog->ic_prev->ic_write_wait,
&log->l_icloglock);
@@ -3441,7 +3491,7 @@ try_again:
spin_unlock(&log->l_icloglock);
return -EIO;
}
- XFS_STATS_INC(xs_log_force_sleep);
+ XFS_STATS_INC(mp, xs_log_force_sleep);
xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
/*
* No need to grab the log lock here since we're
@@ -4023,3 +4073,45 @@ xlog_iclogs_empty(
return 1;
}
+/*
+ * Verify that an LSN stamped into a piece of metadata is valid. This is
+ * intended for use in read verifiers on v5 superblocks.
+ */
+bool
+xfs_log_check_lsn(
+ struct xfs_mount *mp,
+ xfs_lsn_t lsn)
+{
+ struct xlog *log = mp->m_log;
+ bool valid;
+
+ /*
+ * norecovery mode skips mount-time log processing and unconditionally
+ * resets the in-core LSN. We can't validate in this mode, but
+ * modifications are not allowed anyways so just return true.
+ */
+ if (mp->m_flags & XFS_MOUNT_NORECOVERY)
+ return true;
+
+ /*
+ * Some metadata LSNs are initialized to NULL (e.g., the agfl). This is
+ * handled by recovery and thus safe to ignore here.
+ */
+ if (lsn == NULLCOMMITLSN)
+ return true;
+
+ valid = xlog_valid_lsn(mp->m_log, lsn);
+
+ /* warn the user about what's gone wrong before verifier failure */
+ if (!valid) {
+ spin_lock(&log->l_icloglock);
+ xfs_warn(mp,
+"Corruption warning: Metadata has LSN (%d:%d) ahead of current LSN (%d:%d). "
+"Please unmount and run xfs_repair (>= v4.3) to resolve.",
+ CYCLE_LSN(lsn), BLOCK_LSN(lsn),
+ log->l_curr_cycle, log->l_curr_block);
+ spin_unlock(&log->l_icloglock);
+ }
+
+ return valid;
+}
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 09d91d3166cd..aa533a7d50f2 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -181,5 +181,6 @@ bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
void xfs_log_work_queue(struct xfs_mount *mp);
void xfs_log_worker(struct work_struct *work);
void xfs_log_quiesce(struct xfs_mount *mp);
+bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t);
#endif /* __XFS_LOG_H__ */
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 950f3f94720c..ed8896310c00 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -62,6 +62,7 @@ static inline uint xlog_get_client_id(__be32 i)
#define XLOG_STATE_CALLBACK 0x0020 /* Callback functions now */
#define XLOG_STATE_DIRTY 0x0040 /* Dirty IC log, not ready for ACTIVE status*/
#define XLOG_STATE_IOERROR 0x0080 /* IO error happened in sync'ing log */
+#define XLOG_STATE_IOABORT 0x0100 /* force abort on I/O completion (debug) */
#define XLOG_STATE_ALL 0x7FFF /* All possible valid flags */
#define XLOG_STATE_NOTUSED 0x8000 /* This IC log not being used */
@@ -410,6 +411,8 @@ struct xlog {
/* The following field are used for debugging; need to hold icloglock */
#ifdef DEBUG
void *l_iclog_bak[XLOG_MAX_ICLOGS];
+ /* log record crc error injection factor */
+ uint32_t l_badcrc_factor;
#endif
};
@@ -560,4 +563,55 @@ static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock)
remove_wait_queue(wq, &wait);
}
+/*
+ * The LSN is valid so long as it is behind the current LSN. If it isn't, this
+ * means that the next log record that includes this metadata could have a
+ * smaller LSN. In turn, this means that the modification in the log would not
+ * replay.
+ */
+static inline bool
+xlog_valid_lsn(
+ struct xlog *log,
+ xfs_lsn_t lsn)
+{
+ int cur_cycle;
+ int cur_block;
+ bool valid = true;
+
+ /*
+ * First, sample the current lsn without locking to avoid added
+ * contention from metadata I/O. The current cycle and block are updated
+ * (in xlog_state_switch_iclogs()) and read here in a particular order
+ * to avoid false negatives (e.g., thinking the metadata LSN is valid
+ * when it is not).
+ *
+ * The current block is always rewound before the cycle is bumped in
+ * xlog_state_switch_iclogs() to ensure the current LSN is never seen in
+ * a transiently forward state. Instead, we can see the LSN in a
+ * transiently behind state if we happen to race with a cycle wrap.
+ */
+ cur_cycle = ACCESS_ONCE(log->l_curr_cycle);
+ smp_rmb();
+ cur_block = ACCESS_ONCE(log->l_curr_block);
+
+ if ((CYCLE_LSN(lsn) > cur_cycle) ||
+ (CYCLE_LSN(lsn) == cur_cycle && BLOCK_LSN(lsn) > cur_block)) {
+ /*
+ * If the metadata LSN appears invalid, it's possible the check
+ * above raced with a wrap to the next log cycle. Grab the lock
+ * to check for sure.
+ */
+ spin_lock(&log->l_icloglock);
+ cur_cycle = log->l_curr_cycle;
+ cur_block = log->l_curr_block;
+ spin_unlock(&log->l_icloglock);
+
+ if ((CYCLE_LSN(lsn) > cur_cycle) ||
+ (CYCLE_LSN(lsn) == cur_cycle && BLOCK_LSN(lsn) > cur_block))
+ valid = false;
+ }
+
+ return valid;
+}
+
#endif /* __XFS_LOG_PRIV_H__ */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 512a0945d52a..da37beb76f6e 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -61,6 +61,9 @@ xlog_recover_check_summary(
#else
#define xlog_recover_check_summary(log)
#endif
+STATIC int
+xlog_do_recovery_pass(
+ struct xlog *, xfs_daddr_t, xfs_daddr_t, int, xfs_daddr_t *);
/*
* This structure is used during recovery to record the buf log items which
@@ -868,6 +871,351 @@ validate_head:
}
/*
+ * Seek backwards in the log for log record headers.
+ *
+ * Given a starting log block, walk backwards until we find the provided number
+ * of records or hit the provided tail block. The return value is the number of
+ * records encountered or a negative error code. The log block and buffer
+ * pointer of the last record seen are returned in rblk and rhead respectively.
+ */
+STATIC int
+xlog_rseek_logrec_hdr(
+ struct xlog *log,
+ xfs_daddr_t head_blk,
+ xfs_daddr_t tail_blk,
+ int count,
+ struct xfs_buf *bp,
+ xfs_daddr_t *rblk,
+ struct xlog_rec_header **rhead,
+ bool *wrapped)
+{
+ int i;
+ int error;
+ int found = 0;
+ char *offset = NULL;
+ xfs_daddr_t end_blk;
+
+ *wrapped = false;
+
+ /*
+ * Walk backwards from the head block until we hit the tail or the first
+ * block in the log.
+ */
+ end_blk = head_blk > tail_blk ? tail_blk : 0;
+ for (i = (int) head_blk - 1; i >= end_blk; i--) {
+ error = xlog_bread(log, i, 1, bp, &offset);
+ if (error)
+ goto out_error;
+
+ if (*(__be32 *) offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
+ *rblk = i;
+ *rhead = (struct xlog_rec_header *) offset;
+ if (++found == count)
+ break;
+ }
+ }
+
+ /*
+ * If we haven't hit the tail block or the log record header count,
+ * start looking again from the end of the physical log. Note that
+ * callers can pass head == tail if the tail is not yet known.
+ */
+ if (tail_blk >= head_blk && found != count) {
+ for (i = log->l_logBBsize - 1; i >= (int) tail_blk; i--) {
+ error = xlog_bread(log, i, 1, bp, &offset);
+ if (error)
+ goto out_error;
+
+ if (*(__be32 *)offset ==
+ cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
+ *wrapped = true;
+ *rblk = i;
+ *rhead = (struct xlog_rec_header *) offset;
+ if (++found == count)
+ break;
+ }
+ }
+ }
+
+ return found;
+
+out_error:
+ return error;
+}
+
+/*
+ * Seek forward in the log for log record headers.
+ *
+ * Given head and tail blocks, walk forward from the tail block until we find
+ * the provided number of records or hit the head block. The return value is the
+ * number of records encountered or a negative error code. The log block and
+ * buffer pointer of the last record seen are returned in rblk and rhead
+ * respectively.
+ */
+STATIC int
+xlog_seek_logrec_hdr(
+ struct xlog *log,
+ xfs_daddr_t head_blk,
+ xfs_daddr_t tail_blk,
+ int count,
+ struct xfs_buf *bp,
+ xfs_daddr_t *rblk,
+ struct xlog_rec_header **rhead,
+ bool *wrapped)
+{
+ int i;
+ int error;
+ int found = 0;
+ char *offset = NULL;
+ xfs_daddr_t end_blk;
+
+ *wrapped = false;
+
+ /*
+ * Walk forward from the tail block until we hit the head or the last
+ * block in the log.
+ */
+ end_blk = head_blk > tail_blk ? head_blk : log->l_logBBsize - 1;
+ for (i = (int) tail_blk; i <= end_blk; i++) {
+ error = xlog_bread(log, i, 1, bp, &offset);
+ if (error)
+ goto out_error;
+
+ if (*(__be32 *) offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
+ *rblk = i;
+ *rhead = (struct xlog_rec_header *) offset;
+ if (++found == count)
+ break;
+ }
+ }
+
+ /*
+ * If we haven't hit the head block or the log record header count,
+ * start looking again from the start of the physical log.
+ */
+ if (tail_blk > head_blk && found != count) {
+ for (i = 0; i < (int) head_blk; i++) {
+ error = xlog_bread(log, i, 1, bp, &offset);
+ if (error)
+ goto out_error;
+
+ if (*(__be32 *)offset ==
+ cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
+ *wrapped = true;
+ *rblk = i;
+ *rhead = (struct xlog_rec_header *) offset;
+ if (++found == count)
+ break;
+ }
+ }
+ }
+
+ return found;
+
+out_error:
+ return error;
+}
+
+/*
+ * Check the log tail for torn writes. This is required when torn writes are
+ * detected at the head and the head had to be walked back to a previous record.
+ * The tail of the previous record must now be verified to ensure the torn
+ * writes didn't corrupt the previous tail.
+ *
+ * Return an error if CRC verification fails as recovery cannot proceed.
+ */
+STATIC int
+xlog_verify_tail(
+ struct xlog *log,
+ xfs_daddr_t head_blk,
+ xfs_daddr_t tail_blk)
+{
+ struct xlog_rec_header *thead;
+ struct xfs_buf *bp;
+ xfs_daddr_t first_bad;
+ int count;
+ int error = 0;
+ bool wrapped;
+ xfs_daddr_t tmp_head;
+
+ bp = xlog_get_bp(log, 1);
+ if (!bp)
+ return -ENOMEM;
+
+ /*
+ * Seek XLOG_MAX_ICLOGS + 1 records past the current tail record to get
+ * a temporary head block that points after the last possible
+ * concurrently written record of the tail.
+ */
+ count = xlog_seek_logrec_hdr(log, head_blk, tail_blk,
+ XLOG_MAX_ICLOGS + 1, bp, &tmp_head, &thead,
+ &wrapped);
+ if (count < 0) {
+ error = count;
+ goto out;
+ }
+
+ /*
+ * If the call above didn't find XLOG_MAX_ICLOGS + 1 records, we ran
+ * into the actual log head. tmp_head points to the start of the record
+ * so update it to the actual head block.
+ */
+ if (count < XLOG_MAX_ICLOGS + 1)
+ tmp_head = head_blk;
+
+ /*
+ * We now have a tail and temporary head block that covers at least
+ * XLOG_MAX_ICLOGS records from the tail. We need to verify that these
+ * records were completely written. Run a CRC verification pass from
+ * tail to head and return the result.
+ */
+ error = xlog_do_recovery_pass(log, tmp_head, tail_blk,
+ XLOG_RECOVER_CRCPASS, &first_bad);
+
+out:
+ xlog_put_bp(bp);
+ return error;
+}
+
+/*
+ * Detect and trim torn writes from the head of the log.
+ *
+ * Storage without sector atomicity guarantees can result in torn writes in the
+ * log in the event of a crash. Our only means to detect this scenario is via
+ * CRC verification. While we can't always be certain that CRC verification
+ * failure is due to a torn write vs. an unrelated corruption, we do know that
+ * only a certain number (XLOG_MAX_ICLOGS) of log records can be written out at
+ * one time. Therefore, CRC verify up to XLOG_MAX_ICLOGS records at the head of
+ * the log and treat failures in this range as torn writes as a matter of
+ * policy. In the event of CRC failure, the head is walked back to the last good
+ * record in the log and the tail is updated from that record and verified.
+ */
+STATIC int
+xlog_verify_head(
+ struct xlog *log,
+ xfs_daddr_t *head_blk, /* in/out: unverified head */
+ xfs_daddr_t *tail_blk, /* out: tail block */
+ struct xfs_buf *bp,
+ xfs_daddr_t *rhead_blk, /* start blk of last record */
+ struct xlog_rec_header **rhead, /* ptr to last record */
+ bool *wrapped) /* last rec. wraps phys. log */
+{
+ struct xlog_rec_header *tmp_rhead;
+ struct xfs_buf *tmp_bp;
+ xfs_daddr_t first_bad;
+ xfs_daddr_t tmp_rhead_blk;
+ int found;
+ int error;
+ bool tmp_wrapped;
+
+ /*
+ * Search backwards through the log looking for the log record header
+ * block. This wraps all the way back around to the head so something is
+ * seriously wrong if we can't find it.
+ */
+ found = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, bp, rhead_blk,
+ rhead, wrapped);
+ if (found < 0)
+ return found;
+ if (!found) {
+ xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
+ return -EIO;
+ }
+
+ *tail_blk = BLOCK_LSN(be64_to_cpu((*rhead)->h_tail_lsn));
+
+ /*
+ * Now that we have a tail block, check the head of the log for torn
+ * writes. Search again until we hit the tail or the maximum number of
+ * log record I/Os that could have been in flight at one time. Use a
+ * temporary buffer so we don't trash the rhead/bp pointer from the
+ * call above.
+ */
+ tmp_bp = xlog_get_bp(log, 1);
+ if (!tmp_bp)
+ return -ENOMEM;
+ error = xlog_rseek_logrec_hdr(log, *head_blk, *tail_blk,
+ XLOG_MAX_ICLOGS, tmp_bp, &tmp_rhead_blk,
+ &tmp_rhead, &tmp_wrapped);
+ xlog_put_bp(tmp_bp);
+ if (error < 0)
+ return error;
+
+ /*
+ * Now run a CRC verification pass over the records starting at the
+ * block found above to the current head. If a CRC failure occurs, the
+ * log block of the first bad record is saved in first_bad.
+ */
+ error = xlog_do_recovery_pass(log, *head_blk, tmp_rhead_blk,
+ XLOG_RECOVER_CRCPASS, &first_bad);
+ if (error == -EFSBADCRC) {
+ /*
+ * We've hit a potential torn write. Reset the error and warn
+ * about it.
+ */
+ error = 0;
+ xfs_warn(log->l_mp,
+"Torn write (CRC failure) detected at log block 0x%llx. Truncating head block from 0x%llx.",
+ first_bad, *head_blk);
+
+ /*
+ * Get the header block and buffer pointer for the last good
+ * record before the bad record.
+ *
+ * Note that xlog_find_tail() clears the blocks at the new head
+ * (i.e., the records with invalid CRC) if the cycle number
+ * matches the the current cycle.
+ */
+ found = xlog_rseek_logrec_hdr(log, first_bad, *tail_blk, 1, bp,
+ rhead_blk, rhead, wrapped);
+ if (found < 0)
+ return found;
+ if (found == 0) /* XXX: right thing to do here? */
+ return -EIO;
+
+ /*
+ * Reset the head block to the starting block of the first bad
+ * log record and set the tail block based on the last good
+ * record.
+ *
+ * Bail out if the updated head/tail match as this indicates
+ * possible corruption outside of the acceptable
+ * (XLOG_MAX_ICLOGS) range. This is a job for xfs_repair...
+ */
+ *head_blk = first_bad;
+ *tail_blk = BLOCK_LSN(be64_to_cpu((*rhead)->h_tail_lsn));
+ if (*head_blk == *tail_blk) {
+ ASSERT(0);
+ return 0;
+ }
+
+ /*
+ * Now verify the tail based on the updated head. This is
+ * required because the torn writes trimmed from the head could
+ * have been written over the tail of a previous record. Return
+ * any errors since recovery cannot proceed if the tail is
+ * corrupt.
+ *
+ * XXX: This leaves a gap in truly robust protection from torn
+ * writes in the log. If the head is behind the tail, the tail
+ * pushes forward to create some space and then a crash occurs
+ * causing the writes into the previous record's tail region to
+ * tear, log recovery isn't able to recover.
+ *
+ * How likely is this to occur? If possible, can we do something
+ * more intelligent here? Is it safe to push the tail forward if
+ * we can determine that the tail is within the range of the
+ * torn write (e.g., the kernel can only overwrite the tail if
+ * it has actually been pushed forward)? Alternatively, could we
+ * somehow prevent this condition at runtime?
+ */
+ error = xlog_verify_tail(log, *head_blk, *tail_blk);
+ }
+
+ return error;
+}
+
+/*
* Find the sync block number or the tail of the log.
*
* This will be the block number of the last record to have its
@@ -893,13 +1241,13 @@ xlog_find_tail(
xlog_op_header_t *op_head;
char *offset = NULL;
xfs_buf_t *bp;
- int error, i, found;
+ int error;
xfs_daddr_t umount_data_blk;
xfs_daddr_t after_umount_blk;
+ xfs_daddr_t rhead_blk;
xfs_lsn_t tail_lsn;
int hblks;
-
- found = 0;
+ bool wrapped = false;
/*
* Find previous log record
@@ -923,48 +1271,16 @@ xlog_find_tail(
}
/*
- * Search backwards looking for log record header block
+ * Trim the head block back to skip over torn records. We can have
+ * multiple log I/Os in flight at any time, so we assume CRC failures
+ * back through the previous several records are torn writes and skip
+ * them.
*/
ASSERT(*head_blk < INT_MAX);
- for (i = (int)(*head_blk) - 1; i >= 0; i--) {
- error = xlog_bread(log, i, 1, bp, &offset);
- if (error)
- goto done;
-
- if (*(__be32 *)offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
- found = 1;
- break;
- }
- }
- /*
- * If we haven't found the log record header block, start looking
- * again from the end of the physical log. XXXmiken: There should be
- * a check here to make sure we didn't search more than N blocks in
- * the previous code.
- */
- if (!found) {
- for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) {
- error = xlog_bread(log, i, 1, bp, &offset);
- if (error)
- goto done;
-
- if (*(__be32 *)offset ==
- cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
- found = 2;
- break;
- }
- }
- }
- if (!found) {
- xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
- xlog_put_bp(bp);
- ASSERT(0);
- return -EIO;
- }
-
- /* find blk_no of tail of log */
- rhead = (xlog_rec_header_t *)offset;
- *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
+ error = xlog_verify_head(log, head_blk, tail_blk, bp, &rhead_blk,
+ &rhead, &wrapped);
+ if (error)
+ goto done;
/*
* Reset log values according to the state of the log when we
@@ -976,10 +1292,10 @@ xlog_find_tail(
* written was complete and ended exactly on the end boundary
* of the physical log.
*/
- log->l_prev_block = i;
+ log->l_prev_block = rhead_blk;
log->l_curr_block = (int)*head_blk;
log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
- if (found == 2)
+ if (wrapped)
log->l_curr_cycle++;
atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
@@ -1014,12 +1330,13 @@ xlog_find_tail(
} else {
hblks = 1;
}
- after_umount_blk = (i + hblks + (int)
- BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize;
+ after_umount_blk = rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len));
+ after_umount_blk = do_mod(after_umount_blk, log->l_logBBsize);
tail_lsn = atomic64_read(&log->l_tail_lsn);
if (*head_blk == after_umount_blk &&
be32_to_cpu(rhead->h_num_logops) == 1) {
- umount_data_blk = (i + hblks) % log->l_logBBsize;
+ umount_data_blk = rhead_blk + hblks;
+ umount_data_blk = do_mod(umount_data_blk, log->l_logBBsize);
error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
if (error)
goto done;
@@ -3204,6 +3521,7 @@ xlog_recover_dquot_ra_pass2(
struct xfs_disk_dquot *recddq;
struct xfs_dq_logformat *dq_f;
uint type;
+ int len;
if (mp->m_qflags == 0)
@@ -3224,8 +3542,12 @@ xlog_recover_dquot_ra_pass2(
ASSERT(dq_f);
ASSERT(dq_f->qlf_len == 1);
- xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno,
- XFS_FSB_TO_BB(mp, dq_f->qlf_len), NULL);
+ len = XFS_FSB_TO_BB(mp, dq_f->qlf_len);
+ if (xlog_peek_buffer_cancelled(log, dq_f->qlf_blkno, len, 0))
+ return;
+
+ xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno, len,
+ &xfs_dquot_buf_ra_ops);
}
STATIC void
@@ -3431,7 +3753,7 @@ xlog_recover_add_to_cont_trans(
* previous record. Copy the rest of the header.
*/
if (list_empty(&trans->r_itemq)) {
- ASSERT(len < sizeof(struct xfs_trans_header));
+ ASSERT(len <= sizeof(struct xfs_trans_header));
if (len > sizeof(struct xfs_trans_header)) {
xfs_warn(log->l_mp, "%s: bad header length", __func__);
return -EIO;
@@ -4118,26 +4440,69 @@ xlog_recover_process_iunlinks(
mp->m_dmevmask = mp_dmevmask;
}
+STATIC int
+xlog_unpack_data(
+ struct xlog_rec_header *rhead,
+ char *dp,
+ struct xlog *log)
+{
+ int i, j, k;
+
+ for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
+ i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
+ *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i];
+ dp += BBSIZE;
+ }
+
+ if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
+ xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead;
+ for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
+ j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+ k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+ *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k];
+ dp += BBSIZE;
+ }
+ }
+
+ return 0;
+}
+
/*
- * Upack the log buffer data and crc check it. If the check fails, issue a
- * warning if and only if the CRC in the header is non-zero. This makes the
- * check an advisory warning, and the zero CRC check will prevent failure
- * warnings from being emitted when upgrading the kernel from one that does not
- * add CRCs by default.
- *
- * When filesystems are CRC enabled, this CRC mismatch becomes a fatal log
- * corruption failure
+ * CRC check, unpack and process a log record.
*/
STATIC int
-xlog_unpack_data_crc(
+xlog_recover_process(
+ struct xlog *log,
+ struct hlist_head rhash[],
struct xlog_rec_header *rhead,
char *dp,
- struct xlog *log)
+ int pass)
{
+ int error;
__le32 crc;
crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
- if (crc != rhead->h_crc) {
+
+ /*
+ * Nothing else to do if this is a CRC verification pass. Just return
+ * if this a record with a non-zero crc. Unfortunately, mkfs always
+ * sets h_crc to 0 so we must consider this valid even on v5 supers.
+ * Otherwise, return EFSBADCRC on failure so the callers up the stack
+ * know precisely what failed.
+ */
+ if (pass == XLOG_RECOVER_CRCPASS) {
+ if (rhead->h_crc && crc != le32_to_cpu(rhead->h_crc))
+ return -EFSBADCRC;
+ return 0;
+ }
+
+ /*
+ * We're in the normal recovery path. Issue a warning if and only if the
+ * CRC in the header is non-zero. This is an advisory warning and the
+ * zero CRC check prevents warnings from being emitted when upgrading
+ * the kernel from one that does not add CRCs by default.
+ */
+ if (crc != le32_to_cpu(rhead->h_crc)) {
if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
xfs_alert(log->l_mp,
"log record CRC mismatch: found 0x%x, expected 0x%x.",
@@ -4147,47 +4512,18 @@ xlog_unpack_data_crc(
}
/*
- * If we've detected a log record corruption, then we can't
- * recover past this point. Abort recovery if we are enforcing
- * CRC protection by punting an error back up the stack.
+ * If the filesystem is CRC enabled, this mismatch becomes a
+ * fatal log corruption failure.
*/
if (xfs_sb_version_hascrc(&log->l_mp->m_sb))
return -EFSCORRUPTED;
}
- return 0;
-}
-
-STATIC int
-xlog_unpack_data(
- struct xlog_rec_header *rhead,
- char *dp,
- struct xlog *log)
-{
- int i, j, k;
- int error;
-
- error = xlog_unpack_data_crc(rhead, dp, log);
+ error = xlog_unpack_data(rhead, dp, log);
if (error)
return error;
- for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
- i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
- *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i];
- dp += BBSIZE;
- }
-
- if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
- xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead;
- for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
- j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
- k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
- *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k];
- dp += BBSIZE;
- }
- }
-
- return 0;
+ return xlog_recover_process_data(log, rhash, rhead, dp, pass);
}
STATIC int
@@ -4239,18 +4575,21 @@ xlog_do_recovery_pass(
struct xlog *log,
xfs_daddr_t head_blk,
xfs_daddr_t tail_blk,
- int pass)
+ int pass,
+ xfs_daddr_t *first_bad) /* out: first bad log rec */
{
xlog_rec_header_t *rhead;
xfs_daddr_t blk_no;
+ xfs_daddr_t rhead_blk;
char *offset;
xfs_buf_t *hbp, *dbp;
- int error = 0, h_size;
+ int error = 0, h_size, h_len;
int bblks, split_bblks;
int hblks, split_hblks, wrapped_hblks;
struct hlist_head rhash[XLOG_RHASH_SIZE];
ASSERT(head_blk != tail_blk);
+ rhead_blk = 0;
/*
* Read the header of the tail block and get the iclog buffer size from
@@ -4274,7 +4613,31 @@ xlog_do_recovery_pass(
error = xlog_valid_rec_header(log, rhead, tail_blk);
if (error)
goto bread_err1;
+
+ /*
+ * xfsprogs has a bug where record length is based on lsunit but
+ * h_size (iclog size) is hardcoded to 32k. Now that we
+ * unconditionally CRC verify the unmount record, this means the
+ * log buffer can be too small for the record and cause an
+ * overrun.
+ *
+ * Detect this condition here. Use lsunit for the buffer size as
+ * long as this looks like the mkfs case. Otherwise, return an
+ * error to avoid a buffer overrun.
+ */
h_size = be32_to_cpu(rhead->h_size);
+ h_len = be32_to_cpu(rhead->h_len);
+ if (h_len > h_size) {
+ if (h_len <= log->l_mp->m_logbsize &&
+ be32_to_cpu(rhead->h_num_logops) == 1) {
+ xfs_warn(log->l_mp,
+ "invalid iclog size (%d bytes), using lsunit (%d bytes)",
+ h_size, log->l_mp->m_logbsize);
+ h_size = log->l_mp->m_logbsize;
+ } else
+ return -EFSCORRUPTED;
+ }
+
if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) &&
(h_size > XLOG_HEADER_CYCLE_SIZE)) {
hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
@@ -4301,7 +4664,7 @@ xlog_do_recovery_pass(
}
memset(rhash, 0, sizeof(rhash));
- blk_no = tail_blk;
+ blk_no = rhead_blk = tail_blk;
if (tail_blk > head_blk) {
/*
* Perform recovery around the end of the physical log.
@@ -4408,19 +4771,18 @@ xlog_do_recovery_pass(
goto bread_err2;
}
- error = xlog_unpack_data(rhead, offset, log);
+ error = xlog_recover_process(log, rhash, rhead, offset,
+ pass);
if (error)
goto bread_err2;
- error = xlog_recover_process_data(log, rhash,
- rhead, offset, pass);
- if (error)
- goto bread_err2;
blk_no += bblks;
+ rhead_blk = blk_no;
}
ASSERT(blk_no >= log->l_logBBsize);
blk_no -= log->l_logBBsize;
+ rhead_blk = blk_no;
}
/* read first part of physical log */
@@ -4441,21 +4803,22 @@ xlog_do_recovery_pass(
if (error)
goto bread_err2;
- error = xlog_unpack_data(rhead, offset, log);
+ error = xlog_recover_process(log, rhash, rhead, offset, pass);
if (error)
goto bread_err2;
- error = xlog_recover_process_data(log, rhash,
- rhead, offset, pass);
- if (error)
- goto bread_err2;
blk_no += bblks + hblks;
+ rhead_blk = blk_no;
}
bread_err2:
xlog_put_bp(dbp);
bread_err1:
xlog_put_bp(hbp);
+
+ if (error && first_bad)
+ *first_bad = rhead_blk;
+
return error;
}
@@ -4493,7 +4856,7 @@ xlog_do_log_recovery(
INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
error = xlog_do_recovery_pass(log, head_blk, tail_blk,
- XLOG_RECOVER_PASS1);
+ XLOG_RECOVER_PASS1, NULL);
if (error != 0) {
kmem_free(log->l_buf_cancel_table);
log->l_buf_cancel_table = NULL;
@@ -4504,7 +4867,7 @@ xlog_do_log_recovery(
* When it is complete free the table of buf cancel items.
*/
error = xlog_do_recovery_pass(log, head_blk, tail_blk,
- XLOG_RECOVER_PASS2);
+ XLOG_RECOVER_PASS2, NULL);
#ifdef DEBUG
if (!error) {
int i;
@@ -4609,9 +4972,19 @@ xlog_recover(
int error;
/* find the tail of the log */
- if ((error = xlog_find_tail(log, &head_blk, &tail_blk)))
+ error = xlog_find_tail(log, &head_blk, &tail_blk);
+ if (error)
return error;
+ /*
+ * The superblock was read before the log was available and thus the LSN
+ * could not be verified. Check the superblock LSN against the current
+ * LSN now that it's known.
+ */
+ if (xfs_sb_version_hascrc(&log->l_mp->m_sb) &&
+ !xfs_log_check_lsn(log->l_mp, log->l_mp->m_sb.sb_lsn))
+ return -EINVAL;
+
if (tail_blk != head_blk) {
/* There used to be a comment here:
*
diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c
index d8b67547ab34..11792d888e4e 100644
--- a/fs/xfs/xfs_message.c
+++ b/fs/xfs/xfs_message.c
@@ -17,6 +17,7 @@
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_error.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
@@ -43,6 +44,7 @@ void func(const struct xfs_mount *mp, const char *fmt, ...) \
{ \
struct va_format vaf; \
va_list args; \
+ int level; \
\
va_start(args, fmt); \
\
@@ -51,6 +53,11 @@ void func(const struct xfs_mount *mp, const char *fmt, ...) \
\
__xfs_printk(kern_level, mp, &vaf); \
va_end(args); \
+ \
+ if (!kstrtoint(kern_level, 0, &level) && \
+ level <= LOGLEVEL_ERR && \
+ xfs_error_level >= XFS_ERRLEVEL_HIGH) \
+ xfs_stack_trace(); \
} \
define_xfs_printk_level(xfs_emerg, KERN_EMERG);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index bf92e0c037c7..bb753b359bee 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -47,6 +47,16 @@ static DEFINE_MUTEX(xfs_uuid_table_mutex);
static int xfs_uuid_table_size;
static uuid_t *xfs_uuid_table;
+void
+xfs_uuid_table_free(void)
+{
+ if (xfs_uuid_table_size == 0)
+ return;
+ kmem_free(xfs_uuid_table);
+ xfs_uuid_table = NULL;
+ xfs_uuid_table_size = 0;
+}
+
/*
* See if the UUID is unique among mounted XFS filesystems.
* Mount fails if UUID is nil or a FS with the same UUID is already mounted.
@@ -693,10 +703,15 @@ xfs_mountfs(
if (error)
goto out;
- error = xfs_uuid_mount(mp);
+ error = xfs_sysfs_init(&mp->m_stats.xs_kobj, &xfs_stats_ktype,
+ &mp->m_kobj, "stats");
if (error)
goto out_remove_sysfs;
+ error = xfs_uuid_mount(mp);
+ if (error)
+ goto out_del_stats;
+
/*
* Set the minimum read and write sizes
*/
@@ -971,6 +986,8 @@ xfs_mountfs(
xfs_da_unmount(mp);
out_remove_uuid:
xfs_uuid_unmount(mp);
+ out_del_stats:
+ xfs_sysfs_del(&mp->m_stats.xs_kobj);
out_remove_sysfs:
xfs_sysfs_del(&mp->m_kobj);
out:
@@ -1047,6 +1064,7 @@ xfs_unmountfs(
xfs_warn(mp, "Unable to update superblock counters. "
"Freespace may not be correct on next mount.");
+
xfs_log_unmount(mp);
xfs_da_unmount(mp);
xfs_uuid_unmount(mp);
@@ -1056,6 +1074,7 @@ xfs_unmountfs(
#endif
xfs_free_perag(mp);
+ xfs_sysfs_del(&mp->m_stats.xs_kobj);
xfs_sysfs_del(&mp->m_kobj);
}
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 7999e91cd49a..b57098481c10 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -127,6 +127,7 @@ typedef struct xfs_mount {
int64_t m_low_space[XFS_LOWSP_MAX];
/* low free space thresholds */
struct xfs_kobj m_kobj;
+ struct xstats m_stats; /* per-fs stats */
struct workqueue_struct *m_buf_workqueue;
struct workqueue_struct *m_data_workqueue;
@@ -312,6 +313,7 @@ typedef struct xfs_perag {
int pagb_count; /* pagb slots in use */
} xfs_perag_t;
+extern void xfs_uuid_table_free(void);
extern int xfs_log_sbcount(xfs_mount_t *);
extern __uint64_t xfs_default_resblks(xfs_mount_t *mp);
extern int xfs_mountfs(xfs_mount_t *mp);
@@ -336,4 +338,7 @@ extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
extern void xfs_set_low_space_thresholds(struct xfs_mount *);
+int xfs_zero_extent(struct xfs_inode *ip, xfs_fsblock_t start_fsb,
+ xfs_off_t count_fsb);
+
#endif /* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index ab4a6066f7ca..dc6221942b85 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -181,6 +181,11 @@ xfs_fs_map_blocks(
ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
if (!nimaps || imap.br_startblock == HOLESTARTBLOCK) {
+ /*
+ * xfs_iomap_write_direct() expects to take ownership of
+ * the shared ilock.
+ */
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
error = xfs_iomap_write_direct(ip, offset, length,
&imap, nimaps);
if (error)
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index eac9549efd52..532ab79d38fe 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -184,7 +184,7 @@ xfs_qm_dqpurge(
*/
ASSERT(!list_empty(&dqp->q_lru));
list_lru_del(&qi->qi_lru, &dqp->q_lru);
- XFS_STATS_DEC(xs_qm_dquot_unused);
+ XFS_STATS_DEC(mp, xs_qm_dquot_unused);
xfs_qm_dqdestroy(dqp);
return 0;
@@ -448,11 +448,11 @@ xfs_qm_dquot_isolate(
*/
if (dqp->q_nrefs) {
xfs_dqunlock(dqp);
- XFS_STATS_INC(xs_qm_dqwants);
+ XFS_STATS_INC(dqp->q_mount, xs_qm_dqwants);
trace_xfs_dqreclaim_want(dqp);
list_lru_isolate(lru, &dqp->q_lru);
- XFS_STATS_DEC(xs_qm_dquot_unused);
+ XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused);
return LRU_REMOVED;
}
@@ -496,19 +496,19 @@ xfs_qm_dquot_isolate(
ASSERT(dqp->q_nrefs == 0);
list_lru_isolate_move(lru, &dqp->q_lru, &isol->dispose);
- XFS_STATS_DEC(xs_qm_dquot_unused);
+ XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused);
trace_xfs_dqreclaim_done(dqp);
- XFS_STATS_INC(xs_qm_dqreclaims);
+ XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaims);
return LRU_REMOVED;
out_miss_busy:
trace_xfs_dqreclaim_busy(dqp);
- XFS_STATS_INC(xs_qm_dqreclaim_misses);
+ XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses);
return LRU_SKIP;
out_unlock_dirty:
trace_xfs_dqreclaim_busy(dqp);
- XFS_STATS_INC(xs_qm_dqreclaim_misses);
+ XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses);
xfs_dqunlock(dqp);
spin_lock(lru_lock);
return LRU_RETRY;
@@ -525,7 +525,7 @@ xfs_qm_shrink_scan(
unsigned long freed;
int error;
- if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT))
+ if ((sc->gfp_mask & (__GFP_FS|__GFP_DIRECT_RECLAIM)) != (__GFP_FS|__GFP_DIRECT_RECLAIM))
return 0;
INIT_LIST_HEAD(&isol.buffers);
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index ab1bac6a3a1c..be02a68b2fe2 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -766,7 +766,6 @@ xfs_growfs_rt_alloc(
{
xfs_fileoff_t bno; /* block number in file */
struct xfs_buf *bp; /* temporary buffer for zeroing */
- int committed; /* transaction committed flag */
xfs_daddr_t d; /* disk block address */
int error; /* error return value */
xfs_fsblock_t firstblock;/* first block allocated in xaction */
@@ -811,7 +810,7 @@ xfs_growfs_rt_alloc(
/*
* Free any blocks freed up in the transaction, then commit.
*/
- error = xfs_bmap_finish(&tp, &flist, &committed);
+ error = xfs_bmap_finish(&tp, &flist, NULL);
if (error)
goto out_bmap_cancel;
error = xfs_trans_commit(tp);
diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index f2240383d4bb..8686df6c7609 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -18,20 +18,21 @@
#include "xfs.h"
#include <linux/proc_fs.h>
-DEFINE_PER_CPU(struct xfsstats, xfsstats);
+struct xstats xfsstats;
-static int counter_val(int idx)
+static int counter_val(struct xfsstats __percpu *stats, int idx)
{
int val = 0, cpu;
for_each_possible_cpu(cpu)
- val += *(((__u32 *)&per_cpu(xfsstats, cpu) + idx));
+ val += *(((__u32 *)per_cpu_ptr(stats, cpu) + idx));
return val;
}
-static int xfs_stat_proc_show(struct seq_file *m, void *v)
+int xfs_stats_format(struct xfsstats __percpu *stats, char *buf)
{
int i, j;
+ int len = 0;
__uint64_t xs_xstrat_bytes = 0;
__uint64_t xs_write_bytes = 0;
__uint64_t xs_read_bytes = 0;
@@ -65,54 +66,59 @@ static int xfs_stat_proc_show(struct seq_file *m, void *v)
};
/* Loop over all stats groups */
+
for (i = j = 0; i < ARRAY_SIZE(xstats); i++) {
- seq_printf(m, "%s", xstats[i].desc);
+ len += snprintf(buf + len, PATH_MAX - len, "%s",
+ xstats[i].desc);
/* inner loop does each group */
for (; j < xstats[i].endpoint; j++)
- seq_printf(m, " %u", counter_val(j));
- seq_putc(m, '\n');
+ len += snprintf(buf + len, PATH_MAX - len, " %u",
+ counter_val(stats, j));
+ len += snprintf(buf + len, PATH_MAX - len, "\n");
}
/* extra precision counters */
for_each_possible_cpu(i) {
- xs_xstrat_bytes += per_cpu(xfsstats, i).xs_xstrat_bytes;
- xs_write_bytes += per_cpu(xfsstats, i).xs_write_bytes;
- xs_read_bytes += per_cpu(xfsstats, i).xs_read_bytes;
+ xs_xstrat_bytes += per_cpu_ptr(stats, i)->xs_xstrat_bytes;
+ xs_write_bytes += per_cpu_ptr(stats, i)->xs_write_bytes;
+ xs_read_bytes += per_cpu_ptr(stats, i)->xs_read_bytes;
}
- seq_printf(m, "xpc %Lu %Lu %Lu\n",
+ len += snprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n",
xs_xstrat_bytes, xs_write_bytes, xs_read_bytes);
- seq_printf(m, "debug %u\n",
+ len += snprintf(buf + len, PATH_MAX-len, "debug %u\n",
#if defined(DEBUG)
1);
#else
0);
#endif
- return 0;
+
+ return len;
}
-static int xfs_stat_proc_open(struct inode *inode, struct file *file)
+void xfs_stats_clearall(struct xfsstats __percpu *stats)
{
- return single_open(file, xfs_stat_proc_show, NULL);
+ int c;
+ __uint32_t vn_active;
+
+ xfs_notice(NULL, "Clearing xfsstats");
+ for_each_possible_cpu(c) {
+ preempt_disable();
+ /* save vn_active, it's a universal truth! */
+ vn_active = per_cpu_ptr(stats, c)->vn_active;
+ memset(per_cpu_ptr(stats, c), 0, sizeof(*stats));
+ per_cpu_ptr(stats, c)->vn_active = vn_active;
+ preempt_enable();
+ }
}
-static const struct file_operations xfs_stat_proc_fops = {
- .owner = THIS_MODULE,
- .open = xfs_stat_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
/* legacy quota interfaces */
#ifdef CONFIG_XFS_QUOTA
static int xqm_proc_show(struct seq_file *m, void *v)
{
/* maximum; incore; ratio free to inuse; freelist */
seq_printf(m, "%d\t%d\t%d\t%u\n",
- 0,
- counter_val(XFSSTAT_END_XQMSTAT),
- 0,
- counter_val(XFSSTAT_END_XQMSTAT + 1));
+ 0, counter_val(xfsstats.xs_stats, XFSSTAT_END_XQMSTAT),
+ 0, counter_val(xfsstats.xs_stats, XFSSTAT_END_XQMSTAT + 1));
return 0;
}
@@ -136,7 +142,7 @@ static int xqmstat_proc_show(struct seq_file *m, void *v)
seq_printf(m, "qm");
for (j = XFSSTAT_END_IBT_V2; j < XFSSTAT_END_XQMSTAT; j++)
- seq_printf(m, " %u", counter_val(j));
+ seq_printf(m, " %u", counter_val(xfsstats.xs_stats, j));
seq_putc(m, '\n');
return 0;
}
@@ -155,44 +161,35 @@ static const struct file_operations xqmstat_proc_fops = {
};
#endif /* CONFIG_XFS_QUOTA */
+#ifdef CONFIG_PROC_FS
int
xfs_init_procfs(void)
{
if (!proc_mkdir("fs/xfs", NULL))
+ return -ENOMEM;
+
+ if (!proc_symlink("fs/xfs/stat", NULL,
+ "/sys/fs/xfs/stats/stats"))
goto out;
- if (!proc_create("fs/xfs/stat", 0, NULL,
- &xfs_stat_proc_fops))
- goto out_remove_xfs_dir;
#ifdef CONFIG_XFS_QUOTA
if (!proc_create("fs/xfs/xqmstat", 0, NULL,
&xqmstat_proc_fops))
- goto out_remove_stat_file;
+ goto out;
if (!proc_create("fs/xfs/xqm", 0, NULL,
&xqm_proc_fops))
- goto out_remove_xqmstat_file;
+ goto out;
#endif
return 0;
-#ifdef CONFIG_XFS_QUOTA
- out_remove_xqmstat_file:
- remove_proc_entry("fs/xfs/xqmstat", NULL);
- out_remove_stat_file:
- remove_proc_entry("fs/xfs/stat", NULL);
-#endif
- out_remove_xfs_dir:
- remove_proc_entry("fs/xfs", NULL);
- out:
+out:
+ remove_proc_subtree("fs/xfs", NULL);
return -ENOMEM;
}
void
xfs_cleanup_procfs(void)
{
-#ifdef CONFIG_XFS_QUOTA
- remove_proc_entry("fs/xfs/xqm", NULL);
- remove_proc_entry("fs/xfs/xqmstat", NULL);
-#endif
- remove_proc_entry("fs/xfs/stat", NULL);
- remove_proc_entry("fs/xfs", NULL);
+ remove_proc_subtree("fs/xfs", NULL);
}
+#endif /* CONFIG_PROC_FS */
diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h
index c8f238b8299a..483b0eff1988 100644
--- a/fs/xfs/xfs_stats.h
+++ b/fs/xfs/xfs_stats.h
@@ -19,8 +19,6 @@
#define __XFS_STATS_H__
-#if defined(CONFIG_PROC_FS) && !defined(XFS_STATS_OFF)
-
#include <linux/percpu.h>
/*
@@ -215,15 +213,29 @@ struct xfsstats {
__uint64_t xs_read_bytes;
};
-DECLARE_PER_CPU(struct xfsstats, xfsstats);
+int xfs_stats_format(struct xfsstats __percpu *stats, char *buf);
+void xfs_stats_clearall(struct xfsstats __percpu *stats);
+extern struct xstats xfsstats;
-/*
- * We don't disable preempt, not too worried about poking the
- * wrong CPU's stat for now (also aggregated before reporting).
- */
-#define XFS_STATS_INC(v) (per_cpu(xfsstats, current_cpu()).v++)
-#define XFS_STATS_DEC(v) (per_cpu(xfsstats, current_cpu()).v--)
-#define XFS_STATS_ADD(v, inc) (per_cpu(xfsstats, current_cpu()).v += (inc))
+#define XFS_STATS_INC(mp, v) \
+do { \
+ per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v++; \
+ per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->v++; \
+} while (0)
+
+#define XFS_STATS_DEC(mp, v) \
+do { \
+ per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v--; \
+ per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->v--; \
+} while (0)
+
+#define XFS_STATS_ADD(mp, v, inc) \
+do { \
+ per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v += (inc); \
+ per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->v += (inc); \
+} while (0)
+
+#if defined(CONFIG_PROC_FS)
extern int xfs_init_procfs(void);
extern void xfs_cleanup_procfs(void);
@@ -231,10 +243,6 @@ extern void xfs_cleanup_procfs(void);
#else /* !CONFIG_PROC_FS */
-# define XFS_STATS_INC(count)
-# define XFS_STATS_DEC(count)
-# define XFS_STATS_ADD(count, inc)
-
static inline int xfs_init_procfs(void)
{
return 0;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 904f637cfa5f..59c9b7bd958d 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -137,7 +137,7 @@ static const match_table_t tokens = {
};
-STATIC unsigned long
+STATIC int
suffix_kstrtoint(char *s, unsigned int base, int *res)
{
int last, shift_left_factor = 0, _res;
@@ -838,17 +838,18 @@ xfs_init_mount_workqueues(
goto out_destroy_unwritten;
mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s",
- WQ_FREEZABLE, 0, mp->m_fsname);
+ WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
if (!mp->m_reclaim_workqueue)
goto out_destroy_cil;
mp->m_log_workqueue = alloc_workqueue("xfs-log/%s",
- WQ_FREEZABLE|WQ_HIGHPRI, 0, mp->m_fsname);
+ WQ_MEM_RECLAIM|WQ_FREEZABLE|WQ_HIGHPRI, 0,
+ mp->m_fsname);
if (!mp->m_log_workqueue)
goto out_destroy_reclaim;
mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s",
- WQ_FREEZABLE, 0, mp->m_fsname);
+ WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
if (!mp->m_eofblocks_workqueue)
goto out_destroy_log;
@@ -922,7 +923,7 @@ xfs_fs_destroy_inode(
trace_xfs_destroy_inode(ip);
- XFS_STATS_INC(vn_reclaim);
+ XFS_STATS_INC(ip->i_mount, vn_reclaim);
ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
@@ -983,8 +984,8 @@ xfs_fs_evict_inode(
truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
- XFS_STATS_INC(vn_rele);
- XFS_STATS_INC(vn_remove);
+ XFS_STATS_INC(ip->i_mount, vn_rele);
+ XFS_STATS_INC(ip->i_mount, vn_remove);
xfs_inactive(ip);
}
@@ -1474,9 +1475,16 @@ xfs_fs_fill_super(
if (error)
goto out_destroy_workqueues;
+ /* Allocate stats memory before we do operations that might use it */
+ mp->m_stats.xs_stats = alloc_percpu(struct xfsstats);
+ if (!mp->m_stats.xs_stats) {
+ error = -ENOMEM;
+ goto out_destroy_counters;
+ }
+
error = xfs_readsb(mp, flags);
if (error)
- goto out_destroy_counters;
+ goto out_free_stats;
error = xfs_finish_flags(mp);
if (error)
@@ -1545,9 +1553,11 @@ xfs_fs_fill_super(
xfs_filestream_unmount(mp);
out_free_sb:
xfs_freesb(mp);
+ out_free_stats:
+ free_percpu(mp->m_stats.xs_stats);
out_destroy_counters:
xfs_destroy_percpu_counters(mp);
-out_destroy_workqueues:
+ out_destroy_workqueues:
xfs_destroy_mount_workqueues(mp);
out_close_devices:
xfs_close_devices(mp);
@@ -1574,6 +1584,7 @@ xfs_fs_put_super(
xfs_unmountfs(mp);
xfs_freesb(mp);
+ free_percpu(mp->m_stats.xs_stats);
xfs_destroy_percpu_counters(mp);
xfs_destroy_mount_workqueues(mp);
xfs_close_devices(mp);
@@ -1703,8 +1714,8 @@ xfs_init_zones(void)
xfs_inode_zone =
kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode",
- KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | KM_ZONE_SPREAD,
- xfs_fs_inode_init_once);
+ KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | KM_ZONE_SPREAD |
+ KM_ZONE_ACCOUNT, xfs_fs_inode_init_once);
if (!xfs_inode_zone)
goto out_destroy_efi_zone;
@@ -1838,19 +1849,32 @@ init_xfs_fs(void)
xfs_kset = kset_create_and_add("xfs", NULL, fs_kobj);
if (!xfs_kset) {
error = -ENOMEM;
- goto out_sysctl_unregister;;
+ goto out_sysctl_unregister;
+ }
+
+ xfsstats.xs_kobj.kobject.kset = xfs_kset;
+
+ xfsstats.xs_stats = alloc_percpu(struct xfsstats);
+ if (!xfsstats.xs_stats) {
+ error = -ENOMEM;
+ goto out_kset_unregister;
}
+ error = xfs_sysfs_init(&xfsstats.xs_kobj, &xfs_stats_ktype, NULL,
+ "stats");
+ if (error)
+ goto out_free_stats;
+
#ifdef DEBUG
xfs_dbg_kobj.kobject.kset = xfs_kset;
error = xfs_sysfs_init(&xfs_dbg_kobj, &xfs_dbg_ktype, NULL, "debug");
if (error)
- goto out_kset_unregister;
+ goto out_remove_stats_kobj;
#endif
error = xfs_qm_init();
if (error)
- goto out_remove_kobj;
+ goto out_remove_dbg_kobj;
error = register_filesystem(&xfs_fs_type);
if (error)
@@ -1859,11 +1883,15 @@ init_xfs_fs(void)
out_qm_exit:
xfs_qm_exit();
- out_remove_kobj:
+ out_remove_dbg_kobj:
#ifdef DEBUG
xfs_sysfs_del(&xfs_dbg_kobj);
- out_kset_unregister:
+ out_remove_stats_kobj:
#endif
+ xfs_sysfs_del(&xfsstats.xs_kobj);
+ out_free_stats:
+ free_percpu(xfsstats.xs_stats);
+ out_kset_unregister:
kset_unregister(xfs_kset);
out_sysctl_unregister:
xfs_sysctl_unregister();
@@ -1889,6 +1917,8 @@ exit_xfs_fs(void)
#ifdef DEBUG
xfs_sysfs_del(&xfs_dbg_kobj);
#endif
+ xfs_sysfs_del(&xfsstats.xs_kobj);
+ free_percpu(xfsstats.xs_stats);
kset_unregister(xfs_kset);
xfs_sysctl_unregister();
xfs_cleanup_procfs();
@@ -1896,6 +1926,7 @@ exit_xfs_fs(void)
xfs_mru_cache_uninit();
xfs_destroy_workqueues();
xfs_destroy_zones();
+ xfs_uuid_table_free();
}
module_init(init_xfs_fs);
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 996481eeb491..b44284c1adda 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -178,7 +178,6 @@ xfs_symlink(
struct xfs_bmap_free free_list;
xfs_fsblock_t first_block;
bool unlock_dp_on_error = false;
- int committed;
xfs_fileoff_t first_fsb;
xfs_filblks_t fs_blocks;
int nmaps;
@@ -387,7 +386,7 @@ xfs_symlink(
xfs_trans_set_sync(tp);
}
- error = xfs_bmap_finish(&tp, &free_list, &committed);
+ error = xfs_bmap_finish(&tp, &free_list, NULL);
if (error)
goto out_bmap_cancel;
@@ -434,7 +433,6 @@ xfs_inactive_symlink_rmt(
struct xfs_inode *ip)
{
xfs_buf_t *bp;
- int committed;
int done;
int error;
xfs_fsblock_t first_block;
@@ -510,16 +508,10 @@ xfs_inactive_symlink_rmt(
/*
* Commit the first transaction. This logs the EFI and the inode.
*/
- error = xfs_bmap_finish(&tp, &free_list, &committed);
+ error = xfs_bmap_finish(&tp, &free_list, ip);
if (error)
goto error_bmap_cancel;
/*
- * The transaction must have been committed, since there were
- * actually extents freed by xfs_bunmapi. See xfs_bmap_finish.
- * The new tp has the extent freeing and EFDs.
- */
- ASSERT(committed);
- /*
* The first xact was committed, so add the inode to the new one.
* Mark it dirty so it will be logged and moved forward in the log as
* part of every commit.
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index a0c8067cea6f..aed74d3f8da9 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -19,6 +19,7 @@
#include <linux/sysctl.h>
#include <linux/proc_fs.h>
#include "xfs_error.h"
+#include "xfs_stats.h"
static struct ctl_table_header *xfs_table_header;
@@ -31,22 +32,12 @@ xfs_stats_clear_proc_handler(
size_t *lenp,
loff_t *ppos)
{
- int c, ret, *valp = ctl->data;
- __uint32_t vn_active;
+ int ret, *valp = ctl->data;
ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
if (!ret && write && *valp) {
- xfs_notice(NULL, "Clearing xfsstats");
- for_each_possible_cpu(c) {
- preempt_disable();
- /* save vn_active, it's a universal truth! */
- vn_active = per_cpu(xfsstats, c).vn_active;
- memset(&per_cpu(xfsstats, c), 0,
- sizeof(struct xfsstats));
- per_cpu(xfsstats, c).vn_active = vn_active;
- preempt_enable();
- }
+ xfs_stats_clearall(xfsstats.xs_stats);
xfs_stats_clear = 0;
}
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index aa03670851d8..641d625eb334 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -21,11 +21,13 @@
#include "xfs_log_format.h"
#include "xfs_log.h"
#include "xfs_log_priv.h"
+#include "xfs_stats.h"
struct xfs_sysfs_attr {
struct attribute attr;
- ssize_t (*show)(char *buf, void *data);
- ssize_t (*store)(const char *buf, size_t count, void *data);
+ ssize_t (*show)(struct kobject *kobject, char *buf);
+ ssize_t (*store)(struct kobject *kobject, const char *buf,
+ size_t count);
};
static inline struct xfs_sysfs_attr *
@@ -38,6 +40,8 @@ to_attr(struct attribute *attr)
static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_RW(name)
#define XFS_SYSFS_ATTR_RO(name) \
static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_RO(name)
+#define XFS_SYSFS_ATTR_WO(name) \
+ static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_WO(name)
#define ATTR_LIST(name) &xfs_sysfs_attr_##name.attr
@@ -51,14 +55,42 @@ struct kobj_type xfs_mp_ktype = {
.release = xfs_sysfs_release,
};
+STATIC ssize_t
+xfs_sysfs_object_show(
+ struct kobject *kobject,
+ struct attribute *attr,
+ char *buf)
+{
+ struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
+
+ return xfs_attr->show ? xfs_attr->show(kobject, buf) : 0;
+}
+
+STATIC ssize_t
+xfs_sysfs_object_store(
+ struct kobject *kobject,
+ struct attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
+
+ return xfs_attr->store ? xfs_attr->store(kobject, buf, count) : 0;
+}
+
+static const struct sysfs_ops xfs_sysfs_ops = {
+ .show = xfs_sysfs_object_show,
+ .store = xfs_sysfs_object_store,
+};
+
#ifdef DEBUG
/* debug */
STATIC ssize_t
log_recovery_delay_store(
+ struct kobject *kobject,
const char *buf,
- size_t count,
- void *data)
+ size_t count)
{
int ret;
int val;
@@ -77,8 +109,8 @@ log_recovery_delay_store(
STATIC ssize_t
log_recovery_delay_show(
- char *buf,
- void *data)
+ struct kobject *kobject,
+ char *buf)
{
return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.log_recovery_delay);
}
@@ -89,52 +121,87 @@ static struct attribute *xfs_dbg_attrs[] = {
NULL,
};
+struct kobj_type xfs_dbg_ktype = {
+ .release = xfs_sysfs_release,
+ .sysfs_ops = &xfs_sysfs_ops,
+ .default_attrs = xfs_dbg_attrs,
+};
+
+#endif /* DEBUG */
+
+/* stats */
+
+static inline struct xstats *
+to_xstats(struct kobject *kobject)
+{
+ struct xfs_kobj *kobj = to_kobj(kobject);
+
+ return container_of(kobj, struct xstats, xs_kobj);
+}
+
STATIC ssize_t
-xfs_dbg_show(
- struct kobject *kobject,
- struct attribute *attr,
- char *buf)
+stats_show(
+ struct kobject *kobject,
+ char *buf)
{
- struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
+ struct xstats *stats = to_xstats(kobject);
- return xfs_attr->show ? xfs_attr->show(buf, NULL) : 0;
+ return xfs_stats_format(stats->xs_stats, buf);
}
+XFS_SYSFS_ATTR_RO(stats);
STATIC ssize_t
-xfs_dbg_store(
- struct kobject *kobject,
- struct attribute *attr,
- const char *buf,
- size_t count)
+stats_clear_store(
+ struct kobject *kobject,
+ const char *buf,
+ size_t count)
{
- struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
+ int ret;
+ int val;
+ struct xstats *stats = to_xstats(kobject);
+
+ ret = kstrtoint(buf, 0, &val);
+ if (ret)
+ return ret;
- return xfs_attr->store ? xfs_attr->store(buf, count, NULL) : 0;
+ if (val != 1)
+ return -EINVAL;
+
+ xfs_stats_clearall(stats->xs_stats);
+ return count;
}
+XFS_SYSFS_ATTR_WO(stats_clear);
-static struct sysfs_ops xfs_dbg_ops = {
- .show = xfs_dbg_show,
- .store = xfs_dbg_store,
+static struct attribute *xfs_stats_attrs[] = {
+ ATTR_LIST(stats),
+ ATTR_LIST(stats_clear),
+ NULL,
};
-struct kobj_type xfs_dbg_ktype = {
+struct kobj_type xfs_stats_ktype = {
.release = xfs_sysfs_release,
- .sysfs_ops = &xfs_dbg_ops,
- .default_attrs = xfs_dbg_attrs,
+ .sysfs_ops = &xfs_sysfs_ops,
+ .default_attrs = xfs_stats_attrs,
};
-#endif /* DEBUG */
-
/* xlog */
+static inline struct xlog *
+to_xlog(struct kobject *kobject)
+{
+ struct xfs_kobj *kobj = to_kobj(kobject);
+
+ return container_of(kobj, struct xlog, l_kobj);
+}
+
STATIC ssize_t
log_head_lsn_show(
- char *buf,
- void *data)
+ struct kobject *kobject,
+ char *buf)
{
- struct xlog *log = data;
int cycle;
int block;
+ struct xlog *log = to_xlog(kobject);
spin_lock(&log->l_icloglock);
cycle = log->l_curr_cycle;
@@ -147,12 +214,12 @@ XFS_SYSFS_ATTR_RO(log_head_lsn);
STATIC ssize_t
log_tail_lsn_show(
- char *buf,
- void *data)
+ struct kobject *kobject,
+ char *buf)
{
- struct xlog *log = data;
int cycle;
int block;
+ struct xlog *log = to_xlog(kobject);
xlog_crack_atomic_lsn(&log->l_tail_lsn, &cycle, &block);
return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, block);
@@ -161,12 +228,13 @@ XFS_SYSFS_ATTR_RO(log_tail_lsn);
STATIC ssize_t
reserve_grant_head_show(
- char *buf,
- void *data)
+ struct kobject *kobject,
+ char *buf)
+
{
- struct xlog *log = data;
int cycle;
int bytes;
+ struct xlog *log = to_xlog(kobject);
xlog_crack_grant_head(&log->l_reserve_head.grant, &cycle, &bytes);
return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, bytes);
@@ -175,65 +243,64 @@ XFS_SYSFS_ATTR_RO(reserve_grant_head);
STATIC ssize_t
write_grant_head_show(
- char *buf,
- void *data)
+ struct kobject *kobject,
+ char *buf)
{
- struct xlog *log = data;
int cycle;
int bytes;
+ struct xlog *log = to_xlog(kobject);
xlog_crack_grant_head(&log->l_write_head.grant, &cycle, &bytes);
return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, bytes);
}
XFS_SYSFS_ATTR_RO(write_grant_head);
-static struct attribute *xfs_log_attrs[] = {
- ATTR_LIST(log_head_lsn),
- ATTR_LIST(log_tail_lsn),
- ATTR_LIST(reserve_grant_head),
- ATTR_LIST(write_grant_head),
- NULL,
-};
-
-static inline struct xlog *
-to_xlog(struct kobject *kobject)
-{
- struct xfs_kobj *kobj = to_kobj(kobject);
- return container_of(kobj, struct xlog, l_kobj);
-}
-
+#ifdef DEBUG
STATIC ssize_t
-xfs_log_show(
- struct kobject *kobject,
- struct attribute *attr,
- char *buf)
+log_badcrc_factor_store(
+ struct kobject *kobject,
+ const char *buf,
+ size_t count)
{
- struct xlog *log = to_xlog(kobject);
- struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
+ struct xlog *log = to_xlog(kobject);
+ int ret;
+ uint32_t val;
- return xfs_attr->show ? xfs_attr->show(buf, log) : 0;
+ ret = kstrtouint(buf, 0, &val);
+ if (ret)
+ return ret;
+
+ log->l_badcrc_factor = val;
+
+ return count;
}
STATIC ssize_t
-xfs_log_store(
- struct kobject *kobject,
- struct attribute *attr,
- const char *buf,
- size_t count)
+log_badcrc_factor_show(
+ struct kobject *kobject,
+ char *buf)
{
- struct xlog *log = to_xlog(kobject);
- struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
+ struct xlog *log = to_xlog(kobject);
- return xfs_attr->store ? xfs_attr->store(buf, count, log) : 0;
+ return snprintf(buf, PAGE_SIZE, "%d\n", log->l_badcrc_factor);
}
-static struct sysfs_ops xfs_log_ops = {
- .show = xfs_log_show,
- .store = xfs_log_store,
+XFS_SYSFS_ATTR_RW(log_badcrc_factor);
+#endif /* DEBUG */
+
+static struct attribute *xfs_log_attrs[] = {
+ ATTR_LIST(log_head_lsn),
+ ATTR_LIST(log_tail_lsn),
+ ATTR_LIST(reserve_grant_head),
+ ATTR_LIST(write_grant_head),
+#ifdef DEBUG
+ ATTR_LIST(log_badcrc_factor),
+#endif
+ NULL,
};
struct kobj_type xfs_log_ktype = {
.release = xfs_sysfs_release,
- .sysfs_ops = &xfs_log_ops,
+ .sysfs_ops = &xfs_sysfs_ops,
.default_attrs = xfs_log_attrs,
};
diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h
index 240eee35f342..be692e59938d 100644
--- a/fs/xfs/xfs_sysfs.h
+++ b/fs/xfs/xfs_sysfs.h
@@ -22,6 +22,7 @@
extern struct kobj_type xfs_mp_ktype; /* xfs_mount */
extern struct kobj_type xfs_dbg_ktype; /* debug */
extern struct kobj_type xfs_log_ktype; /* xlog */
+extern struct kobj_type xfs_stats_ktype; /* stats */
static inline struct xfs_kobj *
to_kobj(struct kobject *kobject)
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 5ed36b1e04c1..391d797cb53f 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -689,6 +689,7 @@ DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid);
DEFINE_INODE_EVENT(xfs_filemap_fault);
DEFINE_INODE_EVENT(xfs_filemap_pmd_fault);
DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite);
+DEFINE_INODE_EVENT(xfs_filemap_pfn_mkwrite);
DECLARE_EVENT_CLASS(xfs_iref_class,
TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip),
@@ -1221,6 +1222,32 @@ DEFINE_PAGE_EVENT(xfs_writepage);
DEFINE_PAGE_EVENT(xfs_releasepage);
DEFINE_PAGE_EVENT(xfs_invalidatepage);
+DECLARE_EVENT_CLASS(xfs_readpage_class,
+ TP_PROTO(struct inode *inode, int nr_pages),
+ TP_ARGS(inode, nr_pages),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(int, nr_pages)
+ ),
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->nr_pages = nr_pages;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx nr_pages %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->nr_pages)
+)
+
+#define DEFINE_READPAGE_EVENT(name) \
+DEFINE_EVENT(xfs_readpage_class, name, \
+ TP_PROTO(struct inode *inode, int nr_pages), \
+ TP_ARGS(inode, nr_pages))
+DEFINE_READPAGE_EVENT(xfs_vm_readpage);
+DEFINE_READPAGE_EVENT(xfs_vm_readpages);
+
DECLARE_EVENT_CLASS(xfs_imap_class,
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
int type, struct xfs_bmbt_irec *irec),
@@ -1312,6 +1339,7 @@ DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound);
DEFINE_SIMPLE_IO_EVENT(xfs_setfilesize);
+DEFINE_SIMPLE_IO_EVENT(xfs_zero_eof);
DECLARE_EVENT_CLASS(xfs_itrunc_class,
TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size),
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index a0ab1dae9c31..748b16aff45a 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -930,9 +930,9 @@ __xfs_trans_commit(
*/
if (sync) {
error = _xfs_log_force_lsn(mp, commit_lsn, XFS_LOG_SYNC, NULL);
- XFS_STATS_INC(xs_trans_sync);
+ XFS_STATS_INC(mp, xs_trans_sync);
} else {
- XFS_STATS_INC(xs_trans_async);
+ XFS_STATS_INC(mp, xs_trans_async);
}
return error;
@@ -955,7 +955,7 @@ out_unreserve:
xfs_trans_free_items(tp, NULLCOMMITLSN, !!error);
xfs_trans_free(tp);
- XFS_STATS_INC(xs_trans_empty);
+ XFS_STATS_INC(mp, xs_trans_empty);
return error;
}
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 1098cf490189..aa67339b9537 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -349,7 +349,7 @@ xfsaild_push(
xfs_ail_min_lsn(ailp))) {
ailp->xa_log_flush = 0;
- XFS_STATS_INC(xs_push_ail_flush);
+ XFS_STATS_INC(mp, xs_push_ail_flush);
xfs_log_force(mp, XFS_LOG_SYNC);
}
@@ -371,7 +371,7 @@ xfsaild_push(
goto out_done;
}
- XFS_STATS_INC(xs_push_ail);
+ XFS_STATS_INC(mp, xs_push_ail);
lsn = lip->li_lsn;
while ((XFS_LSN_CMP(lip->li_lsn, target) <= 0)) {
@@ -385,7 +385,7 @@ xfsaild_push(
lock_result = lip->li_ops->iop_push(lip, &ailp->xa_buf_list);
switch (lock_result) {
case XFS_ITEM_SUCCESS:
- XFS_STATS_INC(xs_push_ail_success);
+ XFS_STATS_INC(mp, xs_push_ail_success);
trace_xfs_ail_push(lip);
ailp->xa_last_pushed_lsn = lsn;
@@ -403,7 +403,7 @@ xfsaild_push(
* re-try the flushing relatively soon if most of the
* AIL is beeing flushed.
*/
- XFS_STATS_INC(xs_push_ail_flushing);
+ XFS_STATS_INC(mp, xs_push_ail_flushing);
trace_xfs_ail_flushing(lip);
flushing++;
@@ -411,14 +411,14 @@ xfsaild_push(
break;
case XFS_ITEM_PINNED:
- XFS_STATS_INC(xs_push_ail_pinned);
+ XFS_STATS_INC(mp, xs_push_ail_pinned);
trace_xfs_ail_pinned(lip);
stuck++;
ailp->xa_log_flush++;
break;
case XFS_ITEM_LOCKED:
- XFS_STATS_INC(xs_push_ail_locked);
+ XFS_STATS_INC(mp, xs_push_ail_locked);
trace_xfs_ail_locked(lip);
stuck++;
@@ -497,6 +497,7 @@ xfsaild(
long tout = 0; /* milliseconds */
current->flags |= PF_MEMALLOC;
+ set_freezable();
while (!kthread_should_stop()) {
if (tout && tout <= 20)
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index ce78534a047e..995170194df0 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -572,12 +572,16 @@ xfs_quota_warn(
struct xfs_dquot *dqp,
int type)
{
- /* no warnings for project quotas - we just return ENOSPC later */
+ enum quota_type qtype;
+
if (dqp->dq_flags & XFS_DQ_PROJ)
- return;
- quota_send_warning(make_kqid(&init_user_ns,
- (dqp->dq_flags & XFS_DQ_USER) ?
- USRQUOTA : GRPQUOTA,
+ qtype = PRJQUOTA;
+ else if (dqp->dq_flags & XFS_DQ_USER)
+ qtype = USRQUOTA;
+ else
+ qtype = GRPQUOTA;
+
+ quota_send_warning(make_kqid(&init_user_ns, qtype,
be32_to_cpu(dqp->q_core.d_id)),
mp->m_super->s_dev, type);
}
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 17280cd71934..b97f1df910ab 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -108,6 +108,15 @@ xfs_trans_log_inode(
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
/*
+ * Record the specific change for fdatasync optimisation. This
+ * allows fdatasync to skip log forces for inodes that are only
+ * timestamp dirty. We do this before the change count so that
+ * the core being logged in this case does not impact on fdatasync
+ * behaviour.
+ */
+ ip->i_itemp->ili_fsync_fields |= flags;
+
+ /*
* First time we log the inode in a transaction, bump the inode change
* counter if it is configured for this to occur. We don't use
* inode_inc_version() because there is no need for extra locking around
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index c036815183cb..110f1d7d86b0 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -32,15 +32,13 @@
static int
-xfs_xattr_get(struct dentry *dentry, const char *name,
- void *value, size_t size, int xflags)
+xfs_xattr_get(const struct xattr_handler *handler, struct dentry *dentry,
+ const char *name, void *value, size_t size)
{
+ int xflags = handler->flags;
struct xfs_inode *ip = XFS_I(d_inode(dentry));
int error, asize = size;
- if (strcmp(name, "") == 0)
- return -EINVAL;
-
/* Convert Linux syscall to XFS internal ATTR flags */
if (!size) {
xflags |= ATTR_KERNOVAL;
@@ -53,14 +51,35 @@ xfs_xattr_get(struct dentry *dentry, const char *name,
return asize;
}
-static int
-xfs_xattr_set(struct dentry *dentry, const char *name, const void *value,
- size_t size, int flags, int xflags)
+void
+xfs_forget_acl(
+ struct inode *inode,
+ const char *name,
+ int xflags)
{
- struct xfs_inode *ip = XFS_I(d_inode(dentry));
+ /*
+ * Invalidate any cached ACLs if the user has bypassed the ACL
+ * interface. We don't validate the content whatsoever so it is caller
+ * responsibility to provide data in valid format and ensure i_mode is
+ * consistent.
+ */
+ if (xflags & ATTR_ROOT) {
+#ifdef CONFIG_XFS_POSIX_ACL
+ if (!strcmp(name, SGI_ACL_FILE))
+ forget_cached_acl(inode, ACL_TYPE_ACCESS);
+ else if (!strcmp(name, SGI_ACL_DEFAULT))
+ forget_cached_acl(inode, ACL_TYPE_DEFAULT);
+#endif
+ }
+}
- if (strcmp(name, "") == 0)
- return -EINVAL;
+static int
+xfs_xattr_set(const struct xattr_handler *handler, struct dentry *dentry,
+ const char *name, const void *value, size_t size, int flags)
+{
+ int xflags = handler->flags;
+ struct xfs_inode *ip = XFS_I(d_inode(dentry));
+ int error;
/* Convert Linux syscall to XFS internal ATTR flags */
if (flags & XATTR_CREATE)
@@ -70,8 +89,12 @@ xfs_xattr_set(struct dentry *dentry, const char *name, const void *value,
if (!value)
return xfs_attr_remove(ip, (unsigned char *)name, xflags);
- return xfs_attr_set(ip, (unsigned char *)name,
+ error = xfs_attr_set(ip, (unsigned char *)name,
(void *)value, size, xflags);
+ if (!error)
+ xfs_forget_acl(d_inode(dentry), name, xflags);
+
+ return error;
}
static const struct xattr_handler xfs_xattr_user_handler = {
@@ -106,47 +129,19 @@ const struct xattr_handler *xfs_xattr_handlers[] = {
NULL
};
-static unsigned int xfs_xattr_prefix_len(int flags)
-{
- if (flags & XFS_ATTR_SECURE)
- return sizeof("security");
- else if (flags & XFS_ATTR_ROOT)
- return sizeof("trusted");
- else
- return sizeof("user");
-}
-
-static const char *xfs_xattr_prefix(int flags)
-{
- if (flags & XFS_ATTR_SECURE)
- return xfs_xattr_security_handler.prefix;
- else if (flags & XFS_ATTR_ROOT)
- return xfs_xattr_trusted_handler.prefix;
- else
- return xfs_xattr_user_handler.prefix;
-}
-
static int
-xfs_xattr_put_listent(
+__xfs_xattr_put_listent(
struct xfs_attr_list_context *context,
- int flags,
- unsigned char *name,
- int namelen,
- int valuelen,
- unsigned char *value)
+ char *prefix,
+ int prefix_len,
+ unsigned char *name,
+ int namelen)
{
- unsigned int prefix_len = xfs_xattr_prefix_len(flags);
char *offset;
int arraytop;
- ASSERT(context->count >= 0);
-
- /*
- * Only show root namespace entries if we are actually allowed to
- * see them.
- */
- if ((flags & XFS_ATTR_ROOT) && !capable(CAP_SYS_ADMIN))
- return 0;
+ if (!context->alist)
+ goto compute_size;
arraytop = context->count + prefix_len + namelen + 1;
if (arraytop > context->firstu) {
@@ -154,17 +149,19 @@ xfs_xattr_put_listent(
return 1;
}
offset = (char *)context->alist + context->count;
- strncpy(offset, xfs_xattr_prefix(flags), prefix_len);
+ strncpy(offset, prefix, prefix_len);
offset += prefix_len;
strncpy(offset, (char *)name, namelen); /* real name */
offset += namelen;
*offset = '\0';
+
+compute_size:
context->count += prefix_len + namelen + 1;
return 0;
}
static int
-xfs_xattr_put_listent_sizes(
+xfs_xattr_put_listent(
struct xfs_attr_list_context *context,
int flags,
unsigned char *name,
@@ -172,24 +169,55 @@ xfs_xattr_put_listent_sizes(
int valuelen,
unsigned char *value)
{
- context->count += xfs_xattr_prefix_len(flags) + namelen + 1;
- return 0;
-}
+ char *prefix;
+ int prefix_len;
-static int
-list_one_attr(const char *name, const size_t len, void *data,
- size_t size, ssize_t *result)
-{
- char *p = data + *result;
+ ASSERT(context->count >= 0);
- *result += len;
- if (!size)
- return 0;
- if (*result > size)
- return -ERANGE;
+ if (flags & XFS_ATTR_ROOT) {
+#ifdef CONFIG_XFS_POSIX_ACL
+ if (namelen == SGI_ACL_FILE_SIZE &&
+ strncmp(name, SGI_ACL_FILE,
+ SGI_ACL_FILE_SIZE) == 0) {
+ int ret = __xfs_xattr_put_listent(
+ context, XATTR_SYSTEM_PREFIX,
+ XATTR_SYSTEM_PREFIX_LEN,
+ XATTR_POSIX_ACL_ACCESS,
+ strlen(XATTR_POSIX_ACL_ACCESS));
+ if (ret)
+ return ret;
+ } else if (namelen == SGI_ACL_DEFAULT_SIZE &&
+ strncmp(name, SGI_ACL_DEFAULT,
+ SGI_ACL_DEFAULT_SIZE) == 0) {
+ int ret = __xfs_xattr_put_listent(
+ context, XATTR_SYSTEM_PREFIX,
+ XATTR_SYSTEM_PREFIX_LEN,
+ XATTR_POSIX_ACL_DEFAULT,
+ strlen(XATTR_POSIX_ACL_DEFAULT));
+ if (ret)
+ return ret;
+ }
+#endif
- strcpy(p, name);
- return 0;
+ /*
+ * Only show root namespace entries if we are actually allowed to
+ * see them.
+ */
+ if (!capable(CAP_SYS_ADMIN))
+ return 0;
+
+ prefix = XATTR_TRUSTED_PREFIX;
+ prefix_len = XATTR_TRUSTED_PREFIX_LEN;
+ } else if (flags & XFS_ATTR_SECURE) {
+ prefix = XATTR_SECURITY_PREFIX;
+ prefix_len = XATTR_SECURITY_PREFIX_LEN;
+ } else {
+ prefix = XATTR_USER_PREFIX;
+ prefix_len = XATTR_USER_PREFIX_LEN;
+ }
+
+ return __xfs_xattr_put_listent(context, prefix, prefix_len, name,
+ namelen);
}
ssize_t
@@ -198,7 +226,6 @@ xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size)
struct xfs_attr_list_context context;
struct attrlist_cursor_kern cursor = { 0 };
struct inode *inode = d_inode(dentry);
- int error;
/*
* First read the regular on-disk attributes.
@@ -207,37 +234,14 @@ xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size)
context.dp = XFS_I(inode);
context.cursor = &cursor;
context.resynch = 1;
- context.alist = data;
+ context.alist = size ? data : NULL;
context.bufsize = size;
context.firstu = context.bufsize;
-
- if (size)
- context.put_listent = xfs_xattr_put_listent;
- else
- context.put_listent = xfs_xattr_put_listent_sizes;
+ context.put_listent = xfs_xattr_put_listent;
xfs_attr_list_int(&context);
if (context.count < 0)
return -ERANGE;
- /*
- * Then add the two synthetic ACL attributes.
- */
- if (posix_acl_access_exists(inode)) {
- error = list_one_attr(POSIX_ACL_XATTR_ACCESS,
- strlen(POSIX_ACL_XATTR_ACCESS) + 1,
- data, size, &context.count);
- if (error)
- return error;
- }
-
- if (posix_acl_default_exists(inode)) {
- error = list_one_attr(POSIX_ACL_XATTR_DEFAULT,
- strlen(POSIX_ACL_XATTR_DEFAULT) + 1,
- data, size, &context.count);
- if (error)
- return error;
- }
-
return context.count;
}