summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/acl.c2
-rw-r--r--fs/9p/v9fs.c21
-rw-r--r--fs/9p/v9fs.h1
-rw-r--r--fs/9p/vfs_addr.c4
-rw-r--r--fs/9p/vfs_dir.c21
-rw-r--r--fs/9p/vfs_file.c24
-rw-r--r--fs/9p/xattr.c4
-rw-r--r--fs/Makefile4
-rw-r--r--fs/afs/Kconfig12
-rw-r--r--fs/afs/Makefile7
-rw-r--r--fs/afs/addr_list.c209
-rw-r--r--fs/afs/afs.h50
-rw-r--r--fs/afs/cache.c2
-rw-r--r--fs/afs/callback.c17
-rw-r--r--fs/afs/cell.c66
-rw-r--r--fs/afs/cmservice.c287
-rw-r--r--fs/afs/dir.c79
-rw-r--r--fs/afs/dynroot.c4
-rw-r--r--fs/afs/file.c13
-rw-r--r--fs/afs/flock.c26
-rw-r--r--fs/afs/fs_probe.c279
-rw-r--r--fs/afs/fsclient.c583
-rw-r--r--fs/afs/inode.c56
-rw-r--r--fs/afs/internal.h331
-rw-r--r--fs/afs/misc.c52
-rw-r--r--fs/afs/mntpt.c5
-rw-r--r--fs/afs/proc.c110
-rw-r--r--fs/afs/protocol_yfs.h174
-rw-r--r--fs/afs/rotate.c275
-rw-r--r--fs/afs/rxrpc.c165
-rw-r--r--fs/afs/security.c13
-rw-r--r--fs/afs/server.c145
-rw-r--r--fs/afs/server_list.c10
-rw-r--r--fs/afs/super.c5
-rw-r--r--fs/afs/vl_list.c340
-rw-r--r--fs/afs/vl_probe.c282
-rw-r--r--fs/afs/vl_rotate.c325
-rw-r--r--fs/afs/vlclient.c195
-rw-r--r--fs/afs/volume.c56
-rw-r--r--fs/afs/write.c30
-rw-r--r--fs/afs/xattr.c2
-rw-r--r--fs/afs/yfsclient.c2184
-rw-r--r--fs/aio.c377
-rw-r--r--fs/autofs/autofs_i.h16
-rw-r--r--fs/autofs/dev-ioctl.c27
-rw-r--r--fs/autofs/expire.c3
-rw-r--r--fs/autofs/init.c2
-rw-r--r--fs/autofs/inode.c86
-rw-r--r--fs/autofs/root.c16
-rw-r--r--fs/autofs/waitq.c10
-rw-r--r--fs/bfs/bfs.h11
-rw-r--r--fs/bfs/dir.c4
-rw-r--r--fs/bfs/file.c2
-rw-r--r--fs/bfs/inode.c66
-rw-r--r--fs/binfmt_aout.c83
-rw-r--r--fs/binfmt_elf.c32
-rw-r--r--fs/binfmt_script.c57
-rw-r--r--fs/block_dev.c96
-rw-r--r--fs/btrfs/acl.c9
-rw-r--r--fs/btrfs/async-thread.c10
-rw-r--r--fs/btrfs/backref.c35
-rw-r--r--fs/btrfs/btrfs_inode.h14
-rw-r--r--fs/btrfs/check-integrity.c24
-rw-r--r--fs/btrfs/compression.c288
-rw-r--r--fs/btrfs/compression.h52
-rw-r--r--fs/btrfs/ctree.c179
-rw-r--r--fs/btrfs/ctree.h366
-rw-r--r--fs/btrfs/delayed-ref.c126
-rw-r--r--fs/btrfs/delayed-ref.h14
-rw-r--r--fs/btrfs/dev-replace.c200
-rw-r--r--fs/btrfs/dev-replace.h8
-rw-r--r--fs/btrfs/disk-io.c245
-rw-r--r--fs/btrfs/disk-io.h10
-rw-r--r--fs/btrfs/extent-tree.c1564
-rw-r--r--fs/btrfs/extent_io.c524
-rw-r--r--fs/btrfs/extent_io.h81
-rw-r--r--fs/btrfs/extent_map.c8
-rw-r--r--fs/btrfs/extent_map.h22
-rw-r--r--fs/btrfs/file-item.c13
-rw-r--r--fs/btrfs/file.c71
-rw-r--r--fs/btrfs/free-space-cache.c54
-rw-r--r--fs/btrfs/free-space-tree.c15
-rw-r--r--fs/btrfs/inode.c933
-rw-r--r--fs/btrfs/ioctl.c676
-rw-r--r--fs/btrfs/locking.c108
-rw-r--r--fs/btrfs/locking.h15
-rw-r--r--fs/btrfs/lzo.c33
-rw-r--r--fs/btrfs/ordered-data.c30
-rw-r--r--fs/btrfs/ordered-data.h47
-rw-r--r--fs/btrfs/qgroup.c408
-rw-r--r--fs/btrfs/qgroup.h126
-rw-r--r--fs/btrfs/raid56.c5
-rw-r--r--fs/btrfs/reada.c16
-rw-r--r--fs/btrfs/ref-verify.c10
-rw-r--r--fs/btrfs/relocation.c170
-rw-r--r--fs/btrfs/root-tree.c4
-rw-r--r--fs/btrfs/scrub.c134
-rw-r--r--fs/btrfs/send.c19
-rw-r--r--fs/btrfs/super.c115
-rw-r--r--fs/btrfs/sysfs.c14
-rw-r--r--fs/btrfs/sysfs.h2
-rw-r--r--fs/btrfs/tests/btrfs-tests.c4
-rw-r--r--fs/btrfs/tests/extent-io-tests.c29
-rw-r--r--fs/btrfs/tests/inode-tests.c6
-rw-r--r--fs/btrfs/transaction.c135
-rw-r--r--fs/btrfs/transaction.h16
-rw-r--r--fs/btrfs/tree-checker.c16
-rw-r--r--fs/btrfs/tree-defrag.c2
-rw-r--r--fs/btrfs/tree-log.c348
-rw-r--r--fs/btrfs/tree-log.h2
-rw-r--r--fs/btrfs/volumes.c975
-rw-r--r--fs/btrfs/volumes.h30
-rw-r--r--fs/btrfs/xattr.c8
-rw-r--r--fs/btrfs/zlib.c45
-rw-r--r--fs/btrfs/zstd.c316
-rw-r--r--fs/buffer.c47
-rw-r--r--fs/cachefiles/namei.c8
-rw-r--r--fs/cachefiles/rdwr.c9
-rw-r--r--fs/cachefiles/xattr.c3
-rw-r--r--fs/ceph/acl.c13
-rw-r--r--fs/ceph/addr.c12
-rw-r--r--fs/ceph/caps.c98
-rw-r--r--fs/ceph/file.c585
-rw-r--r--fs/ceph/inode.c61
-rw-r--r--fs/ceph/mds_client.c150
-rw-r--r--fs/ceph/mds_client.h16
-rw-r--r--fs/ceph/mdsmap.c1
-rw-r--r--fs/ceph/quota.c16
-rw-r--r--fs/ceph/snap.c3
-rw-r--r--fs/ceph/super.c17
-rw-r--r--fs/ceph/super.h9
-rw-r--r--fs/ceph/xattr.c3
-rw-r--r--fs/cifs/Kconfig7
-rw-r--r--fs/cifs/Makefile2
-rw-r--r--fs/cifs/cifs_debug.c86
-rw-r--r--fs/cifs/cifs_debug.h28
-rw-r--r--fs/cifs/cifs_dfs_ref.c145
-rw-r--r--fs/cifs/cifs_fs_sb.h10
-rw-r--r--fs/cifs/cifs_ioctl.h11
-rw-r--r--fs/cifs/cifs_spnego.c6
-rw-r--r--fs/cifs/cifsencrypt.c13
-rw-r--r--fs/cifs/cifsfs.c88
-rw-r--r--fs/cifs/cifsfs.h4
-rw-r--r--fs/cifs/cifsglob.h57
-rw-r--r--fs/cifs/cifspdu.h3
-rw-r--r--fs/cifs/cifsproto.h30
-rw-r--r--fs/cifs/cifssmb.c162
-rw-r--r--fs/cifs/connect.c961
-rw-r--r--fs/cifs/dfs_cache.c1368
-rw-r--r--fs/cifs/dfs_cache.h97
-rw-r--r--fs/cifs/dir.c2
-rw-r--r--fs/cifs/file.c544
-rw-r--r--fs/cifs/inode.c129
-rw-r--r--fs/cifs/ioctl.c48
-rw-r--r--fs/cifs/misc.c70
-rw-r--r--fs/cifs/readdir.c9
-rw-r--r--fs/cifs/sess.c4
-rw-r--r--fs/cifs/smb1ops.c15
-rw-r--r--fs/cifs/smb2file.c8
-rw-r--r--fs/cifs/smb2glob.h2
-rw-r--r--fs/cifs/smb2inode.c347
-rw-r--r--fs/cifs/smb2maperror.c6
-rw-r--r--fs/cifs/smb2misc.c7
-rw-r--r--fs/cifs/smb2ops.c611
-rw-r--r--fs/cifs/smb2pdu.c504
-rw-r--r--fs/cifs/smb2pdu.h70
-rw-r--r--fs/cifs/smb2proto.h38
-rw-r--r--fs/cifs/smbdirect.c57
-rw-r--r--fs/cifs/trace.c10
-rw-r--r--fs/cifs/trace.h161
-rw-r--r--fs/cifs/transport.c189
-rw-r--r--fs/compat_ioctl.c131
-rw-r--r--fs/cramfs/inode.c12
-rw-r--r--fs/crypto/bio.c3
-rw-r--r--fs/crypto/crypto.c28
-rw-r--r--fs/crypto/fname.c22
-rw-r--r--fs/crypto/fscrypt_private.h67
-rw-r--r--fs/crypto/keyinfo.c353
-rw-r--r--fs/crypto/policy.c5
-rw-r--r--fs/dax.c942
-rw-r--r--fs/dcache.c78
-rw-r--r--fs/debugfs/inode.c40
-rw-r--r--fs/direct-io.c19
-rw-r--r--fs/dlm/ast.c10
-rw-r--r--fs/dlm/lock.c17
-rw-r--r--fs/dlm/lockspace.c9
-rw-r--r--fs/dlm/lowcomms.c6
-rw-r--r--fs/dlm/member.c7
-rw-r--r--fs/dlm/memory.c9
-rw-r--r--fs/dlm/user.c5
-rw-r--r--fs/drop_caches.c8
-rw-r--r--fs/ecryptfs/crypto.c5
-rw-r--r--fs/eventpoll.c439
-rw-r--r--fs/exec.c126
-rw-r--r--fs/exofs/ore.c3
-rw-r--r--fs/exofs/ore_raid.c3
-rw-r--r--fs/exofs/super.c44
-rw-r--r--fs/exportfs/expfs.c3
-rw-r--r--fs/ext2/acl.c4
-rw-r--r--fs/ext2/dir.c35
-rw-r--r--fs/ext2/ext2.h21
-rw-r--r--fs/ext2/file.c1
-rw-r--r--fs/ext2/ialloc.c2
-rw-r--r--fs/ext2/inode.c30
-rw-r--r--fs/ext2/namei.c2
-rw-r--r--fs/ext2/super.c62
-rw-r--r--fs/ext2/symlink.c2
-rw-r--r--fs/ext2/xattr.c6
-rw-r--r--fs/ext4/acl.c3
-rw-r--r--fs/ext4/ext4.h26
-rw-r--r--fs/ext4/fsync.c3
-rw-r--r--fs/ext4/ialloc.c4
-rw-r--r--fs/ext4/inline.c11
-rw-r--r--fs/ext4/inode.c73
-rw-r--r--fs/ext4/ioctl.c2
-rw-r--r--fs/ext4/migrate.c48
-rw-r--r--fs/ext4/namei.c13
-rw-r--r--fs/ext4/page-io.c3
-rw-r--r--fs/ext4/readpage.c5
-rw-r--r--fs/ext4/resize.c105
-rw-r--r--fs/ext4/super.c111
-rw-r--r--fs/ext4/xattr.c110
-rw-r--r--fs/f2fs/acl.c20
-rw-r--r--fs/f2fs/checkpoint.c33
-rw-r--r--fs/f2fs/data.c177
-rw-r--r--fs/f2fs/debug.c45
-rw-r--r--fs/f2fs/dir.c22
-rw-r--r--fs/f2fs/f2fs.h104
-rw-r--r--fs/f2fs/file.c42
-rw-r--r--fs/f2fs/gc.c81
-rw-r--r--fs/f2fs/inline.c22
-rw-r--r--fs/f2fs/inode.c22
-rw-r--r--fs/f2fs/namei.c8
-rw-r--r--fs/f2fs/node.c46
-rw-r--r--fs/f2fs/node.h2
-rw-r--r--fs/f2fs/recovery.c4
-rw-r--r--fs/f2fs/segment.c101
-rw-r--r--fs/f2fs/segment.h2
-rw-r--r--fs/f2fs/shrinker.c2
-rw-r--r--fs/f2fs/super.c175
-rw-r--r--fs/f2fs/sysfs.c27
-rw-r--r--fs/f2fs/xattr.c22
-rw-r--r--fs/fat/cache.c2
-rw-r--r--fs/fat/dir.c14
-rw-r--r--fs/fat/fat.h34
-rw-r--r--fs/fat/fatent.c16
-rw-r--r--fs/fat/file.c18
-rw-r--r--fs/fat/inode.c35
-rw-r--r--fs/fat/misc.c93
-rw-r--r--fs/fat/namei_msdos.c17
-rw-r--r--fs/fat/namei_vfat.c15
-rw-r--r--fs/file.c47
-rw-r--r--fs/file_table.c16
-rw-r--r--fs/fs-writeback.c65
-rw-r--r--fs/fs_types.c105
-rw-r--r--fs/fscache/object.c3
-rw-r--r--fs/fuse/Makefile2
-rw-r--r--fs/fuse/control.c34
-rw-r--r--fs/fuse/dev.c241
-rw-r--r--fs/fuse/dir.c407
-rw-r--r--fs/fuse/file.c230
-rw-r--r--fs/fuse/fuse_i.h128
-rw-r--r--fs/fuse/inode.c56
-rw-r--r--fs/fuse/readdir.c569
-rw-r--r--fs/gfs2/aops.c18
-rw-r--r--fs/gfs2/bmap.c64
-rw-r--r--fs/gfs2/file.c12
-rw-r--r--fs/gfs2/glock.c2
-rw-r--r--fs/gfs2/glock.h2
-rw-r--r--fs/gfs2/glops.c16
-rw-r--r--fs/gfs2/incore.h3
-rw-r--r--fs/gfs2/inode.c18
-rw-r--r--fs/gfs2/inode.h10
-rw-r--r--fs/gfs2/log.c5
-rw-r--r--fs/gfs2/log.h5
-rw-r--r--fs/gfs2/lops.c79
-rw-r--r--fs/gfs2/lops.h2
-rw-r--r--fs/gfs2/meta_io.c3
-rw-r--r--fs/gfs2/recovery.c55
-rw-r--r--fs/gfs2/recovery.h3
-rw-r--r--fs/gfs2/rgrp.c5
-rw-r--r--fs/gfs2/rgrp.h2
-rw-r--r--fs/gfs2/trans.c8
-rw-r--r--fs/hfs/brec.c5
-rw-r--r--fs/hfs/btree.c44
-rw-r--r--fs/hfs/btree.h1
-rw-r--r--fs/hfs/catalog.c16
-rw-r--r--fs/hfs/extent.c10
-rw-r--r--fs/hfs/inode.c2
-rw-r--r--fs/hfsplus/attributes.c10
-rw-r--r--fs/hfsplus/brec.c5
-rw-r--r--fs/hfsplus/btree.c47
-rw-r--r--fs/hfsplus/catalog.c24
-rw-r--r--fs/hfsplus/dir.c1
-rw-r--r--fs/hfsplus/extents.c8
-rw-r--r--fs/hfsplus/hfsplus_fs.h4
-rw-r--r--fs/hfsplus/inode.c22
-rw-r--r--fs/hugetlbfs/inode.c14
-rw-r--r--fs/inode.c16
-rw-r--r--fs/io_uring.c2971
-rw-r--r--fs/ioctl.c15
-rw-r--r--fs/iomap.c159
-rw-r--r--fs/isofs/dir.c2
-rw-r--r--fs/jbd2/commit.c3
-rw-r--r--fs/jbd2/transaction.c45
-rw-r--r--fs/jffs2/super.c3
-rw-r--r--fs/kernfs/dir.c2
-rw-r--r--fs/kernfs/file.c54
-rw-r--r--fs/kernfs/inode.c2
-rw-r--r--fs/kernfs/kernfs-internal.h2
-rw-r--r--fs/kernfs/mount.c18
-rw-r--r--fs/kernfs/symlink.c5
-rw-r--r--fs/lockd/clnt4xdr.c22
-rw-r--r--fs/lockd/clntproc.c8
-rw-r--r--fs/lockd/clntxdr.c22
-rw-r--r--fs/lockd/host.c2
-rw-r--r--fs/lockd/svclock.c2
-rw-r--r--fs/lockd/xdr.c4
-rw-r--r--fs/lockd/xdr4.c4
-rw-r--r--fs/locks.c376
-rw-r--r--fs/mpage.c3
-rw-r--r--fs/namei.c7
-rw-r--r--fs/namespace.c196
-rw-r--r--fs/nfs/blocklayout/blocklayout.c4
-rw-r--r--fs/nfs/callback.c10
-rw-r--r--fs/nfs/callback_proc.c26
-rw-r--r--fs/nfs/client.c9
-rw-r--r--fs/nfs/delegation.c56
-rw-r--r--fs/nfs/delegation.h10
-rw-r--r--fs/nfs/dir.c354
-rw-r--r--fs/nfs/direct.c9
-rw-r--r--fs/nfs/dns_resolve.c15
-rw-r--r--fs/nfs/filelayout/filelayout.c1
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c88
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.h12
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c37
-rw-r--r--fs/nfs/inode.c83
-rw-r--r--fs/nfs/internal.h17
-rw-r--r--fs/nfs/nfs3proc.c9
-rw-r--r--fs/nfs/nfs3xdr.c10
-rw-r--r--fs/nfs/nfs42proc.c19
-rw-r--r--fs/nfs/nfs4_fs.h73
-rw-r--r--fs/nfs/nfs4client.c20
-rw-r--r--fs/nfs/nfs4file.c20
-rw-r--r--fs/nfs/nfs4idmap.c31
-rw-r--r--fs/nfs/nfs4proc.c219
-rw-r--r--fs/nfs/nfs4renewd.c9
-rw-r--r--fs/nfs/nfs4session.c5
-rw-r--r--fs/nfs/nfs4state.c409
-rw-r--r--fs/nfs/nfs4trace.h456
-rw-r--r--fs/nfs/nfs4xdr.c2
-rw-r--r--fs/nfs/pagelist.c53
-rw-r--r--fs/nfs/pnfs.c30
-rw-r--r--fs/nfs/pnfs.h11
-rw-r--r--fs/nfs/pnfs_dev.c4
-rw-r--r--fs/nfs/pnfs_nfs.c2
-rw-r--r--fs/nfs/proc.c2
-rw-r--r--fs/nfs/read.c10
-rw-r--r--fs/nfs/super.c52
-rw-r--r--fs/nfs/unlink.c20
-rw-r--r--fs/nfs/write.c46
-rw-r--r--fs/nfsd/cache.h20
-rw-r--r--fs/nfsd/export.c14
-rw-r--r--fs/nfsd/export.h2
-rw-r--r--fs/nfsd/netns.h8
-rw-r--r--fs/nfsd/nfs4callback.c129
-rw-r--r--fs/nfsd/nfs4idmap.c11
-rw-r--r--fs/nfsd/nfs4layouts.c1
-rw-r--r--fs/nfsd/nfs4proc.c307
-rw-r--r--fs/nfsd/nfs4recover.c17
-rw-r--r--fs/nfsd/nfs4state.c57
-rw-r--r--fs/nfsd/nfs4xdr.c50
-rw-r--r--fs/nfsd/nfscache.c144
-rw-r--r--fs/nfsd/nfsctl.c7
-rw-r--r--fs/nfsd/state.h12
-rw-r--r--fs/nfsd/vfs.c36
-rw-r--r--fs/nfsd/xdr4.h28
-rw-r--r--fs/nfsd/xdr4cb.h10
-rw-r--r--fs/nilfs2/btnode.c28
-rw-r--r--fs/nilfs2/page.c29
-rw-r--r--fs/notify/fanotify/Kconfig1
-rw-r--r--fs/notify/fanotify/fanotify.c298
-rw-r--r--fs/notify/fanotify/fanotify.h120
-rw-r--r--fs/notify/fanotify/fanotify_user.c468
-rw-r--r--fs/notify/fdinfo.c30
-rw-r--r--fs/notify/fsnotify.c48
-rw-r--r--fs/notify/fsnotify.h11
-rw-r--r--fs/notify/inotify/inotify.h1
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c18
-rw-r--r--fs/notify/inotify/inotify_user.c13
-rw-r--r--fs/notify/mark.c85
-rw-r--r--fs/notify/notification.c42
-rw-r--r--fs/ntfs/malloc.h2
-rw-r--r--fs/ntfs/namei.c2
-rw-r--r--fs/ocfs2/Makefile2
-rw-r--r--fs/ocfs2/alloc.c163
-rw-r--r--fs/ocfs2/aops.c18
-rw-r--r--fs/ocfs2/buffer_head_io.c79
-rw-r--r--fs/ocfs2/cluster/heartbeat.c17
-rw-r--r--fs/ocfs2/cluster/masklog.h9
-rw-r--r--fs/ocfs2/cluster/nodemanager.c14
-rw-r--r--fs/ocfs2/cluster/tcp.c2
-rw-r--r--fs/ocfs2/dir.c3
-rw-r--r--fs/ocfs2/dlm/Makefile2
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c2
-rw-r--r--fs/ocfs2/dlm/dlmthread.c2
-rw-r--r--fs/ocfs2/dlmfs/Makefile2
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c7
-rw-r--r--fs/ocfs2/dlmglue.c33
-rw-r--r--fs/ocfs2/export.c2
-rw-r--r--fs/ocfs2/file.c97
-rw-r--r--fs/ocfs2/journal.c57
-rw-r--r--fs/ocfs2/localalloc.c12
-rw-r--r--fs/ocfs2/locks.c10
-rw-r--r--fs/ocfs2/move_extents.c64
-rw-r--r--fs/ocfs2/ocfs2.h1
-rw-r--r--fs/ocfs2/ocfs2_trace.h2
-rw-r--r--fs/ocfs2/refcounttree.c150
-rw-r--r--fs/ocfs2/refcounttree.h24
-rw-r--r--fs/ocfs2/slot_map.c8
-rw-r--r--fs/ocfs2/stackglue.c6
-rw-r--r--fs/ocfs2/stackglue.h3
-rw-r--r--fs/ocfs2/super.c2
-rw-r--r--fs/openpromfs/inode.c11
-rw-r--r--fs/orangefs/file.c4
-rw-r--r--fs/orangefs/inode.c4
-rw-r--r--fs/orangefs/orangefs-bufmap.c2
-rw-r--r--fs/overlayfs/copy_up.c219
-rw-r--r--fs/overlayfs/dir.c48
-rw-r--r--fs/overlayfs/export.c6
-rw-r--r--fs/overlayfs/file.c43
-rw-r--r--fs/overlayfs/namei.c4
-rw-r--r--fs/overlayfs/overlayfs.h14
-rw-r--r--fs/overlayfs/super.c68
-rw-r--r--fs/overlayfs/util.c46
-rw-r--r--fs/pipe.c3
-rw-r--r--fs/pnode.c1
-rw-r--r--fs/proc/array.c26
-rw-r--r--fs/proc/base.c116
-rw-r--r--fs/proc/generic.c4
-rw-r--r--fs/proc/inode.c7
-rw-r--r--fs/proc/internal.h4
-rw-r--r--fs/proc/kcore.c2
-rw-r--r--fs/proc/loadavg.c3
-rw-r--r--fs/proc/meminfo.c16
-rw-r--r--fs/proc/page.c8
-rw-r--r--fs/proc/proc_net.c20
-rw-r--r--fs/proc/proc_sysctl.c13
-rw-r--r--fs/proc/root.c2
-rw-r--r--fs/proc/self.c16
-rw-r--r--fs/proc/stat.c89
-rw-r--r--fs/proc/task_mmu.c45
-rw-r--r--fs/proc/task_nommu.c4
-rw-r--r--fs/proc/thread_self.c16
-rw-r--r--fs/proc/util.c1
-rw-r--r--fs/proc/vmcore.c4
-rw-r--r--fs/pstore/Kconfig1
-rw-r--r--fs/pstore/ftrace.c2
-rw-r--r--fs/pstore/inode.c51
-rw-r--r--fs/pstore/platform.c174
-rw-r--r--fs/pstore/pmsg.c2
-rw-r--r--fs/pstore/ram.c148
-rw-r--r--fs/pstore/ram_core.c47
-rw-r--r--fs/quota/quota.c3
-rw-r--r--fs/read_write.c450
-rw-r--r--fs/readdir.c10
-rw-r--r--fs/reiserfs/Makefile9
-rw-r--r--fs/reiserfs/xattr.c7
-rw-r--r--fs/select.c371
-rw-r--r--fs/splice.c28
-rw-r--r--fs/statfs.c14
-rw-r--r--fs/super.c26
-rw-r--r--fs/sysfs/dir.c3
-rw-r--r--fs/sysfs/file.c12
-rw-r--r--fs/sysfs/group.c3
-rw-r--r--fs/sysfs/symlink.c3
-rw-r--r--fs/sysv/inode.c2
-rw-r--r--fs/timerfd.c4
-rw-r--r--fs/ubifs/Kconfig27
-rw-r--r--fs/ubifs/Makefile1
-rw-r--r--fs/ubifs/auth.c501
-rw-r--r--fs/ubifs/debug.c6
-rw-r--r--fs/ubifs/file.c2
-rw-r--r--fs/ubifs/gc.c49
-rw-r--r--fs/ubifs/io.c110
-rw-r--r--fs/ubifs/journal.c289
-rw-r--r--fs/ubifs/log.c24
-rw-r--r--fs/ubifs/lpt.c184
-rw-r--r--fs/ubifs/lpt_commit.c44
-rw-r--r--fs/ubifs/master.c64
-rw-r--r--fs/ubifs/misc.h5
-rw-r--r--fs/ubifs/recovery.c120
-rw-r--r--fs/ubifs/replay.c227
-rw-r--r--fs/ubifs/sb.c222
-rw-r--r--fs/ubifs/super.c91
-rw-r--r--fs/ubifs/tnc.c36
-rw-r--r--fs/ubifs/tnc_commit.c27
-rw-r--r--fs/ubifs/tnc_misc.c26
-rw-r--r--fs/ubifs/ubifs-media.h46
-rw-r--r--fs/ubifs/ubifs.h253
-rw-r--r--fs/udf/balloc.c30
-rw-r--r--fs/udf/inode.c6
-rw-r--r--fs/udf/super.c206
-rw-r--r--fs/udf/udf_sb.h10
-rw-r--r--fs/udf/unicode.c14
-rw-r--r--fs/userfaultfd.c47
-rw-r--r--fs/utimes.c10
-rw-r--r--fs/xfs/libxfs/xfs_ag.c15
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.c2
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c91
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h4
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c74
-rw-r--r--fs/xfs/libxfs/xfs_attr.c17
-rw-r--r--fs/xfs/libxfs/xfs_attr.h2
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c32
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c8
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c313
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h20
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c13
-rw-r--r--fs/xfs/libxfs/xfs_btree.c2
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c49
-rw-r--r--fs/xfs/libxfs/xfs_da_format.h3
-rw-r--r--fs/xfs/libxfs/xfs_defer.c67
-rw-r--r--fs/xfs/libxfs/xfs_defer.h37
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c17
-rw-r--r--fs/xfs/libxfs/xfs_dir2.h1
-rw-r--r--fs/xfs/libxfs/xfs_dir2_block.c10
-rw-r--r--fs/xfs/libxfs/xfs_dir2_data.c12
-rw-r--r--fs/xfs/libxfs/xfs_dir2_leaf.c100
-rw-r--r--fs/xfs/libxfs/xfs_dir2_node.c10
-rw-r--r--fs/xfs/libxfs/xfs_dquot_buf.c4
-rw-r--r--fs/xfs/libxfs/xfs_errortag.h4
-rw-r--r--fs/xfs/libxfs/xfs_format.h12
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c57
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c47
-rw-r--r--fs/xfs/libxfs/xfs_iext_tree.c13
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c11
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.h2
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.c9
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c240
-rw-r--r--fs/xfs/libxfs/xfs_rmap.h54
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.c3
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c6
-rw-r--r--fs/xfs/libxfs/xfs_sb.c7
-rw-r--r--fs/xfs/libxfs/xfs_shared.h4
-rw-r--r--fs/xfs/libxfs/xfs_symlink_remote.c17
-rw-r--r--fs/xfs/libxfs/xfs_types.c33
-rw-r--r--fs/xfs/libxfs/xfs_types.h25
-rw-r--r--fs/xfs/scrub/agheader.c35
-rw-r--r--fs/xfs/scrub/agheader_repair.c17
-rw-r--r--fs/xfs/scrub/alloc.c4
-rw-r--r--fs/xfs/scrub/attr.c11
-rw-r--r--fs/xfs/scrub/bmap.c27
-rw-r--r--fs/xfs/scrub/btree.c45
-rw-r--r--fs/xfs/scrub/btree.h22
-rw-r--r--fs/xfs/scrub/common.c14
-rw-r--r--fs/xfs/scrub/common.h2
-rw-r--r--fs/xfs/scrub/dir.c6
-rw-r--r--fs/xfs/scrub/ialloc.c378
-rw-r--r--fs/xfs/scrub/inode.c4
-rw-r--r--fs/xfs/scrub/refcount.c16
-rw-r--r--fs/xfs/scrub/repair.c68
-rw-r--r--fs/xfs/scrub/repair.h10
-rw-r--r--fs/xfs/scrub/rmap.c35
-rw-r--r--fs/xfs/scrub/rtbitmap.c5
-rw-r--r--fs/xfs/scrub/scrub.h4
-rw-r--r--fs/xfs/scrub/trace.h176
-rw-r--r--fs/xfs/xfs_aops.c273
-rw-r--r--fs/xfs/xfs_aops.h21
-rw-r--r--fs/xfs/xfs_attr_list.c1
-rw-r--r--fs/xfs/xfs_bmap_util.c23
-rw-r--r--fs/xfs/xfs_bmap_util.h3
-rw-r--r--fs/xfs/xfs_buf.c64
-rw-r--r--fs/xfs/xfs_buf.h8
-rw-r--r--fs/xfs/xfs_buf_item.c28
-rw-r--r--fs/xfs/xfs_error.c6
-rw-r--r--fs/xfs/xfs_error.h1
-rw-r--r--fs/xfs/xfs_extfree_item.c5
-rw-r--r--fs/xfs/xfs_file.c114
-rw-r--r--fs/xfs/xfs_fsops.c4
-rw-r--r--fs/xfs/xfs_globals.c2
-rw-r--r--fs/xfs/xfs_inode.c785
-rw-r--r--fs/xfs/xfs_inode.h3
-rw-r--r--fs/xfs/xfs_ioctl.c2
-rw-r--r--fs/xfs/xfs_ioctl32.c58
-rw-r--r--fs/xfs/xfs_iomap.c518
-rw-r--r--fs/xfs/xfs_iomap.h7
-rw-r--r--fs/xfs/xfs_iops.c21
-rw-r--r--fs/xfs/xfs_itable.c14
-rw-r--r--fs/xfs/xfs_log_recover.c22
-rw-r--r--fs/xfs/xfs_message.c2
-rw-r--r--fs/xfs/xfs_mount.c9
-rw-r--r--fs/xfs/xfs_mount.h21
-rw-r--r--fs/xfs/xfs_ondisk.h21
-rw-r--r--fs/xfs/xfs_pnfs.c2
-rw-r--r--fs/xfs/xfs_qm_bhv.c2
-rw-r--r--fs/xfs/xfs_reflink.c561
-rw-r--r--fs/xfs/xfs_reflink.h33
-rw-r--r--fs/xfs/xfs_rtalloc.c57
-rw-r--r--fs/xfs/xfs_super.c32
-rw-r--r--fs/xfs/xfs_symlink.c33
-rw-r--r--fs/xfs/xfs_sysctl.h1
-rw-r--r--fs/xfs/xfs_sysfs.c24
-rw-r--r--fs/xfs/xfs_trace.h159
-rw-r--r--fs/xfs/xfs_trans.h7
-rw-r--r--fs/xfs/xfs_trans_bmap.c10
-rw-r--r--fs/xfs/xfs_trans_buf.c2
-rw-r--r--fs/xfs/xfs_trans_extfree.c39
-rw-r--r--fs/xfs/xfs_trans_refcount.c10
-rw-r--r--fs/xfs/xfs_trans_rmap.c10
-rw-r--r--fs/xfs/xfs_xattr.c3
611 files changed, 34439 insertions, 14604 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 082d227fa56b..6261719f6f2a 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -276,7 +276,7 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler,
switch (handler->flags) {
case ACL_TYPE_ACCESS:
if (acl) {
- struct iattr iattr;
+ struct iattr iattr = { 0 };
struct posix_acl *old_acl = acl;
retval = posix_acl_update_mode(inode, &iattr.ia_mode, &acl);
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 89bac3d2f05b..619128b55837 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -61,6 +61,8 @@ enum {
Opt_cache_loose, Opt_fscache, Opt_mmap,
/* Access options */
Opt_access, Opt_posixacl,
+ /* Lock timeout option */
+ Opt_locktimeout,
/* Error token */
Opt_err
};
@@ -80,6 +82,7 @@ static const match_table_t tokens = {
{Opt_cachetag, "cachetag=%s"},
{Opt_access, "access=%s"},
{Opt_posixacl, "posixacl"},
+ {Opt_locktimeout, "locktimeout=%u"},
{Opt_err, NULL}
};
@@ -187,6 +190,7 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
#ifdef CONFIG_9P_FSCACHE
v9ses->cachetag = NULL;
#endif
+ v9ses->session_lock_timeout = P9_LOCK_TIMEOUT;
if (!opts)
return 0;
@@ -359,6 +363,23 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
#endif
break;
+ case Opt_locktimeout:
+ r = match_int(&args[0], &option);
+ if (r < 0) {
+ p9_debug(P9_DEBUG_ERROR,
+ "integer field, but no integer?\n");
+ ret = r;
+ continue;
+ }
+ if (option < 1) {
+ p9_debug(P9_DEBUG_ERROR,
+ "locktimeout must be a greater than zero integer.\n");
+ ret = -EINVAL;
+ continue;
+ }
+ v9ses->session_lock_timeout = (long)option * HZ;
+ break;
+
default:
continue;
}
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 982e017acadb..129e5243a6bf 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -116,6 +116,7 @@ struct v9fs_session_info {
struct p9_client *clnt; /* 9p client */
struct list_head slist; /* list of sessions registered with v9fs */
struct rw_semaphore rename_sem;
+ long session_lock_timeout; /* retry interval for blocking locks */
};
/* cache_validity flags */
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index e1cbdfdb7c68..0bcbcc20f769 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -65,7 +65,7 @@ static int v9fs_fid_readpage(struct p9_fid *fid, struct page *page)
if (retval == 0)
return retval;
- iov_iter_bvec(&to, ITER_BVEC | READ, &bvec, 1, PAGE_SIZE);
+ iov_iter_bvec(&to, READ, &bvec, 1, PAGE_SIZE);
retval = p9_client_read(fid, page_offset(page), &to, &err);
if (err) {
@@ -175,7 +175,7 @@ static int v9fs_vfs_writepage_locked(struct page *page)
bvec.bv_page = page;
bvec.bv_offset = 0;
bvec.bv_len = len;
- iov_iter_bvec(&from, ITER_BVEC | WRITE, &bvec, 1, len);
+ iov_iter_bvec(&from, WRITE, &bvec, 1, len);
/* We should have writeback_fid always set */
BUG_ON(!v9inode->writeback_fid);
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index b0405d6aac85..00745147329d 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -76,15 +76,6 @@ static inline int dt_type(struct p9_wstat *mistat)
return rettype;
}
-static void p9stat_init(struct p9_wstat *stbuf)
-{
- stbuf->name = NULL;
- stbuf->uid = NULL;
- stbuf->gid = NULL;
- stbuf->muid = NULL;
- stbuf->extension = NULL;
-}
-
/**
* v9fs_alloc_rdir_buf - Allocate buffer used for read and readdir
* @filp: opened file structure
@@ -114,7 +105,6 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx)
int err = 0;
struct p9_fid *fid;
int buflen;
- int reclen = 0;
struct p9_rdir *rdir;
struct kvec kvec;
@@ -133,7 +123,7 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx)
if (rdir->tail == rdir->head) {
struct iov_iter to;
int n;
- iov_iter_kvec(&to, READ | ITER_KVEC, &kvec, 1, buflen);
+ iov_iter_kvec(&to, READ, &kvec, 1, buflen);
n = p9_client_read(file->private_data, ctx->pos, &to,
&err);
if (err)
@@ -145,15 +135,12 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx)
rdir->tail = n;
}
while (rdir->head < rdir->tail) {
- p9stat_init(&st);
err = p9stat_read(fid->clnt, rdir->buf + rdir->head,
rdir->tail - rdir->head, &st);
- if (err) {
+ if (err <= 0) {
p9_debug(P9_DEBUG_VFS, "returned %d\n", err);
- p9stat_free(&st);
return -EIO;
}
- reclen = st.size+2;
over = !dir_emit(ctx, st.name, strlen(st.name),
v9fs_qid2ino(&st.qid), dt_type(&st));
@@ -161,8 +148,8 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx)
if (over)
return 0;
- rdir->head += reclen;
- ctx->pos += reclen;
+ rdir->head += err;
+ ctx->pos += err;
}
}
}
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 5f2e48d41d72..a25efa782fcc 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -154,6 +154,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
uint8_t status = P9_LOCK_ERROR;
int res = 0;
unsigned char fl_type;
+ struct v9fs_session_info *v9ses;
fid = filp->private_data;
BUG_ON(fid == NULL);
@@ -189,6 +190,8 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
if (IS_SETLKW(cmd))
flock.flags = P9_LOCK_FLAGS_BLOCK;
+ v9ses = v9fs_inode2v9ses(file_inode(filp));
+
/*
* if its a blocked request and we get P9_LOCK_BLOCKED as the status
* for lock request, keep on trying
@@ -202,8 +205,17 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
break;
if (status == P9_LOCK_BLOCKED && !IS_SETLKW(cmd))
break;
- if (schedule_timeout_interruptible(P9_LOCK_TIMEOUT) != 0)
+ if (schedule_timeout_interruptible(v9ses->session_lock_timeout)
+ != 0)
break;
+ /*
+ * p9_client_lock_dotl overwrites flock.client_id with the
+ * server message, free and reuse the client name
+ */
+ if (flock.client_id != fid->clnt->name) {
+ kfree(flock.client_id);
+ flock.client_id = fid->clnt->name;
+ }
}
/* map 9p status to VFS status */
@@ -216,7 +228,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
break;
default:
WARN_ONCE(1, "unknown lock status code: %d\n", status);
- /* fallthough */
+ /* fall through */
case P9_LOCK_ERROR:
case P9_LOCK_GRACE:
res = -ENOLCK;
@@ -235,6 +247,8 @@ out_unlock:
locks_lock_file_wait(filp, fl);
fl->fl_type = fl_type;
}
+ if (flock.client_id != fid->clnt->name)
+ kfree(flock.client_id);
out:
return res;
}
@@ -269,7 +283,7 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl)
res = p9_client_getlock_dotl(fid, &glock);
if (res < 0)
- return res;
+ goto out;
/* map 9p lock type to os lock type */
switch (glock.type) {
case P9_LOCK_TYPE_RDLCK:
@@ -290,7 +304,9 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl)
fl->fl_end = glock.start + glock.length - 1;
fl->fl_pid = -glock.proc_id;
}
- kfree(glock.client_id);
+out:
+ if (glock.client_id != fid->clnt->name)
+ kfree(glock.client_id);
return res;
}
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index 352abc39e891..ac8ff8ca4c11 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -32,7 +32,7 @@ ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name,
struct iov_iter to;
int err;
- iov_iter_kvec(&to, READ | ITER_KVEC, &kvec, 1, buffer_size);
+ iov_iter_kvec(&to, READ, &kvec, 1, buffer_size);
attr_fid = p9_client_xattrwalk(fid, name, &attr_size);
if (IS_ERR(attr_fid)) {
@@ -107,7 +107,7 @@ int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name,
struct iov_iter from;
int retval, err;
- iov_iter_kvec(&from, WRITE | ITER_KVEC, &kvec, 1, value_len);
+ iov_iter_kvec(&from, WRITE, &kvec, 1, value_len);
p9_debug(P9_DEBUG_VFS, "name = %s value_len = %zu flags = %d\n",
name, value_len, flags);
diff --git a/fs/Makefile b/fs/Makefile
index 293733f61594..ffeaa6632ab4 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -12,7 +12,8 @@ obj-y := open.o read_write.o file_table.o super.o \
attr.o bad_inode.o file.o filesystems.o namespace.o \
seq_file.o xattr.o libfs.o fs-writeback.o \
pnode.o splice.o sync.o utimes.o d_path.o \
- stack.o fs_struct.o statfs.o fs_pin.o nsfs.o
+ stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
+ fs_types.o
ifeq ($(CONFIG_BLOCK),y)
obj-y += buffer.o block_dev.o direct-io.o mpage.o
@@ -30,6 +31,7 @@ obj-$(CONFIG_TIMERFD) += timerfd.o
obj-$(CONFIG_EVENTFD) += eventfd.o
obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
obj-$(CONFIG_AIO) += aio.o
+obj-$(CONFIG_IO_URING) += io_uring.o
obj-$(CONFIG_FS_DAX) += dax.o
obj-$(CONFIG_FS_ENCRYPTION) += crypto/
obj-$(CONFIG_FILE_LOCKING) += locks.o
diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig
index ebba3b18e5da..701aaa9b1899 100644
--- a/fs/afs/Kconfig
+++ b/fs/afs/Kconfig
@@ -27,3 +27,15 @@ config AFS_FSCACHE
help
Say Y here if you want AFS data to be cached locally on disk through
the generic filesystem cache manager
+
+config AFS_DEBUG_CURSOR
+ bool "AFS server cursor debugging"
+ depends on AFS_FS
+ help
+ Say Y here to cause the contents of a server cursor to be dumped to
+ the dmesg log if the server rotation algorithm fails to successfully
+ contact a server.
+
+ See <file:Documentation/filesystems/afs.txt> for more information.
+
+ If unsure, say N.
diff --git a/fs/afs/Makefile b/fs/afs/Makefile
index 546874057bd3..0738e2bf5193 100644
--- a/fs/afs/Makefile
+++ b/fs/afs/Makefile
@@ -17,6 +17,7 @@ kafs-y := \
file.o \
flock.o \
fsclient.o \
+ fs_probe.o \
inode.o \
main.o \
misc.o \
@@ -29,9 +30,13 @@ kafs-y := \
super.o \
netdevices.o \
vlclient.o \
+ vl_list.o \
+ vl_probe.o \
+ vl_rotate.o \
volume.o \
write.o \
- xattr.o
+ xattr.o \
+ yfsclient.o
kafs-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_AFS_FS) := kafs.o
diff --git a/fs/afs/addr_list.c b/fs/afs/addr_list.c
index 55a756c60746..967db336d11a 100644
--- a/fs/afs/addr_list.c
+++ b/fs/afs/addr_list.c
@@ -64,19 +64,25 @@ struct afs_addr_list *afs_alloc_addrlist(unsigned int nr,
/*
* Parse a text string consisting of delimited addresses.
*/
-struct afs_addr_list *afs_parse_text_addrs(const char *text, size_t len,
- char delim,
- unsigned short service,
- unsigned short port)
+struct afs_vlserver_list *afs_parse_text_addrs(struct afs_net *net,
+ const char *text, size_t len,
+ char delim,
+ unsigned short service,
+ unsigned short port)
{
+ struct afs_vlserver_list *vllist;
struct afs_addr_list *alist;
const char *p, *end = text + len;
+ const char *problem;
unsigned int nr = 0;
+ int ret = -ENOMEM;
_enter("%*.*s,%c", (int)len, (int)len, text, delim);
- if (!len)
+ if (!len) {
+ _leave(" = -EDESTADDRREQ [empty]");
return ERR_PTR(-EDESTADDRREQ);
+ }
if (delim == ':' && (memchr(text, ',', len) || !memchr(text, '.', len)))
delim = ',';
@@ -84,18 +90,24 @@ struct afs_addr_list *afs_parse_text_addrs(const char *text, size_t len,
/* Count the addresses */
p = text;
do {
- if (!*p)
- return ERR_PTR(-EINVAL);
+ if (!*p) {
+ problem = "nul";
+ goto inval;
+ }
if (*p == delim)
continue;
nr++;
if (*p == '[') {
p++;
- if (p == end)
- return ERR_PTR(-EINVAL);
+ if (p == end) {
+ problem = "brace1";
+ goto inval;
+ }
p = memchr(p, ']', end - p);
- if (!p)
- return ERR_PTR(-EINVAL);
+ if (!p) {
+ problem = "brace2";
+ goto inval;
+ }
p++;
if (p >= end)
break;
@@ -109,10 +121,19 @@ struct afs_addr_list *afs_parse_text_addrs(const char *text, size_t len,
_debug("%u/%u addresses", nr, AFS_MAX_ADDRESSES);
- alist = afs_alloc_addrlist(nr, service, port);
- if (!alist)
+ vllist = afs_alloc_vlserver_list(1);
+ if (!vllist)
return ERR_PTR(-ENOMEM);
+ vllist->nr_servers = 1;
+ vllist->servers[0].server = afs_alloc_vlserver("<dummy>", 7, AFS_VL_PORT);
+ if (!vllist->servers[0].server)
+ goto error_vl;
+
+ alist = afs_alloc_addrlist(nr, service, AFS_VL_PORT);
+ if (!alist)
+ goto error;
+
/* Extract the addresses */
p = text;
do {
@@ -135,17 +156,21 @@ struct afs_addr_list *afs_parse_text_addrs(const char *text, size_t len,
break;
}
- if (in4_pton(p, q - p, (u8 *)&x[0], -1, &stop))
+ if (in4_pton(p, q - p, (u8 *)&x[0], -1, &stop)) {
family = AF_INET;
- else if (in6_pton(p, q - p, (u8 *)x, -1, &stop))
+ } else if (in6_pton(p, q - p, (u8 *)x, -1, &stop)) {
family = AF_INET6;
- else
+ } else {
+ problem = "family";
goto bad_address;
+ }
- if (stop != q)
+ p = q;
+ if (stop != p) {
+ problem = "nostop";
goto bad_address;
+ }
- p = q;
if (q < end && *q == ']')
p++;
@@ -154,18 +179,23 @@ struct afs_addr_list *afs_parse_text_addrs(const char *text, size_t len,
/* Port number specification "+1234" */
xport = 0;
p++;
- if (p >= end || !isdigit(*p))
+ if (p >= end || !isdigit(*p)) {
+ problem = "port";
goto bad_address;
+ }
do {
xport *= 10;
xport += *p - '0';
- if (xport > 65535)
+ if (xport > 65535) {
+ problem = "pval";
goto bad_address;
+ }
p++;
} while (p < end && isdigit(*p));
} else if (*p == delim) {
p++;
} else {
+ problem = "weird";
goto bad_address;
}
}
@@ -177,12 +207,23 @@ struct afs_addr_list *afs_parse_text_addrs(const char *text, size_t len,
} while (p < end);
+ rcu_assign_pointer(vllist->servers[0].server->addresses, alist);
_leave(" = [nr %u]", alist->nr_addrs);
- return alist;
+ return vllist;
-bad_address:
- kfree(alist);
+inval:
+ _leave(" = -EINVAL [%s %zu %*.*s]",
+ problem, p - text, (int)len, (int)len, text);
return ERR_PTR(-EINVAL);
+bad_address:
+ _leave(" = -EINVAL [%s %zu %*.*s]",
+ problem, p - text, (int)len, (int)len, text);
+ ret = -EINVAL;
+error:
+ afs_put_addrlist(alist);
+error_vl:
+ afs_put_vlserverlist(net, vllist);
+ return ERR_PTR(ret);
}
/*
@@ -201,30 +242,34 @@ static int afs_cmp_addr_list(const struct afs_addr_list *a1,
/*
* Perform a DNS query for VL servers and build a up an address list.
*/
-struct afs_addr_list *afs_dns_query(struct afs_cell *cell, time64_t *_expiry)
+struct afs_vlserver_list *afs_dns_query(struct afs_cell *cell, time64_t *_expiry)
{
- struct afs_addr_list *alist;
- char *vllist = NULL;
+ struct afs_vlserver_list *vllist;
+ char *result = NULL;
int ret;
_enter("%s", cell->name);
- ret = dns_query("afsdb", cell->name, cell->name_len,
- "", &vllist, _expiry);
- if (ret < 0)
+ ret = dns_query("afsdb", cell->name, cell->name_len, "srv=1",
+ &result, _expiry);
+ if (ret < 0) {
+ _leave(" = %d [dns]", ret);
return ERR_PTR(ret);
-
- alist = afs_parse_text_addrs(vllist, strlen(vllist), ',',
- VL_SERVICE, AFS_VL_PORT);
- if (IS_ERR(alist)) {
- kfree(vllist);
- if (alist != ERR_PTR(-ENOMEM))
- pr_err("Failed to parse DNS data\n");
- return alist;
}
- kfree(vllist);
- return alist;
+ if (*_expiry == 0)
+ *_expiry = ktime_get_real_seconds() + 60;
+
+ if (ret > 1 && result[0] == 0)
+ vllist = afs_extract_vlserver_list(cell, result, ret);
+ else
+ vllist = afs_parse_text_addrs(cell->net, result, ret, ',',
+ VL_SERVICE, AFS_VL_PORT);
+ kfree(result);
+ if (IS_ERR(vllist) && vllist != ERR_PTR(-ENOMEM))
+ pr_err("Failed to parse DNS data %ld\n", PTR_ERR(vllist));
+
+ return vllist;
}
/*
@@ -258,6 +303,8 @@ void afs_merge_fs_addr4(struct afs_addr_list *alist, __be32 xdr, u16 port)
sizeof(alist->addrs[0]) * (alist->nr_addrs - i));
srx = &alist->addrs[i];
+ srx->srx_family = AF_RXRPC;
+ srx->transport_type = SOCK_DGRAM;
srx->transport_len = sizeof(srx->transport.sin);
srx->transport.sin.sin_family = AF_INET;
srx->transport.sin.sin_port = htons(port);
@@ -296,6 +343,8 @@ void afs_merge_fs_addr6(struct afs_addr_list *alist, __be32 *xdr, u16 port)
sizeof(alist->addrs[0]) * (alist->nr_addrs - i));
srx = &alist->addrs[i];
+ srx->srx_family = AF_RXRPC;
+ srx->transport_type = SOCK_DGRAM;
srx->transport_len = sizeof(srx->transport.sin6);
srx->transport.sin6.sin6_family = AF_INET6;
srx->transport.sin6.sin6_port = htons(port);
@@ -308,25 +357,33 @@ void afs_merge_fs_addr6(struct afs_addr_list *alist, __be32 *xdr, u16 port)
*/
bool afs_iterate_addresses(struct afs_addr_cursor *ac)
{
- _enter("%hu+%hd", ac->start, (short)ac->index);
+ unsigned long set, failed;
+ int index;
if (!ac->alist)
return false;
- if (ac->begun) {
- ac->index++;
- if (ac->index == ac->alist->nr_addrs)
- ac->index = 0;
+ set = ac->alist->responded;
+ failed = ac->alist->failed;
+ _enter("%lx-%lx-%lx,%d", set, failed, ac->tried, ac->index);
- if (ac->index == ac->start) {
- ac->error = -EDESTADDRREQ;
- return false;
- }
- }
+ ac->nr_iterations++;
+
+ set &= ~(failed | ac->tried);
+
+ if (!set)
+ return false;
- ac->begun = true;
+ index = READ_ONCE(ac->alist->preferred);
+ if (test_bit(index, &set))
+ goto selected;
+
+ index = __ffs(set);
+
+selected:
+ ac->index = index;
+ set_bit(index, &ac->tried);
ac->responded = false;
- ac->addr = &ac->alist->addrs[ac->index];
return true;
}
@@ -339,53 +396,13 @@ int afs_end_cursor(struct afs_addr_cursor *ac)
alist = ac->alist;
if (alist) {
- if (ac->responded && ac->index != ac->start)
- WRITE_ONCE(alist->index, ac->index);
+ if (ac->responded &&
+ ac->index != alist->preferred &&
+ test_bit(ac->alist->preferred, &ac->tried))
+ WRITE_ONCE(alist->preferred, ac->index);
afs_put_addrlist(alist);
+ ac->alist = NULL;
}
- ac->addr = NULL;
- ac->alist = NULL;
- ac->begun = false;
return ac->error;
}
-
-/*
- * Set the address cursor for iterating over VL servers.
- */
-int afs_set_vl_cursor(struct afs_addr_cursor *ac, struct afs_cell *cell)
-{
- struct afs_addr_list *alist;
- int ret;
-
- if (!rcu_access_pointer(cell->vl_addrs)) {
- ret = wait_on_bit(&cell->flags, AFS_CELL_FL_NO_LOOKUP_YET,
- TASK_INTERRUPTIBLE);
- if (ret < 0)
- return ret;
-
- if (!rcu_access_pointer(cell->vl_addrs) &&
- ktime_get_real_seconds() < cell->dns_expiry)
- return cell->error;
- }
-
- read_lock(&cell->vl_addrs_lock);
- alist = rcu_dereference_protected(cell->vl_addrs,
- lockdep_is_held(&cell->vl_addrs_lock));
- if (alist->nr_addrs > 0)
- afs_get_addrlist(alist);
- else
- alist = NULL;
- read_unlock(&cell->vl_addrs_lock);
-
- if (!alist)
- return -EDESTADDRREQ;
-
- ac->alist = alist;
- ac->addr = NULL;
- ac->start = READ_ONCE(alist->index);
- ac->index = ac->start;
- ac->error = 0;
- ac->begun = false;
- return 0;
-}
diff --git a/fs/afs/afs.h b/fs/afs/afs.h
index b4ff1f7ae4ab..d12ffb457e47 100644
--- a/fs/afs/afs.h
+++ b/fs/afs/afs.h
@@ -23,9 +23,9 @@
#define AFSPATHMAX 1024 /* Maximum length of a pathname plus NUL */
#define AFSOPAQUEMAX 1024 /* Maximum length of an opaque field */
-typedef unsigned afs_volid_t;
-typedef unsigned afs_vnodeid_t;
-typedef unsigned long long afs_dataversion_t;
+typedef u64 afs_volid_t;
+typedef u64 afs_vnodeid_t;
+typedef u64 afs_dataversion_t;
typedef enum {
AFSVL_RWVOL, /* read/write volume */
@@ -52,8 +52,9 @@ typedef enum {
*/
struct afs_fid {
afs_volid_t vid; /* volume ID */
- afs_vnodeid_t vnode; /* file index within volume */
- unsigned unique; /* unique ID number (file index version) */
+ afs_vnodeid_t vnode; /* Lower 64-bits of file index within volume */
+ u32 vnode_hi; /* Upper 32-bits of file index */
+ u32 unique; /* unique ID number (file index version) */
};
/*
@@ -67,14 +68,14 @@ typedef enum {
} afs_callback_type_t;
struct afs_callback {
+ time64_t expires_at; /* Time at which expires */
unsigned version; /* Callback version */
- unsigned expiry; /* Time at which expires */
afs_callback_type_t type; /* Type of callback */
};
struct afs_callback_break {
struct afs_fid fid; /* File identifier */
- struct afs_callback cb; /* Callback details */
+ //struct afs_callback cb; /* Callback details */
};
#define AFSCBMAX 50 /* maximum callbacks transferred per bulk op */
@@ -129,19 +130,18 @@ typedef u32 afs_access_t;
struct afs_file_status {
u64 size; /* file size */
afs_dataversion_t data_version; /* current data version */
- time_t mtime_client; /* last time client changed data */
- time_t mtime_server; /* last time server changed data */
- unsigned abort_code; /* Abort if bulk-fetching this failed */
-
- afs_file_type_t type; /* file type */
- unsigned nlink; /* link count */
- u32 author; /* author ID */
- u32 owner; /* owner ID */
- u32 group; /* group ID */
+ struct timespec64 mtime_client; /* Last time client changed data */
+ struct timespec64 mtime_server; /* Last time server changed data */
+ s64 author; /* author ID */
+ s64 owner; /* owner ID */
+ s64 group; /* group ID */
afs_access_t caller_access; /* access rights for authenticated caller */
afs_access_t anon_access; /* access rights for unauthenticated caller */
umode_t mode; /* UNIX mode */
+ afs_file_type_t type; /* file type */
+ u32 nlink; /* link count */
s32 lock_count; /* file lock count (0=UNLK -1=WRLCK +ve=#RDLCK */
+ u32 abort_code; /* Abort if bulk-fetching this failed */
};
/*
@@ -158,25 +158,27 @@ struct afs_file_status {
* AFS volume synchronisation information
*/
struct afs_volsync {
- time_t creation; /* volume creation time */
+ time64_t creation; /* volume creation time */
};
/*
* AFS volume status record
*/
struct afs_volume_status {
- u32 vid; /* volume ID */
- u32 parent_id; /* parent volume ID */
+ afs_volid_t vid; /* volume ID */
+ afs_volid_t parent_id; /* parent volume ID */
u8 online; /* true if volume currently online and available */
u8 in_service; /* true if volume currently in service */
u8 blessed; /* same as in_service */
u8 needs_salvage; /* true if consistency checking required */
u32 type; /* volume type (afs_voltype_t) */
- u32 min_quota; /* minimum space set aside (blocks) */
- u32 max_quota; /* maximum space this volume may occupy (blocks) */
- u32 blocks_in_use; /* space this volume currently occupies (blocks) */
- u32 part_blocks_avail; /* space available in volume's partition */
- u32 part_max_blocks; /* size of volume's partition */
+ u64 min_quota; /* minimum space set aside (blocks) */
+ u64 max_quota; /* maximum space this volume may occupy (blocks) */
+ u64 blocks_in_use; /* space this volume currently occupies (blocks) */
+ u64 part_blocks_avail; /* space available in volume's partition */
+ u64 part_max_blocks; /* size of volume's partition */
+ s64 vol_copy_date;
+ s64 vol_backup_date;
};
#define AFS_BLOCK_SIZE 1024
diff --git a/fs/afs/cache.c b/fs/afs/cache.c
index b1c31ec4523a..f6d0a21e8052 100644
--- a/fs/afs/cache.c
+++ b/fs/afs/cache.c
@@ -49,7 +49,7 @@ static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
struct afs_vnode *vnode = cookie_netfs_data;
struct afs_vnode_cache_aux aux;
- _enter("{%x,%x,%llx},%p,%u",
+ _enter("{%llx,%x,%llx},%p,%u",
vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version,
buffer, buflen);
diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index 5f261fbf2182..1c7955f5cdaf 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -210,12 +210,10 @@ void afs_init_callback_state(struct afs_server *server)
/*
* actually break a callback
*/
-void afs_break_callback(struct afs_vnode *vnode)
+void __afs_break_callback(struct afs_vnode *vnode)
{
_enter("");
- write_seqlock(&vnode->cb_lock);
-
clear_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags);
if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
vnode->cb_break++;
@@ -230,7 +228,12 @@ void afs_break_callback(struct afs_vnode *vnode)
afs_lock_may_be_available(vnode);
spin_unlock(&vnode->lock);
}
+}
+void afs_break_callback(struct afs_vnode *vnode)
+{
+ write_seqlock(&vnode->cb_lock);
+ __afs_break_callback(vnode);
write_sequnlock(&vnode->cb_lock);
}
@@ -310,14 +313,10 @@ void afs_break_callbacks(struct afs_server *server, size_t count,
/* TODO: Sort the callback break list by volume ID */
for (; count > 0; callbacks++, count--) {
- _debug("- Fid { vl=%08x n=%u u=%u } CB { v=%u x=%u t=%u }",
+ _debug("- Fid { vl=%08llx n=%llu u=%u }",
callbacks->fid.vid,
callbacks->fid.vnode,
- callbacks->fid.unique,
- callbacks->cb.version,
- callbacks->cb.expiry,
- callbacks->cb.type
- );
+ callbacks->fid.unique);
afs_break_one_callback(server, &callbacks->fid);
}
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 6127f0fcd62c..9de46116c749 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -20,6 +20,8 @@
#include "internal.h"
static unsigned __read_mostly afs_cell_gc_delay = 10;
+static unsigned __read_mostly afs_cell_min_ttl = 10 * 60;
+static unsigned __read_mostly afs_cell_max_ttl = 24 * 60 * 60;
static void afs_manage_cell(struct work_struct *);
@@ -119,7 +121,7 @@ struct afs_cell *afs_lookup_cell_rcu(struct afs_net *net,
*/
static struct afs_cell *afs_alloc_cell(struct afs_net *net,
const char *name, unsigned int namelen,
- const char *vllist)
+ const char *addresses)
{
struct afs_cell *cell;
int i, ret;
@@ -134,7 +136,7 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
if (namelen == 5 && memcmp(name, "@cell", 5) == 0)
return ERR_PTR(-EINVAL);
- _enter("%*.*s,%s", namelen, namelen, name, vllist);
+ _enter("%*.*s,%s", namelen, namelen, name, addresses);
cell = kzalloc(sizeof(struct afs_cell), GFP_KERNEL);
if (!cell) {
@@ -153,23 +155,27 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
(1 << AFS_CELL_FL_NO_LOOKUP_YET));
INIT_LIST_HEAD(&cell->proc_volumes);
rwlock_init(&cell->proc_lock);
- rwlock_init(&cell->vl_addrs_lock);
+ rwlock_init(&cell->vl_servers_lock);
/* Fill in the VL server list if we were given a list of addresses to
* use.
*/
- if (vllist) {
- struct afs_addr_list *alist;
-
- alist = afs_parse_text_addrs(vllist, strlen(vllist), ':',
- VL_SERVICE, AFS_VL_PORT);
- if (IS_ERR(alist)) {
- ret = PTR_ERR(alist);
+ if (addresses) {
+ struct afs_vlserver_list *vllist;
+
+ vllist = afs_parse_text_addrs(net,
+ addresses, strlen(addresses), ':',
+ VL_SERVICE, AFS_VL_PORT);
+ if (IS_ERR(vllist)) {
+ ret = PTR_ERR(vllist);
goto parse_failed;
}
- rcu_assign_pointer(cell->vl_addrs, alist);
+ rcu_assign_pointer(cell->vl_servers, vllist);
cell->dns_expiry = TIME64_MAX;
+ __clear_bit(AFS_CELL_FL_NO_LOOKUP_YET, &cell->flags);
+ } else {
+ cell->dns_expiry = ktime_get_real_seconds();
}
_leave(" = %p", cell);
@@ -356,26 +362,40 @@ int afs_cell_init(struct afs_net *net, const char *rootcell)
*/
static void afs_update_cell(struct afs_cell *cell)
{
- struct afs_addr_list *alist, *old;
- time64_t now, expiry;
+ struct afs_vlserver_list *vllist, *old;
+ unsigned int min_ttl = READ_ONCE(afs_cell_min_ttl);
+ unsigned int max_ttl = READ_ONCE(afs_cell_max_ttl);
+ time64_t now, expiry = 0;
_enter("%s", cell->name);
- alist = afs_dns_query(cell, &expiry);
- if (IS_ERR(alist)) {
- switch (PTR_ERR(alist)) {
+ vllist = afs_dns_query(cell, &expiry);
+
+ now = ktime_get_real_seconds();
+ if (min_ttl > max_ttl)
+ max_ttl = min_ttl;
+ if (expiry < now + min_ttl)
+ expiry = now + min_ttl;
+ else if (expiry > now + max_ttl)
+ expiry = now + max_ttl;
+
+ if (IS_ERR(vllist)) {
+ switch (PTR_ERR(vllist)) {
case -ENODATA:
- /* The DNS said that the cell does not exist */
+ case -EDESTADDRREQ:
+ /* The DNS said that the cell does not exist or there
+ * weren't any addresses to be had.
+ */
set_bit(AFS_CELL_FL_NOT_FOUND, &cell->flags);
clear_bit(AFS_CELL_FL_DNS_FAIL, &cell->flags);
- cell->dns_expiry = ktime_get_real_seconds() + 61;
+ cell->dns_expiry = expiry;
break;
case -EAGAIN:
case -ECONNREFUSED:
default:
set_bit(AFS_CELL_FL_DNS_FAIL, &cell->flags);
- cell->dns_expiry = ktime_get_real_seconds() + 10;
+ cell->dns_expiry = now + 10;
break;
}
@@ -387,12 +407,12 @@ static void afs_update_cell(struct afs_cell *cell)
/* Exclusion on changing vl_addrs is achieved by a
* non-reentrant work item.
*/
- old = rcu_dereference_protected(cell->vl_addrs, true);
- rcu_assign_pointer(cell->vl_addrs, alist);
+ old = rcu_dereference_protected(cell->vl_servers, true);
+ rcu_assign_pointer(cell->vl_servers, vllist);
cell->dns_expiry = expiry;
if (old)
- afs_put_addrlist(old);
+ afs_put_vlserverlist(cell->net, old);
}
if (test_and_clear_bit(AFS_CELL_FL_NO_LOOKUP_YET, &cell->flags))
@@ -414,7 +434,7 @@ static void afs_cell_destroy(struct rcu_head *rcu)
ASSERTCMP(atomic_read(&cell->usage), ==, 0);
- afs_put_addrlist(rcu_access_pointer(cell->vl_addrs));
+ afs_put_vlserverlist(cell->net, rcu_access_pointer(cell->vl_servers));
key_put(cell->anonymous_key);
kfree(cell);
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index 9e51d6fe7e8f..8ee5972893ed 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -16,6 +16,7 @@
#include <linux/ip.h>
#include "internal.h"
#include "afs_cm.h"
+#include "protocol_yfs.h"
static int afs_deliver_cb_init_call_back_state(struct afs_call *);
static int afs_deliver_cb_init_call_back_state3(struct afs_call *);
@@ -30,6 +31,8 @@ static void SRXAFSCB_Probe(struct work_struct *);
static void SRXAFSCB_ProbeUuid(struct work_struct *);
static void SRXAFSCB_TellMeAboutYourself(struct work_struct *);
+static int afs_deliver_yfs_cb_callback(struct afs_call *);
+
#define CM_NAME(name) \
const char afs_SRXCB##name##_name[] __tracepoint_string = \
"CB." #name
@@ -101,12 +104,25 @@ static const struct afs_call_type afs_SRXCBTellMeAboutYourself = {
};
/*
+ * YFS CB.CallBack operation type
+ */
+static CM_NAME(YFS_CallBack);
+static const struct afs_call_type afs_SRXYFSCB_CallBack = {
+ .name = afs_SRXCBYFS_CallBack_name,
+ .deliver = afs_deliver_yfs_cb_callback,
+ .destructor = afs_cm_destructor,
+ .work = SRXAFSCB_CallBack,
+};
+
+/*
* route an incoming cache manager call
* - return T if supported, F if not
*/
bool afs_cm_incoming_call(struct afs_call *call)
{
- _enter("{CB.OP %u}", call->operation_ID);
+ _enter("{%u, CB.OP %u}", call->service_id, call->operation_ID);
+
+ call->epoch = rxrpc_kernel_get_epoch(call->net->socket, call->rxcall);
switch (call->operation_ID) {
case CBCallBack:
@@ -127,12 +143,102 @@ bool afs_cm_incoming_call(struct afs_call *call)
case CBTellMeAboutYourself:
call->type = &afs_SRXCBTellMeAboutYourself;
return true;
+ case YFSCBCallBack:
+ if (call->service_id != YFS_CM_SERVICE)
+ return false;
+ call->type = &afs_SRXYFSCB_CallBack;
+ return true;
default:
return false;
}
}
/*
+ * Record a probe to the cache manager from a server.
+ */
+static int afs_record_cm_probe(struct afs_call *call, struct afs_server *server)
+{
+ _enter("");
+
+ if (test_bit(AFS_SERVER_FL_HAVE_EPOCH, &server->flags) &&
+ !test_bit(AFS_SERVER_FL_PROBING, &server->flags)) {
+ if (server->cm_epoch == call->epoch)
+ return 0;
+
+ if (!server->probe.said_rebooted) {
+ pr_notice("kAFS: FS rebooted %pU\n", &server->uuid);
+ server->probe.said_rebooted = true;
+ }
+ }
+
+ spin_lock(&server->probe_lock);
+
+ if (!test_bit(AFS_SERVER_FL_HAVE_EPOCH, &server->flags)) {
+ server->cm_epoch = call->epoch;
+ server->probe.cm_epoch = call->epoch;
+ goto out;
+ }
+
+ if (server->probe.cm_probed &&
+ call->epoch != server->probe.cm_epoch &&
+ !server->probe.said_inconsistent) {
+ pr_notice("kAFS: FS endpoints inconsistent %pU\n",
+ &server->uuid);
+ server->probe.said_inconsistent = true;
+ }
+
+ if (!server->probe.cm_probed || call->epoch == server->cm_epoch)
+ server->probe.cm_epoch = server->cm_epoch;
+
+out:
+ server->probe.cm_probed = true;
+ spin_unlock(&server->probe_lock);
+ return 0;
+}
+
+/*
+ * Find the server record by peer address and record a probe to the cache
+ * manager from a server.
+ */
+static int afs_find_cm_server_by_peer(struct afs_call *call)
+{
+ struct sockaddr_rxrpc srx;
+ struct afs_server *server;
+
+ rxrpc_kernel_get_peer(call->net->socket, call->rxcall, &srx);
+
+ server = afs_find_server(call->net, &srx);
+ if (!server) {
+ trace_afs_cm_no_server(call, &srx);
+ return 0;
+ }
+
+ call->cm_server = server;
+ return afs_record_cm_probe(call, server);
+}
+
+/*
+ * Find the server record by server UUID and record a probe to the cache
+ * manager from a server.
+ */
+static int afs_find_cm_server_by_uuid(struct afs_call *call,
+ struct afs_uuid *uuid)
+{
+ struct afs_server *server;
+
+ rcu_read_lock();
+ server = afs_find_server_by_uuid(call->net, call->request);
+ rcu_read_unlock();
+ if (!server) {
+ trace_afs_cm_no_server_u(call, call->request);
+ return 0;
+ }
+
+ call->cm_server = server;
+ return afs_record_cm_probe(call, server);
+}
+
+/*
* Clean up a cache manager call.
*/
static void afs_cm_destructor(struct afs_call *call)
@@ -168,7 +274,6 @@ static void SRXAFSCB_CallBack(struct work_struct *work)
static int afs_deliver_cb_callback(struct afs_call *call)
{
struct afs_callback_break *cb;
- struct sockaddr_rxrpc srx;
__be32 *bp;
int ret, loop;
@@ -176,32 +281,32 @@ static int afs_deliver_cb_callback(struct afs_call *call)
switch (call->unmarshall) {
case 0:
- call->offset = 0;
+ afs_extract_to_tmp(call);
call->unmarshall++;
/* extract the FID array and its count in two steps */
case 1:
_debug("extract FID count");
- ret = afs_extract_data(call, &call->tmp, 4, true);
+ ret = afs_extract_data(call, true);
if (ret < 0)
return ret;
call->count = ntohl(call->tmp);
_debug("FID count: %u", call->count);
if (call->count > AFSCBMAX)
- return afs_protocol_error(call, -EBADMSG);
+ return afs_protocol_error(call, -EBADMSG,
+ afs_eproto_cb_fid_count);
call->buffer = kmalloc(array3_size(call->count, 3, 4),
GFP_KERNEL);
if (!call->buffer)
return -ENOMEM;
- call->offset = 0;
+ afs_extract_to_buf(call, call->count * 3 * 4);
call->unmarshall++;
case 2:
_debug("extract FID array");
- ret = afs_extract_data(call, call->buffer,
- call->count * 3 * 4, true);
+ ret = afs_extract_data(call, true);
if (ret < 0)
return ret;
@@ -218,59 +323,46 @@ static int afs_deliver_cb_callback(struct afs_call *call)
cb->fid.vid = ntohl(*bp++);
cb->fid.vnode = ntohl(*bp++);
cb->fid.unique = ntohl(*bp++);
- cb->cb.type = AFSCM_CB_UNTYPED;
}
- call->offset = 0;
+ afs_extract_to_tmp(call);
call->unmarshall++;
/* extract the callback array and its count in two steps */
case 3:
_debug("extract CB count");
- ret = afs_extract_data(call, &call->tmp, 4, true);
+ ret = afs_extract_data(call, true);
if (ret < 0)
return ret;
call->count2 = ntohl(call->tmp);
_debug("CB count: %u", call->count2);
if (call->count2 != call->count && call->count2 != 0)
- return afs_protocol_error(call, -EBADMSG);
- call->offset = 0;
+ return afs_protocol_error(call, -EBADMSG,
+ afs_eproto_cb_count);
+ call->_iter = &call->iter;
+ iov_iter_discard(&call->iter, READ, call->count2 * 3 * 4);
call->unmarshall++;
case 4:
- _debug("extract CB array");
- ret = afs_extract_data(call, call->buffer,
- call->count2 * 3 * 4, false);
+ _debug("extract discard %zu/%u",
+ iov_iter_count(&call->iter), call->count2 * 3 * 4);
+
+ ret = afs_extract_data(call, false);
if (ret < 0)
return ret;
- _debug("unmarshall CB array");
- cb = call->request;
- bp = call->buffer;
- for (loop = call->count2; loop > 0; loop--, cb++) {
- cb->cb.version = ntohl(*bp++);
- cb->cb.expiry = ntohl(*bp++);
- cb->cb.type = ntohl(*bp++);
- }
-
- call->offset = 0;
call->unmarshall++;
case 5:
break;
}
if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING))
- return -EIO;
+ return afs_io_error(call, afs_io_error_cm_reply);
/* we'll need the file server record as that tells us which set of
* vnodes to operate upon */
- rxrpc_kernel_get_peer(call->net->socket, call->rxcall, &srx);
- call->cm_server = afs_find_server(call->net, &srx);
- if (!call->cm_server)
- trace_afs_cm_no_server(call, &srx);
-
- return afs_queue_call_work(call);
+ return afs_find_cm_server_by_peer(call);
}
/*
@@ -294,24 +386,18 @@ static void SRXAFSCB_InitCallBackState(struct work_struct *work)
*/
static int afs_deliver_cb_init_call_back_state(struct afs_call *call)
{
- struct sockaddr_rxrpc srx;
int ret;
_enter("");
- rxrpc_kernel_get_peer(call->net->socket, call->rxcall, &srx);
-
- ret = afs_extract_data(call, NULL, 0, false);
+ afs_extract_discard(call, 0);
+ ret = afs_extract_data(call, false);
if (ret < 0)
return ret;
/* we'll need the file server record as that tells us which set of
* vnodes to operate upon */
- call->cm_server = afs_find_server(call->net, &srx);
- if (!call->cm_server)
- trace_afs_cm_no_server(call, &srx);
-
- return afs_queue_call_work(call);
+ return afs_find_cm_server_by_peer(call);
}
/*
@@ -330,16 +416,15 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call)
switch (call->unmarshall) {
case 0:
- call->offset = 0;
call->buffer = kmalloc_array(11, sizeof(__be32), GFP_KERNEL);
if (!call->buffer)
return -ENOMEM;
+ afs_extract_to_buf(call, 11 * sizeof(__be32));
call->unmarshall++;
case 1:
_debug("extract UUID");
- ret = afs_extract_data(call, call->buffer,
- 11 * sizeof(__be32), false);
+ ret = afs_extract_data(call, false);
switch (ret) {
case 0: break;
case -EAGAIN: return 0;
@@ -362,7 +447,6 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call)
for (loop = 0; loop < 6; loop++)
r->node[loop] = ntohl(b[loop + 5]);
- call->offset = 0;
call->unmarshall++;
case 2:
@@ -370,17 +454,11 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call)
}
if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING))
- return -EIO;
+ return afs_io_error(call, afs_io_error_cm_reply);
/* we'll need the file server record as that tells us which set of
* vnodes to operate upon */
- rcu_read_lock();
- call->cm_server = afs_find_server_by_uuid(call->net, call->request);
- rcu_read_unlock();
- if (!call->cm_server)
- trace_afs_cm_no_server_u(call, call->request);
-
- return afs_queue_call_work(call);
+ return afs_find_cm_server_by_uuid(call, call->request);
}
/*
@@ -405,14 +483,14 @@ static int afs_deliver_cb_probe(struct afs_call *call)
_enter("");
- ret = afs_extract_data(call, NULL, 0, false);
+ afs_extract_discard(call, 0);
+ ret = afs_extract_data(call, false);
if (ret < 0)
return ret;
if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING))
- return -EIO;
-
- return afs_queue_call_work(call);
+ return afs_io_error(call, afs_io_error_cm_reply);
+ return afs_find_cm_server_by_peer(call);
}
/*
@@ -453,16 +531,15 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call)
switch (call->unmarshall) {
case 0:
- call->offset = 0;
call->buffer = kmalloc_array(11, sizeof(__be32), GFP_KERNEL);
if (!call->buffer)
return -ENOMEM;
+ afs_extract_to_buf(call, 11 * sizeof(__be32));
call->unmarshall++;
case 1:
_debug("extract UUID");
- ret = afs_extract_data(call, call->buffer,
- 11 * sizeof(__be32), false);
+ ret = afs_extract_data(call, false);
switch (ret) {
case 0: break;
case -EAGAIN: return 0;
@@ -485,7 +562,6 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call)
for (loop = 0; loop < 6; loop++)
r->node[loop] = ntohl(b[loop + 5]);
- call->offset = 0;
call->unmarshall++;
case 2:
@@ -493,9 +569,8 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call)
}
if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING))
- return -EIO;
-
- return afs_queue_call_work(call);
+ return afs_io_error(call, afs_io_error_cm_reply);
+ return afs_find_cm_server_by_uuid(call, call->request);
}
/*
@@ -570,12 +645,88 @@ static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call)
_enter("");
- ret = afs_extract_data(call, NULL, 0, false);
+ afs_extract_discard(call, 0);
+ ret = afs_extract_data(call, false);
if (ret < 0)
return ret;
if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING))
- return -EIO;
+ return afs_io_error(call, afs_io_error_cm_reply);
+ return afs_find_cm_server_by_peer(call);
+}
+
+/*
+ * deliver request data to a YFS CB.CallBack call
+ */
+static int afs_deliver_yfs_cb_callback(struct afs_call *call)
+{
+ struct afs_callback_break *cb;
+ struct yfs_xdr_YFSFid *bp;
+ size_t size;
+ int ret, loop;
+
+ _enter("{%u}", call->unmarshall);
+
+ switch (call->unmarshall) {
+ case 0:
+ afs_extract_to_tmp(call);
+ call->unmarshall++;
+
+ /* extract the FID array and its count in two steps */
+ case 1:
+ _debug("extract FID count");
+ ret = afs_extract_data(call, true);
+ if (ret < 0)
+ return ret;
+
+ call->count = ntohl(call->tmp);
+ _debug("FID count: %u", call->count);
+ if (call->count > YFSCBMAX)
+ return afs_protocol_error(call, -EBADMSG,
+ afs_eproto_cb_fid_count);
+
+ size = array_size(call->count, sizeof(struct yfs_xdr_YFSFid));
+ call->buffer = kmalloc(size, GFP_KERNEL);
+ if (!call->buffer)
+ return -ENOMEM;
+ afs_extract_to_buf(call, size);
+ call->unmarshall++;
+
+ case 2:
+ _debug("extract FID array");
+ ret = afs_extract_data(call, false);
+ if (ret < 0)
+ return ret;
+
+ _debug("unmarshall FID array");
+ call->request = kcalloc(call->count,
+ sizeof(struct afs_callback_break),
+ GFP_KERNEL);
+ if (!call->request)
+ return -ENOMEM;
+
+ cb = call->request;
+ bp = call->buffer;
+ for (loop = call->count; loop > 0; loop--, cb++) {
+ cb->fid.vid = xdr_to_u64(bp->volume);
+ cb->fid.vnode = xdr_to_u64(bp->vnode.lo);
+ cb->fid.vnode_hi = ntohl(bp->vnode.hi);
+ cb->fid.unique = ntohl(bp->vnode.unique);
+ bp++;
+ }
+
+ afs_extract_to_tmp(call);
+ call->unmarshall++;
+
+ case 3:
+ break;
+ }
+
+ if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING))
+ return afs_io_error(call, afs_io_error_cm_reply);
- return afs_queue_call_work(call);
+ /* We'll need the file server record as that tells us which set of
+ * vnodes to operate upon.
+ */
+ return afs_find_cm_server_by_peer(call);
}
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 855bf2b79fed..8a2562e3a316 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -138,6 +138,7 @@ static bool afs_dir_check_page(struct afs_vnode *dvnode, struct page *page,
ntohs(dbuf->blocks[tmp].hdr.magic));
trace_afs_dir_check_failed(dvnode, off, i_size);
kunmap(page);
+ trace_afs_file_error(dvnode, -EIO, afs_file_error_dir_bad_magic);
goto error;
}
@@ -190,9 +191,11 @@ static struct afs_read *afs_read_dir(struct afs_vnode *dvnode, struct key *key)
retry:
i_size = i_size_read(&dvnode->vfs_inode);
if (i_size < 2048)
- return ERR_PTR(-EIO);
- if (i_size > 2048 * 1024)
+ return ERR_PTR(afs_bad(dvnode, afs_file_error_dir_small));
+ if (i_size > 2048 * 1024) {
+ trace_afs_file_error(dvnode, -EFBIG, afs_file_error_dir_big);
return ERR_PTR(-EFBIG);
+ }
_enter("%llu", i_size);
@@ -315,7 +318,8 @@ content_has_grown:
/*
* deal with one block in an AFS directory
*/
-static int afs_dir_iterate_block(struct dir_context *ctx,
+static int afs_dir_iterate_block(struct afs_vnode *dvnode,
+ struct dir_context *ctx,
union afs_xdr_dir_block *block,
unsigned blkoff)
{
@@ -365,7 +369,7 @@ static int afs_dir_iterate_block(struct dir_context *ctx,
" (len %u/%zu)",
blkoff / sizeof(union afs_xdr_dir_block),
offset, next, tmp, nlen);
- return -EIO;
+ return afs_bad(dvnode, afs_file_error_dir_over_end);
}
if (!(block->hdr.bitmap[next / 8] &
(1 << (next % 8)))) {
@@ -373,7 +377,7 @@ static int afs_dir_iterate_block(struct dir_context *ctx,
" %u unmarked extension (len %u/%zu)",
blkoff / sizeof(union afs_xdr_dir_block),
offset, next, tmp, nlen);
- return -EIO;
+ return afs_bad(dvnode, afs_file_error_dir_unmarked_ext);
}
_debug("ENT[%zu.%u]: ext %u/%zu",
@@ -442,7 +446,7 @@ static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx,
*/
page = req->pages[blkoff / PAGE_SIZE];
if (!page) {
- ret = -EIO;
+ ret = afs_bad(dvnode, afs_file_error_dir_missing_page);
break;
}
mark_page_accessed(page);
@@ -455,7 +459,7 @@ static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx,
do {
dblock = &dbuf->blocks[(blkoff % PAGE_SIZE) /
sizeof(union afs_xdr_dir_block)];
- ret = afs_dir_iterate_block(ctx, dblock, blkoff);
+ ret = afs_dir_iterate_block(dvnode, ctx, dblock, blkoff);
if (ret != 1) {
kunmap(page);
goto out;
@@ -548,7 +552,7 @@ static int afs_do_lookup_one(struct inode *dir, struct dentry *dentry,
}
*fid = cookie.fid;
- _leave(" = 0 { vn=%u u=%u }", fid->vnode, fid->unique);
+ _leave(" = 0 { vn=%llu u=%u }", fid->vnode, fid->unique);
return 0;
}
@@ -826,7 +830,7 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
struct key *key;
int ret;
- _enter("{%x:%u},%p{%pd},",
+ _enter("{%llx:%llu},%p{%pd},",
dvnode->fid.vid, dvnode->fid.vnode, dentry, dentry);
ASSERTCMP(d_inode(dentry), ==, NULL);
@@ -896,7 +900,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
if (d_really_is_positive(dentry)) {
vnode = AFS_FS_I(d_inode(dentry));
- _enter("{v={%x:%u} n=%pd fl=%lx},",
+ _enter("{v={%llx:%llu} n=%pd fl=%lx},",
vnode->fid.vid, vnode->fid.vnode, dentry,
vnode->flags);
} else {
@@ -965,7 +969,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
/* if the vnode ID has changed, then the dirent points to a
* different file */
if (fid.vnode != vnode->fid.vnode) {
- _debug("%pd: dirent changed [%u != %u]",
+ _debug("%pd: dirent changed [%llu != %llu]",
dentry, fid.vnode,
vnode->fid.vnode);
goto not_found;
@@ -1071,8 +1075,6 @@ static void afs_vnode_new_inode(struct afs_fs_cursor *fc,
if (fc->ac.error < 0)
return;
- d_drop(new_dentry);
-
inode = afs_iget(fc->vnode->vfs_inode.i_sb, fc->key,
newfid, newstatus, newcb, fc->cbi);
if (IS_ERR(inode)) {
@@ -1085,7 +1087,8 @@ static void afs_vnode_new_inode(struct afs_fs_cursor *fc,
vnode = AFS_FS_I(inode);
set_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags);
- d_add(new_dentry, inode);
+ afs_vnode_commit_status(fc, vnode, 0);
+ d_instantiate(new_dentry, inode);
}
/*
@@ -1104,7 +1107,7 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
mode |= S_IFDIR;
- _enter("{%x:%u},{%pd},%ho",
+ _enter("{%llx:%llu},{%pd},%ho",
dvnode->fid.vid, dvnode->fid.vnode, dentry, mode);
key = afs_request_key(dvnode->volume->cell);
@@ -1169,12 +1172,12 @@ static void afs_dir_remove_subdir(struct dentry *dentry)
static int afs_rmdir(struct inode *dir, struct dentry *dentry)
{
struct afs_fs_cursor fc;
- struct afs_vnode *dvnode = AFS_FS_I(dir);
+ struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode = NULL;
struct key *key;
u64 data_version = dvnode->status.data_version;
int ret;
- _enter("{%x:%u},{%pd}",
+ _enter("{%llx:%llu},{%pd}",
dvnode->fid.vid, dvnode->fid.vnode, dentry);
key = afs_request_key(dvnode->volume->cell);
@@ -1183,11 +1186,19 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
goto error;
}
+ /* Try to make sure we have a callback promise on the victim. */
+ if (d_really_is_positive(dentry)) {
+ vnode = AFS_FS_I(d_inode(dentry));
+ ret = afs_validate(vnode, key);
+ if (ret < 0)
+ goto error_key;
+ }
+
ret = -ERESTARTSYS;
if (afs_begin_vnode_operation(&fc, dvnode, key)) {
while (afs_select_fileserver(&fc)) {
fc.cb_break = afs_calc_vnode_cb_break(dvnode);
- afs_fs_remove(&fc, dentry->d_name.name, true,
+ afs_fs_remove(&fc, vnode, dentry->d_name.name, true,
data_version);
}
@@ -1201,6 +1212,7 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
}
}
+error_key:
key_put(key);
error:
return ret;
@@ -1231,7 +1243,9 @@ static int afs_dir_remove_link(struct dentry *dentry, struct key *key,
if (d_really_is_positive(dentry)) {
struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry));
- if (dir_valid) {
+ if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
+ /* Already done */
+ } else if (dir_valid) {
drop_nlink(&vnode->vfs_inode);
if (vnode->vfs_inode.i_nlink == 0) {
set_bit(AFS_VNODE_DELETED, &vnode->flags);
@@ -1260,13 +1274,13 @@ static int afs_dir_remove_link(struct dentry *dentry, struct key *key,
static int afs_unlink(struct inode *dir, struct dentry *dentry)
{
struct afs_fs_cursor fc;
- struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode;
+ struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode = NULL;
struct key *key;
unsigned long d_version = (unsigned long)dentry->d_fsdata;
u64 data_version = dvnode->status.data_version;
int ret;
- _enter("{%x:%u},{%pd}",
+ _enter("{%llx:%llu},{%pd}",
dvnode->fid.vid, dvnode->fid.vnode, dentry);
if (dentry->d_name.len >= AFSNAMEMAX)
@@ -1290,7 +1304,18 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
if (afs_begin_vnode_operation(&fc, dvnode, key)) {
while (afs_select_fileserver(&fc)) {
fc.cb_break = afs_calc_vnode_cb_break(dvnode);
- afs_fs_remove(&fc, dentry->d_name.name, false,
+
+ if (test_bit(AFS_SERVER_FL_IS_YFS, &fc.cbi->server->flags) &&
+ !test_bit(AFS_SERVER_FL_NO_RM2, &fc.cbi->server->flags)) {
+ yfs_fs_remove_file2(&fc, vnode, dentry->d_name.name,
+ data_version);
+ if (fc.ac.error != -ECONNABORTED ||
+ fc.ac.abort_code != RXGEN_OPCODE)
+ continue;
+ set_bit(AFS_SERVER_FL_NO_RM2, &fc.cbi->server->flags);
+ }
+
+ afs_fs_remove(&fc, vnode, dentry->d_name.name, false,
data_version);
}
@@ -1330,7 +1355,7 @@ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
mode |= S_IFREG;
- _enter("{%x:%u},{%pd},%ho,",
+ _enter("{%llx:%llu},{%pd},%ho,",
dvnode->fid.vid, dvnode->fid.vnode, dentry, mode);
ret = -ENAMETOOLONG;
@@ -1393,7 +1418,7 @@ static int afs_link(struct dentry *from, struct inode *dir,
dvnode = AFS_FS_I(dir);
data_version = dvnode->status.data_version;
- _enter("{%x:%u},{%x:%u},{%pd}",
+ _enter("{%llx:%llu},{%llx:%llu},{%pd}",
vnode->fid.vid, vnode->fid.vnode,
dvnode->fid.vid, dvnode->fid.vnode,
dentry);
@@ -1464,7 +1489,7 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry,
u64 data_version = dvnode->status.data_version;
int ret;
- _enter("{%x:%u},{%pd},%s",
+ _enter("{%llx:%llu},{%pd},%s",
dvnode->fid.vid, dvnode->fid.vnode, dentry,
content);
@@ -1540,7 +1565,7 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
orig_data_version = orig_dvnode->status.data_version;
new_data_version = new_dvnode->status.data_version;
- _enter("{%x:%u},{%x:%u},{%x:%u},{%pd}",
+ _enter("{%llx:%llu},{%llx:%llu},{%llx:%llu},{%pd}",
orig_dvnode->fid.vid, orig_dvnode->fid.vnode,
vnode->fid.vid, vnode->fid.vnode,
new_dvnode->fid.vid, new_dvnode->fid.vnode,
@@ -1607,7 +1632,7 @@ static int afs_dir_releasepage(struct page *page, gfp_t gfp_flags)
{
struct afs_vnode *dvnode = AFS_FS_I(page->mapping->host);
- _enter("{{%x:%u}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, page->index);
+ _enter("{{%llx:%llu}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, page->index);
set_page_private(page, 0);
ClearPagePrivate(page);
diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c
index f29c6dade7f6..a9ba81ddf154 100644
--- a/fs/afs/dynroot.c
+++ b/fs/afs/dynroot.c
@@ -46,7 +46,7 @@ static int afs_probe_cell_name(struct dentry *dentry)
return 0;
}
- ret = dns_query("afsdb", name, len, "", NULL, NULL);
+ ret = dns_query("afsdb", name, len, "srv=1", NULL, NULL);
if (ret == -ENODATA)
ret = -EDESTADDRREQ;
return ret;
@@ -62,7 +62,7 @@ struct inode *afs_try_auto_mntpt(struct dentry *dentry, struct inode *dir)
struct inode *inode;
int ret = -ENOENT;
- _enter("%p{%pd}, {%x:%u}",
+ _enter("%p{%pd}, {%llx:%llu}",
dentry, dentry, vnode->fid.vid, vnode->fid.vnode);
if (!test_bit(AFS_VNODE_AUTOCELL, &vnode->flags))
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 7d4f26198573..323ae9912203 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -17,6 +17,7 @@
#include <linux/writeback.h>
#include <linux/gfp.h>
#include <linux/task_io_accounting_ops.h>
+#include <linux/mm.h>
#include "internal.h"
static int afs_file_mmap(struct file *file, struct vm_area_struct *vma);
@@ -121,7 +122,7 @@ int afs_open(struct inode *inode, struct file *file)
struct key *key;
int ret;
- _enter("{%x:%u},", vnode->fid.vid, vnode->fid.vnode);
+ _enter("{%llx:%llu},", vnode->fid.vid, vnode->fid.vnode);
key = afs_request_key(vnode->volume->cell);
if (IS_ERR(key)) {
@@ -170,7 +171,7 @@ int afs_release(struct inode *inode, struct file *file)
struct afs_vnode *vnode = AFS_FS_I(inode);
struct afs_file *af = file->private_data;
- _enter("{%x:%u},", vnode->fid.vid, vnode->fid.vnode);
+ _enter("{%llx:%llu},", vnode->fid.vid, vnode->fid.vnode);
if ((file->f_mode & FMODE_WRITE))
return vfs_fsync(file, 0);
@@ -228,7 +229,7 @@ int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *de
struct afs_fs_cursor fc;
int ret;
- _enter("%s{%x:%u.%u},%x,,,",
+ _enter("%s{%llx:%llu.%u},%x,,,",
vnode->volume->name,
vnode->fid.vid,
vnode->fid.vnode,
@@ -441,7 +442,7 @@ static int afs_readpages_one(struct file *file, struct address_space *mapping,
/* Count the number of contiguous pages at the front of the list. Note
* that the list goes prev-wards rather than next-wards.
*/
- first = list_entry(pages->prev, struct page, lru);
+ first = lru_to_page(pages);
index = first->index + 1;
n = 1;
for (p = first->lru.prev; p != pages; p = p->prev) {
@@ -473,7 +474,7 @@ static int afs_readpages_one(struct file *file, struct address_space *mapping,
* page at the end of the file.
*/
do {
- page = list_entry(pages->prev, struct page, lru);
+ page = lru_to_page(pages);
list_del(&page->lru);
index = page->index;
if (add_to_page_cache_lru(page, mapping, index,
@@ -634,7 +635,7 @@ static int afs_releasepage(struct page *page, gfp_t gfp_flags)
struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
unsigned long priv;
- _enter("{{%x:%u}[%lu],%lx},%x",
+ _enter("{{%llx:%llu}[%lu],%lx},%x",
vnode->fid.vid, vnode->fid.vnode, page->index, page->flags,
gfp_flags);
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index dc62d15a964b..e432bd27a2e7 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -29,7 +29,7 @@ static const struct file_lock_operations afs_lock_ops = {
*/
void afs_lock_may_be_available(struct afs_vnode *vnode)
{
- _enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode);
+ _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode);
queue_delayed_work(afs_lock_manager, &vnode->lock_work, 0);
}
@@ -76,7 +76,7 @@ static int afs_set_lock(struct afs_vnode *vnode, struct key *key,
struct afs_fs_cursor fc;
int ret;
- _enter("%s{%x:%u.%u},%x,%u",
+ _enter("%s{%llx:%llu.%u},%x,%u",
vnode->volume->name,
vnode->fid.vid,
vnode->fid.vnode,
@@ -107,7 +107,7 @@ static int afs_extend_lock(struct afs_vnode *vnode, struct key *key)
struct afs_fs_cursor fc;
int ret;
- _enter("%s{%x:%u.%u},%x",
+ _enter("%s{%llx:%llu.%u},%x",
vnode->volume->name,
vnode->fid.vid,
vnode->fid.vnode,
@@ -138,7 +138,7 @@ static int afs_release_lock(struct afs_vnode *vnode, struct key *key)
struct afs_fs_cursor fc;
int ret;
- _enter("%s{%x:%u.%u},%x",
+ _enter("%s{%llx:%llu.%u},%x",
vnode->volume->name,
vnode->fid.vid,
vnode->fid.vnode,
@@ -175,7 +175,7 @@ void afs_lock_work(struct work_struct *work)
struct key *key;
int ret;
- _enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode);
+ _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode);
spin_lock(&vnode->lock);
@@ -192,7 +192,7 @@ again:
ret = afs_release_lock(vnode, vnode->lock_key);
if (ret < 0)
printk(KERN_WARNING "AFS:"
- " Failed to release lock on {%x:%x} error %d\n",
+ " Failed to release lock on {%llx:%llx} error %d\n",
vnode->fid.vid, vnode->fid.vnode, ret);
spin_lock(&vnode->lock);
@@ -208,7 +208,7 @@ again:
/* The new front of the queue now owns the state variables. */
next = list_entry(vnode->pending_locks.next,
struct file_lock, fl_u.afs.link);
- vnode->lock_key = afs_file_key(next->fl_file);
+ vnode->lock_key = key_get(afs_file_key(next->fl_file));
vnode->lock_type = (next->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE;
vnode->lock_state = AFS_VNODE_LOCK_WAITING_FOR_CB;
goto again;
@@ -229,7 +229,7 @@ again:
key_put(key);
if (ret < 0)
- pr_warning("AFS: Failed to extend lock on {%x:%x} error %d\n",
+ pr_warning("AFS: Failed to extend lock on {%llx:%llx} error %d\n",
vnode->fid.vid, vnode->fid.vnode, ret);
spin_lock(&vnode->lock);
@@ -413,7 +413,7 @@ static void afs_dequeue_lock(struct afs_vnode *vnode, struct file_lock *fl)
/* The new front of the queue now owns the state variables. */
next = list_entry(vnode->pending_locks.next,
struct file_lock, fl_u.afs.link);
- vnode->lock_key = afs_file_key(next->fl_file);
+ vnode->lock_key = key_get(afs_file_key(next->fl_file));
vnode->lock_type = (next->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE;
vnode->lock_state = AFS_VNODE_LOCK_WAITING_FOR_CB;
afs_lock_may_be_available(vnode);
@@ -430,7 +430,7 @@ static int afs_do_setlk(struct file *file, struct file_lock *fl)
struct key *key = afs_file_key(file);
int ret;
- _enter("{%x:%u},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type);
+ _enter("{%llx:%llu},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type);
/* only whole-file locks are supported */
if (fl->fl_start != 0 || fl->fl_end != OFFSET_MAX)
@@ -582,7 +582,7 @@ static int afs_do_unlk(struct file *file, struct file_lock *fl)
struct afs_vnode *vnode = AFS_FS_I(locks_inode(file));
int ret;
- _enter("{%x:%u},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type);
+ _enter("{%llx:%llu},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type);
/* Flush all pending writes before doing anything with locks. */
vfs_fsync(file, 0);
@@ -639,7 +639,7 @@ int afs_lock(struct file *file, int cmd, struct file_lock *fl)
{
struct afs_vnode *vnode = AFS_FS_I(locks_inode(file));
- _enter("{%x:%u},%d,{t=%x,fl=%x,r=%Ld:%Ld}",
+ _enter("{%llx:%llu},%d,{t=%x,fl=%x,r=%Ld:%Ld}",
vnode->fid.vid, vnode->fid.vnode, cmd,
fl->fl_type, fl->fl_flags,
(long long) fl->fl_start, (long long) fl->fl_end);
@@ -662,7 +662,7 @@ int afs_flock(struct file *file, int cmd, struct file_lock *fl)
{
struct afs_vnode *vnode = AFS_FS_I(locks_inode(file));
- _enter("{%x:%u},%d,{t=%x,fl=%x}",
+ _enter("{%llx:%llu},%d,{t=%x,fl=%x}",
vnode->fid.vid, vnode->fid.vnode, cmd,
fl->fl_type, fl->fl_flags);
diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c
new file mode 100644
index 000000000000..3a9eaec06756
--- /dev/null
+++ b/fs/afs/fs_probe.c
@@ -0,0 +1,279 @@
+/* AFS fileserver probing
+ *
+ * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include "afs_fs.h"
+#include "internal.h"
+#include "protocol_yfs.h"
+
+static bool afs_fs_probe_done(struct afs_server *server)
+{
+ if (!atomic_dec_and_test(&server->probe_outstanding))
+ return false;
+
+ wake_up_var(&server->probe_outstanding);
+ clear_bit_unlock(AFS_SERVER_FL_PROBING, &server->flags);
+ wake_up_bit(&server->flags, AFS_SERVER_FL_PROBING);
+ return true;
+}
+
+/*
+ * Process the result of probing a fileserver. This is called after successful
+ * or failed delivery of an FS.GetCapabilities operation.
+ */
+void afs_fileserver_probe_result(struct afs_call *call)
+{
+ struct afs_addr_list *alist = call->alist;
+ struct afs_server *server = call->reply[0];
+ unsigned int server_index = (long)call->reply[1];
+ unsigned int index = call->addr_ix;
+ unsigned int rtt = UINT_MAX;
+ bool have_result = false;
+ u64 _rtt;
+ int ret = call->error;
+
+ _enter("%pU,%u", &server->uuid, index);
+
+ spin_lock(&server->probe_lock);
+
+ switch (ret) {
+ case 0:
+ server->probe.error = 0;
+ goto responded;
+ case -ECONNABORTED:
+ if (!server->probe.responded) {
+ server->probe.abort_code = call->abort_code;
+ server->probe.error = ret;
+ }
+ goto responded;
+ case -ENOMEM:
+ case -ENONET:
+ server->probe.local_failure = true;
+ afs_io_error(call, afs_io_error_fs_probe_fail);
+ goto out;
+ case -ECONNRESET: /* Responded, but call expired. */
+ case -ERFKILL:
+ case -EADDRNOTAVAIL:
+ case -ENETUNREACH:
+ case -EHOSTUNREACH:
+ case -EHOSTDOWN:
+ case -ECONNREFUSED:
+ case -ETIMEDOUT:
+ case -ETIME:
+ default:
+ clear_bit(index, &alist->responded);
+ set_bit(index, &alist->failed);
+ if (!server->probe.responded &&
+ (server->probe.error == 0 ||
+ server->probe.error == -ETIMEDOUT ||
+ server->probe.error == -ETIME))
+ server->probe.error = ret;
+ afs_io_error(call, afs_io_error_fs_probe_fail);
+ goto out;
+ }
+
+responded:
+ set_bit(index, &alist->responded);
+ clear_bit(index, &alist->failed);
+
+ if (call->service_id == YFS_FS_SERVICE) {
+ server->probe.is_yfs = true;
+ set_bit(AFS_SERVER_FL_IS_YFS, &server->flags);
+ alist->addrs[index].srx_service = call->service_id;
+ } else {
+ server->probe.not_yfs = true;
+ if (!server->probe.is_yfs) {
+ clear_bit(AFS_SERVER_FL_IS_YFS, &server->flags);
+ alist->addrs[index].srx_service = call->service_id;
+ }
+ }
+
+ /* Get the RTT and scale it to fit into a 32-bit value that represents
+ * over a minute of time so that we can access it with one instruction
+ * on a 32-bit system.
+ */
+ _rtt = rxrpc_kernel_get_rtt(call->net->socket, call->rxcall);
+ _rtt /= 64;
+ rtt = (_rtt > UINT_MAX) ? UINT_MAX : _rtt;
+ if (rtt < server->probe.rtt) {
+ server->probe.rtt = rtt;
+ alist->preferred = index;
+ have_result = true;
+ }
+
+ smp_wmb(); /* Set rtt before responded. */
+ server->probe.responded = true;
+ set_bit(AFS_SERVER_FL_PROBED, &server->flags);
+out:
+ spin_unlock(&server->probe_lock);
+
+ _debug("probe [%u][%u] %pISpc rtt=%u ret=%d",
+ server_index, index, &alist->addrs[index].transport,
+ (unsigned int)rtt, ret);
+
+ have_result |= afs_fs_probe_done(server);
+ if (have_result) {
+ server->probe.have_result = true;
+ wake_up_var(&server->probe.have_result);
+ wake_up_all(&server->probe_wq);
+ }
+}
+
+/*
+ * Probe all of a fileserver's addresses to find out the best route and to
+ * query its capabilities.
+ */
+static int afs_do_probe_fileserver(struct afs_net *net,
+ struct afs_server *server,
+ struct key *key,
+ unsigned int server_index,
+ struct afs_error *_e)
+{
+ struct afs_addr_cursor ac = {
+ .index = 0,
+ };
+ bool in_progress = false;
+ int err;
+
+ _enter("%pU", &server->uuid);
+
+ read_lock(&server->fs_lock);
+ ac.alist = rcu_dereference_protected(server->addresses,
+ lockdep_is_held(&server->fs_lock));
+ read_unlock(&server->fs_lock);
+
+ atomic_set(&server->probe_outstanding, ac.alist->nr_addrs);
+ memset(&server->probe, 0, sizeof(server->probe));
+ server->probe.rtt = UINT_MAX;
+
+ for (ac.index = 0; ac.index < ac.alist->nr_addrs; ac.index++) {
+ err = afs_fs_get_capabilities(net, server, &ac, key, server_index,
+ true);
+ if (err == -EINPROGRESS)
+ in_progress = true;
+ else
+ afs_prioritise_error(_e, err, ac.abort_code);
+ }
+
+ if (!in_progress)
+ afs_fs_probe_done(server);
+ return in_progress;
+}
+
+/*
+ * Send off probes to all unprobed servers.
+ */
+int afs_probe_fileservers(struct afs_net *net, struct key *key,
+ struct afs_server_list *list)
+{
+ struct afs_server *server;
+ struct afs_error e;
+ bool in_progress = false;
+ int i;
+
+ e.error = 0;
+ e.responded = false;
+ for (i = 0; i < list->nr_servers; i++) {
+ server = list->servers[i].server;
+ if (test_bit(AFS_SERVER_FL_PROBED, &server->flags))
+ continue;
+
+ if (!test_and_set_bit_lock(AFS_SERVER_FL_PROBING, &server->flags) &&
+ afs_do_probe_fileserver(net, server, key, i, &e))
+ in_progress = true;
+ }
+
+ return in_progress ? 0 : e.error;
+}
+
+/*
+ * Wait for the first as-yet untried fileserver to respond.
+ */
+int afs_wait_for_fs_probes(struct afs_server_list *slist, unsigned long untried)
+{
+ struct wait_queue_entry *waits;
+ struct afs_server *server;
+ unsigned int rtt = UINT_MAX;
+ bool have_responders = false;
+ int pref = -1, i;
+
+ _enter("%u,%lx", slist->nr_servers, untried);
+
+ /* Only wait for servers that have a probe outstanding. */
+ for (i = 0; i < slist->nr_servers; i++) {
+ if (test_bit(i, &untried)) {
+ server = slist->servers[i].server;
+ if (!test_bit(AFS_SERVER_FL_PROBING, &server->flags))
+ __clear_bit(i, &untried);
+ if (server->probe.responded)
+ have_responders = true;
+ }
+ }
+ if (have_responders || !untried)
+ return 0;
+
+ waits = kmalloc(array_size(slist->nr_servers, sizeof(*waits)), GFP_KERNEL);
+ if (!waits)
+ return -ENOMEM;
+
+ for (i = 0; i < slist->nr_servers; i++) {
+ if (test_bit(i, &untried)) {
+ server = slist->servers[i].server;
+ init_waitqueue_entry(&waits[i], current);
+ add_wait_queue(&server->probe_wq, &waits[i]);
+ }
+ }
+
+ for (;;) {
+ bool still_probing = false;
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ for (i = 0; i < slist->nr_servers; i++) {
+ if (test_bit(i, &untried)) {
+ server = slist->servers[i].server;
+ if (server->probe.responded)
+ goto stop;
+ if (test_bit(AFS_SERVER_FL_PROBING, &server->flags))
+ still_probing = true;
+ }
+ }
+
+ if (!still_probing || signal_pending(current))
+ goto stop;
+ schedule();
+ }
+
+stop:
+ set_current_state(TASK_RUNNING);
+
+ for (i = 0; i < slist->nr_servers; i++) {
+ if (test_bit(i, &untried)) {
+ server = slist->servers[i].server;
+ if (server->probe.responded &&
+ server->probe.rtt < rtt) {
+ pref = i;
+ rtt = server->probe.rtt;
+ }
+
+ remove_wait_queue(&server->probe_wq, &waits[i]);
+ }
+ }
+
+ kfree(waits);
+
+ if (pref == -1 && signal_pending(current))
+ return -ERESTARTSYS;
+
+ if (pref >= 0)
+ slist->preferred = pref;
+ return 0;
+}
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 50929cb91732..ca08c83168f5 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -17,15 +17,10 @@
#include "internal.h"
#include "afs_fs.h"
#include "xdr_fs.h"
+#include "protocol_yfs.h"
static const struct afs_fid afs_zero_fid;
-/*
- * We need somewhere to discard into in case the server helpfully returns more
- * than we asked for in FS.FetchData{,64}.
- */
-static u8 afs_discard_buffer[64];
-
static inline void afs_use_fs_server(struct afs_call *call, struct afs_cb_interest *cbi)
{
call->cbi = afs_get_cb_interest(cbi);
@@ -75,8 +70,7 @@ void afs_update_inode_from_status(struct afs_vnode *vnode,
struct timespec64 t;
umode_t mode;
- t.tv_sec = status->mtime_client;
- t.tv_nsec = 0;
+ t = status->mtime_client;
vnode->vfs_inode.i_ctime = t;
vnode->vfs_inode.i_mtime = t;
vnode->vfs_inode.i_atime = t;
@@ -96,7 +90,7 @@ void afs_update_inode_from_status(struct afs_vnode *vnode,
if (!(flags & AFS_VNODE_NOT_YET_SET)) {
if (expected_version &&
*expected_version != status->data_version) {
- _debug("vnode modified %llx on {%x:%u} [exp %llx]",
+ _debug("vnode modified %llx on {%llx:%llu} [exp %llx]",
(unsigned long long) status->data_version,
vnode->fid.vid, vnode->fid.vnode,
(unsigned long long) *expected_version);
@@ -170,7 +164,7 @@ static int xdr_decode_AFSFetchStatus(struct afs_call *call,
if (type != status->type &&
vnode &&
!test_bit(AFS_VNODE_UNSET, &vnode->flags)) {
- pr_warning("Vnode %x:%x:%x changed type %u to %u\n",
+ pr_warning("Vnode %llx:%llx:%x changed type %u to %u\n",
vnode->fid.vid,
vnode->fid.vnode,
vnode->fid.unique,
@@ -200,8 +194,10 @@ static int xdr_decode_AFSFetchStatus(struct afs_call *call,
EXTRACT_M(mode);
EXTRACT_M(group);
- status->mtime_client = ntohl(xdr->mtime_client);
- status->mtime_server = ntohl(xdr->mtime_server);
+ status->mtime_client.tv_sec = ntohl(xdr->mtime_client);
+ status->mtime_client.tv_nsec = 0;
+ status->mtime_server.tv_sec = ntohl(xdr->mtime_server);
+ status->mtime_server.tv_nsec = 0;
status->lock_count = ntohl(xdr->lock_count);
size = (u64)ntohl(xdr->size_lo);
@@ -233,7 +229,7 @@ static int xdr_decode_AFSFetchStatus(struct afs_call *call,
bad:
xdr_dump_bad(*_bp);
- return afs_protocol_error(call, -EBADMSG);
+ return afs_protocol_error(call, -EBADMSG, afs_eproto_bad_status);
}
/*
@@ -273,7 +269,7 @@ static void xdr_decode_AFSCallBack(struct afs_call *call,
write_seqlock(&vnode->cb_lock);
- if (call->cb_break == afs_cb_break_sum(vnode, cbi)) {
+ if (!afs_cb_is_broken(call->cb_break, vnode, cbi)) {
vnode->cb_version = ntohl(*bp++);
cb_expiry = ntohl(*bp++);
vnode->cb_type = ntohl(*bp++);
@@ -293,13 +289,19 @@ static void xdr_decode_AFSCallBack(struct afs_call *call,
*_bp = bp;
}
-static void xdr_decode_AFSCallBack_raw(const __be32 **_bp,
+static ktime_t xdr_decode_expiry(struct afs_call *call, u32 expiry)
+{
+ return ktime_add_ns(call->reply_time, expiry * NSEC_PER_SEC);
+}
+
+static void xdr_decode_AFSCallBack_raw(struct afs_call *call,
+ const __be32 **_bp,
struct afs_callback *cb)
{
const __be32 *bp = *_bp;
cb->version = ntohl(*bp++);
- cb->expiry = ntohl(*bp++);
+ cb->expires_at = xdr_decode_expiry(call, ntohl(*bp++));
cb->type = ntohl(*bp++);
*_bp = bp;
}
@@ -311,14 +313,18 @@ static void xdr_decode_AFSVolSync(const __be32 **_bp,
struct afs_volsync *volsync)
{
const __be32 *bp = *_bp;
+ u32 creation;
- volsync->creation = ntohl(*bp++);
+ creation = ntohl(*bp++);
bp++; /* spare2 */
bp++; /* spare3 */
bp++; /* spare4 */
bp++; /* spare5 */
bp++; /* spare6 */
*_bp = bp;
+
+ if (volsync)
+ volsync->creation = creation;
}
/*
@@ -379,6 +385,8 @@ static void xdr_decode_AFSFetchVolumeStatus(const __be32 **_bp,
vs->blocks_in_use = ntohl(*bp++);
vs->part_blocks_avail = ntohl(*bp++);
vs->part_max_blocks = ntohl(*bp++);
+ vs->vol_copy_date = 0;
+ vs->vol_backup_date = 0;
*_bp = bp;
}
@@ -395,16 +403,16 @@ static int afs_deliver_fs_fetch_status_vnode(struct afs_call *call)
if (ret < 0)
return ret;
- _enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode);
+ _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode);
/* unmarshall the reply once we've received all of it */
bp = call->buffer;
- if (afs_decode_status(call, &bp, &vnode->status, vnode,
- &call->expected_version, NULL) < 0)
- return afs_protocol_error(call, -EBADMSG);
+ ret = afs_decode_status(call, &bp, &vnode->status, vnode,
+ &call->expected_version, NULL);
+ if (ret < 0)
+ return ret;
xdr_decode_AFSCallBack(call, vnode, &bp);
- if (call->reply[1])
- xdr_decode_AFSVolSync(&bp, call->reply[1]);
+ xdr_decode_AFSVolSync(&bp, call->reply[1]);
_leave(" = 0 [done]");
return 0;
@@ -431,7 +439,10 @@ int afs_fs_fetch_file_status(struct afs_fs_cursor *fc, struct afs_volsync *volsy
struct afs_net *net = afs_v2net(vnode);
__be32 *bp;
- _enter(",%x,{%x:%u},,",
+ if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
+ return yfs_fs_fetch_file_status(fc, volsync, new_inode);
+
+ _enter(",%x,{%llx:%llu},,",
key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode);
call = afs_alloc_flat_call(net, &afs_RXFSFetchStatus_vnode,
@@ -445,6 +456,7 @@ int afs_fs_fetch_file_status(struct afs_fs_cursor *fc, struct afs_volsync *volsy
call->reply[0] = vnode;
call->reply[1] = volsync;
call->expected_version = new_inode ? 1 : vnode->status.data_version;
+ call->want_reply_time = true;
/* marshall the parameters */
bp = call->request;
@@ -468,139 +480,117 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
struct afs_read *req = call->reply[2];
const __be32 *bp;
unsigned int size;
- void *buffer;
int ret;
- _enter("{%u,%zu/%u;%llu/%llu}",
- call->unmarshall, call->offset, call->count,
- req->remain, req->actual_len);
+ _enter("{%u,%zu/%llu}",
+ call->unmarshall, iov_iter_count(&call->iter), req->actual_len);
switch (call->unmarshall) {
case 0:
req->actual_len = 0;
- call->offset = 0;
+ req->index = 0;
+ req->offset = req->pos & (PAGE_SIZE - 1);
call->unmarshall++;
- if (call->operation_ID != FSFETCHDATA64) {
- call->unmarshall++;
- goto no_msw;
+ if (call->operation_ID == FSFETCHDATA64) {
+ afs_extract_to_tmp64(call);
+ } else {
+ call->tmp_u = htonl(0);
+ afs_extract_to_tmp(call);
}
- /* extract the upper part of the returned data length of an
- * FSFETCHDATA64 op (which should always be 0 using this
- * client) */
- case 1:
- _debug("extract data length (MSW)");
- ret = afs_extract_data(call, &call->tmp, 4, true);
- if (ret < 0)
- return ret;
-
- req->actual_len = ntohl(call->tmp);
- req->actual_len <<= 32;
- call->offset = 0;
- call->unmarshall++;
-
- no_msw:
/* extract the returned data length */
- case 2:
+ case 1:
_debug("extract data length");
- ret = afs_extract_data(call, &call->tmp, 4, true);
+ ret = afs_extract_data(call, true);
if (ret < 0)
return ret;
- req->actual_len |= ntohl(call->tmp);
+ req->actual_len = be64_to_cpu(call->tmp64);
_debug("DATA length: %llu", req->actual_len);
-
- req->remain = req->actual_len;
- call->offset = req->pos & (PAGE_SIZE - 1);
- req->index = 0;
- if (req->actual_len == 0)
+ req->remain = min(req->len, req->actual_len);
+ if (req->remain == 0)
goto no_more_data;
+
call->unmarshall++;
begin_page:
ASSERTCMP(req->index, <, req->nr_pages);
- if (req->remain > PAGE_SIZE - call->offset)
- size = PAGE_SIZE - call->offset;
+ if (req->remain > PAGE_SIZE - req->offset)
+ size = PAGE_SIZE - req->offset;
else
size = req->remain;
- call->count = call->offset + size;
- ASSERTCMP(call->count, <=, PAGE_SIZE);
- req->remain -= size;
+ call->bvec[0].bv_len = size;
+ call->bvec[0].bv_offset = req->offset;
+ call->bvec[0].bv_page = req->pages[req->index];
+ iov_iter_bvec(&call->iter, READ, call->bvec, 1, size);
+ ASSERTCMP(size, <=, PAGE_SIZE);
/* extract the returned data */
- case 3:
- _debug("extract data %llu/%llu %zu/%u",
- req->remain, req->actual_len, call->offset, call->count);
+ case 2:
+ _debug("extract data %zu/%llu",
+ iov_iter_count(&call->iter), req->remain);
- buffer = kmap(req->pages[req->index]);
- ret = afs_extract_data(call, buffer, call->count, true);
- kunmap(req->pages[req->index]);
+ ret = afs_extract_data(call, true);
if (ret < 0)
return ret;
- if (call->offset == PAGE_SIZE) {
+ req->remain -= call->bvec[0].bv_len;
+ req->offset += call->bvec[0].bv_len;
+ ASSERTCMP(req->offset, <=, PAGE_SIZE);
+ if (req->offset == PAGE_SIZE) {
+ req->offset = 0;
if (req->page_done)
req->page_done(call, req);
req->index++;
- if (req->remain > 0) {
- call->offset = 0;
- if (req->index >= req->nr_pages) {
- call->unmarshall = 4;
- goto begin_discard;
- }
+ if (req->remain > 0)
goto begin_page;
- }
}
- goto no_more_data;
+
+ ASSERTCMP(req->remain, ==, 0);
+ if (req->actual_len <= req->len)
+ goto no_more_data;
/* Discard any excess data the server gave us */
- begin_discard:
- case 4:
- size = min_t(loff_t, sizeof(afs_discard_buffer), req->remain);
- call->count = size;
- _debug("extract discard %llu/%llu %zu/%u",
- req->remain, req->actual_len, call->offset, call->count);
-
- call->offset = 0;
- ret = afs_extract_data(call, afs_discard_buffer, call->count, true);
- req->remain -= call->offset;
+ iov_iter_discard(&call->iter, READ, req->actual_len - req->len);
+ call->unmarshall = 3;
+ case 3:
+ _debug("extract discard %zu/%llu",
+ iov_iter_count(&call->iter), req->actual_len - req->len);
+
+ ret = afs_extract_data(call, true);
if (ret < 0)
return ret;
- if (req->remain > 0)
- goto begin_discard;
no_more_data:
- call->offset = 0;
- call->unmarshall = 5;
+ call->unmarshall = 4;
+ afs_extract_to_buf(call, (21 + 3 + 6) * 4);
/* extract the metadata */
- case 5:
- ret = afs_extract_data(call, call->buffer,
- (21 + 3 + 6) * 4, false);
+ case 4:
+ ret = afs_extract_data(call, false);
if (ret < 0)
return ret;
bp = call->buffer;
- if (afs_decode_status(call, &bp, &vnode->status, vnode,
- &vnode->status.data_version, req) < 0)
- return afs_protocol_error(call, -EBADMSG);
+ ret = afs_decode_status(call, &bp, &vnode->status, vnode,
+ &vnode->status.data_version, req);
+ if (ret < 0)
+ return ret;
xdr_decode_AFSCallBack(call, vnode, &bp);
- if (call->reply[1])
- xdr_decode_AFSVolSync(&bp, call->reply[1]);
+ xdr_decode_AFSVolSync(&bp, call->reply[1]);
- call->offset = 0;
call->unmarshall++;
- case 6:
+ case 5:
break;
}
for (; req->index < req->nr_pages; req->index++) {
- if (call->count < PAGE_SIZE)
+ if (req->offset < PAGE_SIZE)
zero_user_segment(req->pages[req->index],
- call->count, PAGE_SIZE);
+ req->offset, PAGE_SIZE);
if (req->page_done)
req->page_done(call, req);
- call->count = 0;
+ req->offset = 0;
}
_leave(" = 0 [done]");
@@ -653,6 +643,7 @@ static int afs_fs_fetch_data64(struct afs_fs_cursor *fc, struct afs_read *req)
call->reply[1] = NULL; /* volsync */
call->reply[2] = req;
call->expected_version = vnode->status.data_version;
+ call->want_reply_time = true;
/* marshall the parameters */
bp = call->request;
@@ -682,6 +673,9 @@ int afs_fs_fetch_data(struct afs_fs_cursor *fc, struct afs_read *req)
struct afs_net *net = afs_v2net(vnode);
__be32 *bp;
+ if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
+ return yfs_fs_fetch_data(fc, req);
+
if (upper_32_bits(req->pos) ||
upper_32_bits(req->len) ||
upper_32_bits(req->pos + req->len))
@@ -698,6 +692,7 @@ int afs_fs_fetch_data(struct afs_fs_cursor *fc, struct afs_read *req)
call->reply[1] = NULL; /* volsync */
call->reply[2] = req;
call->expected_version = vnode->status.data_version;
+ call->want_reply_time = true;
/* marshall the parameters */
bp = call->request;
@@ -733,11 +728,14 @@ static int afs_deliver_fs_create_vnode(struct afs_call *call)
/* unmarshall the reply once we've received all of it */
bp = call->buffer;
xdr_decode_AFSFid(&bp, call->reply[1]);
- if (afs_decode_status(call, &bp, call->reply[2], NULL, NULL, NULL) < 0 ||
- afs_decode_status(call, &bp, &vnode->status, vnode,
- &call->expected_version, NULL) < 0)
- return afs_protocol_error(call, -EBADMSG);
- xdr_decode_AFSCallBack_raw(&bp, call->reply[3]);
+ ret = afs_decode_status(call, &bp, call->reply[2], NULL, NULL, NULL);
+ if (ret < 0)
+ return ret;
+ ret = afs_decode_status(call, &bp, &vnode->status, vnode,
+ &call->expected_version, NULL);
+ if (ret < 0)
+ return ret;
+ xdr_decode_AFSCallBack_raw(call, &bp, call->reply[3]);
/* xdr_decode_AFSVolSync(&bp, call->reply[X]); */
_leave(" = 0 [done]");
@@ -778,6 +776,15 @@ int afs_fs_create(struct afs_fs_cursor *fc,
size_t namesz, reqsz, padsz;
__be32 *bp;
+ if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags)){
+ if (S_ISDIR(mode))
+ return yfs_fs_make_dir(fc, name, mode, current_data_version,
+ newfid, newstatus, newcb);
+ else
+ return yfs_fs_create_file(fc, name, mode, current_data_version,
+ newfid, newstatus, newcb);
+ }
+
_enter("");
namesz = strlen(name);
@@ -796,6 +803,7 @@ int afs_fs_create(struct afs_fs_cursor *fc,
call->reply[2] = newstatus;
call->reply[3] = newcb;
call->expected_version = current_data_version + 1;
+ call->want_reply_time = true;
/* marshall the parameters */
bp = call->request;
@@ -839,9 +847,10 @@ static int afs_deliver_fs_remove(struct afs_call *call)
/* unmarshall the reply once we've received all of it */
bp = call->buffer;
- if (afs_decode_status(call, &bp, &vnode->status, vnode,
- &call->expected_version, NULL) < 0)
- return afs_protocol_error(call, -EBADMSG);
+ ret = afs_decode_status(call, &bp, &vnode->status, vnode,
+ &call->expected_version, NULL);
+ if (ret < 0)
+ return ret;
/* xdr_decode_AFSVolSync(&bp, call->reply[X]); */
_leave(" = 0 [done]");
@@ -868,15 +877,18 @@ static const struct afs_call_type afs_RXFSRemoveDir = {
/*
* remove a file or directory
*/
-int afs_fs_remove(struct afs_fs_cursor *fc, const char *name, bool isdir,
- u64 current_data_version)
+int afs_fs_remove(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
+ const char *name, bool isdir, u64 current_data_version)
{
- struct afs_vnode *vnode = fc->vnode;
+ struct afs_vnode *dvnode = fc->vnode;
struct afs_call *call;
- struct afs_net *net = afs_v2net(vnode);
+ struct afs_net *net = afs_v2net(dvnode);
size_t namesz, reqsz, padsz;
__be32 *bp;
+ if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
+ return yfs_fs_remove(fc, vnode, name, isdir, current_data_version);
+
_enter("");
namesz = strlen(name);
@@ -890,15 +902,16 @@ int afs_fs_remove(struct afs_fs_cursor *fc, const char *name, bool isdir,
return -ENOMEM;
call->key = fc->key;
- call->reply[0] = vnode;
+ call->reply[0] = dvnode;
+ call->reply[1] = vnode;
call->expected_version = current_data_version + 1;
/* marshall the parameters */
bp = call->request;
*bp++ = htonl(isdir ? FSREMOVEDIR : FSREMOVEFILE);
- *bp++ = htonl(vnode->fid.vid);
- *bp++ = htonl(vnode->fid.vnode);
- *bp++ = htonl(vnode->fid.unique);
+ *bp++ = htonl(dvnode->fid.vid);
+ *bp++ = htonl(dvnode->fid.vnode);
+ *bp++ = htonl(dvnode->fid.unique);
*bp++ = htonl(namesz);
memcpy(bp, name, namesz);
bp = (void *) bp + namesz;
@@ -908,7 +921,7 @@ int afs_fs_remove(struct afs_fs_cursor *fc, const char *name, bool isdir,
}
afs_use_fs_server(call, fc->cbi);
- trace_afs_make_fs_call(call, &vnode->fid);
+ trace_afs_make_fs_call(call, &dvnode->fid);
return afs_make_call(&fc->ac, call, GFP_NOFS, false);
}
@@ -929,10 +942,13 @@ static int afs_deliver_fs_link(struct afs_call *call)
/* unmarshall the reply once we've received all of it */
bp = call->buffer;
- if (afs_decode_status(call, &bp, &vnode->status, vnode, NULL, NULL) < 0 ||
- afs_decode_status(call, &bp, &dvnode->status, dvnode,
- &call->expected_version, NULL) < 0)
- return afs_protocol_error(call, -EBADMSG);
+ ret = afs_decode_status(call, &bp, &vnode->status, vnode, NULL, NULL);
+ if (ret < 0)
+ return ret;
+ ret = afs_decode_status(call, &bp, &dvnode->status, dvnode,
+ &call->expected_version, NULL);
+ if (ret < 0)
+ return ret;
/* xdr_decode_AFSVolSync(&bp, call->reply[X]); */
_leave(" = 0 [done]");
@@ -961,6 +977,9 @@ int afs_fs_link(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
size_t namesz, reqsz, padsz;
__be32 *bp;
+ if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
+ return yfs_fs_link(fc, vnode, name, current_data_version);
+
_enter("");
namesz = strlen(name);
@@ -1016,10 +1035,13 @@ static int afs_deliver_fs_symlink(struct afs_call *call)
/* unmarshall the reply once we've received all of it */
bp = call->buffer;
xdr_decode_AFSFid(&bp, call->reply[1]);
- if (afs_decode_status(call, &bp, call->reply[2], NULL, NULL, NULL) ||
- afs_decode_status(call, &bp, &vnode->status, vnode,
- &call->expected_version, NULL) < 0)
- return afs_protocol_error(call, -EBADMSG);
+ ret = afs_decode_status(call, &bp, call->reply[2], NULL, NULL, NULL);
+ if (ret < 0)
+ return ret;
+ ret = afs_decode_status(call, &bp, &vnode->status, vnode,
+ &call->expected_version, NULL);
+ if (ret < 0)
+ return ret;
/* xdr_decode_AFSVolSync(&bp, call->reply[X]); */
_leave(" = 0 [done]");
@@ -1052,6 +1074,10 @@ int afs_fs_symlink(struct afs_fs_cursor *fc,
size_t namesz, reqsz, padsz, c_namesz, c_padsz;
__be32 *bp;
+ if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
+ return yfs_fs_symlink(fc, name, contents, current_data_version,
+ newfid, newstatus);
+
_enter("");
namesz = strlen(name);
@@ -1122,13 +1148,16 @@ static int afs_deliver_fs_rename(struct afs_call *call)
/* unmarshall the reply once we've received all of it */
bp = call->buffer;
- if (afs_decode_status(call, &bp, &orig_dvnode->status, orig_dvnode,
- &call->expected_version, NULL) < 0)
- return afs_protocol_error(call, -EBADMSG);
- if (new_dvnode != orig_dvnode &&
- afs_decode_status(call, &bp, &new_dvnode->status, new_dvnode,
- &call->expected_version_2, NULL) < 0)
- return afs_protocol_error(call, -EBADMSG);
+ ret = afs_decode_status(call, &bp, &orig_dvnode->status, orig_dvnode,
+ &call->expected_version, NULL);
+ if (ret < 0)
+ return ret;
+ if (new_dvnode != orig_dvnode) {
+ ret = afs_decode_status(call, &bp, &new_dvnode->status, new_dvnode,
+ &call->expected_version_2, NULL);
+ if (ret < 0)
+ return ret;
+ }
/* xdr_decode_AFSVolSync(&bp, call->reply[X]); */
_leave(" = 0 [done]");
@@ -1161,6 +1190,12 @@ int afs_fs_rename(struct afs_fs_cursor *fc,
size_t reqsz, o_namesz, o_padsz, n_namesz, n_padsz;
__be32 *bp;
+ if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
+ return yfs_fs_rename(fc, orig_name,
+ new_dvnode, new_name,
+ current_orig_data_version,
+ current_new_data_version);
+
_enter("");
o_namesz = strlen(orig_name);
@@ -1231,9 +1266,10 @@ static int afs_deliver_fs_store_data(struct afs_call *call)
/* unmarshall the reply once we've received all of it */
bp = call->buffer;
- if (afs_decode_status(call, &bp, &vnode->status, vnode,
- &call->expected_version, NULL) < 0)
- return afs_protocol_error(call, -EBADMSG);
+ ret = afs_decode_status(call, &bp, &vnode->status, vnode,
+ &call->expected_version, NULL);
+ if (ret < 0)
+ return ret;
/* xdr_decode_AFSVolSync(&bp, call->reply[X]); */
afs_pages_written_back(vnode, call);
@@ -1273,7 +1309,7 @@ static int afs_fs_store_data64(struct afs_fs_cursor *fc,
struct afs_net *net = afs_v2net(vnode);
__be32 *bp;
- _enter(",%x,{%x:%u},,",
+ _enter(",%x,{%llx:%llu},,",
key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode);
call = afs_alloc_flat_call(net, &afs_RXFSStoreData64,
@@ -1330,7 +1366,10 @@ int afs_fs_store_data(struct afs_fs_cursor *fc, struct address_space *mapping,
loff_t size, pos, i_size;
__be32 *bp;
- _enter(",%x,{%x:%u},,",
+ if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
+ return yfs_fs_store_data(fc, mapping, first, last, offset, to);
+
+ _enter(",%x,{%llx:%llu},,",
key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode);
size = (loff_t)to - (loff_t)offset;
@@ -1407,9 +1446,10 @@ static int afs_deliver_fs_store_status(struct afs_call *call)
/* unmarshall the reply once we've received all of it */
bp = call->buffer;
- if (afs_decode_status(call, &bp, &vnode->status, vnode,
- &call->expected_version, NULL) < 0)
- return afs_protocol_error(call, -EBADMSG);
+ ret = afs_decode_status(call, &bp, &vnode->status, vnode,
+ &call->expected_version, NULL);
+ if (ret < 0)
+ return ret;
/* xdr_decode_AFSVolSync(&bp, call->reply[X]); */
_leave(" = 0 [done]");
@@ -1451,7 +1491,7 @@ static int afs_fs_setattr_size64(struct afs_fs_cursor *fc, struct iattr *attr)
struct afs_net *net = afs_v2net(vnode);
__be32 *bp;
- _enter(",%x,{%x:%u},,",
+ _enter(",%x,{%llx:%llu},,",
key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode);
ASSERT(attr->ia_valid & ATTR_SIZE);
@@ -1498,7 +1538,7 @@ static int afs_fs_setattr_size(struct afs_fs_cursor *fc, struct iattr *attr)
struct afs_net *net = afs_v2net(vnode);
__be32 *bp;
- _enter(",%x,{%x:%u},,",
+ _enter(",%x,{%llx:%llu},,",
key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode);
ASSERT(attr->ia_valid & ATTR_SIZE);
@@ -1544,10 +1584,13 @@ int afs_fs_setattr(struct afs_fs_cursor *fc, struct iattr *attr)
struct afs_net *net = afs_v2net(vnode);
__be32 *bp;
+ if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
+ return yfs_fs_setattr(fc, attr);
+
if (attr->ia_valid & ATTR_SIZE)
return afs_fs_setattr_size(fc, attr);
- _enter(",%x,{%x:%u},,",
+ _enter(",%x,{%llx:%llu},,",
key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode);
call = afs_alloc_flat_call(net, &afs_RXFSStoreStatus,
@@ -1581,164 +1624,114 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
{
const __be32 *bp;
char *p;
+ u32 size;
int ret;
_enter("{%u}", call->unmarshall);
switch (call->unmarshall) {
case 0:
- call->offset = 0;
call->unmarshall++;
+ afs_extract_to_buf(call, 12 * 4);
/* extract the returned status record */
case 1:
_debug("extract status");
- ret = afs_extract_data(call, call->buffer,
- 12 * 4, true);
+ ret = afs_extract_data(call, true);
if (ret < 0)
return ret;
bp = call->buffer;
xdr_decode_AFSFetchVolumeStatus(&bp, call->reply[1]);
- call->offset = 0;
call->unmarshall++;
+ afs_extract_to_tmp(call);
/* extract the volume name length */
case 2:
- ret = afs_extract_data(call, &call->tmp, 4, true);
+ ret = afs_extract_data(call, true);
if (ret < 0)
return ret;
call->count = ntohl(call->tmp);
_debug("volname length: %u", call->count);
if (call->count >= AFSNAMEMAX)
- return afs_protocol_error(call, -EBADMSG);
- call->offset = 0;
+ return afs_protocol_error(call, -EBADMSG,
+ afs_eproto_volname_len);
+ size = (call->count + 3) & ~3; /* It's padded */
+ afs_extract_begin(call, call->reply[2], size);
call->unmarshall++;
/* extract the volume name */
case 3:
_debug("extract volname");
- if (call->count > 0) {
- ret = afs_extract_data(call, call->reply[2],
- call->count, true);
- if (ret < 0)
- return ret;
- }
+ ret = afs_extract_data(call, true);
+ if (ret < 0)
+ return ret;
p = call->reply[2];
p[call->count] = 0;
_debug("volname '%s'", p);
-
- call->offset = 0;
+ afs_extract_to_tmp(call);
call->unmarshall++;
- /* extract the volume name padding */
- if ((call->count & 3) == 0) {
- call->unmarshall++;
- goto no_volname_padding;
- }
- call->count = 4 - (call->count & 3);
-
- case 4:
- ret = afs_extract_data(call, call->buffer,
- call->count, true);
- if (ret < 0)
- return ret;
-
- call->offset = 0;
- call->unmarshall++;
- no_volname_padding:
-
/* extract the offline message length */
- case 5:
- ret = afs_extract_data(call, &call->tmp, 4, true);
+ case 4:
+ ret = afs_extract_data(call, true);
if (ret < 0)
return ret;
call->count = ntohl(call->tmp);
_debug("offline msg length: %u", call->count);
if (call->count >= AFSNAMEMAX)
- return afs_protocol_error(call, -EBADMSG);
- call->offset = 0;
+ return afs_protocol_error(call, -EBADMSG,
+ afs_eproto_offline_msg_len);
+ size = (call->count + 3) & ~3; /* It's padded */
+ afs_extract_begin(call, call->reply[2], size);
call->unmarshall++;
/* extract the offline message */
- case 6:
+ case 5:
_debug("extract offline");
- if (call->count > 0) {
- ret = afs_extract_data(call, call->reply[2],
- call->count, true);
- if (ret < 0)
- return ret;
- }
+ ret = afs_extract_data(call, true);
+ if (ret < 0)
+ return ret;
p = call->reply[2];
p[call->count] = 0;
_debug("offline '%s'", p);
- call->offset = 0;
+ afs_extract_to_tmp(call);
call->unmarshall++;
- /* extract the offline message padding */
- if ((call->count & 3) == 0) {
- call->unmarshall++;
- goto no_offline_padding;
- }
- call->count = 4 - (call->count & 3);
-
- case 7:
- ret = afs_extract_data(call, call->buffer,
- call->count, true);
- if (ret < 0)
- return ret;
-
- call->offset = 0;
- call->unmarshall++;
- no_offline_padding:
-
/* extract the message of the day length */
- case 8:
- ret = afs_extract_data(call, &call->tmp, 4, true);
+ case 6:
+ ret = afs_extract_data(call, true);
if (ret < 0)
return ret;
call->count = ntohl(call->tmp);
_debug("motd length: %u", call->count);
if (call->count >= AFSNAMEMAX)
- return afs_protocol_error(call, -EBADMSG);
- call->offset = 0;
+ return afs_protocol_error(call, -EBADMSG,
+ afs_eproto_motd_len);
+ size = (call->count + 3) & ~3; /* It's padded */
+ afs_extract_begin(call, call->reply[2], size);
call->unmarshall++;
/* extract the message of the day */
- case 9:
+ case 7:
_debug("extract motd");
- if (call->count > 0) {
- ret = afs_extract_data(call, call->reply[2],
- call->count, true);
- if (ret < 0)
- return ret;
- }
+ ret = afs_extract_data(call, false);
+ if (ret < 0)
+ return ret;
p = call->reply[2];
p[call->count] = 0;
_debug("motd '%s'", p);
- call->offset = 0;
call->unmarshall++;
- /* extract the message of the day padding */
- call->count = (4 - (call->count & 3)) & 3;
-
- case 10:
- ret = afs_extract_data(call, call->buffer,
- call->count, false);
- if (ret < 0)
- return ret;
-
- call->offset = 0;
- call->unmarshall++;
- case 11:
+ case 8:
break;
}
@@ -1778,6 +1771,9 @@ int afs_fs_get_volume_status(struct afs_fs_cursor *fc,
__be32 *bp;
void *tmpbuf;
+ if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
+ return yfs_fs_get_volume_status(fc, vs);
+
_enter("");
tmpbuf = kmalloc(AFSOPAQUEMAX, GFP_KERNEL);
@@ -1867,6 +1863,9 @@ int afs_fs_set_lock(struct afs_fs_cursor *fc, afs_lock_type_t type)
struct afs_net *net = afs_v2net(vnode);
__be32 *bp;
+ if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
+ return yfs_fs_set_lock(fc, type);
+
_enter("");
call = afs_alloc_flat_call(net, &afs_RXFSSetLock, 5 * 4, 6 * 4);
@@ -1899,6 +1898,9 @@ int afs_fs_extend_lock(struct afs_fs_cursor *fc)
struct afs_net *net = afs_v2net(vnode);
__be32 *bp;
+ if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
+ return yfs_fs_extend_lock(fc);
+
_enter("");
call = afs_alloc_flat_call(net, &afs_RXFSExtendLock, 4 * 4, 6 * 4);
@@ -1930,6 +1932,9 @@ int afs_fs_release_lock(struct afs_fs_cursor *fc)
struct afs_net *net = afs_v2net(vnode);
__be32 *bp;
+ if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
+ return yfs_fs_release_lock(fc);
+
_enter("");
call = afs_alloc_flat_call(net, &afs_RXFSReleaseLock, 4 * 4, 6 * 4);
@@ -2004,19 +2009,16 @@ static int afs_deliver_fs_get_capabilities(struct afs_call *call)
u32 count;
int ret;
- _enter("{%u,%zu/%u}", call->unmarshall, call->offset, call->count);
+ _enter("{%u,%zu}", call->unmarshall, iov_iter_count(&call->iter));
-again:
switch (call->unmarshall) {
case 0:
- call->offset = 0;
+ afs_extract_to_tmp(call);
call->unmarshall++;
/* Extract the capabilities word count */
case 1:
- ret = afs_extract_data(call, &call->tmp,
- 1 * sizeof(__be32),
- true);
+ ret = afs_extract_data(call, true);
if (ret < 0)
return ret;
@@ -2024,24 +2026,17 @@ again:
call->count = count;
call->count2 = count;
- call->offset = 0;
+ iov_iter_discard(&call->iter, READ, count * sizeof(__be32));
call->unmarshall++;
/* Extract capabilities words */
case 2:
- count = min(call->count, 16U);
- ret = afs_extract_data(call, call->buffer,
- count * sizeof(__be32),
- call->count > 16);
+ ret = afs_extract_data(call, false);
if (ret < 0)
return ret;
/* TODO: Examine capabilities */
- call->count -= count;
- if (call->count > 0)
- goto again;
- call->offset = 0;
call->unmarshall++;
break;
}
@@ -2050,6 +2045,14 @@ again:
return 0;
}
+static void afs_destroy_fs_get_capabilities(struct afs_call *call)
+{
+ struct afs_server *server = call->reply[0];
+
+ afs_put_server(call->net, server);
+ afs_flat_call_destructor(call);
+}
+
/*
* FS.GetCapabilities operation type
*/
@@ -2057,7 +2060,8 @@ static const struct afs_call_type afs_RXFSGetCapabilities = {
.name = "FS.GetCapabilities",
.op = afs_FS_GetCapabilities,
.deliver = afs_deliver_fs_get_capabilities,
- .destructor = afs_flat_call_destructor,
+ .done = afs_fileserver_probe_result,
+ .destructor = afs_destroy_fs_get_capabilities,
};
/*
@@ -2067,7 +2071,9 @@ static const struct afs_call_type afs_RXFSGetCapabilities = {
int afs_fs_get_capabilities(struct afs_net *net,
struct afs_server *server,
struct afs_addr_cursor *ac,
- struct key *key)
+ struct key *key,
+ unsigned int server_index,
+ bool async)
{
struct afs_call *call;
__be32 *bp;
@@ -2079,6 +2085,10 @@ int afs_fs_get_capabilities(struct afs_net *net,
return -ENOMEM;
call->key = key;
+ call->reply[0] = afs_get_server(server);
+ call->reply[1] = (void *)(long)server_index;
+ call->upgrade = true;
+ call->want_reply_time = true;
/* marshall the parameters */
bp = call->request;
@@ -2086,7 +2096,7 @@ int afs_fs_get_capabilities(struct afs_net *net,
/* Can't take a ref on server */
trace_afs_make_fs_call(call, NULL);
- return afs_make_call(ac, call, GFP_NOFS, false);
+ return afs_make_call(ac, call, GFP_NOFS, async);
}
/*
@@ -2097,7 +2107,7 @@ static int afs_deliver_fs_fetch_status(struct afs_call *call)
struct afs_file_status *status = call->reply[1];
struct afs_callback *callback = call->reply[2];
struct afs_volsync *volsync = call->reply[3];
- struct afs_vnode *vnode = call->reply[0];
+ struct afs_fid *fid = call->reply[0];
const __be32 *bp;
int ret;
@@ -2105,21 +2115,16 @@ static int afs_deliver_fs_fetch_status(struct afs_call *call)
if (ret < 0)
return ret;
- _enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode);
+ _enter("{%llx:%llu}", fid->vid, fid->vnode);
/* unmarshall the reply once we've received all of it */
bp = call->buffer;
- afs_decode_status(call, &bp, status, vnode,
- &call->expected_version, NULL);
- callback[call->count].version = ntohl(bp[0]);
- callback[call->count].expiry = ntohl(bp[1]);
- callback[call->count].type = ntohl(bp[2]);
- if (vnode)
- xdr_decode_AFSCallBack(call, vnode, &bp);
- else
- bp += 3;
- if (volsync)
- xdr_decode_AFSVolSync(&bp, volsync);
+ ret = afs_decode_status(call, &bp, status, NULL,
+ &call->expected_version, NULL);
+ if (ret < 0)
+ return ret;
+ xdr_decode_AFSCallBack_raw(call, &bp, callback);
+ xdr_decode_AFSVolSync(&bp, volsync);
_leave(" = 0 [done]");
return 0;
@@ -2148,7 +2153,10 @@ int afs_fs_fetch_status(struct afs_fs_cursor *fc,
struct afs_call *call;
__be32 *bp;
- _enter(",%x,{%x:%u},,",
+ if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
+ return yfs_fs_fetch_status(fc, net, fid, status, callback, volsync);
+
+ _enter(",%x,{%llx:%llu},,",
key_serial(fc->key), fid->vid, fid->vnode);
call = afs_alloc_flat_call(net, &afs_RXFSFetchStatus, 16, (21 + 3 + 6) * 4);
@@ -2158,11 +2166,12 @@ int afs_fs_fetch_status(struct afs_fs_cursor *fc,
}
call->key = fc->key;
- call->reply[0] = NULL; /* vnode for fid[0] */
+ call->reply[0] = fid;
call->reply[1] = status;
call->reply[2] = callback;
call->reply[3] = volsync;
call->expected_version = 1; /* vnode->status.data_version */
+ call->want_reply_time = true;
/* marshall the parameters */
bp = call->request;
@@ -2193,38 +2202,40 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call)
switch (call->unmarshall) {
case 0:
- call->offset = 0;
+ afs_extract_to_tmp(call);
call->unmarshall++;
/* Extract the file status count and array in two steps */
case 1:
_debug("extract status count");
- ret = afs_extract_data(call, &call->tmp, 4, true);
+ ret = afs_extract_data(call, true);
if (ret < 0)
return ret;
tmp = ntohl(call->tmp);
_debug("status count: %u/%u", tmp, call->count2);
if (tmp != call->count2)
- return afs_protocol_error(call, -EBADMSG);
+ return afs_protocol_error(call, -EBADMSG,
+ afs_eproto_ibulkst_count);
call->count = 0;
call->unmarshall++;
more_counts:
- call->offset = 0;
+ afs_extract_to_buf(call, 21 * sizeof(__be32));
case 2:
_debug("extract status array %u", call->count);
- ret = afs_extract_data(call, call->buffer, 21 * 4, true);
+ ret = afs_extract_data(call, true);
if (ret < 0)
return ret;
bp = call->buffer;
statuses = call->reply[1];
- if (afs_decode_status(call, &bp, &statuses[call->count],
- call->count == 0 ? vnode : NULL,
- NULL, NULL) < 0)
- return afs_protocol_error(call, -EBADMSG);
+ ret = afs_decode_status(call, &bp, &statuses[call->count],
+ call->count == 0 ? vnode : NULL,
+ NULL, NULL);
+ if (ret < 0)
+ return ret;
call->count++;
if (call->count < call->count2)
@@ -2232,27 +2243,28 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call)
call->count = 0;
call->unmarshall++;
- call->offset = 0;
+ afs_extract_to_tmp(call);
/* Extract the callback count and array in two steps */
case 3:
_debug("extract CB count");
- ret = afs_extract_data(call, &call->tmp, 4, true);
+ ret = afs_extract_data(call, true);
if (ret < 0)
return ret;
tmp = ntohl(call->tmp);
_debug("CB count: %u", tmp);
if (tmp != call->count2)
- return afs_protocol_error(call, -EBADMSG);
+ return afs_protocol_error(call, -EBADMSG,
+ afs_eproto_ibulkst_cb_count);
call->count = 0;
call->unmarshall++;
more_cbs:
- call->offset = 0;
+ afs_extract_to_buf(call, 3 * sizeof(__be32));
case 4:
_debug("extract CB array");
- ret = afs_extract_data(call, call->buffer, 3 * 4, true);
+ ret = afs_extract_data(call, true);
if (ret < 0)
return ret;
@@ -2260,7 +2272,7 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call)
bp = call->buffer;
callbacks = call->reply[2];
callbacks[call->count].version = ntohl(bp[0]);
- callbacks[call->count].expiry = ntohl(bp[1]);
+ callbacks[call->count].expires_at = xdr_decode_expiry(call, ntohl(bp[1]));
callbacks[call->count].type = ntohl(bp[2]);
statuses = call->reply[1];
if (call->count == 0 && vnode && statuses[0].abort_code == 0)
@@ -2269,19 +2281,17 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call)
if (call->count < call->count2)
goto more_cbs;
- call->offset = 0;
+ afs_extract_to_buf(call, 6 * sizeof(__be32));
call->unmarshall++;
case 5:
- ret = afs_extract_data(call, call->buffer, 6 * 4, false);
+ ret = afs_extract_data(call, false);
if (ret < 0)
return ret;
bp = call->buffer;
- if (call->reply[3])
- xdr_decode_AFSVolSync(&bp, call->reply[3]);
+ xdr_decode_AFSVolSync(&bp, call->reply[3]);
- call->offset = 0;
call->unmarshall++;
case 6:
@@ -2317,7 +2327,11 @@ int afs_fs_inline_bulk_status(struct afs_fs_cursor *fc,
__be32 *bp;
int i;
- _enter(",%x,{%x:%u},%u",
+ if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
+ return yfs_fs_inline_bulk_status(fc, net, fids, statuses, callbacks,
+ nr_fids, volsync);
+
+ _enter(",%x,{%llx:%llu},%u",
key_serial(fc->key), fids[0].vid, fids[1].vnode, nr_fids);
call = afs_alloc_flat_call(net, &afs_RXFSInlineBulkStatus,
@@ -2334,6 +2348,7 @@ int afs_fs_inline_bulk_status(struct afs_fs_cursor *fc,
call->reply[2] = callbacks;
call->reply[3] = volsync;
call->count2 = nr_fids;
+ call->want_reply_time = true;
/* marshall the parameters */
bp = call->request;
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 479b7fdda124..1a4ce07fb406 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -82,7 +82,7 @@ static int afs_inode_init_from_status(struct afs_vnode *vnode, struct key *key)
default:
printk("kAFS: AFS vnode with undefined type\n");
read_sequnlock_excl(&vnode->cb_lock);
- return afs_protocol_error(NULL, -EBADMSG);
+ return afs_protocol_error(NULL, -EBADMSG, afs_eproto_file_type);
}
inode->i_blocks = 0;
@@ -100,7 +100,7 @@ int afs_fetch_status(struct afs_vnode *vnode, struct key *key, bool new_inode)
struct afs_fs_cursor fc;
int ret;
- _enter("%s,{%x:%u.%u,S=%lx}",
+ _enter("%s,{%llx:%llu.%u,S=%lx}",
vnode->volume->name,
vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique,
vnode->flags);
@@ -127,9 +127,9 @@ int afs_fetch_status(struct afs_vnode *vnode, struct key *key, bool new_inode)
int afs_iget5_test(struct inode *inode, void *opaque)
{
struct afs_iget_data *data = opaque;
+ struct afs_vnode *vnode = AFS_FS_I(inode);
- return inode->i_ino == data->fid.vnode &&
- inode->i_generation == data->fid.unique;
+ return memcmp(&vnode->fid, &data->fid, sizeof(data->fid)) == 0;
}
/*
@@ -150,11 +150,14 @@ static int afs_iget5_set(struct inode *inode, void *opaque)
struct afs_iget_data *data = opaque;
struct afs_vnode *vnode = AFS_FS_I(inode);
- inode->i_ino = data->fid.vnode;
- inode->i_generation = data->fid.unique;
vnode->fid = data->fid;
vnode->volume = data->volume;
+ /* YFS supports 96-bit vnode IDs, but Linux only supports
+ * 64-bit inode numbers.
+ */
+ inode->i_ino = data->fid.vnode;
+ inode->i_generation = data->fid.unique;
return 0;
}
@@ -193,7 +196,7 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root)
return ERR_PTR(-ENOMEM);
}
- _debug("GOT INODE %p { ino=%lu, vl=%x, vn=%x, u=%x }",
+ _debug("GOT INODE %p { ino=%lu, vl=%llx, vn=%llx, u=%x }",
inode, inode->i_ino, data.fid.vid, data.fid.vnode,
data.fid.unique);
@@ -252,8 +255,8 @@ static void afs_get_inode_cache(struct afs_vnode *vnode)
key.vnode_id = vnode->fid.vnode;
key.unique = vnode->fid.unique;
- key.vnode_id_ext[0] = 0;
- key.vnode_id_ext[1] = 0;
+ key.vnode_id_ext[0] = vnode->fid.vnode >> 32;
+ key.vnode_id_ext[1] = vnode->fid.vnode_hi;
aux.data_version = vnode->status.data_version;
vnode->cache = fscache_acquire_cookie(vnode->volume->cache,
@@ -277,7 +280,7 @@ struct inode *afs_iget(struct super_block *sb, struct key *key,
struct inode *inode;
int ret;
- _enter(",{%x:%u.%u},,", fid->vid, fid->vnode, fid->unique);
+ _enter(",{%llx:%llu.%u},,", fid->vid, fid->vnode, fid->unique);
as = sb->s_fs_info;
data.volume = as->volume;
@@ -289,7 +292,7 @@ struct inode *afs_iget(struct super_block *sb, struct key *key,
return ERR_PTR(-ENOMEM);
}
- _debug("GOT INODE %p { vl=%x vn=%x, u=%x }",
+ _debug("GOT INODE %p { vl=%llx vn=%llx, u=%x }",
inode, fid->vid, fid->vnode, fid->unique);
vnode = AFS_FS_I(inode);
@@ -314,11 +317,11 @@ struct inode *afs_iget(struct super_block *sb, struct key *key,
* didn't give us a callback) */
vnode->cb_version = 0;
vnode->cb_type = 0;
- vnode->cb_expires_at = 0;
+ vnode->cb_expires_at = ktime_get();
} else {
vnode->cb_version = cb->version;
vnode->cb_type = cb->type;
- vnode->cb_expires_at = cb->expiry;
+ vnode->cb_expires_at = cb->expires_at;
vnode->cb_interest = afs_get_cb_interest(cbi);
set_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
}
@@ -352,7 +355,7 @@ bad_inode:
*/
void afs_zap_data(struct afs_vnode *vnode)
{
- _enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode);
+ _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode);
#ifdef CONFIG_AFS_FSCACHE
fscache_invalidate(vnode->cache);
@@ -379,10 +382,10 @@ void afs_zap_data(struct afs_vnode *vnode)
int afs_validate(struct afs_vnode *vnode, struct key *key)
{
time64_t now = ktime_get_real_seconds();
- bool valid = false;
+ bool valid;
int ret;
- _enter("{v={%x:%u} fl=%lx},%x",
+ _enter("{v={%llx:%llu} fl=%lx},%x",
vnode->fid.vid, vnode->fid.vnode, vnode->flags,
key_serial(key));
@@ -399,15 +402,20 @@ int afs_validate(struct afs_vnode *vnode, struct key *key)
vnode->cb_v_break = vnode->volume->cb_v_break;
valid = false;
} else if (vnode->status.type == AFS_FTYPE_DIR &&
- test_bit(AFS_VNODE_DIR_VALID, &vnode->flags) &&
- vnode->cb_expires_at - 10 > now) {
- valid = true;
- } else if (!test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags) &&
- vnode->cb_expires_at - 10 > now) {
+ (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags) ||
+ vnode->cb_expires_at - 10 <= now)) {
+ valid = false;
+ } else if (test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags) ||
+ vnode->cb_expires_at - 10 <= now) {
+ valid = false;
+ } else {
valid = true;
}
} else if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
valid = true;
+ } else {
+ vnode->cb_v_break = vnode->volume->cb_v_break;
+ valid = false;
}
read_sequnlock_excl(&vnode->cb_lock);
@@ -501,7 +509,7 @@ void afs_evict_inode(struct inode *inode)
vnode = AFS_FS_I(inode);
- _enter("{%x:%u.%d}",
+ _enter("{%llx:%llu.%d}",
vnode->fid.vid,
vnode->fid.vnode,
vnode->fid.unique);
@@ -537,6 +545,8 @@ void afs_evict_inode(struct inode *inode)
#endif
afs_put_permits(rcu_access_pointer(vnode->permit_cache));
+ key_put(vnode->lock_key);
+ vnode->lock_key = NULL;
_leave("");
}
@@ -550,7 +560,7 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr)
struct key *key;
int ret;
- _enter("{%x:%u},{n=%pd},%x",
+ _enter("{%llx:%llu},{n=%pd},%x",
vnode->fid.vid, vnode->fid.vnode, dentry,
attr->ia_valid);
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 72de1f157d20..8871b9e8645f 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -22,6 +22,7 @@
#include <linux/backing-dev.h>
#include <linux/uuid.h>
#include <linux/mm_types.h>
+#include <linux/dns_resolver.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include <net/sock.h>
@@ -75,10 +76,13 @@ struct afs_addr_list {
u32 version; /* Version */
unsigned char max_addrs;
unsigned char nr_addrs;
- unsigned char index; /* Address currently in use */
+ unsigned char preferred; /* Preferred address */
unsigned char nr_ipv4; /* Number of IPv4 addresses */
+ enum dns_record_source source:8;
+ enum dns_lookup_status status:8;
unsigned long probed; /* Mask of servers that have been probed */
- unsigned long yfs; /* Mask of servers that are YFS */
+ unsigned long failed; /* Mask of addrs that failed locally/ICMP */
+ unsigned long responded; /* Mask of addrs that responded */
struct sockaddr_rxrpc addrs[];
#define AFS_MAX_ADDRESSES ((unsigned int)(sizeof(unsigned long) * 8))
};
@@ -88,6 +92,7 @@ struct afs_addr_list {
*/
struct afs_call {
const struct afs_call_type *type; /* type of call */
+ struct afs_addr_list *alist; /* Address is alist[addr_ix] */
wait_queue_head_t waitq; /* processes awaiting completion */
struct work_struct async_work; /* async I/O processor */
struct work_struct work; /* actual work processor */
@@ -98,16 +103,22 @@ struct afs_call {
struct afs_cb_interest *cbi; /* Callback interest for server used */
void *request; /* request data (first part) */
struct address_space *mapping; /* Pages being written from */
+ struct iov_iter iter; /* Buffer iterator */
+ struct iov_iter *_iter; /* Iterator currently in use */
+ union { /* Convenience for ->iter */
+ struct kvec kvec[1];
+ struct bio_vec bvec[1];
+ };
void *buffer; /* reply receive buffer */
void *reply[4]; /* Where to put the reply */
pgoff_t first; /* first page in mapping to deal with */
pgoff_t last; /* last page in mapping to deal with */
- size_t offset; /* offset into received data store */
atomic_t usage;
enum afs_call_state state;
spinlock_t state_lock;
int error; /* error code */
u32 abort_code; /* Remote abort ID or 0 */
+ u32 epoch;
unsigned request_size; /* size of request data */
unsigned reply_max; /* maximum size of reply */
unsigned first_offset; /* offset into mapping[first] */
@@ -117,19 +128,28 @@ struct afs_call {
unsigned count2; /* count used in unmarshalling */
};
unsigned char unmarshall; /* unmarshalling phase */
+ unsigned char addr_ix; /* Address in ->alist */
bool incoming; /* T if incoming call */
bool send_pages; /* T if data from mapping should be sent */
bool need_attention; /* T if RxRPC poked us */
bool async; /* T if asynchronous */
bool ret_reply0; /* T if should return reply[0] on success */
bool upgrade; /* T to request service upgrade */
+ bool want_reply_time; /* T if want reply_time */
u16 service_id; /* Actual service ID (after upgrade) */
unsigned int debug_id; /* Trace ID */
u32 operation_ID; /* operation ID for an incoming call */
u32 count; /* count for use in unmarshalling */
- __be32 tmp; /* place to extract temporary data */
+ union { /* place to extract temporary data */
+ struct {
+ __be32 tmp_u;
+ __be32 tmp;
+ } __attribute__((packed));
+ __be64 tmp64;
+ };
afs_dataversion_t expected_version; /* Updated version expected from store */
afs_dataversion_t expected_version_2; /* 2nd updated version expected from store */
+ ktime_t reply_time; /* Time of first reply packet */
};
struct afs_call_type {
@@ -146,6 +166,9 @@ struct afs_call_type {
/* Work function */
void (*work)(struct work_struct *work);
+
+ /* Call done function (gets called immediately on success or failure) */
+ void (*done)(struct afs_call *call);
};
/*
@@ -185,6 +208,7 @@ struct afs_read {
refcount_t usage;
unsigned int index; /* Which page we're reading into */
unsigned int nr_pages;
+ unsigned int offset; /* offset into current page */
void (*page_done)(struct afs_call *, struct afs_read *);
struct page **pages;
struct page *array[];
@@ -343,13 +367,70 @@ struct afs_cell {
rwlock_t proc_lock;
/* VL server list. */
- rwlock_t vl_addrs_lock; /* Lock on vl_addrs */
- struct afs_addr_list __rcu *vl_addrs; /* List of VL servers */
+ rwlock_t vl_servers_lock; /* Lock on vl_servers */
+ struct afs_vlserver_list __rcu *vl_servers;
+
u8 name_len; /* Length of name */
char name[64 + 1]; /* Cell name, case-flattened and NUL-padded */
};
/*
+ * Volume Location server record.
+ */
+struct afs_vlserver {
+ struct rcu_head rcu;
+ struct afs_addr_list __rcu *addresses; /* List of addresses for this VL server */
+ unsigned long flags;
+#define AFS_VLSERVER_FL_PROBED 0 /* The VL server has been probed */
+#define AFS_VLSERVER_FL_PROBING 1 /* VL server is being probed */
+#define AFS_VLSERVER_FL_IS_YFS 2 /* Server is YFS not AFS */
+ rwlock_t lock; /* Lock on addresses */
+ atomic_t usage;
+
+ /* Probe state */
+ wait_queue_head_t probe_wq;
+ atomic_t probe_outstanding;
+ spinlock_t probe_lock;
+ struct {
+ unsigned int rtt; /* RTT as ktime/64 */
+ u32 abort_code;
+ short error;
+ bool have_result;
+ bool responded:1;
+ bool is_yfs:1;
+ bool not_yfs:1;
+ bool local_failure:1;
+ } probe;
+
+ u16 port;
+ u16 name_len; /* Length of name */
+ char name[]; /* Server name, case-flattened */
+};
+
+/*
+ * Weighted list of Volume Location servers.
+ */
+struct afs_vlserver_entry {
+ u16 priority; /* Preference (as SRV) */
+ u16 weight; /* Weight (as SRV) */
+ enum dns_record_source source:8;
+ enum dns_lookup_status status:8;
+ struct afs_vlserver *server;
+};
+
+struct afs_vlserver_list {
+ struct rcu_head rcu;
+ atomic_t usage;
+ u8 nr_servers;
+ u8 index; /* Server currently in use */
+ u8 preferred; /* Preferred server */
+ enum dns_record_source source:8;
+ enum dns_lookup_status status:8;
+ rwlock_t lock;
+ struct afs_vlserver_entry servers[];
+};
+
+/*
* Cached VLDB entry.
*
* This is pointed to by cell->vldb_entries, indexed by name.
@@ -403,8 +484,12 @@ struct afs_server {
#define AFS_SERVER_FL_PROBING 6 /* Fileserver is being probed */
#define AFS_SERVER_FL_NO_IBULK 7 /* Fileserver doesn't support FS.InlineBulkStatus */
#define AFS_SERVER_FL_MAY_HAVE_CB 8 /* May have callbacks on this fileserver */
+#define AFS_SERVER_FL_IS_YFS 9 /* Server is YFS not AFS */
+#define AFS_SERVER_FL_NO_RM2 10 /* Fileserver doesn't support YFS.RemoveFile2 */
+#define AFS_SERVER_FL_HAVE_EPOCH 11 /* ->epoch is valid */
atomic_t usage;
u32 addr_version; /* Address list version */
+ u32 cm_epoch; /* Server RxRPC epoch */
/* file service access */
rwlock_t fs_lock; /* access lock */
@@ -413,6 +498,26 @@ struct afs_server {
struct hlist_head cb_volumes; /* List of volume interests on this server */
unsigned cb_s_break; /* Break-everything counter. */
rwlock_t cb_break_lock; /* Volume finding lock */
+
+ /* Probe state */
+ wait_queue_head_t probe_wq;
+ atomic_t probe_outstanding;
+ spinlock_t probe_lock;
+ struct {
+ unsigned int rtt; /* RTT as ktime/64 */
+ u32 abort_code;
+ u32 cm_epoch;
+ short error;
+ bool have_result;
+ bool responded:1;
+ bool is_yfs:1;
+ bool not_yfs:1;
+ bool local_failure:1;
+ bool no_epoch:1;
+ bool cm_probed:1;
+ bool said_rebooted:1;
+ bool said_inconsistent:1;
+ } probe;
};
/*
@@ -447,8 +552,8 @@ struct afs_server_entry {
struct afs_server_list {
refcount_t usage;
- unsigned short nr_servers;
- unsigned short index; /* Server currently in use */
+ unsigned char nr_servers;
+ unsigned char preferred; /* Preferred server */
unsigned short vnovol_mask; /* Servers to be skipped due to VNOVOL */
unsigned int seq; /* Set to ->servers_seq when installed */
rwlock_t lock;
@@ -550,6 +655,15 @@ struct afs_vnode {
afs_callback_type_t cb_type; /* type of callback */
};
+static inline struct fscache_cookie *afs_vnode_cache(struct afs_vnode *vnode)
+{
+#ifdef CONFIG_AFS_FSCACHE
+ return vnode->cache;
+#else
+ return NULL;
+#endif
+}
+
/*
* cached security record for one user's attempt to access a vnode
*/
@@ -582,17 +696,43 @@ struct afs_interface {
};
/*
+ * Error prioritisation and accumulation.
+ */
+struct afs_error {
+ short error; /* Accumulated error */
+ bool responded; /* T if server responded */
+};
+
+/*
* Cursor for iterating over a server's address list.
*/
struct afs_addr_cursor {
struct afs_addr_list *alist; /* Current address list (pins ref) */
- struct sockaddr_rxrpc *addr;
+ unsigned long tried; /* Tried addresses */
+ signed char index; /* Current address */
+ bool responded; /* T if the current address responded */
+ unsigned short nr_iterations; /* Number of address iterations */
+ short error;
u32 abort_code;
- unsigned short start; /* Starting point in alist->addrs[] */
- unsigned short index; /* Wrapping offset from start to current addr */
+};
+
+/*
+ * Cursor for iterating over a set of volume location servers.
+ */
+struct afs_vl_cursor {
+ struct afs_addr_cursor ac;
+ struct afs_cell *cell; /* The cell we're querying */
+ struct afs_vlserver_list *server_list; /* Current server list (pins ref) */
+ struct afs_vlserver *server; /* Server on which this resides */
+ struct key *key; /* Key for the server */
+ unsigned long untried; /* Bitmask of untried servers */
+ short index; /* Current server */
short error;
- bool begun; /* T if we've begun iteration */
- bool responded; /* T if the current address responded */
+ unsigned short flags;
+#define AFS_VL_CURSOR_STOP 0x0001 /* Set to cease iteration */
+#define AFS_VL_CURSOR_RETRY 0x0002 /* Set to do a retry */
+#define AFS_VL_CURSOR_RETRIED 0x0004 /* Set if started a retry */
+ unsigned short nr_iterations; /* Number of server iterations */
};
/*
@@ -604,10 +744,11 @@ struct afs_fs_cursor {
struct afs_server_list *server_list; /* Current server list (pins ref) */
struct afs_cb_interest *cbi; /* Server on which this resides (pins ref) */
struct key *key; /* Key for the server */
+ unsigned long untried; /* Bitmask of untried servers */
unsigned int cb_break; /* cb_break + cb_s_break before the call */
unsigned int cb_break_2; /* cb_break + cb_s_break (2nd vnode) */
- unsigned char start; /* Initial index in server list */
- unsigned char index; /* Number of servers tried beyond start */
+ short index; /* Current server */
+ short error;
unsigned short flags;
#define AFS_FS_CURSOR_STOP 0x0001 /* Set to cease iteration */
#define AFS_FS_CURSOR_VBUSY 0x0002 /* Set if seen VBUSY */
@@ -615,6 +756,7 @@ struct afs_fs_cursor {
#define AFS_FS_CURSOR_VNOVOL 0x0008 /* Set if seen VNOVOL */
#define AFS_FS_CURSOR_CUR_ONLY 0x0010 /* Set if current server only (file lock held) */
#define AFS_FS_CURSOR_NO_VSLEEP 0x0020 /* Set to prevent sleep on VBUSY, VOFFLINE, ... */
+ unsigned short nr_iterations; /* Number of server iterations */
};
/*
@@ -640,12 +782,12 @@ extern struct afs_addr_list *afs_alloc_addrlist(unsigned int,
unsigned short,
unsigned short);
extern void afs_put_addrlist(struct afs_addr_list *);
-extern struct afs_addr_list *afs_parse_text_addrs(const char *, size_t, char,
- unsigned short, unsigned short);
-extern struct afs_addr_list *afs_dns_query(struct afs_cell *, time64_t *);
+extern struct afs_vlserver_list *afs_parse_text_addrs(struct afs_net *,
+ const char *, size_t, char,
+ unsigned short, unsigned short);
+extern struct afs_vlserver_list *afs_dns_query(struct afs_cell *, time64_t *);
extern bool afs_iterate_addresses(struct afs_addr_cursor *);
extern int afs_end_cursor(struct afs_addr_cursor *);
-extern int afs_set_vl_cursor(struct afs_addr_cursor *, struct afs_cell *);
extern void afs_merge_fs_addr4(struct afs_addr_list *, __be32, u16);
extern void afs_merge_fs_addr6(struct afs_addr_list *, __be32 *, u16);
@@ -668,6 +810,7 @@ extern struct fscache_cookie_def afs_vnode_cache_index_def;
* callback.c
*/
extern void afs_init_callback_state(struct afs_server *);
+extern void __afs_break_callback(struct afs_vnode *);
extern void afs_break_callback(struct afs_vnode *);
extern void afs_break_callbacks(struct afs_server *, size_t, struct afs_callback_break*);
@@ -688,10 +831,13 @@ static inline unsigned int afs_calc_vnode_cb_break(struct afs_vnode *vnode)
return vnode->cb_break + vnode->cb_s_break + vnode->cb_v_break;
}
-static inline unsigned int afs_cb_break_sum(struct afs_vnode *vnode,
- struct afs_cb_interest *cbi)
+static inline bool afs_cb_is_broken(unsigned int cb_break,
+ const struct afs_vnode *vnode,
+ const struct afs_cb_interest *cbi)
{
- return vnode->cb_break + cbi->server->cb_s_break + vnode->volume->cb_v_break;
+ return !cbi || cb_break != (vnode->cb_break +
+ cbi->server->cb_s_break +
+ vnode->volume->cb_v_break);
}
/*
@@ -781,7 +927,7 @@ extern int afs_fs_give_up_callbacks(struct afs_net *, struct afs_server *);
extern int afs_fs_fetch_data(struct afs_fs_cursor *, struct afs_read *);
extern int afs_fs_create(struct afs_fs_cursor *, const char *, umode_t, u64,
struct afs_fid *, struct afs_file_status *, struct afs_callback *);
-extern int afs_fs_remove(struct afs_fs_cursor *, const char *, bool, u64);
+extern int afs_fs_remove(struct afs_fs_cursor *, struct afs_vnode *, const char *, bool, u64);
extern int afs_fs_link(struct afs_fs_cursor *, struct afs_vnode *, const char *, u64);
extern int afs_fs_symlink(struct afs_fs_cursor *, const char *, const char *, u64,
struct afs_fid *, struct afs_file_status *);
@@ -797,7 +943,7 @@ extern int afs_fs_release_lock(struct afs_fs_cursor *);
extern int afs_fs_give_up_all_callbacks(struct afs_net *, struct afs_server *,
struct afs_addr_cursor *, struct key *);
extern int afs_fs_get_capabilities(struct afs_net *, struct afs_server *,
- struct afs_addr_cursor *, struct key *);
+ struct afs_addr_cursor *, struct key *, unsigned int, bool);
extern int afs_fs_inline_bulk_status(struct afs_fs_cursor *, struct afs_net *,
struct afs_fid *, struct afs_file_status *,
struct afs_callback *, unsigned int,
@@ -807,6 +953,13 @@ extern int afs_fs_fetch_status(struct afs_fs_cursor *, struct afs_net *,
struct afs_callback *, struct afs_volsync *);
/*
+ * fs_probe.c
+ */
+extern void afs_fileserver_probe_result(struct afs_call *);
+extern int afs_probe_fileservers(struct afs_net *, struct key *, struct afs_server_list *);
+extern int afs_wait_for_fs_probes(struct afs_server_list *, unsigned long);
+
+/*
* inode.c
*/
extern int afs_fetch_status(struct afs_vnode *, struct key *, bool);
@@ -870,6 +1023,7 @@ static inline void __afs_stat(atomic_t *s)
* misc.c
*/
extern int afs_abort_to_error(u32);
+extern void afs_prioritise_error(struct afs_error *, int, u32);
/*
* mntpt.c
@@ -922,7 +1076,6 @@ extern int __net_init afs_open_socket(struct afs_net *);
extern void __net_exit afs_close_socket(struct afs_net *);
extern void afs_charge_preallocation(struct work_struct *);
extern void afs_put_call(struct afs_call *);
-extern int afs_queue_call_work(struct afs_call *);
extern long afs_make_call(struct afs_addr_cursor *, struct afs_call *, gfp_t, bool);
extern struct afs_call *afs_alloc_flat_call(struct afs_net *,
const struct afs_call_type *,
@@ -930,12 +1083,39 @@ extern struct afs_call *afs_alloc_flat_call(struct afs_net *,
extern void afs_flat_call_destructor(struct afs_call *);
extern void afs_send_empty_reply(struct afs_call *);
extern void afs_send_simple_reply(struct afs_call *, const void *, size_t);
-extern int afs_extract_data(struct afs_call *, void *, size_t, bool);
-extern int afs_protocol_error(struct afs_call *, int);
+extern int afs_extract_data(struct afs_call *, bool);
+extern int afs_protocol_error(struct afs_call *, int, enum afs_eproto_cause);
+
+static inline void afs_extract_begin(struct afs_call *call, void *buf, size_t size)
+{
+ call->kvec[0].iov_base = buf;
+ call->kvec[0].iov_len = size;
+ iov_iter_kvec(&call->iter, READ, call->kvec, 1, size);
+}
+
+static inline void afs_extract_to_tmp(struct afs_call *call)
+{
+ afs_extract_begin(call, &call->tmp, sizeof(call->tmp));
+}
+
+static inline void afs_extract_to_tmp64(struct afs_call *call)
+{
+ afs_extract_begin(call, &call->tmp64, sizeof(call->tmp64));
+}
+
+static inline void afs_extract_discard(struct afs_call *call, size_t size)
+{
+ iov_iter_discard(&call->iter, READ, size);
+}
+
+static inline void afs_extract_to_buf(struct afs_call *call, size_t size)
+{
+ afs_extract_begin(call, call->buffer, size);
+}
static inline int afs_transfer_reply(struct afs_call *call)
{
- return afs_extract_data(call, call->buffer, call->reply_max, false);
+ return afs_extract_data(call, false);
}
static inline bool afs_check_call_state(struct afs_call *call,
@@ -1012,7 +1192,6 @@ extern void afs_put_server(struct afs_net *, struct afs_server *);
extern void afs_manage_servers(struct work_struct *);
extern void afs_servers_timer(struct timer_list *);
extern void __net_exit afs_purge_servers(struct afs_net *);
-extern bool afs_probe_fileserver(struct afs_fs_cursor *);
extern bool afs_check_server_record(struct afs_fs_cursor *, struct afs_server *);
/*
@@ -1039,14 +1218,51 @@ extern void afs_fs_exit(void);
/*
* vlclient.c
*/
-extern struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_net *,
- struct afs_addr_cursor *,
- struct key *, const char *, int);
-extern struct afs_addr_list *afs_vl_get_addrs_u(struct afs_net *, struct afs_addr_cursor *,
- struct key *, const uuid_t *);
-extern int afs_vl_get_capabilities(struct afs_net *, struct afs_addr_cursor *, struct key *);
-extern struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_net *, struct afs_addr_cursor *,
- struct key *, const uuid_t *);
+extern struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_vl_cursor *,
+ const char *, int);
+extern struct afs_addr_list *afs_vl_get_addrs_u(struct afs_vl_cursor *, const uuid_t *);
+extern int afs_vl_get_capabilities(struct afs_net *, struct afs_addr_cursor *, struct key *,
+ struct afs_vlserver *, unsigned int, bool);
+extern struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_vl_cursor *, const uuid_t *);
+
+/*
+ * vl_probe.c
+ */
+extern void afs_vlserver_probe_result(struct afs_call *);
+extern int afs_send_vl_probes(struct afs_net *, struct key *, struct afs_vlserver_list *);
+extern int afs_wait_for_vl_probes(struct afs_vlserver_list *, unsigned long);
+
+/*
+ * vl_rotate.c
+ */
+extern bool afs_begin_vlserver_operation(struct afs_vl_cursor *,
+ struct afs_cell *, struct key *);
+extern bool afs_select_vlserver(struct afs_vl_cursor *);
+extern bool afs_select_current_vlserver(struct afs_vl_cursor *);
+extern int afs_end_vlserver_operation(struct afs_vl_cursor *);
+
+/*
+ * vlserver_list.c
+ */
+static inline struct afs_vlserver *afs_get_vlserver(struct afs_vlserver *vlserver)
+{
+ atomic_inc(&vlserver->usage);
+ return vlserver;
+}
+
+static inline struct afs_vlserver_list *afs_get_vlserverlist(struct afs_vlserver_list *vllist)
+{
+ if (vllist)
+ atomic_inc(&vllist->usage);
+ return vllist;
+}
+
+extern struct afs_vlserver *afs_alloc_vlserver(const char *, size_t, unsigned short);
+extern void afs_put_vlserver(struct afs_net *, struct afs_vlserver *);
+extern struct afs_vlserver_list *afs_alloc_vlserver_list(unsigned int);
+extern void afs_put_vlserverlist(struct afs_net *, struct afs_vlserver_list *);
+extern struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *,
+ const void *, size_t);
/*
* volume.c
@@ -1089,6 +1305,36 @@ extern int afs_launder_page(struct page *);
extern const struct xattr_handler *afs_xattr_handlers[];
extern ssize_t afs_listxattr(struct dentry *, char *, size_t);
+/*
+ * yfsclient.c
+ */
+extern int yfs_fs_fetch_file_status(struct afs_fs_cursor *, struct afs_volsync *, bool);
+extern int yfs_fs_fetch_data(struct afs_fs_cursor *, struct afs_read *);
+extern int yfs_fs_create_file(struct afs_fs_cursor *, const char *, umode_t, u64,
+ struct afs_fid *, struct afs_file_status *, struct afs_callback *);
+extern int yfs_fs_make_dir(struct afs_fs_cursor *, const char *, umode_t, u64,
+ struct afs_fid *, struct afs_file_status *, struct afs_callback *);
+extern int yfs_fs_remove_file2(struct afs_fs_cursor *, struct afs_vnode *, const char *, u64);
+extern int yfs_fs_remove(struct afs_fs_cursor *, struct afs_vnode *, const char *, bool, u64);
+extern int yfs_fs_link(struct afs_fs_cursor *, struct afs_vnode *, const char *, u64);
+extern int yfs_fs_symlink(struct afs_fs_cursor *, const char *, const char *, u64,
+ struct afs_fid *, struct afs_file_status *);
+extern int yfs_fs_rename(struct afs_fs_cursor *, const char *,
+ struct afs_vnode *, const char *, u64, u64);
+extern int yfs_fs_store_data(struct afs_fs_cursor *, struct address_space *,
+ pgoff_t, pgoff_t, unsigned, unsigned);
+extern int yfs_fs_setattr(struct afs_fs_cursor *, struct iattr *);
+extern int yfs_fs_get_volume_status(struct afs_fs_cursor *, struct afs_volume_status *);
+extern int yfs_fs_set_lock(struct afs_fs_cursor *, afs_lock_type_t);
+extern int yfs_fs_extend_lock(struct afs_fs_cursor *);
+extern int yfs_fs_release_lock(struct afs_fs_cursor *);
+extern int yfs_fs_fetch_status(struct afs_fs_cursor *, struct afs_net *,
+ struct afs_fid *, struct afs_file_status *,
+ struct afs_callback *, struct afs_volsync *);
+extern int yfs_fs_inline_bulk_status(struct afs_fs_cursor *, struct afs_net *,
+ struct afs_fid *, struct afs_file_status *,
+ struct afs_callback *, unsigned int,
+ struct afs_volsync *);
/*
* Miscellaneous inline functions.
@@ -1120,6 +1366,17 @@ static inline void afs_check_for_remote_deletion(struct afs_fs_cursor *fc,
}
}
+static inline int afs_io_error(struct afs_call *call, enum afs_io_error where)
+{
+ trace_afs_io_error(call->debug_id, -EIO, where);
+ return -EIO;
+}
+
+static inline int afs_bad(struct afs_vnode *vnode, enum afs_file_error where)
+{
+ trace_afs_file_error(vnode, -EIO, where);
+ return -EIO;
+}
/*****************************************************************************/
/*
diff --git a/fs/afs/misc.c b/fs/afs/misc.c
index 700a5fa7f4ec..bbb1fd51b019 100644
--- a/fs/afs/misc.c
+++ b/fs/afs/misc.c
@@ -118,3 +118,55 @@ int afs_abort_to_error(u32 abort_code)
default: return -EREMOTEIO;
}
}
+
+/*
+ * Select the error to report from a set of errors.
+ */
+void afs_prioritise_error(struct afs_error *e, int error, u32 abort_code)
+{
+ switch (error) {
+ case 0:
+ return;
+ default:
+ if (e->error == -ETIMEDOUT ||
+ e->error == -ETIME)
+ return;
+ case -ETIMEDOUT:
+ case -ETIME:
+ if (e->error == -ENOMEM ||
+ e->error == -ENONET)
+ return;
+ case -ENOMEM:
+ case -ENONET:
+ if (e->error == -ERFKILL)
+ return;
+ case -ERFKILL:
+ if (e->error == -EADDRNOTAVAIL)
+ return;
+ case -EADDRNOTAVAIL:
+ if (e->error == -ENETUNREACH)
+ return;
+ case -ENETUNREACH:
+ if (e->error == -EHOSTUNREACH)
+ return;
+ case -EHOSTUNREACH:
+ if (e->error == -EHOSTDOWN)
+ return;
+ case -EHOSTDOWN:
+ if (e->error == -ECONNREFUSED)
+ return;
+ case -ECONNREFUSED:
+ if (e->error == -ECONNRESET)
+ return;
+ case -ECONNRESET: /* Responded, but call expired. */
+ if (e->responded)
+ return;
+ e->error = error;
+ return;
+
+ case -ECONNABORTED:
+ e->responded = true;
+ e->error = afs_abort_to_error(abort_code);
+ return;
+ }
+}
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 99fd13500a97..2e51c6994148 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -130,9 +130,10 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
goto error_no_page;
}
- ret = -EIO;
- if (PageError(page))
+ if (PageError(page)) {
+ ret = afs_bad(AFS_FS_I(d_inode(mntpt)), afs_file_error_mntpt);
goto error;
+ }
buf = kmap_atomic(page);
memcpy(devname, buf, size);
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 9101f62707af..be2ee3bbd0a9 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -17,6 +17,11 @@
#include <linux/uaccess.h>
#include "internal.h"
+struct afs_vl_seq_net_private {
+ struct seq_net_private seq; /* Must be first */
+ struct afs_vlserver_list *vllist;
+};
+
static inline struct afs_net *afs_seq2net(struct seq_file *m)
{
return afs_net(seq_file_net(m));
@@ -32,16 +37,24 @@ static inline struct afs_net *afs_seq2net_single(struct seq_file *m)
*/
static int afs_proc_cells_show(struct seq_file *m, void *v)
{
- struct afs_cell *cell = list_entry(v, struct afs_cell, proc_link);
+ struct afs_vlserver_list *vllist;
+ struct afs_cell *cell;
if (v == SEQ_START_TOKEN) {
/* display header on line 1 */
- seq_puts(m, "USE NAME\n");
+ seq_puts(m, "USE TTL SV NAME\n");
return 0;
}
+ cell = list_entry(v, struct afs_cell, proc_link);
+ vllist = rcu_dereference(cell->vl_servers);
+
/* display one cell per line on subsequent lines */
- seq_printf(m, "%3u %s\n", atomic_read(&cell->usage), cell->name);
+ seq_printf(m, "%3u %6lld %2u %s\n",
+ atomic_read(&cell->usage),
+ cell->dns_expiry - ktime_get_real_seconds(),
+ vllist ? vllist->nr_servers : 0,
+ cell->name);
return 0;
}
@@ -208,7 +221,7 @@ static int afs_proc_cell_volumes_show(struct seq_file *m, void *v)
return 0;
}
- seq_printf(m, "%3d %08x %s\n",
+ seq_printf(m, "%3d %08llx %s\n",
atomic_read(&vol->usage), vol->vid,
afs_vol_types[vol->type]);
@@ -247,61 +260,102 @@ static const struct seq_operations afs_proc_cell_volumes_ops = {
.show = afs_proc_cell_volumes_show,
};
+static const char *const dns_record_sources[NR__dns_record_source + 1] = {
+ [DNS_RECORD_UNAVAILABLE] = "unav",
+ [DNS_RECORD_FROM_CONFIG] = "cfg",
+ [DNS_RECORD_FROM_DNS_A] = "A",
+ [DNS_RECORD_FROM_DNS_AFSDB] = "AFSDB",
+ [DNS_RECORD_FROM_DNS_SRV] = "SRV",
+ [DNS_RECORD_FROM_NSS] = "nss",
+ [NR__dns_record_source] = "[weird]"
+};
+
+static const char *const dns_lookup_statuses[NR__dns_lookup_status + 1] = {
+ [DNS_LOOKUP_NOT_DONE] = "no-lookup",
+ [DNS_LOOKUP_GOOD] = "good",
+ [DNS_LOOKUP_GOOD_WITH_BAD] = "good/bad",
+ [DNS_LOOKUP_BAD] = "bad",
+ [DNS_LOOKUP_GOT_NOT_FOUND] = "not-found",
+ [DNS_LOOKUP_GOT_LOCAL_FAILURE] = "local-failure",
+ [DNS_LOOKUP_GOT_TEMP_FAILURE] = "temp-failure",
+ [DNS_LOOKUP_GOT_NS_FAILURE] = "ns-failure",
+ [NR__dns_lookup_status] = "[weird]"
+};
+
/*
* Display the list of Volume Location servers we're using for a cell.
*/
static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v)
{
- struct sockaddr_rxrpc *addr = v;
+ const struct afs_vl_seq_net_private *priv = m->private;
+ const struct afs_vlserver_list *vllist = priv->vllist;
+ const struct afs_vlserver_entry *entry;
+ const struct afs_vlserver *vlserver;
+ const struct afs_addr_list *alist;
+ int i;
- /* display header on line 1 */
- if (v == (void *)1) {
- seq_puts(m, "ADDRESS\n");
+ if (v == SEQ_START_TOKEN) {
+ seq_printf(m, "# source %s, status %s\n",
+ dns_record_sources[vllist->source],
+ dns_lookup_statuses[vllist->status]);
return 0;
}
- /* display one cell per line on subsequent lines */
- seq_printf(m, "%pISp\n", &addr->transport);
+ entry = v;
+ vlserver = entry->server;
+ alist = rcu_dereference(vlserver->addresses);
+
+ seq_printf(m, "%s [p=%hu w=%hu s=%s,%s]:\n",
+ vlserver->name, entry->priority, entry->weight,
+ dns_record_sources[alist ? alist->source : entry->source],
+ dns_lookup_statuses[alist ? alist->status : entry->status]);
+ if (alist) {
+ for (i = 0; i < alist->nr_addrs; i++)
+ seq_printf(m, " %c %pISpc\n",
+ alist->preferred == i ? '>' : '-',
+ &alist->addrs[i].transport);
+ }
return 0;
}
static void *afs_proc_cell_vlservers_start(struct seq_file *m, loff_t *_pos)
__acquires(rcu)
{
- struct afs_addr_list *alist;
+ struct afs_vl_seq_net_private *priv = m->private;
+ struct afs_vlserver_list *vllist;
struct afs_cell *cell = PDE_DATA(file_inode(m->file));
loff_t pos = *_pos;
rcu_read_lock();
- alist = rcu_dereference(cell->vl_addrs);
+ vllist = rcu_dereference(cell->vl_servers);
+ priv->vllist = vllist;
- /* allow for the header line */
- if (!pos)
- return (void *) 1;
- pos--;
+ if (pos < 0)
+ *_pos = pos = 0;
+ if (pos == 0)
+ return SEQ_START_TOKEN;
- if (!alist || pos >= alist->nr_addrs)
+ if (!vllist || pos - 1 >= vllist->nr_servers)
return NULL;
- return alist->addrs + pos;
+ return &vllist->servers[pos - 1];
}
static void *afs_proc_cell_vlservers_next(struct seq_file *m, void *v,
loff_t *_pos)
{
- struct afs_addr_list *alist;
- struct afs_cell *cell = PDE_DATA(file_inode(m->file));
+ struct afs_vl_seq_net_private *priv = m->private;
+ struct afs_vlserver_list *vllist = priv->vllist;
loff_t pos;
- alist = rcu_dereference(cell->vl_addrs);
-
pos = *_pos;
- (*_pos)++;
- if (!alist || pos >= alist->nr_addrs)
+ pos++;
+ *_pos = pos;
+ if (!vllist || pos - 1 >= vllist->nr_servers)
return NULL;
- return alist->addrs + pos;
+ return &vllist->servers[pos - 1];
}
static void afs_proc_cell_vlservers_stop(struct seq_file *m, void *v)
@@ -337,11 +391,11 @@ static int afs_proc_servers_show(struct seq_file *m, void *v)
&server->uuid,
atomic_read(&server->usage),
&alist->addrs[0].transport,
- alist->index == 0 ? "*" : "");
+ alist->preferred == 0 ? "*" : "");
for (i = 1; i < alist->nr_addrs; i++)
seq_printf(m, " %pISpc%s\n",
&alist->addrs[i].transport,
- alist->index == i ? "*" : "");
+ alist->preferred == i ? "*" : "");
return 0;
}
@@ -562,7 +616,7 @@ int afs_proc_cell_setup(struct afs_cell *cell)
if (!proc_create_net_data("vlservers", 0444, dir,
&afs_proc_cell_vlservers_ops,
- sizeof(struct seq_net_private),
+ sizeof(struct afs_vl_seq_net_private),
cell) ||
!proc_create_net_data("volumes", 0444, dir,
&afs_proc_cell_volumes_ops,
diff --git a/fs/afs/protocol_yfs.h b/fs/afs/protocol_yfs.h
new file mode 100644
index 000000000000..d443e2bfa094
--- /dev/null
+++ b/fs/afs/protocol_yfs.h
@@ -0,0 +1,174 @@
+/* YFS protocol bits
+ *
+ * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#define YFS_FS_SERVICE 2500
+#define YFS_CM_SERVICE 2501
+
+#define YFSCBMAX 1024
+
+enum YFS_CM_Operations {
+ YFSCBProbe = 206, /* probe client */
+ YFSCBGetLock = 207, /* get contents of CM lock table */
+ YFSCBXStatsVersion = 209, /* get version of extended statistics */
+ YFSCBGetXStats = 210, /* get contents of extended statistics data */
+ YFSCBInitCallBackState3 = 213, /* initialise callback state, version 3 */
+ YFSCBProbeUuid = 214, /* check the client hasn't rebooted */
+ YFSCBGetServerPrefs = 215,
+ YFSCBGetCellServDV = 216,
+ YFSCBGetLocalCell = 217,
+ YFSCBGetCacheConfig = 218,
+ YFSCBGetCellByNum = 65537,
+ YFSCBTellMeAboutYourself = 65538, /* get client capabilities */
+ YFSCBCallBack = 64204,
+};
+
+enum YFS_FS_Operations {
+ YFSFETCHACL = 64131, /* YFS Fetch file ACL */
+ YFSFETCHSTATUS = 64132, /* YFS Fetch file status */
+ YFSSTOREACL = 64134, /* YFS Store file ACL */
+ YFSSTORESTATUS = 64135, /* YFS Store file status */
+ YFSREMOVEFILE = 64136, /* YFS Remove a file */
+ YFSCREATEFILE = 64137, /* YFS Create a file */
+ YFSRENAME = 64138, /* YFS Rename or move a file or directory */
+ YFSSYMLINK = 64139, /* YFS Create a symbolic link */
+ YFSLINK = 64140, /* YFS Create a hard link */
+ YFSMAKEDIR = 64141, /* YFS Create a directory */
+ YFSREMOVEDIR = 64142, /* YFS Remove a directory */
+ YFSGETVOLUMESTATUS = 64149, /* YFS Get volume status information */
+ YFSSETVOLUMESTATUS = 64150, /* YFS Set volume status information */
+ YFSSETLOCK = 64156, /* YFS Request a file lock */
+ YFSEXTENDLOCK = 64157, /* YFS Extend a file lock */
+ YFSRELEASELOCK = 64158, /* YFS Release a file lock */
+ YFSLOOKUP = 64161, /* YFS lookup file in directory */
+ YFSFLUSHCPS = 64165,
+ YFSFETCHOPAQUEACL = 64168,
+ YFSWHOAMI = 64170,
+ YFSREMOVEACL = 64171,
+ YFSREMOVEFILE2 = 64173,
+ YFSSTOREOPAQUEACL2 = 64174,
+ YFSINLINEBULKSTATUS = 64536, /* YFS Fetch multiple file statuses with errors */
+ YFSFETCHDATA64 = 64537, /* YFS Fetch file data */
+ YFSSTOREDATA64 = 64538, /* YFS Store file data */
+ YFSUPDATESYMLINK = 64540,
+};
+
+struct yfs_xdr_u64 {
+ __be32 msw;
+ __be32 lsw;
+} __packed;
+
+static inline u64 xdr_to_u64(const struct yfs_xdr_u64 x)
+{
+ return ((u64)ntohl(x.msw) << 32) | ntohl(x.lsw);
+}
+
+static inline struct yfs_xdr_u64 u64_to_xdr(const u64 x)
+{
+ return (struct yfs_xdr_u64){ .msw = htonl(x >> 32), .lsw = htonl(x) };
+}
+
+struct yfs_xdr_vnode {
+ struct yfs_xdr_u64 lo;
+ __be32 hi;
+ __be32 unique;
+} __packed;
+
+struct yfs_xdr_YFSFid {
+ struct yfs_xdr_u64 volume;
+ struct yfs_xdr_vnode vnode;
+} __packed;
+
+
+struct yfs_xdr_YFSFetchStatus {
+ __be32 type;
+ __be32 nlink;
+ struct yfs_xdr_u64 size;
+ struct yfs_xdr_u64 data_version;
+ struct yfs_xdr_u64 author;
+ struct yfs_xdr_u64 owner;
+ struct yfs_xdr_u64 group;
+ __be32 mode;
+ __be32 caller_access;
+ __be32 anon_access;
+ struct yfs_xdr_vnode parent;
+ __be32 data_access_protocol;
+ struct yfs_xdr_u64 mtime_client;
+ struct yfs_xdr_u64 mtime_server;
+ __be32 lock_count;
+ __be32 abort_code;
+} __packed;
+
+struct yfs_xdr_YFSCallBack {
+ __be32 version;
+ struct yfs_xdr_u64 expiration_time;
+ __be32 type;
+} __packed;
+
+struct yfs_xdr_YFSStoreStatus {
+ __be32 mask;
+ __be32 mode;
+ struct yfs_xdr_u64 mtime_client;
+ struct yfs_xdr_u64 owner;
+ struct yfs_xdr_u64 group;
+} __packed;
+
+struct yfs_xdr_RPCFlags {
+ __be32 rpc_flags;
+} __packed;
+
+struct yfs_xdr_YFSVolSync {
+ struct yfs_xdr_u64 vol_creation_date;
+ struct yfs_xdr_u64 vol_update_date;
+ struct yfs_xdr_u64 max_quota;
+ struct yfs_xdr_u64 blocks_in_use;
+ struct yfs_xdr_u64 blocks_avail;
+} __packed;
+
+enum yfs_volume_type {
+ yfs_volume_type_ro = 0,
+ yfs_volume_type_rw = 1,
+};
+
+#define yfs_FVSOnline 0x1
+#define yfs_FVSInservice 0x2
+#define yfs_FVSBlessed 0x4
+#define yfs_FVSNeedsSalvage 0x8
+
+struct yfs_xdr_YFSFetchVolumeStatus {
+ struct yfs_xdr_u64 vid;
+ struct yfs_xdr_u64 parent_id;
+ __be32 flags;
+ __be32 type;
+ struct yfs_xdr_u64 max_quota;
+ struct yfs_xdr_u64 blocks_in_use;
+ struct yfs_xdr_u64 part_blocks_avail;
+ struct yfs_xdr_u64 part_max_blocks;
+ struct yfs_xdr_u64 vol_copy_date;
+ struct yfs_xdr_u64 vol_backup_date;
+} __packed;
+
+struct yfs_xdr_YFSStoreVolumeStatus {
+ __be32 mask;
+ struct yfs_xdr_u64 min_quota;
+ struct yfs_xdr_u64 max_quota;
+ struct yfs_xdr_u64 file_quota;
+} __packed;
+
+enum yfs_lock_type {
+ yfs_LockNone = -1,
+ yfs_LockRead = 0,
+ yfs_LockWrite = 1,
+ yfs_LockExtend = 2,
+ yfs_LockRelease = 3,
+ yfs_LockMandatoryRead = 0x100,
+ yfs_LockMandatoryWrite = 0x101,
+ yfs_LockMandatoryExtend = 0x102,
+};
diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
index 1faef56b12bd..c3ae324781f8 100644
--- a/fs/afs/rotate.c
+++ b/fs/afs/rotate.c
@@ -19,14 +19,6 @@
#include "afs_fs.h"
/*
- * Initialise a filesystem server cursor for iterating over FS servers.
- */
-static void afs_init_fs_cursor(struct afs_fs_cursor *fc, struct afs_vnode *vnode)
-{
- memset(fc, 0, sizeof(*fc));
-}
-
-/*
* Begin an operation on the fileserver.
*
* Fileserver operations are serialised on the server by vnode, so we serialise
@@ -35,13 +27,14 @@ static void afs_init_fs_cursor(struct afs_fs_cursor *fc, struct afs_vnode *vnode
bool afs_begin_vnode_operation(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
struct key *key)
{
- afs_init_fs_cursor(fc, vnode);
+ memset(fc, 0, sizeof(*fc));
fc->vnode = vnode;
fc->key = key;
fc->ac.error = SHRT_MAX;
+ fc->error = -EDESTADDRREQ;
if (mutex_lock_interruptible(&vnode->io_lock) < 0) {
- fc->ac.error = -EINTR;
+ fc->error = -EINTR;
fc->flags |= AFS_FS_CURSOR_STOP;
return false;
}
@@ -65,12 +58,15 @@ static bool afs_start_fs_iteration(struct afs_fs_cursor *fc,
fc->server_list = afs_get_serverlist(vnode->volume->servers);
read_unlock(&vnode->volume->servers_lock);
+ fc->untried = (1UL << fc->server_list->nr_servers) - 1;
+ fc->index = READ_ONCE(fc->server_list->preferred);
+
cbi = vnode->cb_interest;
if (cbi) {
/* See if the vnode's preferred record is still available */
for (i = 0; i < fc->server_list->nr_servers; i++) {
if (fc->server_list->servers[i].cb_interest == cbi) {
- fc->start = i;
+ fc->index = i;
goto found_interest;
}
}
@@ -80,7 +76,7 @@ static bool afs_start_fs_iteration(struct afs_fs_cursor *fc,
* and have to return an error.
*/
if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) {
- fc->ac.error = -ESTALE;
+ fc->error = -ESTALE;
return false;
}
@@ -94,12 +90,9 @@ static bool afs_start_fs_iteration(struct afs_fs_cursor *fc,
afs_put_cb_interest(afs_v2net(vnode), cbi);
cbi = NULL;
- } else {
- fc->start = READ_ONCE(fc->server_list->index);
}
found_interest:
- fc->index = fc->start;
return true;
}
@@ -117,7 +110,7 @@ static void afs_busy(struct afs_volume *volume, u32 abort_code)
default: m = "busy"; break;
}
- pr_notice("kAFS: Volume %u '%s' is %s\n", volume->vid, volume->name, m);
+ pr_notice("kAFS: Volume %llu '%s' is %s\n", volume->vid, volume->name, m);
}
/*
@@ -127,7 +120,7 @@ static bool afs_sleep_and_retry(struct afs_fs_cursor *fc)
{
msleep_interruptible(1000);
if (signal_pending(current)) {
- fc->ac.error = -ERESTARTSYS;
+ fc->error = -ERESTARTSYS;
return false;
}
@@ -143,27 +136,33 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc)
struct afs_addr_list *alist;
struct afs_server *server;
struct afs_vnode *vnode = fc->vnode;
+ struct afs_error e;
+ u32 rtt;
+ int error = fc->ac.error, i;
- _enter("%u/%u,%u/%u,%d,%d",
- fc->index, fc->start,
- fc->ac.index, fc->ac.start,
- fc->ac.error, fc->ac.abort_code);
+ _enter("%lx[%d],%lx[%d],%d,%d",
+ fc->untried, fc->index,
+ fc->ac.tried, fc->ac.index,
+ error, fc->ac.abort_code);
if (fc->flags & AFS_FS_CURSOR_STOP) {
_leave(" = f [stopped]");
return false;
}
+ fc->nr_iterations++;
+
/* Evaluate the result of the previous operation, if there was one. */
- switch (fc->ac.error) {
+ switch (error) {
case SHRT_MAX:
goto start;
case 0:
default:
/* Success or local failure. Stop. */
+ fc->error = error;
fc->flags |= AFS_FS_CURSOR_STOP;
- _leave(" = f [okay/local %d]", fc->ac.error);
+ _leave(" = f [okay/local %d]", error);
return false;
case -ECONNABORTED:
@@ -178,7 +177,7 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc)
* - May indicate that the fileserver couldn't attach to the vol.
*/
if (fc->flags & AFS_FS_CURSOR_VNOVOL) {
- fc->ac.error = -EREMOTEIO;
+ fc->error = -EREMOTEIO;
goto next_server;
}
@@ -187,12 +186,12 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc)
write_unlock(&vnode->volume->servers_lock);
set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags);
- fc->ac.error = afs_check_volume_status(vnode->volume, fc->key);
- if (fc->ac.error < 0)
- goto failed;
+ error = afs_check_volume_status(vnode->volume, fc->key);
+ if (error < 0)
+ goto failed_set_error;
if (test_bit(AFS_VOLUME_DELETED, &vnode->volume->flags)) {
- fc->ac.error = -ENOMEDIUM;
+ fc->error = -ENOMEDIUM;
goto failed;
}
@@ -200,7 +199,7 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc)
* it's the fileserver having trouble.
*/
if (vnode->volume->servers == fc->server_list) {
- fc->ac.error = -EREMOTEIO;
+ fc->error = -EREMOTEIO;
goto next_server;
}
@@ -215,7 +214,7 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc)
case VONLINE:
case VDISKFULL:
case VOVERQUOTA:
- fc->ac.error = afs_abort_to_error(fc->ac.abort_code);
+ fc->error = afs_abort_to_error(fc->ac.abort_code);
goto next_server;
case VOFFLINE:
@@ -224,11 +223,11 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc)
clear_bit(AFS_VOLUME_BUSY, &vnode->volume->flags);
}
if (fc->flags & AFS_FS_CURSOR_NO_VSLEEP) {
- fc->ac.error = -EADV;
+ fc->error = -EADV;
goto failed;
}
if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) {
- fc->ac.error = -ESTALE;
+ fc->error = -ESTALE;
goto failed;
}
goto busy;
@@ -240,7 +239,7 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc)
* have a file lock we need to maintain.
*/
if (fc->flags & AFS_FS_CURSOR_NO_VSLEEP) {
- fc->ac.error = -EBUSY;
+ fc->error = -EBUSY;
goto failed;
}
if (!test_and_set_bit(AFS_VOLUME_BUSY, &vnode->volume->flags)) {
@@ -269,16 +268,16 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc)
* honour, just in case someone sets up a loop.
*/
if (fc->flags & AFS_FS_CURSOR_VMOVED) {
- fc->ac.error = -EREMOTEIO;
+ fc->error = -EREMOTEIO;
goto failed;
}
fc->flags |= AFS_FS_CURSOR_VMOVED;
set_bit(AFS_VOLUME_WAIT, &vnode->volume->flags);
set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags);
- fc->ac.error = afs_check_volume_status(vnode->volume, fc->key);
- if (fc->ac.error < 0)
- goto failed;
+ error = afs_check_volume_status(vnode->volume, fc->key);
+ if (error < 0)
+ goto failed_set_error;
/* If the server list didn't change, then the VLDB is
* out of sync with the fileservers. This is hopefully
@@ -290,7 +289,7 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc)
* TODO: Retry a few times with sleeps.
*/
if (vnode->volume->servers == fc->server_list) {
- fc->ac.error = -ENOMEDIUM;
+ fc->error = -ENOMEDIUM;
goto failed;
}
@@ -299,20 +298,28 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc)
default:
clear_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags);
clear_bit(AFS_VOLUME_BUSY, &vnode->volume->flags);
- fc->ac.error = afs_abort_to_error(fc->ac.abort_code);
+ fc->error = afs_abort_to_error(fc->ac.abort_code);
goto failed;
}
+ case -ETIMEDOUT:
+ case -ETIME:
+ if (fc->error != -EDESTADDRREQ)
+ goto iterate_address;
+ /* Fall through */
+ case -ERFKILL:
+ case -EADDRNOTAVAIL:
case -ENETUNREACH:
case -EHOSTUNREACH:
+ case -EHOSTDOWN:
case -ECONNREFUSED:
- case -ETIMEDOUT:
- case -ETIME:
_debug("no conn");
+ fc->error = error;
goto iterate_address;
case -ECONNRESET:
_debug("call reset");
+ fc->error = error;
goto failed;
}
@@ -328,15 +335,57 @@ start:
/* See if we need to do an update of the volume record. Note that the
* volume may have moved or even have been deleted.
*/
- fc->ac.error = afs_check_volume_status(vnode->volume, fc->key);
- if (fc->ac.error < 0)
- goto failed;
+ error = afs_check_volume_status(vnode->volume, fc->key);
+ if (error < 0)
+ goto failed_set_error;
if (!afs_start_fs_iteration(fc, vnode))
goto failed;
-use_server:
- _debug("use");
+ _debug("__ VOL %llx __", vnode->volume->vid);
+ error = afs_probe_fileservers(afs_v2net(vnode), fc->key, fc->server_list);
+ if (error < 0)
+ goto failed_set_error;
+
+pick_server:
+ _debug("pick [%lx]", fc->untried);
+
+ error = afs_wait_for_fs_probes(fc->server_list, fc->untried);
+ if (error < 0)
+ goto failed_set_error;
+
+ /* Pick the untried server with the lowest RTT. If we have outstanding
+ * callbacks, we stick with the server we're already using if we can.
+ */
+ if (fc->cbi) {
+ _debug("cbi %u", fc->index);
+ if (test_bit(fc->index, &fc->untried))
+ goto selected_server;
+ afs_put_cb_interest(afs_v2net(vnode), fc->cbi);
+ fc->cbi = NULL;
+ _debug("nocbi");
+ }
+
+ fc->index = -1;
+ rtt = U32_MAX;
+ for (i = 0; i < fc->server_list->nr_servers; i++) {
+ struct afs_server *s = fc->server_list->servers[i].server;
+
+ if (!test_bit(i, &fc->untried) || !s->probe.responded)
+ continue;
+ if (s->probe.rtt < rtt) {
+ fc->index = i;
+ rtt = s->probe.rtt;
+ }
+ }
+
+ if (fc->index == -1)
+ goto no_more_servers;
+
+selected_server:
+ _debug("use %d", fc->index);
+ __clear_bit(fc->index, &fc->untried);
+
/* We're starting on a different fileserver from the list. We need to
* check it, create a callback intercept, find its address list and
* probe its capabilities before we use it.
@@ -354,10 +403,10 @@ use_server:
* break request before we've finished decoding the reply and
* installing the vnode.
*/
- fc->ac.error = afs_register_server_cb_interest(vnode, fc->server_list,
- fc->index);
- if (fc->ac.error < 0)
- goto failed;
+ error = afs_register_server_cb_interest(vnode, fc->server_list,
+ fc->index);
+ if (error < 0)
+ goto failed_set_error;
fc->cbi = afs_get_cb_interest(vnode->cb_interest);
@@ -369,66 +418,53 @@ use_server:
memset(&fc->ac, 0, sizeof(fc->ac));
- /* Probe the current fileserver if we haven't done so yet. */
- if (!test_bit(AFS_SERVER_FL_PROBED, &server->flags)) {
- fc->ac.alist = afs_get_addrlist(alist);
-
- if (!afs_probe_fileserver(fc)) {
- switch (fc->ac.error) {
- case -ENOMEM:
- case -ERESTARTSYS:
- case -EINTR:
- goto failed;
- default:
- goto next_server;
- }
- }
- }
-
if (!fc->ac.alist)
fc->ac.alist = alist;
else
afs_put_addrlist(alist);
- fc->ac.start = READ_ONCE(alist->index);
- fc->ac.index = fc->ac.start;
+ fc->ac.index = -1;
iterate_address:
ASSERT(fc->ac.alist);
- _debug("iterate %d/%d", fc->ac.index, fc->ac.alist->nr_addrs);
/* Iterate over the current server's address list to try and find an
* address on which it will respond to us.
*/
if (!afs_iterate_addresses(&fc->ac))
goto next_server;
+ _debug("address [%u] %u/%u", fc->index, fc->ac.index, fc->ac.alist->nr_addrs);
+
_leave(" = t");
return true;
next_server:
_debug("next");
afs_end_cursor(&fc->ac);
- afs_put_cb_interest(afs_v2net(vnode), fc->cbi);
- fc->cbi = NULL;
- fc->index++;
- if (fc->index >= fc->server_list->nr_servers)
- fc->index = 0;
- if (fc->index != fc->start)
- goto use_server;
+ goto pick_server;
+no_more_servers:
/* That's all the servers poked to no good effect. Try again if some
* of them were busy.
*/
if (fc->flags & AFS_FS_CURSOR_VBUSY)
goto restart_from_beginning;
- fc->ac.error = -EDESTADDRREQ;
- goto failed;
+ e.error = -EDESTADDRREQ;
+ e.responded = false;
+ for (i = 0; i < fc->server_list->nr_servers; i++) {
+ struct afs_server *s = fc->server_list->servers[i].server;
+
+ afs_prioritise_error(&e, READ_ONCE(s->probe.error),
+ s->probe.abort_code);
+ }
+failed_set_error:
+ fc->error = error;
failed:
fc->flags |= AFS_FS_CURSOR_STOP;
afs_end_cursor(&fc->ac);
- _leave(" = f [failed %d]", fc->ac.error);
+ _leave(" = f [failed %d]", fc->error);
return false;
}
@@ -442,13 +478,14 @@ bool afs_select_current_fileserver(struct afs_fs_cursor *fc)
struct afs_vnode *vnode = fc->vnode;
struct afs_cb_interest *cbi = vnode->cb_interest;
struct afs_addr_list *alist;
+ int error = fc->ac.error;
_enter("");
- switch (fc->ac.error) {
+ switch (error) {
case SHRT_MAX:
if (!cbi) {
- fc->ac.error = -ESTALE;
+ fc->error = -ESTALE;
fc->flags |= AFS_FS_CURSOR_STOP;
return false;
}
@@ -461,35 +498,40 @@ bool afs_select_current_fileserver(struct afs_fs_cursor *fc)
afs_get_addrlist(alist);
read_unlock(&cbi->server->fs_lock);
if (!alist) {
- fc->ac.error = -ESTALE;
+ fc->error = -ESTALE;
fc->flags |= AFS_FS_CURSOR_STOP;
return false;
}
memset(&fc->ac, 0, sizeof(fc->ac));
fc->ac.alist = alist;
- fc->ac.start = READ_ONCE(alist->index);
- fc->ac.index = fc->ac.start;
+ fc->ac.index = -1;
goto iterate_address;
case 0:
default:
/* Success or local failure. Stop. */
+ fc->error = error;
fc->flags |= AFS_FS_CURSOR_STOP;
- _leave(" = f [okay/local %d]", fc->ac.error);
+ _leave(" = f [okay/local %d]", error);
return false;
case -ECONNABORTED:
+ fc->error = afs_abort_to_error(fc->ac.abort_code);
fc->flags |= AFS_FS_CURSOR_STOP;
_leave(" = f [abort]");
return false;
+ case -ERFKILL:
+ case -EADDRNOTAVAIL:
case -ENETUNREACH:
case -EHOSTUNREACH:
+ case -EHOSTDOWN:
case -ECONNREFUSED:
case -ETIMEDOUT:
case -ETIME:
_debug("no conn");
+ fc->error = error;
goto iterate_address;
}
@@ -507,12 +549,66 @@ iterate_address:
}
/*
+ * Dump cursor state in the case of the error being EDESTADDRREQ.
+ */
+static void afs_dump_edestaddrreq(const struct afs_fs_cursor *fc)
+{
+ static int count;
+ int i;
+
+ if (!IS_ENABLED(CONFIG_AFS_DEBUG_CURSOR) || count > 3)
+ return;
+ count++;
+
+ rcu_read_lock();
+
+ pr_notice("EDESTADDR occurred\n");
+ pr_notice("FC: cbb=%x cbb2=%x fl=%hx err=%hd\n",
+ fc->cb_break, fc->cb_break_2, fc->flags, fc->error);
+ pr_notice("FC: ut=%lx ix=%d ni=%u\n",
+ fc->untried, fc->index, fc->nr_iterations);
+
+ if (fc->server_list) {
+ const struct afs_server_list *sl = fc->server_list;
+ pr_notice("FC: SL nr=%u pr=%u vnov=%hx\n",
+ sl->nr_servers, sl->preferred, sl->vnovol_mask);
+ for (i = 0; i < sl->nr_servers; i++) {
+ const struct afs_server *s = sl->servers[i].server;
+ pr_notice("FC: server fl=%lx av=%u %pU\n",
+ s->flags, s->addr_version, &s->uuid);
+ if (s->addresses) {
+ const struct afs_addr_list *a =
+ rcu_dereference(s->addresses);
+ pr_notice("FC: - av=%u nr=%u/%u/%u pr=%u\n",
+ a->version,
+ a->nr_ipv4, a->nr_addrs, a->max_addrs,
+ a->preferred);
+ pr_notice("FC: - pr=%lx R=%lx F=%lx\n",
+ a->probed, a->responded, a->failed);
+ if (a == fc->ac.alist)
+ pr_notice("FC: - current\n");
+ }
+ }
+ }
+
+ pr_notice("AC: t=%lx ax=%u ac=%d er=%d r=%u ni=%u\n",
+ fc->ac.tried, fc->ac.index, fc->ac.abort_code, fc->ac.error,
+ fc->ac.responded, fc->ac.nr_iterations);
+ rcu_read_unlock();
+}
+
+/*
* Tidy up a filesystem cursor and unlock the vnode.
*/
int afs_end_vnode_operation(struct afs_fs_cursor *fc)
{
struct afs_net *net = afs_v2net(fc->vnode);
- int ret;
+
+ if (fc->error == -EDESTADDRREQ ||
+ fc->error == -EADDRNOTAVAIL ||
+ fc->error == -ENETUNREACH ||
+ fc->error == -EHOSTUNREACH)
+ afs_dump_edestaddrreq(fc);
mutex_unlock(&fc->vnode->io_lock);
@@ -520,9 +616,8 @@ int afs_end_vnode_operation(struct afs_fs_cursor *fc)
afs_put_cb_interest(net, fc->cbi);
afs_put_serverlist(net, fc->server_list);
- ret = fc->ac.error;
- if (ret == -ECONNABORTED)
- afs_abort_to_error(fc->ac.abort_code);
+ if (fc->error == -ECONNABORTED)
+ fc->error = afs_abort_to_error(fc->ac.abort_code);
- return fc->ac.error;
+ return fc->error;
}
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 77a83790a31f..2c588f9bbbda 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -16,12 +16,14 @@
#include <net/af_rxrpc.h>
#include "internal.h"
#include "afs_cm.h"
+#include "protocol_yfs.h"
struct workqueue_struct *afs_async_calls;
static void afs_wake_up_call_waiter(struct sock *, struct rxrpc_call *, unsigned long);
static long afs_wait_for_call_to_complete(struct afs_call *, struct afs_addr_cursor *);
static void afs_wake_up_async_call(struct sock *, struct rxrpc_call *, unsigned long);
+static void afs_delete_async_call(struct work_struct *);
static void afs_process_async_call(struct work_struct *);
static void afs_rx_new_call(struct sock *, struct rxrpc_call *, unsigned long);
static void afs_rx_discard_new_call(struct rxrpc_call *, unsigned long);
@@ -75,6 +77,18 @@ int afs_open_socket(struct afs_net *net)
if (ret < 0)
goto error_2;
+ srx.srx_service = YFS_CM_SERVICE;
+ ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx));
+ if (ret < 0)
+ goto error_2;
+
+ /* Ideally, we'd turn on service upgrade here, but we can't because
+ * OpenAFS is buggy and leaks the userStatus field from packet to
+ * packet and between FS packets and CB packets - so if we try to do an
+ * upgrade on an FS packet, OpenAFS will leak that into the CB packet
+ * it sends back to us.
+ */
+
rxrpc_kernel_new_call_notification(socket, afs_rx_new_call,
afs_rx_discard_new_call);
@@ -143,6 +157,7 @@ static struct afs_call *afs_alloc_call(struct afs_net *net,
INIT_WORK(&call->async_work, afs_process_async_call);
init_waitqueue_head(&call->waitq);
spin_lock_init(&call->state_lock);
+ call->_iter = &call->iter;
o = atomic_inc_return(&net->nr_outstanding_calls);
trace_afs_call(call, afs_call_trace_alloc, 1, o,
@@ -176,6 +191,7 @@ void afs_put_call(struct afs_call *call)
afs_put_server(call->net, call->cm_server);
afs_put_cb_interest(call->net, call->cbi);
+ afs_put_addrlist(call->alist);
kfree(call->request);
trace_afs_call(call, afs_call_trace_free, 0, o,
@@ -188,22 +204,29 @@ void afs_put_call(struct afs_call *call)
}
}
-/*
- * Queue the call for actual work. Returns 0 unconditionally for convenience.
- */
-int afs_queue_call_work(struct afs_call *call)
+static struct afs_call *afs_get_call(struct afs_call *call,
+ enum afs_call_trace why)
{
int u = atomic_inc_return(&call->usage);
- trace_afs_call(call, afs_call_trace_work, u,
+ trace_afs_call(call, why, u,
atomic_read(&call->net->nr_outstanding_calls),
__builtin_return_address(0));
+ return call;
+}
- INIT_WORK(&call->work, call->type->work);
+/*
+ * Queue the call for actual work.
+ */
+static void afs_queue_call_work(struct afs_call *call)
+{
+ if (call->type->work) {
+ INIT_WORK(&call->work, call->type->work);
- if (!queue_work(afs_wq, &call->work))
- afs_put_call(call);
- return 0;
+ afs_get_call(call, afs_call_trace_work);
+ if (!queue_work(afs_wq, &call->work))
+ afs_put_call(call);
+ }
}
/*
@@ -233,6 +256,7 @@ struct afs_call *afs_alloc_flat_call(struct afs_net *net,
goto nomem_free;
}
+ afs_extract_to_buf(call, call->reply_max);
call->operation_ID = type->op;
init_waitqueue_head(&call->waitq);
return call;
@@ -286,7 +310,7 @@ static void afs_load_bvec(struct afs_call *call, struct msghdr *msg,
offset = 0;
}
- iov_iter_bvec(&msg->msg_iter, WRITE | ITER_BVEC, bv, nr, bytes);
+ iov_iter_bvec(&msg->msg_iter, WRITE, bv, nr, bytes);
}
/*
@@ -342,7 +366,7 @@ static int afs_send_pages(struct afs_call *call, struct msghdr *msg)
long afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call,
gfp_t gfp, bool async)
{
- struct sockaddr_rxrpc *srx = ac->addr;
+ struct sockaddr_rxrpc *srx = &ac->alist->addrs[ac->index];
struct rxrpc_call *rxcall;
struct msghdr msg;
struct kvec iov[1];
@@ -359,6 +383,8 @@ long afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call,
atomic_read(&call->net->nr_outstanding_calls));
call->async = async;
+ call->addr_ix = ac->index;
+ call->alist = afs_get_addrlist(ac->alist);
/* Work out the length we're going to transmit. This is awkward for
* calls such as FS.StoreData where there's an extra injection of data
@@ -379,6 +405,12 @@ long afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call,
}
}
+ /* If the call is going to be asynchronous, we need an extra ref for
+ * the call to hold itself so the caller need not hang on to its ref.
+ */
+ if (call->async)
+ afs_get_call(call, afs_call_trace_get);
+
/* create a call */
rxcall = rxrpc_kernel_begin_call(call->net->socket, srx, call->key,
(unsigned long)call,
@@ -390,6 +422,7 @@ long afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call,
call->debug_id);
if (IS_ERR(rxcall)) {
ret = PTR_ERR(rxcall);
+ call->error = ret;
goto error_kill_call;
}
@@ -401,8 +434,7 @@ long afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call,
msg.msg_name = NULL;
msg.msg_namelen = 0;
- iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, iov, 1,
- call->request_size);
+ iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, call->request_size);
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_flags = MSG_WAITALL | (call->send_pages ? MSG_MORE : 0);
@@ -419,20 +451,22 @@ long afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call,
goto error_do_abort;
}
- /* at this point, an async call may no longer exist as it may have
- * already completed */
- if (call->async)
+ /* Note that at this point, we may have received the reply or an abort
+ * - and an asynchronous call may already have completed.
+ */
+ if (call->async) {
+ afs_put_call(call);
return -EINPROGRESS;
+ }
return afs_wait_for_call_to_complete(call, ac);
error_do_abort:
- call->state = AFS_CALL_COMPLETE;
if (ret != -ECONNABORTED) {
rxrpc_kernel_abort_call(call->net->socket, rxcall,
RX_USER_ABORT, ret, "KSD");
} else {
- iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, NULL, 0, 0);
+ iov_iter_kvec(&msg.msg_iter, READ, NULL, 0, 0);
rxrpc_kernel_recv_data(call->net->socket, rxcall,
&msg.msg_iter, false,
&call->abort_code, &call->service_id);
@@ -442,8 +476,26 @@ error_do_abort:
call->error = ret;
trace_afs_call_done(call);
error_kill_call:
- afs_put_call(call);
+ if (call->type->done)
+ call->type->done(call);
+
+ /* We need to dispose of the extra ref we grabbed for an async call.
+ * The call, however, might be queued on afs_async_calls and we need to
+ * make sure we don't get any more notifications that might requeue it.
+ */
+ if (call->rxcall) {
+ rxrpc_kernel_end_call(call->net->socket, call->rxcall);
+ call->rxcall = NULL;
+ }
+ if (call->async) {
+ if (cancel_work_sync(&call->async_work))
+ afs_put_call(call);
+ afs_put_call(call);
+ }
+
ac->error = ret;
+ call->state = AFS_CALL_COMPLETE;
+ afs_put_call(call);
_leave(" = %d", ret);
return ret;
}
@@ -466,14 +518,12 @@ static void afs_deliver_to_call(struct afs_call *call)
state == AFS_CALL_SV_AWAIT_ACK
) {
if (state == AFS_CALL_SV_AWAIT_ACK) {
- struct iov_iter iter;
-
- iov_iter_kvec(&iter, READ | ITER_KVEC, NULL, 0, 0);
+ iov_iter_kvec(&call->iter, READ, NULL, 0, 0);
ret = rxrpc_kernel_recv_data(call->net->socket,
- call->rxcall, &iter, false,
- &remote_abort,
+ call->rxcall, &call->iter,
+ false, &remote_abort,
&call->service_id);
- trace_afs_recv_data(call, 0, 0, false, ret);
+ trace_afs_receive_data(call, &call->iter, false, ret);
if (ret == -EINPROGRESS || ret == -EAGAIN)
return;
@@ -485,10 +535,17 @@ static void afs_deliver_to_call(struct afs_call *call)
return;
}
+ if (call->want_reply_time &&
+ rxrpc_kernel_get_reply_time(call->net->socket,
+ call->rxcall,
+ &call->reply_time))
+ call->want_reply_time = false;
+
ret = call->type->deliver(call);
state = READ_ONCE(call->state);
switch (ret) {
case 0:
+ afs_queue_call_work(call);
if (state == AFS_CALL_CL_PROC_REPLY) {
if (call->cbi)
set_bit(AFS_SERVER_FL_MAY_HAVE_CB,
@@ -500,7 +557,6 @@ static void afs_deliver_to_call(struct afs_call *call)
case -EINPROGRESS:
case -EAGAIN:
goto out;
- case -EIO:
case -ECONNABORTED:
ASSERTCMP(state, ==, AFS_CALL_COMPLETE);
goto done;
@@ -509,6 +565,10 @@ static void afs_deliver_to_call(struct afs_call *call)
rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
abort_code, ret, "KIV");
goto local_abort;
+ case -EIO:
+ pr_err("kAFS: Call %u in bad state %u\n",
+ call->debug_id, state);
+ /* Fall through */
case -ENODATA:
case -EBADMSG:
case -EMSGSIZE:
@@ -517,12 +577,14 @@ static void afs_deliver_to_call(struct afs_call *call)
if (state != AFS_CALL_CL_AWAIT_REPLY)
abort_code = RXGEN_SS_UNMARSHAL;
rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
- abort_code, -EBADMSG, "KUM");
+ abort_code, ret, "KUM");
goto local_abort;
}
}
done:
+ if (call->type->done)
+ call->type->done(call);
if (state == AFS_CALL_COMPLETE && call->incoming)
afs_put_call(call);
out:
@@ -545,6 +607,7 @@ static long afs_wait_for_call_to_complete(struct afs_call *call,
{
signed long rtt2, timeout;
long ret;
+ bool stalled = false;
u64 rtt;
u32 life, last_life;
@@ -578,12 +641,20 @@ static long afs_wait_for_call_to_complete(struct afs_call *call,
life = rxrpc_kernel_check_life(call->net->socket, call->rxcall);
if (timeout == 0 &&
- life == last_life && signal_pending(current))
+ life == last_life && signal_pending(current)) {
+ if (stalled)
break;
+ __set_current_state(TASK_RUNNING);
+ rxrpc_kernel_probe_life(call->net->socket, call->rxcall);
+ timeout = rtt2;
+ stalled = true;
+ continue;
+ }
if (life != last_life) {
timeout = rtt2;
last_life = life;
+ stalled = false;
}
timeout = schedule_timeout(timeout);
@@ -728,6 +799,7 @@ void afs_charge_preallocation(struct work_struct *work)
call->async = true;
call->state = AFS_CALL_SV_AWAIT_OP_ID;
init_waitqueue_head(&call->waitq);
+ afs_extract_to_tmp(call);
}
if (rxrpc_kernel_charge_accept(net->socket,
@@ -773,18 +845,15 @@ static int afs_deliver_cm_op_id(struct afs_call *call)
{
int ret;
- _enter("{%zu}", call->offset);
-
- ASSERTCMP(call->offset, <, 4);
+ _enter("{%zu}", iov_iter_count(call->_iter));
/* the operation ID forms the first four bytes of the request data */
- ret = afs_extract_data(call, &call->tmp, 4, true);
+ ret = afs_extract_data(call, true);
if (ret < 0)
return ret;
call->operation_ID = ntohl(call->tmp);
afs_set_call_state(call, AFS_CALL_SV_AWAIT_OP_ID, AFS_CALL_SV_AWAIT_REQUEST);
- call->offset = 0;
/* ask the cache manager to route the call (it'll change the call type
* if successful) */
@@ -825,7 +894,7 @@ void afs_send_empty_reply(struct afs_call *call)
msg.msg_name = NULL;
msg.msg_namelen = 0;
- iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, NULL, 0, 0);
+ iov_iter_kvec(&msg.msg_iter, WRITE, NULL, 0, 0);
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_flags = 0;
@@ -864,7 +933,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
iov[0].iov_len = len;
msg.msg_name = NULL;
msg.msg_namelen = 0;
- iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, iov, 1, len);
+ iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, len);
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_flags = 0;
@@ -888,30 +957,19 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
/*
* Extract a piece of data from the received data socket buffers.
*/
-int afs_extract_data(struct afs_call *call, void *buf, size_t count,
- bool want_more)
+int afs_extract_data(struct afs_call *call, bool want_more)
{
struct afs_net *net = call->net;
- struct iov_iter iter;
- struct kvec iov;
+ struct iov_iter *iter = call->_iter;
enum afs_call_state state;
u32 remote_abort = 0;
int ret;
- _enter("{%s,%zu},,%zu,%d",
- call->type->name, call->offset, count, want_more);
-
- ASSERTCMP(call->offset, <=, count);
-
- iov.iov_base = buf + call->offset;
- iov.iov_len = count - call->offset;
- iov_iter_kvec(&iter, ITER_KVEC | READ, &iov, 1, count - call->offset);
+ _enter("{%s,%zu},%d", call->type->name, iov_iter_count(iter), want_more);
- ret = rxrpc_kernel_recv_data(net->socket, call->rxcall, &iter,
+ ret = rxrpc_kernel_recv_data(net->socket, call->rxcall, iter,
want_more, &remote_abort,
&call->service_id);
- call->offset += (count - call->offset) - iov_iter_count(&iter);
- trace_afs_recv_data(call, count, call->offset, want_more, ret);
if (ret == 0 || ret == -EAGAIN)
return ret;
@@ -926,7 +984,7 @@ int afs_extract_data(struct afs_call *call, void *buf, size_t count,
break;
case AFS_CALL_COMPLETE:
kdebug("prem complete %d", call->error);
- return -EIO;
+ return afs_io_error(call, afs_io_error_extract);
default:
break;
}
@@ -940,8 +998,9 @@ int afs_extract_data(struct afs_call *call, void *buf, size_t count,
/*
* Log protocol error production.
*/
-noinline int afs_protocol_error(struct afs_call *call, int error)
+noinline int afs_protocol_error(struct afs_call *call, int error,
+ enum afs_eproto_cause cause)
{
- trace_afs_protocol_error(call, error, __builtin_return_address(0));
+ trace_afs_protocol_error(call, error, cause);
return error;
}
diff --git a/fs/afs/security.c b/fs/afs/security.c
index 81dfedb7879f..5f58a9a17e69 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -126,7 +126,7 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key,
bool changed = false;
int i, j;
- _enter("{%x:%u},%x,%x",
+ _enter("{%llx:%llu},%x,%x",
vnode->fid.vid, vnode->fid.vnode, key_serial(key), caller_access);
rcu_read_lock();
@@ -147,7 +147,8 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key,
break;
}
- if (cb_break != afs_cb_break_sum(vnode, vnode->cb_interest)) {
+ if (afs_cb_is_broken(cb_break, vnode,
+ vnode->cb_interest)) {
changed = true;
break;
}
@@ -177,7 +178,7 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key,
}
}
- if (cb_break != afs_cb_break_sum(vnode, vnode->cb_interest))
+ if (afs_cb_is_broken(cb_break, vnode, vnode->cb_interest))
goto someone_else_changed_it;
/* We need a ref on any permits list we want to copy as we'll have to
@@ -256,7 +257,7 @@ found:
spin_lock(&vnode->lock);
zap = rcu_access_pointer(vnode->permit_cache);
- if (cb_break == afs_cb_break_sum(vnode, vnode->cb_interest) &&
+ if (!afs_cb_is_broken(cb_break, vnode, vnode->cb_interest) &&
zap == permits)
rcu_assign_pointer(vnode->permit_cache, replacement);
else
@@ -289,7 +290,7 @@ int afs_check_permit(struct afs_vnode *vnode, struct key *key,
bool valid = false;
int i, ret;
- _enter("{%x:%u},%x",
+ _enter("{%llx:%llu},%x",
vnode->fid.vid, vnode->fid.vnode, key_serial(key));
/* check the permits to see if we've got one yet */
@@ -349,7 +350,7 @@ int afs_permission(struct inode *inode, int mask)
if (mask & MAY_NOT_BLOCK)
return -ECHILD;
- _enter("{{%x:%u},%lx},%x,",
+ _enter("{{%llx:%llu},%lx},%x,",
vnode->fid.vid, vnode->fid.vnode, vnode->flags, mask);
key = afs_request_key(vnode->volume->cell);
diff --git a/fs/afs/server.c b/fs/afs/server.c
index 1d329e6981d5..642afa2e9783 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -13,6 +13,7 @@
#include <linux/slab.h>
#include "afs_fs.h"
#include "internal.h"
+#include "protocol_yfs.h"
static unsigned afs_server_gc_delay = 10; /* Server record timeout in seconds */
static unsigned afs_server_update_delay = 30; /* Time till VLDB recheck in secs */
@@ -230,6 +231,8 @@ static struct afs_server *afs_alloc_server(struct afs_net *net,
rwlock_init(&server->fs_lock);
INIT_HLIST_HEAD(&server->cb_volumes);
rwlock_init(&server->cb_break_lock);
+ init_waitqueue_head(&server->probe_wq);
+ spin_lock_init(&server->probe_lock);
afs_inc_servers_outstanding(net);
_leave(" = %p", server);
@@ -246,41 +249,23 @@ enomem:
static struct afs_addr_list *afs_vl_lookup_addrs(struct afs_cell *cell,
struct key *key, const uuid_t *uuid)
{
- struct afs_addr_cursor ac;
- struct afs_addr_list *alist;
+ struct afs_vl_cursor vc;
+ struct afs_addr_list *alist = NULL;
int ret;
- ret = afs_set_vl_cursor(&ac, cell);
- if (ret < 0)
- return ERR_PTR(ret);
-
- while (afs_iterate_addresses(&ac)) {
- if (test_bit(ac.index, &ac.alist->yfs))
- alist = afs_yfsvl_get_endpoints(cell->net, &ac, key, uuid);
- else
- alist = afs_vl_get_addrs_u(cell->net, &ac, key, uuid);
- switch (ac.error) {
- case 0:
- afs_end_cursor(&ac);
- return alist;
- case -ECONNABORTED:
- ac.error = afs_abort_to_error(ac.abort_code);
- goto error;
- case -ENOMEM:
- case -ENONET:
- goto error;
- case -ENETUNREACH:
- case -EHOSTUNREACH:
- case -ECONNREFUSED:
- break;
- default:
- ac.error = -EIO;
- goto error;
+ ret = -ERESTARTSYS;
+ if (afs_begin_vlserver_operation(&vc, cell, key)) {
+ while (afs_select_vlserver(&vc)) {
+ if (test_bit(AFS_VLSERVER_FL_IS_YFS, &vc.server->flags))
+ alist = afs_yfsvl_get_endpoints(&vc, uuid);
+ else
+ alist = afs_vl_get_addrs_u(&vc, uuid);
}
+
+ ret = afs_end_vlserver_operation(&vc);
}
-error:
- return ERR_PTR(afs_end_cursor(&ac));
+ return ret < 0 ? ERR_PTR(ret) : alist;
}
/*
@@ -382,9 +367,7 @@ static void afs_destroy_server(struct afs_net *net, struct afs_server *server)
struct afs_addr_list *alist = rcu_access_pointer(server->addresses);
struct afs_addr_cursor ac = {
.alist = alist,
- .start = alist->index,
- .index = 0,
- .addr = &alist->addrs[alist->index],
+ .index = alist->preferred,
.error = 0,
};
_enter("%p", server);
@@ -392,6 +375,9 @@ static void afs_destroy_server(struct afs_net *net, struct afs_server *server)
if (test_bit(AFS_SERVER_FL_MAY_HAVE_CB, &server->flags))
afs_fs_give_up_all_callbacks(net, server, &ac, NULL);
+ wait_var_event(&server->probe_outstanding,
+ atomic_read(&server->probe_outstanding) == 0);
+
call_rcu(&server->rcu, afs_server_rcu);
afs_dec_servers_outstanding(net);
}
@@ -525,99 +511,6 @@ void afs_purge_servers(struct afs_net *net)
}
/*
- * Probe a fileserver to find its capabilities.
- *
- * TODO: Try service upgrade.
- */
-static bool afs_do_probe_fileserver(struct afs_fs_cursor *fc)
-{
- _enter("");
-
- fc->ac.addr = NULL;
- fc->ac.start = READ_ONCE(fc->ac.alist->index);
- fc->ac.index = fc->ac.start;
- fc->ac.error = 0;
- fc->ac.begun = false;
-
- while (afs_iterate_addresses(&fc->ac)) {
- afs_fs_get_capabilities(afs_v2net(fc->vnode), fc->cbi->server,
- &fc->ac, fc->key);
- switch (fc->ac.error) {
- case 0:
- afs_end_cursor(&fc->ac);
- set_bit(AFS_SERVER_FL_PROBED, &fc->cbi->server->flags);
- return true;
- case -ECONNABORTED:
- fc->ac.error = afs_abort_to_error(fc->ac.abort_code);
- goto error;
- case -ENOMEM:
- case -ENONET:
- goto error;
- case -ENETUNREACH:
- case -EHOSTUNREACH:
- case -ECONNREFUSED:
- case -ETIMEDOUT:
- case -ETIME:
- break;
- default:
- fc->ac.error = -EIO;
- goto error;
- }
- }
-
-error:
- afs_end_cursor(&fc->ac);
- return false;
-}
-
-/*
- * If we haven't already, try probing the fileserver to get its capabilities.
- * We try not to instigate parallel probes, but it's possible that the parallel
- * probes will fail due to authentication failure when ours would succeed.
- *
- * TODO: Try sending an anonymous probe if an authenticated probe fails.
- */
-bool afs_probe_fileserver(struct afs_fs_cursor *fc)
-{
- bool success;
- int ret, retries = 0;
-
- _enter("");
-
-retry:
- if (test_bit(AFS_SERVER_FL_PROBED, &fc->cbi->server->flags)) {
- _leave(" = t");
- return true;
- }
-
- if (!test_and_set_bit_lock(AFS_SERVER_FL_PROBING, &fc->cbi->server->flags)) {
- success = afs_do_probe_fileserver(fc);
- clear_bit_unlock(AFS_SERVER_FL_PROBING, &fc->cbi->server->flags);
- wake_up_bit(&fc->cbi->server->flags, AFS_SERVER_FL_PROBING);
- _leave(" = t");
- return success;
- }
-
- _debug("wait");
- ret = wait_on_bit(&fc->cbi->server->flags, AFS_SERVER_FL_PROBING,
- TASK_INTERRUPTIBLE);
- if (ret == -ERESTARTSYS) {
- fc->ac.error = ret;
- _leave(" = f [%d]", ret);
- return false;
- }
-
- retries++;
- if (retries == 4) {
- fc->ac.error = -ESTALE;
- _leave(" = f [stale]");
- return false;
- }
- _debug("retry");
- goto retry;
-}
-
-/*
* Get an update for a server's address list.
*/
static noinline bool afs_update_server_record(struct afs_fs_cursor *fc, struct afs_server *server)
diff --git a/fs/afs/server_list.c b/fs/afs/server_list.c
index 8a5760aa5832..155dc14caef9 100644
--- a/fs/afs/server_list.c
+++ b/fs/afs/server_list.c
@@ -42,9 +42,7 @@ struct afs_server_list *afs_alloc_server_list(struct afs_cell *cell,
if (vldb->fs_mask[i] & type_mask)
nr_servers++;
- slist = kzalloc(sizeof(struct afs_server_list) +
- sizeof(struct afs_server_entry) * nr_servers,
- GFP_KERNEL);
+ slist = kzalloc(struct_size(slist, servers, nr_servers), GFP_KERNEL);
if (!slist)
goto error;
@@ -118,11 +116,11 @@ bool afs_annotate_server_list(struct afs_server_list *new,
return false;
changed:
- /* Maintain the same current server as before if possible. */
- cur = old->servers[old->index].server;
+ /* Maintain the same preferred server as before if possible. */
+ cur = old->servers[old->preferred].server;
for (j = 0; j < new->nr_servers; j++) {
if (new->servers[j].server == cur) {
- new->index = j;
+ new->preferred = j;
break;
}
}
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 4d3e274207fb..dcd07fe99871 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -406,10 +406,11 @@ static int afs_fill_super(struct super_block *sb,
inode = afs_iget_pseudo_dir(sb, true);
sb->s_flags |= SB_RDONLY;
} else {
- sprintf(sb->s_id, "%u", as->volume->vid);
+ sprintf(sb->s_id, "%llu", as->volume->vid);
afs_activate_volume(as->volume);
fid.vid = as->volume->vid;
fid.vnode = 1;
+ fid.vnode_hi = 0;
fid.unique = 1;
inode = afs_iget(sb, params->key, &fid, NULL, NULL, NULL);
}
@@ -663,7 +664,7 @@ static void afs_destroy_inode(struct inode *inode)
{
struct afs_vnode *vnode = AFS_FS_I(inode);
- _enter("%p{%x:%u}", inode, vnode->fid.vid, vnode->fid.vnode);
+ _enter("%p{%llx:%llu}", inode, vnode->fid.vid, vnode->fid.vnode);
_debug("DESTROY INODE %p", inode);
diff --git a/fs/afs/vl_list.c b/fs/afs/vl_list.c
new file mode 100644
index 000000000000..b4f1a84519b9
--- /dev/null
+++ b/fs/afs/vl_list.c
@@ -0,0 +1,340 @@
+/* AFS vlserver list management.
+ *
+ * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+struct afs_vlserver *afs_alloc_vlserver(const char *name, size_t name_len,
+ unsigned short port)
+{
+ struct afs_vlserver *vlserver;
+
+ vlserver = kzalloc(struct_size(vlserver, name, name_len + 1),
+ GFP_KERNEL);
+ if (vlserver) {
+ atomic_set(&vlserver->usage, 1);
+ rwlock_init(&vlserver->lock);
+ init_waitqueue_head(&vlserver->probe_wq);
+ spin_lock_init(&vlserver->probe_lock);
+ vlserver->name_len = name_len;
+ vlserver->port = port;
+ memcpy(vlserver->name, name, name_len);
+ }
+ return vlserver;
+}
+
+static void afs_vlserver_rcu(struct rcu_head *rcu)
+{
+ struct afs_vlserver *vlserver = container_of(rcu, struct afs_vlserver, rcu);
+
+ afs_put_addrlist(rcu_access_pointer(vlserver->addresses));
+ kfree_rcu(vlserver, rcu);
+}
+
+void afs_put_vlserver(struct afs_net *net, struct afs_vlserver *vlserver)
+{
+ if (vlserver) {
+ unsigned int u = atomic_dec_return(&vlserver->usage);
+ //_debug("VL PUT %p{%u}", vlserver, u);
+
+ if (u == 0)
+ call_rcu(&vlserver->rcu, afs_vlserver_rcu);
+ }
+}
+
+struct afs_vlserver_list *afs_alloc_vlserver_list(unsigned int nr_servers)
+{
+ struct afs_vlserver_list *vllist;
+
+ vllist = kzalloc(struct_size(vllist, servers, nr_servers), GFP_KERNEL);
+ if (vllist) {
+ atomic_set(&vllist->usage, 1);
+ rwlock_init(&vllist->lock);
+ }
+
+ return vllist;
+}
+
+void afs_put_vlserverlist(struct afs_net *net, struct afs_vlserver_list *vllist)
+{
+ if (vllist) {
+ unsigned int u = atomic_dec_return(&vllist->usage);
+
+ //_debug("VLLS PUT %p{%u}", vllist, u);
+ if (u == 0) {
+ int i;
+
+ for (i = 0; i < vllist->nr_servers; i++) {
+ afs_put_vlserver(net, vllist->servers[i].server);
+ }
+ kfree_rcu(vllist, rcu);
+ }
+ }
+}
+
+static u16 afs_extract_le16(const u8 **_b)
+{
+ u16 val;
+
+ val = (u16)*(*_b)++ << 0;
+ val |= (u16)*(*_b)++ << 8;
+ return val;
+}
+
+/*
+ * Build a VL server address list from a DNS queried server list.
+ */
+static struct afs_addr_list *afs_extract_vl_addrs(const u8 **_b, const u8 *end,
+ u8 nr_addrs, u16 port)
+{
+ struct afs_addr_list *alist;
+ const u8 *b = *_b;
+ int ret = -EINVAL;
+
+ alist = afs_alloc_addrlist(nr_addrs, VL_SERVICE, port);
+ if (!alist)
+ return ERR_PTR(-ENOMEM);
+ if (nr_addrs == 0)
+ return alist;
+
+ for (; nr_addrs > 0 && end - b >= nr_addrs; nr_addrs--) {
+ struct dns_server_list_v1_address hdr;
+ __be32 x[4];
+
+ hdr.address_type = *b++;
+
+ switch (hdr.address_type) {
+ case DNS_ADDRESS_IS_IPV4:
+ if (end - b < 4) {
+ _leave(" = -EINVAL [short inet]");
+ goto error;
+ }
+ memcpy(x, b, 4);
+ afs_merge_fs_addr4(alist, x[0], port);
+ b += 4;
+ break;
+
+ case DNS_ADDRESS_IS_IPV6:
+ if (end - b < 16) {
+ _leave(" = -EINVAL [short inet6]");
+ goto error;
+ }
+ memcpy(x, b, 16);
+ afs_merge_fs_addr6(alist, x, port);
+ b += 16;
+ break;
+
+ default:
+ _leave(" = -EADDRNOTAVAIL [unknown af %u]",
+ hdr.address_type);
+ ret = -EADDRNOTAVAIL;
+ goto error;
+ }
+ }
+
+ /* Start with IPv6 if available. */
+ if (alist->nr_ipv4 < alist->nr_addrs)
+ alist->preferred = alist->nr_ipv4;
+
+ *_b = b;
+ return alist;
+
+error:
+ *_b = b;
+ afs_put_addrlist(alist);
+ return ERR_PTR(ret);
+}
+
+/*
+ * Build a VL server list from a DNS queried server list.
+ */
+struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *cell,
+ const void *buffer,
+ size_t buffer_size)
+{
+ const struct dns_server_list_v1_header *hdr = buffer;
+ struct dns_server_list_v1_server bs;
+ struct afs_vlserver_list *vllist, *previous;
+ struct afs_addr_list *addrs;
+ struct afs_vlserver *server;
+ const u8 *b = buffer, *end = buffer + buffer_size;
+ int ret = -ENOMEM, nr_servers, i, j;
+
+ _enter("");
+
+ /* Check that it's a server list, v1 */
+ if (end - b < sizeof(*hdr) ||
+ hdr->hdr.content != DNS_PAYLOAD_IS_SERVER_LIST ||
+ hdr->hdr.version != 1) {
+ pr_notice("kAFS: Got DNS record [%u,%u] len %zu\n",
+ hdr->hdr.content, hdr->hdr.version, end - b);
+ ret = -EDESTADDRREQ;
+ goto dump;
+ }
+
+ nr_servers = hdr->nr_servers;
+
+ vllist = afs_alloc_vlserver_list(nr_servers);
+ if (!vllist)
+ return ERR_PTR(-ENOMEM);
+
+ vllist->source = (hdr->source < NR__dns_record_source) ?
+ hdr->source : NR__dns_record_source;
+ vllist->status = (hdr->status < NR__dns_lookup_status) ?
+ hdr->status : NR__dns_lookup_status;
+
+ read_lock(&cell->vl_servers_lock);
+ previous = afs_get_vlserverlist(
+ rcu_dereference_protected(cell->vl_servers,
+ lockdep_is_held(&cell->vl_servers_lock)));
+ read_unlock(&cell->vl_servers_lock);
+
+ b += sizeof(*hdr);
+ while (end - b >= sizeof(bs)) {
+ bs.name_len = afs_extract_le16(&b);
+ bs.priority = afs_extract_le16(&b);
+ bs.weight = afs_extract_le16(&b);
+ bs.port = afs_extract_le16(&b);
+ bs.source = *b++;
+ bs.status = *b++;
+ bs.protocol = *b++;
+ bs.nr_addrs = *b++;
+
+ _debug("extract %u %u %u %u %u %u %*.*s",
+ bs.name_len, bs.priority, bs.weight,
+ bs.port, bs.protocol, bs.nr_addrs,
+ bs.name_len, bs.name_len, b);
+
+ if (end - b < bs.name_len)
+ break;
+
+ ret = -EPROTONOSUPPORT;
+ if (bs.protocol == DNS_SERVER_PROTOCOL_UNSPECIFIED) {
+ bs.protocol = DNS_SERVER_PROTOCOL_UDP;
+ } else if (bs.protocol != DNS_SERVER_PROTOCOL_UDP) {
+ _leave(" = [proto %u]", bs.protocol);
+ goto error;
+ }
+
+ if (bs.port == 0)
+ bs.port = AFS_VL_PORT;
+ if (bs.source > NR__dns_record_source)
+ bs.source = NR__dns_record_source;
+ if (bs.status > NR__dns_lookup_status)
+ bs.status = NR__dns_lookup_status;
+
+ server = NULL;
+ if (previous) {
+ /* See if we can update an old server record */
+ for (i = 0; i < previous->nr_servers; i++) {
+ struct afs_vlserver *p = previous->servers[i].server;
+
+ if (p->name_len == bs.name_len &&
+ p->port == bs.port &&
+ strncasecmp(b, p->name, bs.name_len) == 0) {
+ server = afs_get_vlserver(p);
+ break;
+ }
+ }
+ }
+
+ if (!server) {
+ ret = -ENOMEM;
+ server = afs_alloc_vlserver(b, bs.name_len, bs.port);
+ if (!server)
+ goto error;
+ }
+
+ b += bs.name_len;
+
+ /* Extract the addresses - note that we can't skip this as we
+ * have to advance the payload pointer.
+ */
+ addrs = afs_extract_vl_addrs(&b, end, bs.nr_addrs, bs.port);
+ if (IS_ERR(addrs)) {
+ ret = PTR_ERR(addrs);
+ goto error_2;
+ }
+
+ if (vllist->nr_servers >= nr_servers) {
+ _debug("skip %u >= %u", vllist->nr_servers, nr_servers);
+ afs_put_addrlist(addrs);
+ afs_put_vlserver(cell->net, server);
+ continue;
+ }
+
+ addrs->source = bs.source;
+ addrs->status = bs.status;
+
+ if (addrs->nr_addrs == 0) {
+ afs_put_addrlist(addrs);
+ if (!rcu_access_pointer(server->addresses)) {
+ afs_put_vlserver(cell->net, server);
+ continue;
+ }
+ } else {
+ struct afs_addr_list *old = addrs;
+
+ write_lock(&server->lock);
+ rcu_swap_protected(server->addresses, old,
+ lockdep_is_held(&server->lock));
+ write_unlock(&server->lock);
+ afs_put_addrlist(old);
+ }
+
+
+ /* TODO: Might want to check for duplicates */
+
+ /* Insertion-sort by priority and weight */
+ for (j = 0; j < vllist->nr_servers; j++) {
+ if (bs.priority < vllist->servers[j].priority)
+ break; /* Lower preferable */
+ if (bs.priority == vllist->servers[j].priority &&
+ bs.weight > vllist->servers[j].weight)
+ break; /* Higher preferable */
+ }
+
+ if (j < vllist->nr_servers) {
+ memmove(vllist->servers + j + 1,
+ vllist->servers + j,
+ (vllist->nr_servers - j) * sizeof(struct afs_vlserver_entry));
+ }
+
+ clear_bit(AFS_VLSERVER_FL_PROBED, &server->flags);
+
+ vllist->servers[j].priority = bs.priority;
+ vllist->servers[j].weight = bs.weight;
+ vllist->servers[j].server = server;
+ vllist->nr_servers++;
+ }
+
+ if (b != end) {
+ _debug("parse error %zd", b - end);
+ goto error;
+ }
+
+ afs_put_vlserverlist(cell->net, previous);
+ _leave(" = ok [%u]", vllist->nr_servers);
+ return vllist;
+
+error_2:
+ afs_put_vlserver(cell->net, server);
+error:
+ afs_put_vlserverlist(cell->net, vllist);
+ afs_put_vlserverlist(cell->net, previous);
+dump:
+ if (ret != -ENOMEM) {
+ printk(KERN_DEBUG "DNS: at %zu\n", (const void *)b - buffer);
+ print_hex_dump_bytes("DNS: ", DUMP_PREFIX_NONE, buffer, buffer_size);
+ }
+ return ERR_PTR(ret);
+}
diff --git a/fs/afs/vl_probe.c b/fs/afs/vl_probe.c
new file mode 100644
index 000000000000..f402ee8171a1
--- /dev/null
+++ b/fs/afs/vl_probe.c
@@ -0,0 +1,282 @@
+/* AFS vlserver probing
+ *
+ * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include "afs_fs.h"
+#include "internal.h"
+#include "protocol_yfs.h"
+
+static bool afs_vl_probe_done(struct afs_vlserver *server)
+{
+ if (!atomic_dec_and_test(&server->probe_outstanding))
+ return false;
+
+ wake_up_var(&server->probe_outstanding);
+ clear_bit_unlock(AFS_VLSERVER_FL_PROBING, &server->flags);
+ wake_up_bit(&server->flags, AFS_VLSERVER_FL_PROBING);
+ return true;
+}
+
+/*
+ * Process the result of probing a vlserver. This is called after successful
+ * or failed delivery of an VL.GetCapabilities operation.
+ */
+void afs_vlserver_probe_result(struct afs_call *call)
+{
+ struct afs_addr_list *alist = call->alist;
+ struct afs_vlserver *server = call->reply[0];
+ unsigned int server_index = (long)call->reply[1];
+ unsigned int index = call->addr_ix;
+ unsigned int rtt = UINT_MAX;
+ bool have_result = false;
+ u64 _rtt;
+ int ret = call->error;
+
+ _enter("%s,%u,%u,%d,%d", server->name, server_index, index, ret, call->abort_code);
+
+ spin_lock(&server->probe_lock);
+
+ switch (ret) {
+ case 0:
+ server->probe.error = 0;
+ goto responded;
+ case -ECONNABORTED:
+ if (!server->probe.responded) {
+ server->probe.abort_code = call->abort_code;
+ server->probe.error = ret;
+ }
+ goto responded;
+ case -ENOMEM:
+ case -ENONET:
+ server->probe.local_failure = true;
+ afs_io_error(call, afs_io_error_vl_probe_fail);
+ goto out;
+ case -ECONNRESET: /* Responded, but call expired. */
+ case -ERFKILL:
+ case -EADDRNOTAVAIL:
+ case -ENETUNREACH:
+ case -EHOSTUNREACH:
+ case -EHOSTDOWN:
+ case -ECONNREFUSED:
+ case -ETIMEDOUT:
+ case -ETIME:
+ default:
+ clear_bit(index, &alist->responded);
+ set_bit(index, &alist->failed);
+ if (!server->probe.responded &&
+ (server->probe.error == 0 ||
+ server->probe.error == -ETIMEDOUT ||
+ server->probe.error == -ETIME))
+ server->probe.error = ret;
+ afs_io_error(call, afs_io_error_vl_probe_fail);
+ goto out;
+ }
+
+responded:
+ set_bit(index, &alist->responded);
+ clear_bit(index, &alist->failed);
+
+ if (call->service_id == YFS_VL_SERVICE) {
+ server->probe.is_yfs = true;
+ set_bit(AFS_VLSERVER_FL_IS_YFS, &server->flags);
+ alist->addrs[index].srx_service = call->service_id;
+ } else {
+ server->probe.not_yfs = true;
+ if (!server->probe.is_yfs) {
+ clear_bit(AFS_VLSERVER_FL_IS_YFS, &server->flags);
+ alist->addrs[index].srx_service = call->service_id;
+ }
+ }
+
+ /* Get the RTT and scale it to fit into a 32-bit value that represents
+ * over a minute of time so that we can access it with one instruction
+ * on a 32-bit system.
+ */
+ _rtt = rxrpc_kernel_get_rtt(call->net->socket, call->rxcall);
+ _rtt /= 64;
+ rtt = (_rtt > UINT_MAX) ? UINT_MAX : _rtt;
+ if (rtt < server->probe.rtt) {
+ server->probe.rtt = rtt;
+ alist->preferred = index;
+ have_result = true;
+ }
+
+ smp_wmb(); /* Set rtt before responded. */
+ server->probe.responded = true;
+ set_bit(AFS_VLSERVER_FL_PROBED, &server->flags);
+out:
+ spin_unlock(&server->probe_lock);
+
+ _debug("probe [%u][%u] %pISpc rtt=%u ret=%d",
+ server_index, index, &alist->addrs[index].transport,
+ (unsigned int)rtt, ret);
+
+ have_result |= afs_vl_probe_done(server);
+ if (have_result) {
+ server->probe.have_result = true;
+ wake_up_var(&server->probe.have_result);
+ wake_up_all(&server->probe_wq);
+ }
+}
+
+/*
+ * Probe all of a vlserver's addresses to find out the best route and to
+ * query its capabilities.
+ */
+static bool afs_do_probe_vlserver(struct afs_net *net,
+ struct afs_vlserver *server,
+ struct key *key,
+ unsigned int server_index,
+ struct afs_error *_e)
+{
+ struct afs_addr_cursor ac = {
+ .index = 0,
+ };
+ bool in_progress = false;
+ int err;
+
+ _enter("%s", server->name);
+
+ read_lock(&server->lock);
+ ac.alist = rcu_dereference_protected(server->addresses,
+ lockdep_is_held(&server->lock));
+ read_unlock(&server->lock);
+
+ atomic_set(&server->probe_outstanding, ac.alist->nr_addrs);
+ memset(&server->probe, 0, sizeof(server->probe));
+ server->probe.rtt = UINT_MAX;
+
+ for (ac.index = 0; ac.index < ac.alist->nr_addrs; ac.index++) {
+ err = afs_vl_get_capabilities(net, &ac, key, server,
+ server_index, true);
+ if (err == -EINPROGRESS)
+ in_progress = true;
+ else
+ afs_prioritise_error(_e, err, ac.abort_code);
+ }
+
+ if (!in_progress)
+ afs_vl_probe_done(server);
+ return in_progress;
+}
+
+/*
+ * Send off probes to all unprobed servers.
+ */
+int afs_send_vl_probes(struct afs_net *net, struct key *key,
+ struct afs_vlserver_list *vllist)
+{
+ struct afs_vlserver *server;
+ struct afs_error e;
+ bool in_progress = false;
+ int i;
+
+ e.error = 0;
+ e.responded = false;
+ for (i = 0; i < vllist->nr_servers; i++) {
+ server = vllist->servers[i].server;
+ if (test_bit(AFS_VLSERVER_FL_PROBED, &server->flags))
+ continue;
+
+ if (!test_and_set_bit_lock(AFS_VLSERVER_FL_PROBING, &server->flags) &&
+ afs_do_probe_vlserver(net, server, key, i, &e))
+ in_progress = true;
+ }
+
+ return in_progress ? 0 : e.error;
+}
+
+/*
+ * Wait for the first as-yet untried server to respond.
+ */
+int afs_wait_for_vl_probes(struct afs_vlserver_list *vllist,
+ unsigned long untried)
+{
+ struct wait_queue_entry *waits;
+ struct afs_vlserver *server;
+ unsigned int rtt = UINT_MAX;
+ bool have_responders = false;
+ int pref = -1, i;
+
+ _enter("%u,%lx", vllist->nr_servers, untried);
+
+ /* Only wait for servers that have a probe outstanding. */
+ for (i = 0; i < vllist->nr_servers; i++) {
+ if (test_bit(i, &untried)) {
+ server = vllist->servers[i].server;
+ if (!test_bit(AFS_VLSERVER_FL_PROBING, &server->flags))
+ __clear_bit(i, &untried);
+ if (server->probe.responded)
+ have_responders = true;
+ }
+ }
+ if (have_responders || !untried)
+ return 0;
+
+ waits = kmalloc(array_size(vllist->nr_servers, sizeof(*waits)), GFP_KERNEL);
+ if (!waits)
+ return -ENOMEM;
+
+ for (i = 0; i < vllist->nr_servers; i++) {
+ if (test_bit(i, &untried)) {
+ server = vllist->servers[i].server;
+ init_waitqueue_entry(&waits[i], current);
+ add_wait_queue(&server->probe_wq, &waits[i]);
+ }
+ }
+
+ for (;;) {
+ bool still_probing = false;
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ for (i = 0; i < vllist->nr_servers; i++) {
+ if (test_bit(i, &untried)) {
+ server = vllist->servers[i].server;
+ if (server->probe.responded)
+ goto stop;
+ if (test_bit(AFS_VLSERVER_FL_PROBING, &server->flags))
+ still_probing = true;
+ }
+ }
+
+ if (!still_probing || signal_pending(current))
+ goto stop;
+ schedule();
+ }
+
+stop:
+ set_current_state(TASK_RUNNING);
+
+ for (i = 0; i < vllist->nr_servers; i++) {
+ if (test_bit(i, &untried)) {
+ server = vllist->servers[i].server;
+ if (server->probe.responded &&
+ server->probe.rtt < rtt) {
+ pref = i;
+ rtt = server->probe.rtt;
+ }
+
+ remove_wait_queue(&server->probe_wq, &waits[i]);
+ }
+ }
+
+ kfree(waits);
+
+ if (pref == -1 && signal_pending(current))
+ return -ERESTARTSYS;
+
+ if (pref >= 0)
+ vllist->preferred = pref;
+
+ _leave(" = 0 [%u]", pref);
+ return 0;
+}
diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c
new file mode 100644
index 000000000000..7adde83a0648
--- /dev/null
+++ b/fs/afs/vl_rotate.c
@@ -0,0 +1,325 @@
+/* Handle vlserver selection and rotation.
+ *
+ * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include "internal.h"
+#include "afs_vl.h"
+
+/*
+ * Begin an operation on a volume location server.
+ */
+bool afs_begin_vlserver_operation(struct afs_vl_cursor *vc, struct afs_cell *cell,
+ struct key *key)
+{
+ memset(vc, 0, sizeof(*vc));
+ vc->cell = cell;
+ vc->key = key;
+ vc->error = -EDESTADDRREQ;
+ vc->ac.error = SHRT_MAX;
+
+ if (signal_pending(current)) {
+ vc->error = -EINTR;
+ vc->flags |= AFS_VL_CURSOR_STOP;
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * Begin iteration through a server list, starting with the last used server if
+ * possible, or the last recorded good server if not.
+ */
+static bool afs_start_vl_iteration(struct afs_vl_cursor *vc)
+{
+ struct afs_cell *cell = vc->cell;
+
+ if (wait_on_bit(&cell->flags, AFS_CELL_FL_NO_LOOKUP_YET,
+ TASK_INTERRUPTIBLE)) {
+ vc->error = -ERESTARTSYS;
+ return false;
+ }
+
+ read_lock(&cell->vl_servers_lock);
+ vc->server_list = afs_get_vlserverlist(
+ rcu_dereference_protected(cell->vl_servers,
+ lockdep_is_held(&cell->vl_servers_lock)));
+ read_unlock(&cell->vl_servers_lock);
+ if (!vc->server_list || !vc->server_list->nr_servers)
+ return false;
+
+ vc->untried = (1UL << vc->server_list->nr_servers) - 1;
+ vc->index = -1;
+ return true;
+}
+
+/*
+ * Select the vlserver to use. May be called multiple times to rotate
+ * through the vlservers.
+ */
+bool afs_select_vlserver(struct afs_vl_cursor *vc)
+{
+ struct afs_addr_list *alist;
+ struct afs_vlserver *vlserver;
+ struct afs_error e;
+ u32 rtt;
+ int error = vc->ac.error, i;
+
+ _enter("%lx[%d],%lx[%d],%d,%d",
+ vc->untried, vc->index,
+ vc->ac.tried, vc->ac.index,
+ error, vc->ac.abort_code);
+
+ if (vc->flags & AFS_VL_CURSOR_STOP) {
+ _leave(" = f [stopped]");
+ return false;
+ }
+
+ vc->nr_iterations++;
+
+ /* Evaluate the result of the previous operation, if there was one. */
+ switch (error) {
+ case SHRT_MAX:
+ goto start;
+
+ default:
+ case 0:
+ /* Success or local failure. Stop. */
+ vc->error = error;
+ vc->flags |= AFS_VL_CURSOR_STOP;
+ _leave(" = f [okay/local %d]", vc->ac.error);
+ return false;
+
+ case -ECONNABORTED:
+ /* The far side rejected the operation on some grounds. This
+ * might involve the server being busy or the volume having been moved.
+ */
+ switch (vc->ac.abort_code) {
+ case AFSVL_IO:
+ case AFSVL_BADVOLOPER:
+ case AFSVL_NOMEM:
+ /* The server went weird. */
+ vc->error = -EREMOTEIO;
+ //write_lock(&vc->cell->vl_servers_lock);
+ //vc->server_list->weird_mask |= 1 << vc->index;
+ //write_unlock(&vc->cell->vl_servers_lock);
+ goto next_server;
+
+ default:
+ vc->error = afs_abort_to_error(vc->ac.abort_code);
+ goto failed;
+ }
+
+ case -ERFKILL:
+ case -EADDRNOTAVAIL:
+ case -ENETUNREACH:
+ case -EHOSTUNREACH:
+ case -EHOSTDOWN:
+ case -ECONNREFUSED:
+ case -ETIMEDOUT:
+ case -ETIME:
+ _debug("no conn %d", error);
+ vc->error = error;
+ goto iterate_address;
+
+ case -ECONNRESET:
+ _debug("call reset");
+ vc->error = error;
+ vc->flags |= AFS_VL_CURSOR_RETRY;
+ goto next_server;
+ }
+
+restart_from_beginning:
+ _debug("restart");
+ afs_end_cursor(&vc->ac);
+ afs_put_vlserverlist(vc->cell->net, vc->server_list);
+ vc->server_list = NULL;
+ if (vc->flags & AFS_VL_CURSOR_RETRIED)
+ goto failed;
+ vc->flags |= AFS_VL_CURSOR_RETRIED;
+start:
+ _debug("start");
+
+ if (!afs_start_vl_iteration(vc))
+ goto failed;
+
+ error = afs_send_vl_probes(vc->cell->net, vc->key, vc->server_list);
+ if (error < 0)
+ goto failed_set_error;
+
+pick_server:
+ _debug("pick [%lx]", vc->untried);
+
+ error = afs_wait_for_vl_probes(vc->server_list, vc->untried);
+ if (error < 0)
+ goto failed_set_error;
+
+ /* Pick the untried server with the lowest RTT. */
+ vc->index = vc->server_list->preferred;
+ if (test_bit(vc->index, &vc->untried))
+ goto selected_server;
+
+ vc->index = -1;
+ rtt = U32_MAX;
+ for (i = 0; i < vc->server_list->nr_servers; i++) {
+ struct afs_vlserver *s = vc->server_list->servers[i].server;
+
+ if (!test_bit(i, &vc->untried) || !s->probe.responded)
+ continue;
+ if (s->probe.rtt < rtt) {
+ vc->index = i;
+ rtt = s->probe.rtt;
+ }
+ }
+
+ if (vc->index == -1)
+ goto no_more_servers;
+
+selected_server:
+ _debug("use %d", vc->index);
+ __clear_bit(vc->index, &vc->untried);
+
+ /* We're starting on a different vlserver from the list. We need to
+ * check it, find its address list and probe its capabilities before we
+ * use it.
+ */
+ ASSERTCMP(vc->ac.alist, ==, NULL);
+ vlserver = vc->server_list->servers[vc->index].server;
+ vc->server = vlserver;
+
+ _debug("USING VLSERVER: %s", vlserver->name);
+
+ read_lock(&vlserver->lock);
+ alist = rcu_dereference_protected(vlserver->addresses,
+ lockdep_is_held(&vlserver->lock));
+ afs_get_addrlist(alist);
+ read_unlock(&vlserver->lock);
+
+ memset(&vc->ac, 0, sizeof(vc->ac));
+
+ if (!vc->ac.alist)
+ vc->ac.alist = alist;
+ else
+ afs_put_addrlist(alist);
+
+ vc->ac.index = -1;
+
+iterate_address:
+ ASSERT(vc->ac.alist);
+ /* Iterate over the current server's address list to try and find an
+ * address on which it will respond to us.
+ */
+ if (!afs_iterate_addresses(&vc->ac))
+ goto next_server;
+
+ _debug("VL address %d/%d", vc->ac.index, vc->ac.alist->nr_addrs);
+
+ _leave(" = t %pISpc", &vc->ac.alist->addrs[vc->ac.index].transport);
+ return true;
+
+next_server:
+ _debug("next");
+ afs_end_cursor(&vc->ac);
+ goto pick_server;
+
+no_more_servers:
+ /* That's all the servers poked to no good effect. Try again if some
+ * of them were busy.
+ */
+ if (vc->flags & AFS_VL_CURSOR_RETRY)
+ goto restart_from_beginning;
+
+ e.error = -EDESTADDRREQ;
+ e.responded = false;
+ for (i = 0; i < vc->server_list->nr_servers; i++) {
+ struct afs_vlserver *s = vc->server_list->servers[i].server;
+
+ afs_prioritise_error(&e, READ_ONCE(s->probe.error),
+ s->probe.abort_code);
+ }
+
+failed_set_error:
+ vc->error = error;
+failed:
+ vc->flags |= AFS_VL_CURSOR_STOP;
+ afs_end_cursor(&vc->ac);
+ _leave(" = f [failed %d]", vc->error);
+ return false;
+}
+
+/*
+ * Dump cursor state in the case of the error being EDESTADDRREQ.
+ */
+static void afs_vl_dump_edestaddrreq(const struct afs_vl_cursor *vc)
+{
+ static int count;
+ int i;
+
+ if (!IS_ENABLED(CONFIG_AFS_DEBUG_CURSOR) || count > 3)
+ return;
+ count++;
+
+ rcu_read_lock();
+ pr_notice("EDESTADDR occurred\n");
+ pr_notice("VC: ut=%lx ix=%u ni=%hu fl=%hx err=%hd\n",
+ vc->untried, vc->index, vc->nr_iterations, vc->flags, vc->error);
+
+ if (vc->server_list) {
+ const struct afs_vlserver_list *sl = vc->server_list;
+ pr_notice("VC: SL nr=%u ix=%u\n",
+ sl->nr_servers, sl->index);
+ for (i = 0; i < sl->nr_servers; i++) {
+ const struct afs_vlserver *s = sl->servers[i].server;
+ pr_notice("VC: server %s+%hu fl=%lx E=%hd\n",
+ s->name, s->port, s->flags, s->probe.error);
+ if (s->addresses) {
+ const struct afs_addr_list *a =
+ rcu_dereference(s->addresses);
+ pr_notice("VC: - nr=%u/%u/%u pf=%u\n",
+ a->nr_ipv4, a->nr_addrs, a->max_addrs,
+ a->preferred);
+ pr_notice("VC: - pr=%lx R=%lx F=%lx\n",
+ a->probed, a->responded, a->failed);
+ if (a == vc->ac.alist)
+ pr_notice("VC: - current\n");
+ }
+ }
+ }
+
+ pr_notice("AC: t=%lx ax=%u ac=%d er=%d r=%u ni=%u\n",
+ vc->ac.tried, vc->ac.index, vc->ac.abort_code, vc->ac.error,
+ vc->ac.responded, vc->ac.nr_iterations);
+ rcu_read_unlock();
+}
+
+/*
+ * Tidy up a volume location server cursor and unlock the vnode.
+ */
+int afs_end_vlserver_operation(struct afs_vl_cursor *vc)
+{
+ struct afs_net *net = vc->cell->net;
+
+ if (vc->error == -EDESTADDRREQ ||
+ vc->error == -EADDRNOTAVAIL ||
+ vc->error == -ENETUNREACH ||
+ vc->error == -EHOSTUNREACH)
+ afs_vl_dump_edestaddrreq(vc);
+
+ afs_end_cursor(&vc->ac);
+ afs_put_vlserverlist(net, vc->server_list);
+
+ if (vc->error == -ECONNABORTED)
+ vc->error = afs_abort_to_error(vc->ac.abort_code);
+
+ return vc->error;
+}
diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
index c3b740813fc7..c3d9e5a5f67e 100644
--- a/fs/afs/vlclient.c
+++ b/fs/afs/vlclient.c
@@ -128,14 +128,13 @@ static const struct afs_call_type afs_RXVLGetEntryByNameU = {
* Dispatch a get volume entry by name or ID operation (uuid variant). If the
* volname is a decimal number then it's a volume ID not a volume name.
*/
-struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_net *net,
- struct afs_addr_cursor *ac,
- struct key *key,
+struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_vl_cursor *vc,
const char *volname,
int volnamesz)
{
struct afs_vldb_entry *entry;
struct afs_call *call;
+ struct afs_net *net = vc->cell->net;
size_t reqsz, padsz;
__be32 *bp;
@@ -155,7 +154,7 @@ struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_net *net,
return ERR_PTR(-ENOMEM);
}
- call->key = key;
+ call->key = vc->key;
call->reply[0] = entry;
call->ret_reply0 = true;
@@ -168,7 +167,7 @@ struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_net *net,
memset((void *)bp + volnamesz, 0, padsz);
trace_afs_make_vl_call(call);
- return (struct afs_vldb_entry *)afs_make_call(ac, call, GFP_KERNEL, false);
+ return (struct afs_vldb_entry *)afs_make_call(&vc->ac, call, GFP_KERNEL, false);
}
/*
@@ -187,19 +186,18 @@ static int afs_deliver_vl_get_addrs_u(struct afs_call *call)
u32 uniquifier, nentries, count;
int i, ret;
- _enter("{%u,%zu/%u}", call->unmarshall, call->offset, call->count);
+ _enter("{%u,%zu/%u}",
+ call->unmarshall, iov_iter_count(call->_iter), call->count);
-again:
switch (call->unmarshall) {
case 0:
- call->offset = 0;
+ afs_extract_to_buf(call,
+ sizeof(struct afs_uuid__xdr) + 3 * sizeof(__be32));
call->unmarshall++;
/* Extract the returned uuid, uniquifier, nentries and blkaddrs size */
case 1:
- ret = afs_extract_data(call, call->buffer,
- sizeof(struct afs_uuid__xdr) + 3 * sizeof(__be32),
- true);
+ ret = afs_extract_data(call, true);
if (ret < 0)
return ret;
@@ -216,28 +214,28 @@ again:
call->reply[0] = alist;
call->count = count;
call->count2 = nentries;
- call->offset = 0;
call->unmarshall++;
+ more_entries:
+ count = min(call->count, 4U);
+ afs_extract_to_buf(call, count * sizeof(__be32));
+
/* Extract entries */
case 2:
- count = min(call->count, 4U);
- ret = afs_extract_data(call, call->buffer,
- count * sizeof(__be32),
- call->count > 4);
+ ret = afs_extract_data(call, call->count > 4);
if (ret < 0)
return ret;
alist = call->reply[0];
bp = call->buffer;
+ count = min(call->count, 4U);
for (i = 0; i < count; i++)
if (alist->nr_addrs < call->count2)
afs_merge_fs_addr4(alist, *bp++, AFS_FS_PORT);
call->count -= count;
if (call->count > 0)
- goto again;
- call->offset = 0;
+ goto more_entries;
call->unmarshall++;
break;
}
@@ -267,14 +265,13 @@ static const struct afs_call_type afs_RXVLGetAddrsU = {
* Dispatch an operation to get the addresses for a server, where the server is
* nominated by UUID.
*/
-struct afs_addr_list *afs_vl_get_addrs_u(struct afs_net *net,
- struct afs_addr_cursor *ac,
- struct key *key,
+struct afs_addr_list *afs_vl_get_addrs_u(struct afs_vl_cursor *vc,
const uuid_t *uuid)
{
struct afs_ListAddrByAttributes__xdr *r;
const struct afs_uuid *u = (const struct afs_uuid *)uuid;
struct afs_call *call;
+ struct afs_net *net = vc->cell->net;
__be32 *bp;
int i;
@@ -286,7 +283,7 @@ struct afs_addr_list *afs_vl_get_addrs_u(struct afs_net *net,
if (!call)
return ERR_PTR(-ENOMEM);
- call->key = key;
+ call->key = vc->key;
call->reply[0] = NULL;
call->ret_reply0 = true;
@@ -307,7 +304,7 @@ struct afs_addr_list *afs_vl_get_addrs_u(struct afs_net *net,
r->uuid.node[i] = htonl(u->node[i]);
trace_afs_make_vl_call(call);
- return (struct afs_addr_list *)afs_make_call(ac, call, GFP_KERNEL, false);
+ return (struct afs_addr_list *)afs_make_call(&vc->ac, call, GFP_KERNEL, false);
}
/*
@@ -318,54 +315,51 @@ static int afs_deliver_vl_get_capabilities(struct afs_call *call)
u32 count;
int ret;
- _enter("{%u,%zu/%u}", call->unmarshall, call->offset, call->count);
+ _enter("{%u,%zu/%u}",
+ call->unmarshall, iov_iter_count(call->_iter), call->count);
-again:
switch (call->unmarshall) {
case 0:
- call->offset = 0;
+ afs_extract_to_tmp(call);
call->unmarshall++;
/* Extract the capabilities word count */
case 1:
- ret = afs_extract_data(call, &call->tmp,
- 1 * sizeof(__be32),
- true);
+ ret = afs_extract_data(call, true);
if (ret < 0)
return ret;
count = ntohl(call->tmp);
-
call->count = count;
call->count2 = count;
- call->offset = 0;
+
call->unmarshall++;
+ afs_extract_discard(call, count * sizeof(__be32));
/* Extract capabilities words */
case 2:
- count = min(call->count, 16U);
- ret = afs_extract_data(call, call->buffer,
- count * sizeof(__be32),
- call->count > 16);
+ ret = afs_extract_data(call, false);
if (ret < 0)
return ret;
/* TODO: Examine capabilities */
- call->count -= count;
- if (call->count > 0)
- goto again;
- call->offset = 0;
call->unmarshall++;
break;
}
- call->reply[0] = (void *)(unsigned long)call->service_id;
-
_leave(" = 0 [done]");
return 0;
}
+static void afs_destroy_vl_get_capabilities(struct afs_call *call)
+{
+ struct afs_vlserver *server = call->reply[0];
+
+ afs_put_vlserver(call->net, server);
+ afs_flat_call_destructor(call);
+}
+
/*
* VL.GetCapabilities operation type
*/
@@ -373,11 +367,12 @@ static const struct afs_call_type afs_RXVLGetCapabilities = {
.name = "VL.GetCapabilities",
.op = afs_VL_GetCapabilities,
.deliver = afs_deliver_vl_get_capabilities,
- .destructor = afs_flat_call_destructor,
+ .done = afs_vlserver_probe_result,
+ .destructor = afs_destroy_vl_get_capabilities,
};
/*
- * Probe a fileserver for the capabilities that it supports. This can
+ * Probe a volume server for the capabilities that it supports. This can
* return up to 196 words.
*
* We use this to probe for service upgrade to determine what the server at the
@@ -385,7 +380,10 @@ static const struct afs_call_type afs_RXVLGetCapabilities = {
*/
int afs_vl_get_capabilities(struct afs_net *net,
struct afs_addr_cursor *ac,
- struct key *key)
+ struct key *key,
+ struct afs_vlserver *server,
+ unsigned int server_index,
+ bool async)
{
struct afs_call *call;
__be32 *bp;
@@ -397,9 +395,10 @@ int afs_vl_get_capabilities(struct afs_net *net,
return -ENOMEM;
call->key = key;
- call->upgrade = true; /* Let's see if this is a YFS server */
- call->reply[0] = (void *)VLGETCAPABILITIES;
- call->ret_reply0 = true;
+ call->reply[0] = afs_get_vlserver(server);
+ call->reply[1] = (void *)(long)server_index;
+ call->upgrade = true;
+ call->want_reply_time = true;
/* marshall the parameters */
bp = call->request;
@@ -407,7 +406,7 @@ int afs_vl_get_capabilities(struct afs_net *net,
/* Can't take a ref on server */
trace_afs_make_vl_call(call);
- return afs_make_call(ac, call, GFP_KERNEL, false);
+ return afs_make_call(ac, call, GFP_KERNEL, async);
}
/*
@@ -426,22 +425,19 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call)
u32 uniquifier, size;
int ret;
- _enter("{%u,%zu/%u,%u}", call->unmarshall, call->offset, call->count, call->count2);
+ _enter("{%u,%zu,%u}",
+ call->unmarshall, iov_iter_count(call->_iter), call->count2);
-again:
switch (call->unmarshall) {
case 0:
- call->offset = 0;
+ afs_extract_to_buf(call, sizeof(uuid_t) + 3 * sizeof(__be32));
call->unmarshall = 1;
/* Extract the returned uuid, uniquifier, fsEndpoints count and
* either the first fsEndpoint type or the volEndpoints
* count if there are no fsEndpoints. */
case 1:
- ret = afs_extract_data(call, call->buffer,
- sizeof(uuid_t) +
- 3 * sizeof(__be32),
- true);
+ ret = afs_extract_data(call, true);
if (ret < 0)
return ret;
@@ -451,22 +447,19 @@ again:
call->count2 = ntohl(*bp); /* Type or next count */
if (call->count > YFS_MAXENDPOINTS)
- return afs_protocol_error(call, -EBADMSG);
+ return afs_protocol_error(call, -EBADMSG,
+ afs_eproto_yvl_fsendpt_num);
alist = afs_alloc_addrlist(call->count, FS_SERVICE, AFS_FS_PORT);
if (!alist)
return -ENOMEM;
alist->version = uniquifier;
call->reply[0] = alist;
- call->offset = 0;
if (call->count == 0)
goto extract_volendpoints;
- call->unmarshall = 2;
-
- /* Extract fsEndpoints[] entries */
- case 2:
+ next_fsendpoint:
switch (call->count2) {
case YFS_ENDPOINT_IPV4:
size = sizeof(__be32) * (1 + 1 + 1);
@@ -475,11 +468,17 @@ again:
size = sizeof(__be32) * (1 + 4 + 1);
break;
default:
- return afs_protocol_error(call, -EBADMSG);
+ return afs_protocol_error(call, -EBADMSG,
+ afs_eproto_yvl_fsendpt_type);
}
size += sizeof(__be32);
- ret = afs_extract_data(call, call->buffer, size, true);
+ afs_extract_to_buf(call, size);
+ call->unmarshall = 2;
+
+ /* Extract fsEndpoints[] entries */
+ case 2:
+ ret = afs_extract_data(call, true);
if (ret < 0)
return ret;
@@ -488,18 +487,21 @@ again:
switch (call->count2) {
case YFS_ENDPOINT_IPV4:
if (ntohl(bp[0]) != sizeof(__be32) * 2)
- return afs_protocol_error(call, -EBADMSG);
+ return afs_protocol_error(call, -EBADMSG,
+ afs_eproto_yvl_fsendpt4_len);
afs_merge_fs_addr4(alist, bp[1], ntohl(bp[2]));
bp += 3;
break;
case YFS_ENDPOINT_IPV6:
if (ntohl(bp[0]) != sizeof(__be32) * 5)
- return afs_protocol_error(call, -EBADMSG);
+ return afs_protocol_error(call, -EBADMSG,
+ afs_eproto_yvl_fsendpt6_len);
afs_merge_fs_addr6(alist, bp + 1, ntohl(bp[5]));
bp += 6;
break;
default:
- return afs_protocol_error(call, -EBADMSG);
+ return afs_protocol_error(call, -EBADMSG,
+ afs_eproto_yvl_fsendpt_type);
}
/* Got either the type of the next entry or the count of
@@ -507,10 +509,9 @@ again:
*/
call->count2 = ntohl(*bp++);
- call->offset = 0;
call->count--;
if (call->count > 0)
- goto again;
+ goto next_fsendpoint;
extract_volendpoints:
/* Extract the list of volEndpoints. */
@@ -518,8 +519,10 @@ again:
if (!call->count)
goto end;
if (call->count > YFS_MAXENDPOINTS)
- return afs_protocol_error(call, -EBADMSG);
+ return afs_protocol_error(call, -EBADMSG,
+ afs_eproto_yvl_vlendpt_type);
+ afs_extract_to_buf(call, 1 * sizeof(__be32));
call->unmarshall = 3;
/* Extract the type of volEndpoints[0]. Normally we would
@@ -527,17 +530,14 @@ again:
* data of the current one, but this is the first...
*/
case 3:
- ret = afs_extract_data(call, call->buffer, sizeof(__be32), true);
+ ret = afs_extract_data(call, true);
if (ret < 0)
return ret;
bp = call->buffer;
- call->count2 = ntohl(*bp++);
- call->offset = 0;
- call->unmarshall = 4;
- /* Extract volEndpoints[] entries */
- case 4:
+ next_volendpoint:
+ call->count2 = ntohl(*bp++);
switch (call->count2) {
case YFS_ENDPOINT_IPV4:
size = sizeof(__be32) * (1 + 1 + 1);
@@ -546,12 +546,18 @@ again:
size = sizeof(__be32) * (1 + 4 + 1);
break;
default:
- return afs_protocol_error(call, -EBADMSG);
+ return afs_protocol_error(call, -EBADMSG,
+ afs_eproto_yvl_vlendpt_type);
}
if (call->count > 1)
- size += sizeof(__be32);
- ret = afs_extract_data(call, call->buffer, size, true);
+ size += sizeof(__be32); /* Get next type too */
+ afs_extract_to_buf(call, size);
+ call->unmarshall = 4;
+
+ /* Extract volEndpoints[] entries */
+ case 4:
+ ret = afs_extract_data(call, true);
if (ret < 0)
return ret;
@@ -559,34 +565,35 @@ again:
switch (call->count2) {
case YFS_ENDPOINT_IPV4:
if (ntohl(bp[0]) != sizeof(__be32) * 2)
- return afs_protocol_error(call, -EBADMSG);
+ return afs_protocol_error(call, -EBADMSG,
+ afs_eproto_yvl_vlendpt4_len);
bp += 3;
break;
case YFS_ENDPOINT_IPV6:
if (ntohl(bp[0]) != sizeof(__be32) * 5)
- return afs_protocol_error(call, -EBADMSG);
+ return afs_protocol_error(call, -EBADMSG,
+ afs_eproto_yvl_vlendpt6_len);
bp += 6;
break;
default:
- return afs_protocol_error(call, -EBADMSG);
+ return afs_protocol_error(call, -EBADMSG,
+ afs_eproto_yvl_vlendpt_type);
}
/* Got either the type of the next entry or the count of
* volEndpoints if no more fsEndpoints.
*/
- call->offset = 0;
call->count--;
- if (call->count > 0) {
- call->count2 = ntohl(*bp++);
- goto again;
- }
+ if (call->count > 0)
+ goto next_volendpoint;
end:
+ afs_extract_discard(call, 0);
call->unmarshall = 5;
/* Done */
case 5:
- ret = afs_extract_data(call, call->buffer, 0, false);
+ ret = afs_extract_data(call, false);
if (ret < 0)
return ret;
call->unmarshall = 6;
@@ -596,11 +603,6 @@ again:
}
alist = call->reply[0];
-
- /* Start with IPv6 if available. */
- if (alist->nr_ipv4 < alist->nr_addrs)
- alist->index = alist->nr_ipv4;
-
_leave(" = 0 [done]");
return 0;
}
@@ -619,12 +621,11 @@ static const struct afs_call_type afs_YFSVLGetEndpoints = {
* Dispatch an operation to get the addresses for a server, where the server is
* nominated by UUID.
*/
-struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_net *net,
- struct afs_addr_cursor *ac,
- struct key *key,
+struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_vl_cursor *vc,
const uuid_t *uuid)
{
struct afs_call *call;
+ struct afs_net *net = vc->cell->net;
__be32 *bp;
_enter("");
@@ -635,7 +636,7 @@ struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_net *net,
if (!call)
return ERR_PTR(-ENOMEM);
- call->key = key;
+ call->key = vc->key;
call->reply[0] = NULL;
call->ret_reply0 = true;
@@ -646,5 +647,5 @@ struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_net *net,
memcpy(bp, uuid, sizeof(*uuid)); /* Type opr_uuid */
trace_afs_make_vl_call(call);
- return (struct afs_addr_list *)afs_make_call(ac, call, GFP_KERNEL, false);
+ return (struct afs_addr_list *)afs_make_call(&vc->ac, call, GFP_KERNEL, false);
}
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 3037bd01f617..00975ed3640f 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -74,55 +74,19 @@ static struct afs_vldb_entry *afs_vl_lookup_vldb(struct afs_cell *cell,
const char *volname,
size_t volnamesz)
{
- struct afs_addr_cursor ac;
- struct afs_vldb_entry *vldb;
+ struct afs_vldb_entry *vldb = ERR_PTR(-EDESTADDRREQ);
+ struct afs_vl_cursor vc;
int ret;
- ret = afs_set_vl_cursor(&ac, cell);
- if (ret < 0)
- return ERR_PTR(ret);
-
- while (afs_iterate_addresses(&ac)) {
- if (!test_bit(ac.index, &ac.alist->probed)) {
- ret = afs_vl_get_capabilities(cell->net, &ac, key);
- switch (ret) {
- case VL_SERVICE:
- clear_bit(ac.index, &ac.alist->yfs);
- set_bit(ac.index, &ac.alist->probed);
- ac.addr->srx_service = ret;
- break;
- case YFS_VL_SERVICE:
- set_bit(ac.index, &ac.alist->yfs);
- set_bit(ac.index, &ac.alist->probed);
- ac.addr->srx_service = ret;
- break;
- }
- }
-
- vldb = afs_vl_get_entry_by_name_u(cell->net, &ac, key,
- volname, volnamesz);
- switch (ac.error) {
- case 0:
- afs_end_cursor(&ac);
- return vldb;
- case -ECONNABORTED:
- ac.error = afs_abort_to_error(ac.abort_code);
- goto error;
- case -ENOMEM:
- case -ENONET:
- goto error;
- case -ENETUNREACH:
- case -EHOSTUNREACH:
- case -ECONNREFUSED:
- break;
- default:
- ac.error = -EIO;
- goto error;
- }
+ if (!afs_begin_vlserver_operation(&vc, cell, key))
+ return ERR_PTR(-ERESTARTSYS);
+
+ while (afs_select_vlserver(&vc)) {
+ vldb = afs_vl_get_entry_by_name_u(&vc, volname, volnamesz);
}
-error:
- return ERR_PTR(afs_end_cursor(&ac));
+ ret = afs_end_vlserver_operation(&vc);
+ return ret < 0 ? ERR_PTR(ret) : vldb;
}
/*
@@ -270,7 +234,7 @@ static int afs_update_volume_status(struct afs_volume *volume, struct key *key)
/* We look up an ID by passing it as a decimal string in the
* operation's name parameter.
*/
- idsz = sprintf(idbuf, "%u", volume->vid);
+ idsz = sprintf(idbuf, "%llu", volume->vid);
vldb = afs_vl_lookup_vldb(volume->cell, key, idbuf, idsz);
if (IS_ERR(vldb)) {
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 19c04caf3c01..72efcfcf9f95 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -33,10 +33,21 @@ static int afs_fill_page(struct afs_vnode *vnode, struct key *key,
loff_t pos, unsigned int len, struct page *page)
{
struct afs_read *req;
+ size_t p;
+ void *data;
int ret;
_enter(",,%llu", (unsigned long long)pos);
+ if (pos >= vnode->vfs_inode.i_size) {
+ p = pos & ~PAGE_MASK;
+ ASSERTCMP(p + len, <=, PAGE_SIZE);
+ data = kmap(page);
+ memset(data + p, 0, len);
+ kunmap(page);
+ return 0;
+ }
+
req = kzalloc(sizeof(struct afs_read) + sizeof(struct page *),
GFP_KERNEL);
if (!req)
@@ -81,7 +92,7 @@ int afs_write_begin(struct file *file, struct address_space *mapping,
pgoff_t index = pos >> PAGE_SHIFT;
int ret;
- _enter("{%x:%u},{%lx},%u,%u",
+ _enter("{%llx:%llu},{%lx},%u,%u",
vnode->fid.vid, vnode->fid.vnode, index, from, to);
/* We want to store information about how much of a page is altered in
@@ -181,7 +192,7 @@ int afs_write_end(struct file *file, struct address_space *mapping,
loff_t i_size, maybe_i_size;
int ret;
- _enter("{%x:%u},{%lx}",
+ _enter("{%llx:%llu},{%lx}",
vnode->fid.vid, vnode->fid.vnode, page->index);
maybe_i_size = pos + copied;
@@ -230,7 +241,7 @@ static void afs_kill_pages(struct address_space *mapping,
struct pagevec pv;
unsigned count, loop;
- _enter("{%x:%u},%lx-%lx",
+ _enter("{%llx:%llu},%lx-%lx",
vnode->fid.vid, vnode->fid.vnode, first, last);
pagevec_init(&pv);
@@ -272,7 +283,7 @@ static void afs_redirty_pages(struct writeback_control *wbc,
struct pagevec pv;
unsigned count, loop;
- _enter("{%x:%u},%lx-%lx",
+ _enter("{%llx:%llu},%lx-%lx",
vnode->fid.vid, vnode->fid.vnode, first, last);
pagevec_init(&pv);
@@ -314,7 +325,7 @@ static int afs_store_data(struct address_space *mapping,
struct list_head *p;
int ret = -ENOKEY, ret2;
- _enter("%s{%x:%u.%u},%lx,%lx,%x,%x",
+ _enter("%s{%llx:%llu.%u},%lx,%lx,%x,%x",
vnode->volume->name,
vnode->fid.vid,
vnode->fid.vnode,
@@ -533,6 +544,7 @@ no_more:
case -ENOENT:
case -ENOMEDIUM:
case -ENXIO:
+ trace_afs_file_error(vnode, ret, afs_file_error_writeback_fail);
afs_kill_pages(mapping, first, last);
mapping_set_error(mapping, ret);
break;
@@ -675,7 +687,7 @@ void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call)
unsigned count, loop;
pgoff_t first = call->first, last = call->last;
- _enter("{%x:%u},{%lx-%lx}",
+ _enter("{%llx:%llu},{%lx-%lx}",
vnode->fid.vid, vnode->fid.vnode, first, last);
pagevec_init(&pv);
@@ -714,7 +726,7 @@ ssize_t afs_file_write(struct kiocb *iocb, struct iov_iter *from)
ssize_t result;
size_t count = iov_iter_count(from);
- _enter("{%x.%u},{%zu},",
+ _enter("{%llx:%llu},{%zu},",
vnode->fid.vid, vnode->fid.vnode, count);
if (IS_SWAPFILE(&vnode->vfs_inode)) {
@@ -742,7 +754,7 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
struct inode *inode = file_inode(file);
struct afs_vnode *vnode = AFS_FS_I(inode);
- _enter("{%x:%u},{n=%pD},%d",
+ _enter("{%llx:%llu},{n=%pD},%d",
vnode->fid.vid, vnode->fid.vnode, file,
datasync);
@@ -760,7 +772,7 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf)
struct afs_vnode *vnode = AFS_FS_I(inode);
unsigned long priv;
- _enter("{{%x:%u}},{%lx}",
+ _enter("{{%llx:%llu}},{%lx}",
vnode->fid.vid, vnode->fid.vnode, vmf->page->index);
sb_start_pagefault(inode->i_sb);
diff --git a/fs/afs/xattr.c b/fs/afs/xattr.c
index cfcc674e64a5..a2cdf25573e2 100644
--- a/fs/afs/xattr.c
+++ b/fs/afs/xattr.c
@@ -72,7 +72,7 @@ static int afs_xattr_get_fid(const struct xattr_handler *handler,
char text[8 + 1 + 8 + 1 + 8 + 1];
size_t len;
- len = sprintf(text, "%x:%x:%x",
+ len = sprintf(text, "%llx:%llx:%x",
vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
if (size == 0)
return len;
diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c
new file mode 100644
index 000000000000..5aa57929e8c2
--- /dev/null
+++ b/fs/afs/yfsclient.c
@@ -0,0 +1,2184 @@
+/* YFS File Server client stubs
+ *
+ * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/circ_buf.h>
+#include <linux/iversion.h>
+#include "internal.h"
+#include "afs_fs.h"
+#include "xdr_fs.h"
+#include "protocol_yfs.h"
+
+static const struct afs_fid afs_zero_fid;
+
+static inline void afs_use_fs_server(struct afs_call *call, struct afs_cb_interest *cbi)
+{
+ call->cbi = afs_get_cb_interest(cbi);
+}
+
+#define xdr_size(x) (sizeof(*x) / sizeof(__be32))
+
+static void xdr_decode_YFSFid(const __be32 **_bp, struct afs_fid *fid)
+{
+ const struct yfs_xdr_YFSFid *x = (const void *)*_bp;
+
+ fid->vid = xdr_to_u64(x->volume);
+ fid->vnode = xdr_to_u64(x->vnode.lo);
+ fid->vnode_hi = ntohl(x->vnode.hi);
+ fid->unique = ntohl(x->vnode.unique);
+ *_bp += xdr_size(x);
+}
+
+static __be32 *xdr_encode_u32(__be32 *bp, u32 n)
+{
+ *bp++ = htonl(n);
+ return bp;
+}
+
+static __be32 *xdr_encode_u64(__be32 *bp, u64 n)
+{
+ struct yfs_xdr_u64 *x = (void *)bp;
+
+ *x = u64_to_xdr(n);
+ return bp + xdr_size(x);
+}
+
+static __be32 *xdr_encode_YFSFid(__be32 *bp, struct afs_fid *fid)
+{
+ struct yfs_xdr_YFSFid *x = (void *)bp;
+
+ x->volume = u64_to_xdr(fid->vid);
+ x->vnode.lo = u64_to_xdr(fid->vnode);
+ x->vnode.hi = htonl(fid->vnode_hi);
+ x->vnode.unique = htonl(fid->unique);
+ return bp + xdr_size(x);
+}
+
+static size_t xdr_strlen(unsigned int len)
+{
+ return sizeof(__be32) + round_up(len, sizeof(__be32));
+}
+
+static __be32 *xdr_encode_string(__be32 *bp, const char *p, unsigned int len)
+{
+ bp = xdr_encode_u32(bp, len);
+ bp = memcpy(bp, p, len);
+ if (len & 3) {
+ unsigned int pad = 4 - (len & 3);
+
+ memset((u8 *)bp + len, 0, pad);
+ len += pad;
+ }
+
+ return bp + len / sizeof(__be32);
+}
+
+static s64 linux_to_yfs_time(const struct timespec64 *t)
+{
+ /* Convert to 100ns intervals. */
+ return (u64)t->tv_sec * 10000000 + t->tv_nsec/100;
+}
+
+static __be32 *xdr_encode_YFSStoreStatus_mode(__be32 *bp, mode_t mode)
+{
+ struct yfs_xdr_YFSStoreStatus *x = (void *)bp;
+
+ x->mask = htonl(AFS_SET_MODE);
+ x->mode = htonl(mode & S_IALLUGO);
+ x->mtime_client = u64_to_xdr(0);
+ x->owner = u64_to_xdr(0);
+ x->group = u64_to_xdr(0);
+ return bp + xdr_size(x);
+}
+
+static __be32 *xdr_encode_YFSStoreStatus_mtime(__be32 *bp, const struct timespec64 *t)
+{
+ struct yfs_xdr_YFSStoreStatus *x = (void *)bp;
+ s64 mtime = linux_to_yfs_time(t);
+
+ x->mask = htonl(AFS_SET_MTIME);
+ x->mode = htonl(0);
+ x->mtime_client = u64_to_xdr(mtime);
+ x->owner = u64_to_xdr(0);
+ x->group = u64_to_xdr(0);
+ return bp + xdr_size(x);
+}
+
+/*
+ * Convert a signed 100ns-resolution 64-bit time into a timespec.
+ */
+static struct timespec64 yfs_time_to_linux(s64 t)
+{
+ struct timespec64 ts;
+ u64 abs_t;
+
+ /*
+ * Unfortunately can not use normal 64 bit division on 32 bit arch, but
+ * the alternative, do_div, does not work with negative numbers so have
+ * to special case them
+ */
+ if (t < 0) {
+ abs_t = -t;
+ ts.tv_nsec = (time64_t)(do_div(abs_t, 10000000) * 100);
+ ts.tv_nsec = -ts.tv_nsec;
+ ts.tv_sec = -abs_t;
+ } else {
+ abs_t = t;
+ ts.tv_nsec = (time64_t)do_div(abs_t, 10000000) * 100;
+ ts.tv_sec = abs_t;
+ }
+
+ return ts;
+}
+
+static struct timespec64 xdr_to_time(const struct yfs_xdr_u64 xdr)
+{
+ s64 t = xdr_to_u64(xdr);
+
+ return yfs_time_to_linux(t);
+}
+
+static void yfs_check_req(struct afs_call *call, __be32 *bp)
+{
+ size_t len = (void *)bp - call->request;
+
+ if (len > call->request_size)
+ pr_err("kAFS: %s: Request buffer overflow (%zu>%u)\n",
+ call->type->name, len, call->request_size);
+ else if (len < call->request_size)
+ pr_warning("kAFS: %s: Request buffer underflow (%zu<%u)\n",
+ call->type->name, len, call->request_size);
+}
+
+/*
+ * Dump a bad file status record.
+ */
+static void xdr_dump_bad(const __be32 *bp)
+{
+ __be32 x[4];
+ int i;
+
+ pr_notice("YFS XDR: Bad status record\n");
+ for (i = 0; i < 5 * 4 * 4; i += 16) {
+ memcpy(x, bp, 16);
+ bp += 4;
+ pr_notice("%03x: %08x %08x %08x %08x\n",
+ i, ntohl(x[0]), ntohl(x[1]), ntohl(x[2]), ntohl(x[3]));
+ }
+
+ memcpy(x, bp, 4);
+ pr_notice("0x50: %08x\n", ntohl(x[0]));
+}
+
+/*
+ * Decode a YFSFetchStatus block
+ */
+static int xdr_decode_YFSFetchStatus(struct afs_call *call,
+ const __be32 **_bp,
+ struct afs_file_status *status,
+ struct afs_vnode *vnode,
+ const afs_dataversion_t *expected_version,
+ struct afs_read *read_req)
+{
+ const struct yfs_xdr_YFSFetchStatus *xdr = (const void *)*_bp;
+ u32 type;
+ u8 flags = 0;
+
+ status->abort_code = ntohl(xdr->abort_code);
+ if (status->abort_code != 0) {
+ if (vnode && status->abort_code == VNOVNODE) {
+ set_bit(AFS_VNODE_DELETED, &vnode->flags);
+ status->nlink = 0;
+ __afs_break_callback(vnode);
+ }
+ return 0;
+ }
+
+ type = ntohl(xdr->type);
+ switch (type) {
+ case AFS_FTYPE_FILE:
+ case AFS_FTYPE_DIR:
+ case AFS_FTYPE_SYMLINK:
+ if (type != status->type &&
+ vnode &&
+ !test_bit(AFS_VNODE_UNSET, &vnode->flags)) {
+ pr_warning("Vnode %llx:%llx:%x changed type %u to %u\n",
+ vnode->fid.vid,
+ vnode->fid.vnode,
+ vnode->fid.unique,
+ status->type, type);
+ goto bad;
+ }
+ status->type = type;
+ break;
+ default:
+ goto bad;
+ }
+
+#define EXTRACT_M4(FIELD) \
+ do { \
+ u32 x = ntohl(xdr->FIELD); \
+ if (status->FIELD != x) { \
+ flags |= AFS_VNODE_META_CHANGED; \
+ status->FIELD = x; \
+ } \
+ } while (0)
+
+#define EXTRACT_M8(FIELD) \
+ do { \
+ u64 x = xdr_to_u64(xdr->FIELD); \
+ if (status->FIELD != x) { \
+ flags |= AFS_VNODE_META_CHANGED; \
+ status->FIELD = x; \
+ } \
+ } while (0)
+
+#define EXTRACT_D8(FIELD) \
+ do { \
+ u64 x = xdr_to_u64(xdr->FIELD); \
+ if (status->FIELD != x) { \
+ flags |= AFS_VNODE_DATA_CHANGED; \
+ status->FIELD = x; \
+ } \
+ } while (0)
+
+ EXTRACT_M4(nlink);
+ EXTRACT_D8(size);
+ EXTRACT_D8(data_version);
+ EXTRACT_M8(author);
+ EXTRACT_M8(owner);
+ EXTRACT_M8(group);
+ EXTRACT_M4(mode);
+ EXTRACT_M4(caller_access); /* call ticket dependent */
+ EXTRACT_M4(anon_access);
+
+ status->mtime_client = xdr_to_time(xdr->mtime_client);
+ status->mtime_server = xdr_to_time(xdr->mtime_server);
+ status->lock_count = ntohl(xdr->lock_count);
+
+ if (read_req) {
+ read_req->data_version = status->data_version;
+ read_req->file_size = status->size;
+ }
+
+ *_bp += xdr_size(xdr);
+
+ if (vnode) {
+ if (test_bit(AFS_VNODE_UNSET, &vnode->flags))
+ flags |= AFS_VNODE_NOT_YET_SET;
+ afs_update_inode_from_status(vnode, status, expected_version,
+ flags);
+ }
+
+ return 0;
+
+bad:
+ xdr_dump_bad(*_bp);
+ return afs_protocol_error(call, -EBADMSG, afs_eproto_bad_status);
+}
+
+/*
+ * Decode the file status. We need to lock the target vnode if we're going to
+ * update its status so that stat() sees the attributes update atomically.
+ */
+static int yfs_decode_status(struct afs_call *call,
+ const __be32 **_bp,
+ struct afs_file_status *status,
+ struct afs_vnode *vnode,
+ const afs_dataversion_t *expected_version,
+ struct afs_read *read_req)
+{
+ int ret;
+
+ if (!vnode)
+ return xdr_decode_YFSFetchStatus(call, _bp, status, vnode,
+ expected_version, read_req);
+
+ write_seqlock(&vnode->cb_lock);
+ ret = xdr_decode_YFSFetchStatus(call, _bp, status, vnode,
+ expected_version, read_req);
+ write_sequnlock(&vnode->cb_lock);
+ return ret;
+}
+
+/*
+ * Decode a YFSCallBack block
+ */
+static void xdr_decode_YFSCallBack(struct afs_call *call,
+ struct afs_vnode *vnode,
+ const __be32 **_bp)
+{
+ struct yfs_xdr_YFSCallBack *xdr = (void *)*_bp;
+ struct afs_cb_interest *old, *cbi = call->cbi;
+ u64 cb_expiry;
+
+ write_seqlock(&vnode->cb_lock);
+
+ if (!afs_cb_is_broken(call->cb_break, vnode, cbi)) {
+ cb_expiry = xdr_to_u64(xdr->expiration_time);
+ do_div(cb_expiry, 10 * 1000 * 1000);
+ vnode->cb_version = ntohl(xdr->version);
+ vnode->cb_type = ntohl(xdr->type);
+ vnode->cb_expires_at = cb_expiry + ktime_get_real_seconds();
+ old = vnode->cb_interest;
+ if (old != call->cbi) {
+ vnode->cb_interest = cbi;
+ cbi = old;
+ }
+ set_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
+ }
+
+ write_sequnlock(&vnode->cb_lock);
+ call->cbi = cbi;
+ *_bp += xdr_size(xdr);
+}
+
+static void xdr_decode_YFSCallBack_raw(const __be32 **_bp,
+ struct afs_callback *cb)
+{
+ struct yfs_xdr_YFSCallBack *x = (void *)*_bp;
+ u64 cb_expiry;
+
+ cb_expiry = xdr_to_u64(x->expiration_time);
+ do_div(cb_expiry, 10 * 1000 * 1000);
+ cb->version = ntohl(x->version);
+ cb->type = ntohl(x->type);
+ cb->expires_at = cb_expiry + ktime_get_real_seconds();
+
+ *_bp += xdr_size(x);
+}
+
+/*
+ * Decode a YFSVolSync block
+ */
+static void xdr_decode_YFSVolSync(const __be32 **_bp,
+ struct afs_volsync *volsync)
+{
+ struct yfs_xdr_YFSVolSync *x = (void *)*_bp;
+ u64 creation;
+
+ if (volsync) {
+ creation = xdr_to_u64(x->vol_creation_date);
+ do_div(creation, 10 * 1000 * 1000);
+ volsync->creation = creation;
+ }
+
+ *_bp += xdr_size(x);
+}
+
+/*
+ * Encode the requested attributes into a YFSStoreStatus block
+ */
+static __be32 *xdr_encode_YFS_StoreStatus(__be32 *bp, struct iattr *attr)
+{
+ struct yfs_xdr_YFSStoreStatus *x = (void *)bp;
+ s64 mtime = 0, owner = 0, group = 0;
+ u32 mask = 0, mode = 0;
+
+ mask = 0;
+ if (attr->ia_valid & ATTR_MTIME) {
+ mask |= AFS_SET_MTIME;
+ mtime = linux_to_yfs_time(&attr->ia_mtime);
+ }
+
+ if (attr->ia_valid & ATTR_UID) {
+ mask |= AFS_SET_OWNER;
+ owner = from_kuid(&init_user_ns, attr->ia_uid);
+ }
+
+ if (attr->ia_valid & ATTR_GID) {
+ mask |= AFS_SET_GROUP;
+ group = from_kgid(&init_user_ns, attr->ia_gid);
+ }
+
+ if (attr->ia_valid & ATTR_MODE) {
+ mask |= AFS_SET_MODE;
+ mode = attr->ia_mode & S_IALLUGO;
+ }
+
+ x->mask = htonl(mask);
+ x->mode = htonl(mode);
+ x->mtime_client = u64_to_xdr(mtime);
+ x->owner = u64_to_xdr(owner);
+ x->group = u64_to_xdr(group);
+ return bp + xdr_size(x);
+}
+
+/*
+ * Decode a YFSFetchVolumeStatus block.
+ */
+static void xdr_decode_YFSFetchVolumeStatus(const __be32 **_bp,
+ struct afs_volume_status *vs)
+{
+ const struct yfs_xdr_YFSFetchVolumeStatus *x = (const void *)*_bp;
+ u32 flags;
+
+ vs->vid = xdr_to_u64(x->vid);
+ vs->parent_id = xdr_to_u64(x->parent_id);
+ flags = ntohl(x->flags);
+ vs->online = flags & yfs_FVSOnline;
+ vs->in_service = flags & yfs_FVSInservice;
+ vs->blessed = flags & yfs_FVSBlessed;
+ vs->needs_salvage = flags & yfs_FVSNeedsSalvage;
+ vs->type = ntohl(x->type);
+ vs->min_quota = 0;
+ vs->max_quota = xdr_to_u64(x->max_quota);
+ vs->blocks_in_use = xdr_to_u64(x->blocks_in_use);
+ vs->part_blocks_avail = xdr_to_u64(x->part_blocks_avail);
+ vs->part_max_blocks = xdr_to_u64(x->part_max_blocks);
+ vs->vol_copy_date = xdr_to_u64(x->vol_copy_date);
+ vs->vol_backup_date = xdr_to_u64(x->vol_backup_date);
+ *_bp += sizeof(*x) / sizeof(__be32);
+}
+
+/*
+ * deliver reply data to an FS.FetchStatus
+ */
+static int yfs_deliver_fs_fetch_status_vnode(struct afs_call *call)
+{
+ struct afs_vnode *vnode = call->reply[0];
+ const __be32 *bp;
+ int ret;
+
+ ret = afs_transfer_reply(call);
+ if (ret < 0)
+ return ret;
+
+ _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode);
+
+ /* unmarshall the reply once we've received all of it */
+ bp = call->buffer;
+ ret = yfs_decode_status(call, &bp, &vnode->status, vnode,
+ &call->expected_version, NULL);
+ if (ret < 0)
+ return ret;
+ xdr_decode_YFSCallBack(call, vnode, &bp);
+ xdr_decode_YFSVolSync(&bp, call->reply[1]);
+
+ _leave(" = 0 [done]");
+ return 0;
+}
+
+/*
+ * YFS.FetchStatus operation type
+ */
+static const struct afs_call_type yfs_RXYFSFetchStatus_vnode = {
+ .name = "YFS.FetchStatus(vnode)",
+ .op = yfs_FS_FetchStatus,
+ .deliver = yfs_deliver_fs_fetch_status_vnode,
+ .destructor = afs_flat_call_destructor,
+};
+
+/*
+ * Fetch the status information for a file.
+ */
+int yfs_fs_fetch_file_status(struct afs_fs_cursor *fc, struct afs_volsync *volsync,
+ bool new_inode)
+{
+ struct afs_vnode *vnode = fc->vnode;
+ struct afs_call *call;
+ struct afs_net *net = afs_v2net(vnode);
+ __be32 *bp;
+
+ _enter(",%x,{%llx:%llu},,",
+ key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode);
+
+ call = afs_alloc_flat_call(net, &yfs_RXYFSFetchStatus_vnode,
+ sizeof(__be32) * 2 +
+ sizeof(struct yfs_xdr_YFSFid),
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSCallBack) +
+ sizeof(struct yfs_xdr_YFSVolSync));
+ if (!call) {
+ fc->ac.error = -ENOMEM;
+ return -ENOMEM;
+ }
+
+ call->key = fc->key;
+ call->reply[0] = vnode;
+ call->reply[1] = volsync;
+ call->expected_version = new_inode ? 1 : vnode->status.data_version;
+
+ /* marshall the parameters */
+ bp = call->request;
+ bp = xdr_encode_u32(bp, YFSFETCHSTATUS);
+ bp = xdr_encode_u32(bp, 0); /* RPC flags */
+ bp = xdr_encode_YFSFid(bp, &vnode->fid);
+ yfs_check_req(call, bp);
+
+ call->cb_break = fc->cb_break;
+ afs_use_fs_server(call, fc->cbi);
+ trace_afs_make_fs_call(call, &vnode->fid);
+ return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+}
+
+/*
+ * Deliver reply data to an YFS.FetchData64.
+ */
+static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
+{
+ struct afs_vnode *vnode = call->reply[0];
+ struct afs_read *req = call->reply[2];
+ const __be32 *bp;
+ unsigned int size;
+ int ret;
+
+ _enter("{%u,%zu/%llu}",
+ call->unmarshall, iov_iter_count(&call->iter), req->actual_len);
+
+ switch (call->unmarshall) {
+ case 0:
+ req->actual_len = 0;
+ req->index = 0;
+ req->offset = req->pos & (PAGE_SIZE - 1);
+ afs_extract_to_tmp64(call);
+ call->unmarshall++;
+
+ /* extract the returned data length */
+ case 1:
+ _debug("extract data length");
+ ret = afs_extract_data(call, true);
+ if (ret < 0)
+ return ret;
+
+ req->actual_len = be64_to_cpu(call->tmp64);
+ _debug("DATA length: %llu", req->actual_len);
+ req->remain = min(req->len, req->actual_len);
+ if (req->remain == 0)
+ goto no_more_data;
+
+ call->unmarshall++;
+
+ begin_page:
+ ASSERTCMP(req->index, <, req->nr_pages);
+ if (req->remain > PAGE_SIZE - req->offset)
+ size = PAGE_SIZE - req->offset;
+ else
+ size = req->remain;
+ call->bvec[0].bv_len = size;
+ call->bvec[0].bv_offset = req->offset;
+ call->bvec[0].bv_page = req->pages[req->index];
+ iov_iter_bvec(&call->iter, READ, call->bvec, 1, size);
+ ASSERTCMP(size, <=, PAGE_SIZE);
+
+ /* extract the returned data */
+ case 2:
+ _debug("extract data %zu/%llu",
+ iov_iter_count(&call->iter), req->remain);
+
+ ret = afs_extract_data(call, true);
+ if (ret < 0)
+ return ret;
+ req->remain -= call->bvec[0].bv_len;
+ req->offset += call->bvec[0].bv_len;
+ ASSERTCMP(req->offset, <=, PAGE_SIZE);
+ if (req->offset == PAGE_SIZE) {
+ req->offset = 0;
+ if (req->page_done)
+ req->page_done(call, req);
+ req->index++;
+ if (req->remain > 0)
+ goto begin_page;
+ }
+
+ ASSERTCMP(req->remain, ==, 0);
+ if (req->actual_len <= req->len)
+ goto no_more_data;
+
+ /* Discard any excess data the server gave us */
+ iov_iter_discard(&call->iter, READ, req->actual_len - req->len);
+ call->unmarshall = 3;
+ case 3:
+ _debug("extract discard %zu/%llu",
+ iov_iter_count(&call->iter), req->actual_len - req->len);
+
+ ret = afs_extract_data(call, true);
+ if (ret < 0)
+ return ret;
+
+ no_more_data:
+ call->unmarshall = 4;
+ afs_extract_to_buf(call,
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSCallBack) +
+ sizeof(struct yfs_xdr_YFSVolSync));
+
+ /* extract the metadata */
+ case 4:
+ ret = afs_extract_data(call, false);
+ if (ret < 0)
+ return ret;
+
+ bp = call->buffer;
+ ret = yfs_decode_status(call, &bp, &vnode->status, vnode,
+ &vnode->status.data_version, req);
+ if (ret < 0)
+ return ret;
+ xdr_decode_YFSCallBack(call, vnode, &bp);
+ xdr_decode_YFSVolSync(&bp, call->reply[1]);
+
+ call->unmarshall++;
+
+ case 5:
+ break;
+ }
+
+ for (; req->index < req->nr_pages; req->index++) {
+ if (req->offset < PAGE_SIZE)
+ zero_user_segment(req->pages[req->index],
+ req->offset, PAGE_SIZE);
+ if (req->page_done)
+ req->page_done(call, req);
+ req->offset = 0;
+ }
+
+ _leave(" = 0 [done]");
+ return 0;
+}
+
+static void yfs_fetch_data_destructor(struct afs_call *call)
+{
+ struct afs_read *req = call->reply[2];
+
+ afs_put_read(req);
+ afs_flat_call_destructor(call);
+}
+
+/*
+ * YFS.FetchData64 operation type
+ */
+static const struct afs_call_type yfs_RXYFSFetchData64 = {
+ .name = "YFS.FetchData64",
+ .op = yfs_FS_FetchData64,
+ .deliver = yfs_deliver_fs_fetch_data64,
+ .destructor = yfs_fetch_data_destructor,
+};
+
+/*
+ * Fetch data from a file.
+ */
+int yfs_fs_fetch_data(struct afs_fs_cursor *fc, struct afs_read *req)
+{
+ struct afs_vnode *vnode = fc->vnode;
+ struct afs_call *call;
+ struct afs_net *net = afs_v2net(vnode);
+ __be32 *bp;
+
+ _enter(",%x,{%llx:%llu},%llx,%llx",
+ key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode,
+ req->pos, req->len);
+
+ call = afs_alloc_flat_call(net, &yfs_RXYFSFetchData64,
+ sizeof(__be32) * 2 +
+ sizeof(struct yfs_xdr_YFSFid) +
+ sizeof(struct yfs_xdr_u64) * 2,
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSCallBack) +
+ sizeof(struct yfs_xdr_YFSVolSync));
+ if (!call)
+ return -ENOMEM;
+
+ call->key = fc->key;
+ call->reply[0] = vnode;
+ call->reply[1] = NULL; /* volsync */
+ call->reply[2] = req;
+ call->expected_version = vnode->status.data_version;
+ call->want_reply_time = true;
+
+ /* marshall the parameters */
+ bp = call->request;
+ bp = xdr_encode_u32(bp, YFSFETCHDATA64);
+ bp = xdr_encode_u32(bp, 0); /* RPC flags */
+ bp = xdr_encode_YFSFid(bp, &vnode->fid);
+ bp = xdr_encode_u64(bp, req->pos);
+ bp = xdr_encode_u64(bp, req->len);
+ yfs_check_req(call, bp);
+
+ refcount_inc(&req->usage);
+ call->cb_break = fc->cb_break;
+ afs_use_fs_server(call, fc->cbi);
+ trace_afs_make_fs_call(call, &vnode->fid);
+ return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+}
+
+/*
+ * Deliver reply data for YFS.CreateFile or YFS.MakeDir.
+ */
+static int yfs_deliver_fs_create_vnode(struct afs_call *call)
+{
+ struct afs_vnode *vnode = call->reply[0];
+ const __be32 *bp;
+ int ret;
+
+ _enter("{%u}", call->unmarshall);
+
+ ret = afs_transfer_reply(call);
+ if (ret < 0)
+ return ret;
+
+ /* unmarshall the reply once we've received all of it */
+ bp = call->buffer;
+ xdr_decode_YFSFid(&bp, call->reply[1]);
+ ret = yfs_decode_status(call, &bp, call->reply[2], NULL, NULL, NULL);
+ if (ret < 0)
+ return ret;
+ ret = yfs_decode_status(call, &bp, &vnode->status, vnode,
+ &call->expected_version, NULL);
+ if (ret < 0)
+ return ret;
+ xdr_decode_YFSCallBack_raw(&bp, call->reply[3]);
+ xdr_decode_YFSVolSync(&bp, NULL);
+
+ _leave(" = 0 [done]");
+ return 0;
+}
+
+/*
+ * FS.CreateFile and FS.MakeDir operation type
+ */
+static const struct afs_call_type afs_RXFSCreateFile = {
+ .name = "YFS.CreateFile",
+ .op = yfs_FS_CreateFile,
+ .deliver = yfs_deliver_fs_create_vnode,
+ .destructor = afs_flat_call_destructor,
+};
+
+/*
+ * Create a file.
+ */
+int yfs_fs_create_file(struct afs_fs_cursor *fc,
+ const char *name,
+ umode_t mode,
+ u64 current_data_version,
+ struct afs_fid *newfid,
+ struct afs_file_status *newstatus,
+ struct afs_callback *newcb)
+{
+ struct afs_vnode *vnode = fc->vnode;
+ struct afs_call *call;
+ struct afs_net *net = afs_v2net(vnode);
+ size_t namesz, reqsz, rplsz;
+ __be32 *bp;
+
+ _enter("");
+
+ namesz = strlen(name);
+ reqsz = (sizeof(__be32) +
+ sizeof(__be32) +
+ sizeof(struct yfs_xdr_YFSFid) +
+ xdr_strlen(namesz) +
+ sizeof(struct yfs_xdr_YFSStoreStatus) +
+ sizeof(__be32));
+ rplsz = (sizeof(struct yfs_xdr_YFSFid) +
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSCallBack) +
+ sizeof(struct yfs_xdr_YFSVolSync));
+
+ call = afs_alloc_flat_call(net, &afs_RXFSCreateFile, reqsz, rplsz);
+ if (!call)
+ return -ENOMEM;
+
+ call->key = fc->key;
+ call->reply[0] = vnode;
+ call->reply[1] = newfid;
+ call->reply[2] = newstatus;
+ call->reply[3] = newcb;
+ call->expected_version = current_data_version + 1;
+
+ /* marshall the parameters */
+ bp = call->request;
+ bp = xdr_encode_u32(bp, YFSCREATEFILE);
+ bp = xdr_encode_u32(bp, 0); /* RPC flags */
+ bp = xdr_encode_YFSFid(bp, &vnode->fid);
+ bp = xdr_encode_string(bp, name, namesz);
+ bp = xdr_encode_YFSStoreStatus_mode(bp, mode);
+ bp = xdr_encode_u32(bp, yfs_LockNone); /* ViceLockType */
+ yfs_check_req(call, bp);
+
+ afs_use_fs_server(call, fc->cbi);
+ trace_afs_make_fs_call(call, &vnode->fid);
+ return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+}
+
+static const struct afs_call_type yfs_RXFSMakeDir = {
+ .name = "YFS.MakeDir",
+ .op = yfs_FS_MakeDir,
+ .deliver = yfs_deliver_fs_create_vnode,
+ .destructor = afs_flat_call_destructor,
+};
+
+/*
+ * Make a directory.
+ */
+int yfs_fs_make_dir(struct afs_fs_cursor *fc,
+ const char *name,
+ umode_t mode,
+ u64 current_data_version,
+ struct afs_fid *newfid,
+ struct afs_file_status *newstatus,
+ struct afs_callback *newcb)
+{
+ struct afs_vnode *vnode = fc->vnode;
+ struct afs_call *call;
+ struct afs_net *net = afs_v2net(vnode);
+ size_t namesz, reqsz, rplsz;
+ __be32 *bp;
+
+ _enter("");
+
+ namesz = strlen(name);
+ reqsz = (sizeof(__be32) +
+ sizeof(struct yfs_xdr_RPCFlags) +
+ sizeof(struct yfs_xdr_YFSFid) +
+ xdr_strlen(namesz) +
+ sizeof(struct yfs_xdr_YFSStoreStatus));
+ rplsz = (sizeof(struct yfs_xdr_YFSFid) +
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSCallBack) +
+ sizeof(struct yfs_xdr_YFSVolSync));
+
+ call = afs_alloc_flat_call(net, &yfs_RXFSMakeDir, reqsz, rplsz);
+ if (!call)
+ return -ENOMEM;
+
+ call->key = fc->key;
+ call->reply[0] = vnode;
+ call->reply[1] = newfid;
+ call->reply[2] = newstatus;
+ call->reply[3] = newcb;
+ call->expected_version = current_data_version + 1;
+
+ /* marshall the parameters */
+ bp = call->request;
+ bp = xdr_encode_u32(bp, YFSMAKEDIR);
+ bp = xdr_encode_u32(bp, 0); /* RPC flags */
+ bp = xdr_encode_YFSFid(bp, &vnode->fid);
+ bp = xdr_encode_string(bp, name, namesz);
+ bp = xdr_encode_YFSStoreStatus_mode(bp, mode);
+ yfs_check_req(call, bp);
+
+ afs_use_fs_server(call, fc->cbi);
+ trace_afs_make_fs_call(call, &vnode->fid);
+ return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+}
+
+/*
+ * Deliver reply data to a YFS.RemoveFile2 operation.
+ */
+static int yfs_deliver_fs_remove_file2(struct afs_call *call)
+{
+ struct afs_vnode *dvnode = call->reply[0];
+ struct afs_vnode *vnode = call->reply[1];
+ struct afs_fid fid;
+ const __be32 *bp;
+ int ret;
+
+ _enter("{%u}", call->unmarshall);
+
+ ret = afs_transfer_reply(call);
+ if (ret < 0)
+ return ret;
+
+ /* unmarshall the reply once we've received all of it */
+ bp = call->buffer;
+ ret = yfs_decode_status(call, &bp, &dvnode->status, dvnode,
+ &call->expected_version, NULL);
+ if (ret < 0)
+ return ret;
+
+ xdr_decode_YFSFid(&bp, &fid);
+ ret = yfs_decode_status(call, &bp, &vnode->status, vnode, NULL, NULL);
+ if (ret < 0)
+ return ret;
+ /* Was deleted if vnode->status.abort_code == VNOVNODE. */
+
+ xdr_decode_YFSVolSync(&bp, NULL);
+ return 0;
+}
+
+/*
+ * YFS.RemoveFile2 operation type.
+ */
+static const struct afs_call_type yfs_RXYFSRemoveFile2 = {
+ .name = "YFS.RemoveFile2",
+ .op = yfs_FS_RemoveFile2,
+ .deliver = yfs_deliver_fs_remove_file2,
+ .destructor = afs_flat_call_destructor,
+};
+
+/*
+ * Remove a file and retrieve new file status.
+ */
+int yfs_fs_remove_file2(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
+ const char *name, u64 current_data_version)
+{
+ struct afs_vnode *dvnode = fc->vnode;
+ struct afs_call *call;
+ struct afs_net *net = afs_v2net(dvnode);
+ size_t namesz;
+ __be32 *bp;
+
+ _enter("");
+
+ namesz = strlen(name);
+
+ call = afs_alloc_flat_call(net, &yfs_RXYFSRemoveFile2,
+ sizeof(__be32) +
+ sizeof(struct yfs_xdr_RPCFlags) +
+ sizeof(struct yfs_xdr_YFSFid) +
+ xdr_strlen(namesz),
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSFid) +
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSVolSync));
+ if (!call)
+ return -ENOMEM;
+
+ call->key = fc->key;
+ call->reply[0] = dvnode;
+ call->reply[1] = vnode;
+ call->expected_version = current_data_version + 1;
+
+ /* marshall the parameters */
+ bp = call->request;
+ bp = xdr_encode_u32(bp, YFSREMOVEFILE2);
+ bp = xdr_encode_u32(bp, 0); /* RPC flags */
+ bp = xdr_encode_YFSFid(bp, &dvnode->fid);
+ bp = xdr_encode_string(bp, name, namesz);
+ yfs_check_req(call, bp);
+
+ afs_use_fs_server(call, fc->cbi);
+ trace_afs_make_fs_call(call, &dvnode->fid);
+ return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+}
+
+/*
+ * Deliver reply data to a YFS.RemoveFile or YFS.RemoveDir operation.
+ */
+static int yfs_deliver_fs_remove(struct afs_call *call)
+{
+ struct afs_vnode *dvnode = call->reply[0];
+ const __be32 *bp;
+ int ret;
+
+ _enter("{%u}", call->unmarshall);
+
+ ret = afs_transfer_reply(call);
+ if (ret < 0)
+ return ret;
+
+ /* unmarshall the reply once we've received all of it */
+ bp = call->buffer;
+ ret = yfs_decode_status(call, &bp, &dvnode->status, dvnode,
+ &call->expected_version, NULL);
+ if (ret < 0)
+ return ret;
+
+ xdr_decode_YFSVolSync(&bp, NULL);
+ return 0;
+}
+
+/*
+ * FS.RemoveDir and FS.RemoveFile operation types.
+ */
+static const struct afs_call_type yfs_RXYFSRemoveFile = {
+ .name = "YFS.RemoveFile",
+ .op = yfs_FS_RemoveFile,
+ .deliver = yfs_deliver_fs_remove,
+ .destructor = afs_flat_call_destructor,
+};
+
+static const struct afs_call_type yfs_RXYFSRemoveDir = {
+ .name = "YFS.RemoveDir",
+ .op = yfs_FS_RemoveDir,
+ .deliver = yfs_deliver_fs_remove,
+ .destructor = afs_flat_call_destructor,
+};
+
+/*
+ * remove a file or directory
+ */
+int yfs_fs_remove(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
+ const char *name, bool isdir, u64 current_data_version)
+{
+ struct afs_vnode *dvnode = fc->vnode;
+ struct afs_call *call;
+ struct afs_net *net = afs_v2net(dvnode);
+ size_t namesz;
+ __be32 *bp;
+
+ _enter("");
+
+ namesz = strlen(name);
+ call = afs_alloc_flat_call(
+ net, isdir ? &yfs_RXYFSRemoveDir : &yfs_RXYFSRemoveFile,
+ sizeof(__be32) +
+ sizeof(struct yfs_xdr_RPCFlags) +
+ sizeof(struct yfs_xdr_YFSFid) +
+ xdr_strlen(namesz),
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSVolSync));
+ if (!call)
+ return -ENOMEM;
+
+ call->key = fc->key;
+ call->reply[0] = dvnode;
+ call->reply[1] = vnode;
+ call->expected_version = current_data_version + 1;
+
+ /* marshall the parameters */
+ bp = call->request;
+ bp = xdr_encode_u32(bp, isdir ? YFSREMOVEDIR : YFSREMOVEFILE);
+ bp = xdr_encode_u32(bp, 0); /* RPC flags */
+ bp = xdr_encode_YFSFid(bp, &dvnode->fid);
+ bp = xdr_encode_string(bp, name, namesz);
+ yfs_check_req(call, bp);
+
+ afs_use_fs_server(call, fc->cbi);
+ trace_afs_make_fs_call(call, &dvnode->fid);
+ return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+}
+
+/*
+ * Deliver reply data to a YFS.Link operation.
+ */
+static int yfs_deliver_fs_link(struct afs_call *call)
+{
+ struct afs_vnode *dvnode = call->reply[0], *vnode = call->reply[1];
+ const __be32 *bp;
+ int ret;
+
+ _enter("{%u}", call->unmarshall);
+
+ ret = afs_transfer_reply(call);
+ if (ret < 0)
+ return ret;
+
+ /* unmarshall the reply once we've received all of it */
+ bp = call->buffer;
+ ret = yfs_decode_status(call, &bp, &vnode->status, vnode, NULL, NULL);
+ if (ret < 0)
+ return ret;
+ ret = yfs_decode_status(call, &bp, &dvnode->status, dvnode,
+ &call->expected_version, NULL);
+ if (ret < 0)
+ return ret;
+ xdr_decode_YFSVolSync(&bp, NULL);
+ _leave(" = 0 [done]");
+ return 0;
+}
+
+/*
+ * YFS.Link operation type.
+ */
+static const struct afs_call_type yfs_RXYFSLink = {
+ .name = "YFS.Link",
+ .op = yfs_FS_Link,
+ .deliver = yfs_deliver_fs_link,
+ .destructor = afs_flat_call_destructor,
+};
+
+/*
+ * Make a hard link.
+ */
+int yfs_fs_link(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
+ const char *name, u64 current_data_version)
+{
+ struct afs_vnode *dvnode = fc->vnode;
+ struct afs_call *call;
+ struct afs_net *net = afs_v2net(vnode);
+ size_t namesz;
+ __be32 *bp;
+
+ _enter("");
+
+ namesz = strlen(name);
+ call = afs_alloc_flat_call(net, &yfs_RXYFSLink,
+ sizeof(__be32) +
+ sizeof(struct yfs_xdr_RPCFlags) +
+ sizeof(struct yfs_xdr_YFSFid) +
+ xdr_strlen(namesz) +
+ sizeof(struct yfs_xdr_YFSFid),
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSVolSync));
+ if (!call)
+ return -ENOMEM;
+
+ call->key = fc->key;
+ call->reply[0] = dvnode;
+ call->reply[1] = vnode;
+ call->expected_version = current_data_version + 1;
+
+ /* marshall the parameters */
+ bp = call->request;
+ bp = xdr_encode_u32(bp, YFSLINK);
+ bp = xdr_encode_u32(bp, 0); /* RPC flags */
+ bp = xdr_encode_YFSFid(bp, &dvnode->fid);
+ bp = xdr_encode_string(bp, name, namesz);
+ bp = xdr_encode_YFSFid(bp, &vnode->fid);
+ yfs_check_req(call, bp);
+
+ afs_use_fs_server(call, fc->cbi);
+ trace_afs_make_fs_call(call, &vnode->fid);
+ return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+}
+
+/*
+ * Deliver reply data to a YFS.Symlink operation.
+ */
+static int yfs_deliver_fs_symlink(struct afs_call *call)
+{
+ struct afs_vnode *vnode = call->reply[0];
+ const __be32 *bp;
+ int ret;
+
+ _enter("{%u}", call->unmarshall);
+
+ ret = afs_transfer_reply(call);
+ if (ret < 0)
+ return ret;
+
+ /* unmarshall the reply once we've received all of it */
+ bp = call->buffer;
+ xdr_decode_YFSFid(&bp, call->reply[1]);
+ ret = yfs_decode_status(call, &bp, call->reply[2], NULL, NULL, NULL);
+ if (ret < 0)
+ return ret;
+ ret = yfs_decode_status(call, &bp, &vnode->status, vnode,
+ &call->expected_version, NULL);
+ if (ret < 0)
+ return ret;
+ xdr_decode_YFSVolSync(&bp, NULL);
+
+ _leave(" = 0 [done]");
+ return 0;
+}
+
+/*
+ * YFS.Symlink operation type
+ */
+static const struct afs_call_type yfs_RXYFSSymlink = {
+ .name = "YFS.Symlink",
+ .op = yfs_FS_Symlink,
+ .deliver = yfs_deliver_fs_symlink,
+ .destructor = afs_flat_call_destructor,
+};
+
+/*
+ * Create a symbolic link.
+ */
+int yfs_fs_symlink(struct afs_fs_cursor *fc,
+ const char *name,
+ const char *contents,
+ u64 current_data_version,
+ struct afs_fid *newfid,
+ struct afs_file_status *newstatus)
+{
+ struct afs_vnode *dvnode = fc->vnode;
+ struct afs_call *call;
+ struct afs_net *net = afs_v2net(dvnode);
+ size_t namesz, contents_sz;
+ __be32 *bp;
+
+ _enter("");
+
+ namesz = strlen(name);
+ contents_sz = strlen(contents);
+ call = afs_alloc_flat_call(net, &yfs_RXYFSSymlink,
+ sizeof(__be32) +
+ sizeof(struct yfs_xdr_RPCFlags) +
+ sizeof(struct yfs_xdr_YFSFid) +
+ xdr_strlen(namesz) +
+ xdr_strlen(contents_sz) +
+ sizeof(struct yfs_xdr_YFSStoreStatus),
+ sizeof(struct yfs_xdr_YFSFid) +
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSVolSync));
+ if (!call)
+ return -ENOMEM;
+
+ call->key = fc->key;
+ call->reply[0] = dvnode;
+ call->reply[1] = newfid;
+ call->reply[2] = newstatus;
+ call->expected_version = current_data_version + 1;
+
+ /* marshall the parameters */
+ bp = call->request;
+ bp = xdr_encode_u32(bp, YFSSYMLINK);
+ bp = xdr_encode_u32(bp, 0); /* RPC flags */
+ bp = xdr_encode_YFSFid(bp, &dvnode->fid);
+ bp = xdr_encode_string(bp, name, namesz);
+ bp = xdr_encode_string(bp, contents, contents_sz);
+ bp = xdr_encode_YFSStoreStatus_mode(bp, S_IRWXUGO);
+ yfs_check_req(call, bp);
+
+ afs_use_fs_server(call, fc->cbi);
+ trace_afs_make_fs_call(call, &dvnode->fid);
+ return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+}
+
+/*
+ * Deliver reply data to a YFS.Rename operation.
+ */
+static int yfs_deliver_fs_rename(struct afs_call *call)
+{
+ struct afs_vnode *orig_dvnode = call->reply[0];
+ struct afs_vnode *new_dvnode = call->reply[1];
+ const __be32 *bp;
+ int ret;
+
+ _enter("{%u}", call->unmarshall);
+
+ ret = afs_transfer_reply(call);
+ if (ret < 0)
+ return ret;
+
+ /* unmarshall the reply once we've received all of it */
+ bp = call->buffer;
+ ret = yfs_decode_status(call, &bp, &orig_dvnode->status, orig_dvnode,
+ &call->expected_version, NULL);
+ if (ret < 0)
+ return ret;
+ if (new_dvnode != orig_dvnode) {
+ ret = yfs_decode_status(call, &bp, &new_dvnode->status, new_dvnode,
+ &call->expected_version_2, NULL);
+ if (ret < 0)
+ return ret;
+ }
+
+ xdr_decode_YFSVolSync(&bp, NULL);
+ _leave(" = 0 [done]");
+ return 0;
+}
+
+/*
+ * YFS.Rename operation type
+ */
+static const struct afs_call_type yfs_RXYFSRename = {
+ .name = "FS.Rename",
+ .op = yfs_FS_Rename,
+ .deliver = yfs_deliver_fs_rename,
+ .destructor = afs_flat_call_destructor,
+};
+
+/*
+ * Rename a file or directory.
+ */
+int yfs_fs_rename(struct afs_fs_cursor *fc,
+ const char *orig_name,
+ struct afs_vnode *new_dvnode,
+ const char *new_name,
+ u64 current_orig_data_version,
+ u64 current_new_data_version)
+{
+ struct afs_vnode *orig_dvnode = fc->vnode;
+ struct afs_call *call;
+ struct afs_net *net = afs_v2net(orig_dvnode);
+ size_t o_namesz, n_namesz;
+ __be32 *bp;
+
+ _enter("");
+
+ o_namesz = strlen(orig_name);
+ n_namesz = strlen(new_name);
+ call = afs_alloc_flat_call(net, &yfs_RXYFSRename,
+ sizeof(__be32) +
+ sizeof(struct yfs_xdr_RPCFlags) +
+ sizeof(struct yfs_xdr_YFSFid) +
+ xdr_strlen(o_namesz) +
+ sizeof(struct yfs_xdr_YFSFid) +
+ xdr_strlen(n_namesz),
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSVolSync));
+ if (!call)
+ return -ENOMEM;
+
+ call->key = fc->key;
+ call->reply[0] = orig_dvnode;
+ call->reply[1] = new_dvnode;
+ call->expected_version = current_orig_data_version + 1;
+ call->expected_version_2 = current_new_data_version + 1;
+
+ /* marshall the parameters */
+ bp = call->request;
+ bp = xdr_encode_u32(bp, YFSRENAME);
+ bp = xdr_encode_u32(bp, 0); /* RPC flags */
+ bp = xdr_encode_YFSFid(bp, &orig_dvnode->fid);
+ bp = xdr_encode_string(bp, orig_name, o_namesz);
+ bp = xdr_encode_YFSFid(bp, &new_dvnode->fid);
+ bp = xdr_encode_string(bp, new_name, n_namesz);
+ yfs_check_req(call, bp);
+
+ afs_use_fs_server(call, fc->cbi);
+ trace_afs_make_fs_call(call, &orig_dvnode->fid);
+ return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+}
+
+/*
+ * Deliver reply data to a YFS.StoreData64 operation.
+ */
+static int yfs_deliver_fs_store_data(struct afs_call *call)
+{
+ struct afs_vnode *vnode = call->reply[0];
+ const __be32 *bp;
+ int ret;
+
+ _enter("");
+
+ ret = afs_transfer_reply(call);
+ if (ret < 0)
+ return ret;
+
+ /* unmarshall the reply once we've received all of it */
+ bp = call->buffer;
+ ret = yfs_decode_status(call, &bp, &vnode->status, vnode,
+ &call->expected_version, NULL);
+ if (ret < 0)
+ return ret;
+ xdr_decode_YFSVolSync(&bp, NULL);
+
+ afs_pages_written_back(vnode, call);
+
+ _leave(" = 0 [done]");
+ return 0;
+}
+
+/*
+ * YFS.StoreData64 operation type.
+ */
+static const struct afs_call_type yfs_RXYFSStoreData64 = {
+ .name = "YFS.StoreData64",
+ .op = yfs_FS_StoreData64,
+ .deliver = yfs_deliver_fs_store_data,
+ .destructor = afs_flat_call_destructor,
+};
+
+/*
+ * Store a set of pages to a large file.
+ */
+int yfs_fs_store_data(struct afs_fs_cursor *fc, struct address_space *mapping,
+ pgoff_t first, pgoff_t last,
+ unsigned offset, unsigned to)
+{
+ struct afs_vnode *vnode = fc->vnode;
+ struct afs_call *call;
+ struct afs_net *net = afs_v2net(vnode);
+ loff_t size, pos, i_size;
+ __be32 *bp;
+
+ _enter(",%x,{%llx:%llu},,",
+ key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode);
+
+ size = (loff_t)to - (loff_t)offset;
+ if (first != last)
+ size += (loff_t)(last - first) << PAGE_SHIFT;
+ pos = (loff_t)first << PAGE_SHIFT;
+ pos += offset;
+
+ i_size = i_size_read(&vnode->vfs_inode);
+ if (pos + size > i_size)
+ i_size = size + pos;
+
+ _debug("size %llx, at %llx, i_size %llx",
+ (unsigned long long)size, (unsigned long long)pos,
+ (unsigned long long)i_size);
+
+ call = afs_alloc_flat_call(net, &yfs_RXYFSStoreData64,
+ sizeof(__be32) +
+ sizeof(__be32) +
+ sizeof(struct yfs_xdr_YFSFid) +
+ sizeof(struct yfs_xdr_YFSStoreStatus) +
+ sizeof(struct yfs_xdr_u64) * 3,
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSVolSync));
+ if (!call)
+ return -ENOMEM;
+
+ call->key = fc->key;
+ call->mapping = mapping;
+ call->reply[0] = vnode;
+ call->first = first;
+ call->last = last;
+ call->first_offset = offset;
+ call->last_to = to;
+ call->send_pages = true;
+ call->expected_version = vnode->status.data_version + 1;
+
+ /* marshall the parameters */
+ bp = call->request;
+ bp = xdr_encode_u32(bp, YFSSTOREDATA64);
+ bp = xdr_encode_u32(bp, 0); /* RPC flags */
+ bp = xdr_encode_YFSFid(bp, &vnode->fid);
+ bp = xdr_encode_YFSStoreStatus_mtime(bp, &vnode->vfs_inode.i_mtime);
+ bp = xdr_encode_u64(bp, pos);
+ bp = xdr_encode_u64(bp, size);
+ bp = xdr_encode_u64(bp, i_size);
+ yfs_check_req(call, bp);
+
+ afs_use_fs_server(call, fc->cbi);
+ trace_afs_make_fs_call(call, &vnode->fid);
+ return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+}
+
+/*
+ * deliver reply data to an FS.StoreStatus
+ */
+static int yfs_deliver_fs_store_status(struct afs_call *call)
+{
+ struct afs_vnode *vnode = call->reply[0];
+ const __be32 *bp;
+ int ret;
+
+ _enter("");
+
+ ret = afs_transfer_reply(call);
+ if (ret < 0)
+ return ret;
+
+ /* unmarshall the reply once we've received all of it */
+ bp = call->buffer;
+ ret = yfs_decode_status(call, &bp, &vnode->status, vnode,
+ &call->expected_version, NULL);
+ if (ret < 0)
+ return ret;
+ xdr_decode_YFSVolSync(&bp, NULL);
+
+ _leave(" = 0 [done]");
+ return 0;
+}
+
+/*
+ * YFS.StoreStatus operation type
+ */
+static const struct afs_call_type yfs_RXYFSStoreStatus = {
+ .name = "YFS.StoreStatus",
+ .op = yfs_FS_StoreStatus,
+ .deliver = yfs_deliver_fs_store_status,
+ .destructor = afs_flat_call_destructor,
+};
+
+static const struct afs_call_type yfs_RXYFSStoreData64_as_Status = {
+ .name = "YFS.StoreData64",
+ .op = yfs_FS_StoreData64,
+ .deliver = yfs_deliver_fs_store_status,
+ .destructor = afs_flat_call_destructor,
+};
+
+/*
+ * Set the attributes on a file, using YFS.StoreData64 rather than
+ * YFS.StoreStatus so as to alter the file size also.
+ */
+static int yfs_fs_setattr_size(struct afs_fs_cursor *fc, struct iattr *attr)
+{
+ struct afs_vnode *vnode = fc->vnode;
+ struct afs_call *call;
+ struct afs_net *net = afs_v2net(vnode);
+ __be32 *bp;
+
+ _enter(",%x,{%llx:%llu},,",
+ key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode);
+
+ call = afs_alloc_flat_call(net, &yfs_RXYFSStoreData64_as_Status,
+ sizeof(__be32) * 2 +
+ sizeof(struct yfs_xdr_YFSFid) +
+ sizeof(struct yfs_xdr_YFSStoreStatus) +
+ sizeof(struct yfs_xdr_u64) * 3,
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSVolSync));
+ if (!call)
+ return -ENOMEM;
+
+ call->key = fc->key;
+ call->reply[0] = vnode;
+ call->expected_version = vnode->status.data_version + 1;
+
+ /* marshall the parameters */
+ bp = call->request;
+ bp = xdr_encode_u32(bp, YFSSTOREDATA64);
+ bp = xdr_encode_u32(bp, 0); /* RPC flags */
+ bp = xdr_encode_YFSFid(bp, &vnode->fid);
+ bp = xdr_encode_YFS_StoreStatus(bp, attr);
+ bp = xdr_encode_u64(bp, 0); /* position of start of write */
+ bp = xdr_encode_u64(bp, 0); /* size of write */
+ bp = xdr_encode_u64(bp, attr->ia_size); /* new file length */
+ yfs_check_req(call, bp);
+
+ afs_use_fs_server(call, fc->cbi);
+ trace_afs_make_fs_call(call, &vnode->fid);
+ return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+}
+
+/*
+ * Set the attributes on a file, using YFS.StoreData64 if there's a change in
+ * file size, and YFS.StoreStatus otherwise.
+ */
+int yfs_fs_setattr(struct afs_fs_cursor *fc, struct iattr *attr)
+{
+ struct afs_vnode *vnode = fc->vnode;
+ struct afs_call *call;
+ struct afs_net *net = afs_v2net(vnode);
+ __be32 *bp;
+
+ if (attr->ia_valid & ATTR_SIZE)
+ return yfs_fs_setattr_size(fc, attr);
+
+ _enter(",%x,{%llx:%llu},,",
+ key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode);
+
+ call = afs_alloc_flat_call(net, &yfs_RXYFSStoreStatus,
+ sizeof(__be32) * 2 +
+ sizeof(struct yfs_xdr_YFSFid) +
+ sizeof(struct yfs_xdr_YFSStoreStatus),
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSVolSync));
+ if (!call)
+ return -ENOMEM;
+
+ call->key = fc->key;
+ call->reply[0] = vnode;
+ call->expected_version = vnode->status.data_version;
+
+ /* marshall the parameters */
+ bp = call->request;
+ bp = xdr_encode_u32(bp, YFSSTORESTATUS);
+ bp = xdr_encode_u32(bp, 0); /* RPC flags */
+ bp = xdr_encode_YFSFid(bp, &vnode->fid);
+ bp = xdr_encode_YFS_StoreStatus(bp, attr);
+ yfs_check_req(call, bp);
+
+ afs_use_fs_server(call, fc->cbi);
+ trace_afs_make_fs_call(call, &vnode->fid);
+ return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+}
+
+/*
+ * Deliver reply data to a YFS.GetVolumeStatus operation.
+ */
+static int yfs_deliver_fs_get_volume_status(struct afs_call *call)
+{
+ const __be32 *bp;
+ char *p;
+ u32 size;
+ int ret;
+
+ _enter("{%u}", call->unmarshall);
+
+ switch (call->unmarshall) {
+ case 0:
+ call->unmarshall++;
+ afs_extract_to_buf(call, sizeof(struct yfs_xdr_YFSFetchVolumeStatus));
+
+ /* extract the returned status record */
+ case 1:
+ _debug("extract status");
+ ret = afs_extract_data(call, true);
+ if (ret < 0)
+ return ret;
+
+ bp = call->buffer;
+ xdr_decode_YFSFetchVolumeStatus(&bp, call->reply[1]);
+ call->unmarshall++;
+ afs_extract_to_tmp(call);
+
+ /* extract the volume name length */
+ case 2:
+ ret = afs_extract_data(call, true);
+ if (ret < 0)
+ return ret;
+
+ call->count = ntohl(call->tmp);
+ _debug("volname length: %u", call->count);
+ if (call->count >= AFSNAMEMAX)
+ return afs_protocol_error(call, -EBADMSG,
+ afs_eproto_volname_len);
+ size = (call->count + 3) & ~3; /* It's padded */
+ afs_extract_begin(call, call->reply[2], size);
+ call->unmarshall++;
+
+ /* extract the volume name */
+ case 3:
+ _debug("extract volname");
+ ret = afs_extract_data(call, true);
+ if (ret < 0)
+ return ret;
+
+ p = call->reply[2];
+ p[call->count] = 0;
+ _debug("volname '%s'", p);
+ afs_extract_to_tmp(call);
+ call->unmarshall++;
+
+ /* extract the offline message length */
+ case 4:
+ ret = afs_extract_data(call, true);
+ if (ret < 0)
+ return ret;
+
+ call->count = ntohl(call->tmp);
+ _debug("offline msg length: %u", call->count);
+ if (call->count >= AFSNAMEMAX)
+ return afs_protocol_error(call, -EBADMSG,
+ afs_eproto_offline_msg_len);
+ size = (call->count + 3) & ~3; /* It's padded */
+ afs_extract_begin(call, call->reply[2], size);
+ call->unmarshall++;
+
+ /* extract the offline message */
+ case 5:
+ _debug("extract offline");
+ ret = afs_extract_data(call, true);
+ if (ret < 0)
+ return ret;
+
+ p = call->reply[2];
+ p[call->count] = 0;
+ _debug("offline '%s'", p);
+
+ afs_extract_to_tmp(call);
+ call->unmarshall++;
+
+ /* extract the message of the day length */
+ case 6:
+ ret = afs_extract_data(call, true);
+ if (ret < 0)
+ return ret;
+
+ call->count = ntohl(call->tmp);
+ _debug("motd length: %u", call->count);
+ if (call->count >= AFSNAMEMAX)
+ return afs_protocol_error(call, -EBADMSG,
+ afs_eproto_motd_len);
+ size = (call->count + 3) & ~3; /* It's padded */
+ afs_extract_begin(call, call->reply[2], size);
+ call->unmarshall++;
+
+ /* extract the message of the day */
+ case 7:
+ _debug("extract motd");
+ ret = afs_extract_data(call, false);
+ if (ret < 0)
+ return ret;
+
+ p = call->reply[2];
+ p[call->count] = 0;
+ _debug("motd '%s'", p);
+
+ call->unmarshall++;
+
+ case 8:
+ break;
+ }
+
+ _leave(" = 0 [done]");
+ return 0;
+}
+
+/*
+ * Destroy a YFS.GetVolumeStatus call.
+ */
+static void yfs_get_volume_status_call_destructor(struct afs_call *call)
+{
+ kfree(call->reply[2]);
+ call->reply[2] = NULL;
+ afs_flat_call_destructor(call);
+}
+
+/*
+ * YFS.GetVolumeStatus operation type
+ */
+static const struct afs_call_type yfs_RXYFSGetVolumeStatus = {
+ .name = "YFS.GetVolumeStatus",
+ .op = yfs_FS_GetVolumeStatus,
+ .deliver = yfs_deliver_fs_get_volume_status,
+ .destructor = yfs_get_volume_status_call_destructor,
+};
+
+/*
+ * fetch the status of a volume
+ */
+int yfs_fs_get_volume_status(struct afs_fs_cursor *fc,
+ struct afs_volume_status *vs)
+{
+ struct afs_vnode *vnode = fc->vnode;
+ struct afs_call *call;
+ struct afs_net *net = afs_v2net(vnode);
+ __be32 *bp;
+ void *tmpbuf;
+
+ _enter("");
+
+ tmpbuf = kmalloc(AFSOPAQUEMAX, GFP_KERNEL);
+ if (!tmpbuf)
+ return -ENOMEM;
+
+ call = afs_alloc_flat_call(net, &yfs_RXYFSGetVolumeStatus,
+ sizeof(__be32) * 2 +
+ sizeof(struct yfs_xdr_u64),
+ sizeof(struct yfs_xdr_YFSFetchVolumeStatus) +
+ sizeof(__be32));
+ if (!call) {
+ kfree(tmpbuf);
+ return -ENOMEM;
+ }
+
+ call->key = fc->key;
+ call->reply[0] = vnode;
+ call->reply[1] = vs;
+ call->reply[2] = tmpbuf;
+
+ /* marshall the parameters */
+ bp = call->request;
+ bp = xdr_encode_u32(bp, YFSGETVOLUMESTATUS);
+ bp = xdr_encode_u32(bp, 0); /* RPC flags */
+ bp = xdr_encode_u64(bp, vnode->fid.vid);
+ yfs_check_req(call, bp);
+
+ afs_use_fs_server(call, fc->cbi);
+ trace_afs_make_fs_call(call, &vnode->fid);
+ return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+}
+
+/*
+ * Deliver reply data to an YFS.SetLock, YFS.ExtendLock or YFS.ReleaseLock
+ */
+static int yfs_deliver_fs_xxxx_lock(struct afs_call *call)
+{
+ struct afs_vnode *vnode = call->reply[0];
+ const __be32 *bp;
+ int ret;
+
+ _enter("{%u}", call->unmarshall);
+
+ ret = afs_transfer_reply(call);
+ if (ret < 0)
+ return ret;
+
+ /* unmarshall the reply once we've received all of it */
+ bp = call->buffer;
+ ret = yfs_decode_status(call, &bp, &vnode->status, vnode,
+ &call->expected_version, NULL);
+ if (ret < 0)
+ return ret;
+ xdr_decode_YFSVolSync(&bp, NULL);
+
+ _leave(" = 0 [done]");
+ return 0;
+}
+
+/*
+ * YFS.SetLock operation type
+ */
+static const struct afs_call_type yfs_RXYFSSetLock = {
+ .name = "YFS.SetLock",
+ .op = yfs_FS_SetLock,
+ .deliver = yfs_deliver_fs_xxxx_lock,
+ .destructor = afs_flat_call_destructor,
+};
+
+/*
+ * YFS.ExtendLock operation type
+ */
+static const struct afs_call_type yfs_RXYFSExtendLock = {
+ .name = "YFS.ExtendLock",
+ .op = yfs_FS_ExtendLock,
+ .deliver = yfs_deliver_fs_xxxx_lock,
+ .destructor = afs_flat_call_destructor,
+};
+
+/*
+ * YFS.ReleaseLock operation type
+ */
+static const struct afs_call_type yfs_RXYFSReleaseLock = {
+ .name = "YFS.ReleaseLock",
+ .op = yfs_FS_ReleaseLock,
+ .deliver = yfs_deliver_fs_xxxx_lock,
+ .destructor = afs_flat_call_destructor,
+};
+
+/*
+ * Set a lock on a file
+ */
+int yfs_fs_set_lock(struct afs_fs_cursor *fc, afs_lock_type_t type)
+{
+ struct afs_vnode *vnode = fc->vnode;
+ struct afs_call *call;
+ struct afs_net *net = afs_v2net(vnode);
+ __be32 *bp;
+
+ _enter("");
+
+ call = afs_alloc_flat_call(net, &yfs_RXYFSSetLock,
+ sizeof(__be32) * 2 +
+ sizeof(struct yfs_xdr_YFSFid) +
+ sizeof(__be32),
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSVolSync));
+ if (!call)
+ return -ENOMEM;
+
+ call->key = fc->key;
+ call->reply[0] = vnode;
+
+ /* marshall the parameters */
+ bp = call->request;
+ bp = xdr_encode_u32(bp, YFSSETLOCK);
+ bp = xdr_encode_u32(bp, 0); /* RPC flags */
+ bp = xdr_encode_YFSFid(bp, &vnode->fid);
+ bp = xdr_encode_u32(bp, type);
+ yfs_check_req(call, bp);
+
+ afs_use_fs_server(call, fc->cbi);
+ trace_afs_make_fs_call(call, &vnode->fid);
+ return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+}
+
+/*
+ * extend a lock on a file
+ */
+int yfs_fs_extend_lock(struct afs_fs_cursor *fc)
+{
+ struct afs_vnode *vnode = fc->vnode;
+ struct afs_call *call;
+ struct afs_net *net = afs_v2net(vnode);
+ __be32 *bp;
+
+ _enter("");
+
+ call = afs_alloc_flat_call(net, &yfs_RXYFSExtendLock,
+ sizeof(__be32) * 2 +
+ sizeof(struct yfs_xdr_YFSFid),
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSVolSync));
+ if (!call)
+ return -ENOMEM;
+
+ call->key = fc->key;
+ call->reply[0] = vnode;
+
+ /* marshall the parameters */
+ bp = call->request;
+ bp = xdr_encode_u32(bp, YFSEXTENDLOCK);
+ bp = xdr_encode_u32(bp, 0); /* RPC flags */
+ bp = xdr_encode_YFSFid(bp, &vnode->fid);
+ yfs_check_req(call, bp);
+
+ afs_use_fs_server(call, fc->cbi);
+ trace_afs_make_fs_call(call, &vnode->fid);
+ return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+}
+
+/*
+ * release a lock on a file
+ */
+int yfs_fs_release_lock(struct afs_fs_cursor *fc)
+{
+ struct afs_vnode *vnode = fc->vnode;
+ struct afs_call *call;
+ struct afs_net *net = afs_v2net(vnode);
+ __be32 *bp;
+
+ _enter("");
+
+ call = afs_alloc_flat_call(net, &yfs_RXYFSReleaseLock,
+ sizeof(__be32) * 2 +
+ sizeof(struct yfs_xdr_YFSFid),
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSVolSync));
+ if (!call)
+ return -ENOMEM;
+
+ call->key = fc->key;
+ call->reply[0] = vnode;
+
+ /* marshall the parameters */
+ bp = call->request;
+ bp = xdr_encode_u32(bp, YFSRELEASELOCK);
+ bp = xdr_encode_u32(bp, 0); /* RPC flags */
+ bp = xdr_encode_YFSFid(bp, &vnode->fid);
+ yfs_check_req(call, bp);
+
+ afs_use_fs_server(call, fc->cbi);
+ trace_afs_make_fs_call(call, &vnode->fid);
+ return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+}
+
+/*
+ * Deliver reply data to an FS.FetchStatus with no vnode.
+ */
+static int yfs_deliver_fs_fetch_status(struct afs_call *call)
+{
+ struct afs_file_status *status = call->reply[1];
+ struct afs_callback *callback = call->reply[2];
+ struct afs_volsync *volsync = call->reply[3];
+ struct afs_vnode *vnode = call->reply[0];
+ const __be32 *bp;
+ int ret;
+
+ ret = afs_transfer_reply(call);
+ if (ret < 0)
+ return ret;
+
+ _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode);
+
+ /* unmarshall the reply once we've received all of it */
+ bp = call->buffer;
+ ret = yfs_decode_status(call, &bp, status, vnode,
+ &call->expected_version, NULL);
+ if (ret < 0)
+ return ret;
+ xdr_decode_YFSCallBack_raw(&bp, callback);
+ xdr_decode_YFSVolSync(&bp, volsync);
+
+ _leave(" = 0 [done]");
+ return 0;
+}
+
+/*
+ * YFS.FetchStatus operation type
+ */
+static const struct afs_call_type yfs_RXYFSFetchStatus = {
+ .name = "YFS.FetchStatus",
+ .op = yfs_FS_FetchStatus,
+ .deliver = yfs_deliver_fs_fetch_status,
+ .destructor = afs_flat_call_destructor,
+};
+
+/*
+ * Fetch the status information for a fid without needing a vnode handle.
+ */
+int yfs_fs_fetch_status(struct afs_fs_cursor *fc,
+ struct afs_net *net,
+ struct afs_fid *fid,
+ struct afs_file_status *status,
+ struct afs_callback *callback,
+ struct afs_volsync *volsync)
+{
+ struct afs_call *call;
+ __be32 *bp;
+
+ _enter(",%x,{%llx:%llu},,",
+ key_serial(fc->key), fid->vid, fid->vnode);
+
+ call = afs_alloc_flat_call(net, &yfs_RXYFSFetchStatus,
+ sizeof(__be32) * 2 +
+ sizeof(struct yfs_xdr_YFSFid),
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSCallBack) +
+ sizeof(struct yfs_xdr_YFSVolSync));
+ if (!call) {
+ fc->ac.error = -ENOMEM;
+ return -ENOMEM;
+ }
+
+ call->key = fc->key;
+ call->reply[0] = NULL; /* vnode for fid[0] */
+ call->reply[1] = status;
+ call->reply[2] = callback;
+ call->reply[3] = volsync;
+ call->expected_version = 1; /* vnode->status.data_version */
+
+ /* marshall the parameters */
+ bp = call->request;
+ bp = xdr_encode_u32(bp, YFSFETCHSTATUS);
+ bp = xdr_encode_u32(bp, 0); /* RPC flags */
+ bp = xdr_encode_YFSFid(bp, fid);
+ yfs_check_req(call, bp);
+
+ call->cb_break = fc->cb_break;
+ afs_use_fs_server(call, fc->cbi);
+ trace_afs_make_fs_call(call, fid);
+ return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+}
+
+/*
+ * Deliver reply data to an YFS.InlineBulkStatus call
+ */
+static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call)
+{
+ struct afs_file_status *statuses;
+ struct afs_callback *callbacks;
+ struct afs_vnode *vnode = call->reply[0];
+ const __be32 *bp;
+ u32 tmp;
+ int ret;
+
+ _enter("{%u}", call->unmarshall);
+
+ switch (call->unmarshall) {
+ case 0:
+ afs_extract_to_tmp(call);
+ call->unmarshall++;
+
+ /* Extract the file status count and array in two steps */
+ case 1:
+ _debug("extract status count");
+ ret = afs_extract_data(call, true);
+ if (ret < 0)
+ return ret;
+
+ tmp = ntohl(call->tmp);
+ _debug("status count: %u/%u", tmp, call->count2);
+ if (tmp != call->count2)
+ return afs_protocol_error(call, -EBADMSG,
+ afs_eproto_ibulkst_count);
+
+ call->count = 0;
+ call->unmarshall++;
+ more_counts:
+ afs_extract_to_buf(call, sizeof(struct yfs_xdr_YFSFetchStatus));
+
+ case 2:
+ _debug("extract status array %u", call->count);
+ ret = afs_extract_data(call, true);
+ if (ret < 0)
+ return ret;
+
+ bp = call->buffer;
+ statuses = call->reply[1];
+ ret = yfs_decode_status(call, &bp, &statuses[call->count],
+ call->count == 0 ? vnode : NULL,
+ NULL, NULL);
+ if (ret < 0)
+ return ret;
+
+ call->count++;
+ if (call->count < call->count2)
+ goto more_counts;
+
+ call->count = 0;
+ call->unmarshall++;
+ afs_extract_to_tmp(call);
+
+ /* Extract the callback count and array in two steps */
+ case 3:
+ _debug("extract CB count");
+ ret = afs_extract_data(call, true);
+ if (ret < 0)
+ return ret;
+
+ tmp = ntohl(call->tmp);
+ _debug("CB count: %u", tmp);
+ if (tmp != call->count2)
+ return afs_protocol_error(call, -EBADMSG,
+ afs_eproto_ibulkst_cb_count);
+ call->count = 0;
+ call->unmarshall++;
+ more_cbs:
+ afs_extract_to_buf(call, sizeof(struct yfs_xdr_YFSCallBack));
+
+ case 4:
+ _debug("extract CB array");
+ ret = afs_extract_data(call, true);
+ if (ret < 0)
+ return ret;
+
+ _debug("unmarshall CB array");
+ bp = call->buffer;
+ callbacks = call->reply[2];
+ xdr_decode_YFSCallBack_raw(&bp, &callbacks[call->count]);
+ statuses = call->reply[1];
+ if (call->count == 0 && vnode && statuses[0].abort_code == 0) {
+ bp = call->buffer;
+ xdr_decode_YFSCallBack(call, vnode, &bp);
+ }
+ call->count++;
+ if (call->count < call->count2)
+ goto more_cbs;
+
+ afs_extract_to_buf(call, sizeof(struct yfs_xdr_YFSVolSync));
+ call->unmarshall++;
+
+ case 5:
+ ret = afs_extract_data(call, false);
+ if (ret < 0)
+ return ret;
+
+ bp = call->buffer;
+ xdr_decode_YFSVolSync(&bp, call->reply[3]);
+
+ call->unmarshall++;
+
+ case 6:
+ break;
+ }
+
+ _leave(" = 0 [done]");
+ return 0;
+}
+
+/*
+ * FS.InlineBulkStatus operation type
+ */
+static const struct afs_call_type yfs_RXYFSInlineBulkStatus = {
+ .name = "YFS.InlineBulkStatus",
+ .op = yfs_FS_InlineBulkStatus,
+ .deliver = yfs_deliver_fs_inline_bulk_status,
+ .destructor = afs_flat_call_destructor,
+};
+
+/*
+ * Fetch the status information for up to 1024 files
+ */
+int yfs_fs_inline_bulk_status(struct afs_fs_cursor *fc,
+ struct afs_net *net,
+ struct afs_fid *fids,
+ struct afs_file_status *statuses,
+ struct afs_callback *callbacks,
+ unsigned int nr_fids,
+ struct afs_volsync *volsync)
+{
+ struct afs_call *call;
+ __be32 *bp;
+ int i;
+
+ _enter(",%x,{%llx:%llu},%u",
+ key_serial(fc->key), fids[0].vid, fids[1].vnode, nr_fids);
+
+ call = afs_alloc_flat_call(net, &yfs_RXYFSInlineBulkStatus,
+ sizeof(__be32) +
+ sizeof(__be32) +
+ sizeof(__be32) +
+ sizeof(struct yfs_xdr_YFSFid) * nr_fids,
+ sizeof(struct yfs_xdr_YFSFetchStatus));
+ if (!call) {
+ fc->ac.error = -ENOMEM;
+ return -ENOMEM;
+ }
+
+ call->key = fc->key;
+ call->reply[0] = NULL; /* vnode for fid[0] */
+ call->reply[1] = statuses;
+ call->reply[2] = callbacks;
+ call->reply[3] = volsync;
+ call->count2 = nr_fids;
+
+ /* marshall the parameters */
+ bp = call->request;
+ bp = xdr_encode_u32(bp, YFSINLINEBULKSTATUS);
+ bp = xdr_encode_u32(bp, 0); /* RPCFlags */
+ bp = xdr_encode_u32(bp, nr_fids);
+ for (i = 0; i < nr_fids; i++)
+ bp = xdr_encode_YFSFid(bp, &fids[i]);
+ yfs_check_req(call, bp);
+
+ call->cb_break = fc->cb_break;
+ afs_use_fs_server(call, fc->cbi);
+ trace_afs_make_fs_call(call, &fids[0]);
+ return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+}
diff --git a/fs/aio.c b/fs/aio.c
index 301e6314183b..38b741aef0bf 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -45,6 +45,7 @@
#include <asm/kmap_types.h>
#include <linux/uaccess.h>
+#include <linux/nospec.h>
#include "internal.h"
@@ -69,6 +70,12 @@ struct aio_ring {
struct io_event io_events[0];
}; /* 128 bytes + ring size */
+/*
+ * Plugging is meant to work with larger batches of IOs. If we don't
+ * have more than the below, then don't bother setting up a plug.
+ */
+#define AIO_PLUG_THRESHOLD 2
+
#define AIO_RING_PAGES 8
struct kioctx_table {
@@ -160,9 +167,13 @@ struct kioctx {
unsigned id;
};
+/*
+ * First field must be the file pointer in all the
+ * iocb unions! See also 'struct kiocb' in <linux/fs.h>
+ */
struct fsync_iocb {
- struct work_struct work;
struct file *file;
+ struct work_struct work;
bool datasync;
};
@@ -176,8 +187,15 @@ struct poll_iocb {
struct work_struct work;
};
+/*
+ * NOTE! Each of the iocb union members has the file pointer
+ * as the first entry in their struct definition. So you can
+ * access the file pointer through any of the sub-structs,
+ * or directly as just 'ki_filp' in this struct.
+ */
struct aio_kiocb {
union {
+ struct file *ki_filp;
struct kiocb rw;
struct fsync_iocb fsync;
struct poll_iocb poll;
@@ -408,7 +426,7 @@ static int aio_migratepage(struct address_space *mapping, struct page *new,
BUG_ON(PageWriteback(old));
get_page(new);
- rc = migrate_page_move_mapping(mapping, new, old, NULL, mode, 1);
+ rc = migrate_page_move_mapping(mapping, new, old, mode, 1);
if (rc != MIGRATEPAGE_SUCCESS) {
put_page(new);
goto out_unlock;
@@ -901,7 +919,7 @@ static void put_reqs_available(struct kioctx *ctx, unsigned nr)
local_irq_restore(flags);
}
-static bool get_reqs_available(struct kioctx *ctx)
+static bool __get_reqs_available(struct kioctx *ctx)
{
struct kioctx_cpu *kcpu;
bool ret = false;
@@ -993,6 +1011,14 @@ static void user_refill_reqs_available(struct kioctx *ctx)
spin_unlock_irq(&ctx->completion_lock);
}
+static bool get_reqs_available(struct kioctx *ctx)
+{
+ if (__get_reqs_available(ctx))
+ return true;
+ user_refill_reqs_available(ctx);
+ return __get_reqs_available(ctx);
+}
+
/* aio_get_req
* Allocate a slot for an aio request.
* Returns NULL if no requests are free.
@@ -1001,24 +1027,16 @@ static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx)
{
struct aio_kiocb *req;
- if (!get_reqs_available(ctx)) {
- user_refill_reqs_available(ctx);
- if (!get_reqs_available(ctx))
- return NULL;
- }
-
- req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO);
+ req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL);
if (unlikely(!req))
- goto out_put;
+ return NULL;
percpu_ref_get(&ctx->reqs);
+ req->ki_ctx = ctx;
INIT_LIST_HEAD(&req->ki_list);
refcount_set(&req->ki_refcnt, 0);
- req->ki_ctx = ctx;
+ req->ki_eventfd = NULL;
return req;
-out_put:
- put_reqs_available(ctx, 1);
- return NULL;
}
static struct kioctx *lookup_ioctx(unsigned long ctx_id)
@@ -1038,6 +1056,7 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
if (!table || id >= table->nr)
goto out;
+ id = array_index_nospec(id, table->nr);
ctx = rcu_dereference(table->table[id]);
if (ctx && ctx->user_id == ctx_id) {
if (percpu_ref_tryget_live(&ctx->users))
@@ -1052,11 +1071,22 @@ static inline void iocb_put(struct aio_kiocb *iocb)
{
if (refcount_read(&iocb->ki_refcnt) == 0 ||
refcount_dec_and_test(&iocb->ki_refcnt)) {
+ if (iocb->ki_filp)
+ fput(iocb->ki_filp);
percpu_ref_put(&iocb->ki_ctx->reqs);
kmem_cache_free(kiocb_cachep, iocb);
}
}
+static void aio_fill_event(struct io_event *ev, struct aio_kiocb *iocb,
+ long res, long res2)
+{
+ ev->obj = (u64)(unsigned long)iocb->ki_user_iocb;
+ ev->data = iocb->ki_user_data;
+ ev->res = res;
+ ev->res2 = res2;
+}
+
/* aio_complete
* Called when the io request on the given iocb is complete.
*/
@@ -1084,10 +1114,7 @@ static void aio_complete(struct aio_kiocb *iocb, long res, long res2)
ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
event = ev_page + pos % AIO_EVENTS_PER_PAGE;
- event->obj = (u64)(unsigned long)iocb->ki_user_iocb;
- event->data = iocb->ki_user_data;
- event->res = res;
- event->res2 = res2;
+ aio_fill_event(event, iocb, res, res2);
kunmap_atomic(ev_page);
flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
@@ -1410,18 +1437,15 @@ static void aio_complete_rw(struct kiocb *kiocb, long res, long res2)
file_end_write(kiocb->ki_filp);
}
- fput(kiocb->ki_filp);
aio_complete(iocb, res, res2);
}
-static int aio_prep_rw(struct kiocb *req, struct iocb *iocb)
+static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
{
int ret;
- req->ki_filp = fget(iocb->aio_fildes);
- if (unlikely(!req->ki_filp))
- return -EBADF;
req->ki_complete = aio_complete_rw;
+ req->private = NULL;
req->ki_pos = iocb->aio_offset;
req->ki_flags = iocb_flags(req->ki_filp);
if (iocb->aio_flags & IOCB_FLAG_RESFD)
@@ -1441,15 +1465,17 @@ static int aio_prep_rw(struct kiocb *req, struct iocb *iocb)
req->ki_ioprio = iocb->aio_reqprio;
} else
- req->ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
+ req->ki_ioprio = get_current_ioprio();
ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags);
if (unlikely(ret))
- fput(req->ki_filp);
- return ret;
+ return ret;
+
+ req->ki_flags &= ~IOCB_HIPRI; /* no one is going to poll for this I/O */
+ return 0;
}
-static int aio_setup_rw(int rw, struct iocb *iocb, struct iovec **iovec,
+static int aio_setup_rw(int rw, const struct iocb *iocb, struct iovec **iovec,
bool vectored, bool compat, struct iov_iter *iter)
{
void __user *buf = (void __user *)(uintptr_t)iocb->aio_buf;
@@ -1484,12 +1510,12 @@ static inline void aio_rw_done(struct kiocb *req, ssize_t ret)
ret = -EINTR;
/*FALLTHRU*/
default:
- aio_complete_rw(req, ret, 0);
+ req->ki_complete(req, ret, 0);
}
}
-static ssize_t aio_read(struct kiocb *req, struct iocb *iocb, bool vectored,
- bool compat)
+static ssize_t aio_read(struct kiocb *req, const struct iocb *iocb,
+ bool vectored, bool compat)
{
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
struct iov_iter iter;
@@ -1500,29 +1526,24 @@ static ssize_t aio_read(struct kiocb *req, struct iocb *iocb, bool vectored,
if (ret)
return ret;
file = req->ki_filp;
-
- ret = -EBADF;
if (unlikely(!(file->f_mode & FMODE_READ)))
- goto out_fput;
+ return -EBADF;
ret = -EINVAL;
if (unlikely(!file->f_op->read_iter))
- goto out_fput;
+ return -EINVAL;
ret = aio_setup_rw(READ, iocb, &iovec, vectored, compat, &iter);
if (ret)
- goto out_fput;
+ return ret;
ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter));
if (!ret)
aio_rw_done(req, call_read_iter(file, req, &iter));
kfree(iovec);
-out_fput:
- if (unlikely(ret))
- fput(file);
return ret;
}
-static ssize_t aio_write(struct kiocb *req, struct iocb *iocb, bool vectored,
- bool compat)
+static ssize_t aio_write(struct kiocb *req, const struct iocb *iocb,
+ bool vectored, bool compat)
{
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
struct iov_iter iter;
@@ -1534,16 +1555,14 @@ static ssize_t aio_write(struct kiocb *req, struct iocb *iocb, bool vectored,
return ret;
file = req->ki_filp;
- ret = -EBADF;
if (unlikely(!(file->f_mode & FMODE_WRITE)))
- goto out_fput;
- ret = -EINVAL;
+ return -EBADF;
if (unlikely(!file->f_op->write_iter))
- goto out_fput;
+ return -EINVAL;
ret = aio_setup_rw(WRITE, iocb, &iovec, vectored, compat, &iter);
if (ret)
- goto out_fput;
+ return ret;
ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter));
if (!ret) {
/*
@@ -1561,9 +1580,6 @@ static ssize_t aio_write(struct kiocb *req, struct iocb *iocb, bool vectored,
aio_rw_done(req, call_write_iter(file, req, &iter));
}
kfree(iovec);
-out_fput:
- if (unlikely(ret))
- fput(file);
return ret;
}
@@ -1573,23 +1589,18 @@ static void aio_fsync_work(struct work_struct *work)
int ret;
ret = vfs_fsync(req->file, req->datasync);
- fput(req->file);
aio_complete(container_of(req, struct aio_kiocb, fsync), ret, 0);
}
-static int aio_fsync(struct fsync_iocb *req, struct iocb *iocb, bool datasync)
+static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb,
+ bool datasync)
{
if (unlikely(iocb->aio_buf || iocb->aio_offset || iocb->aio_nbytes ||
iocb->aio_rw_flags))
return -EINVAL;
- req->file = fget(iocb->aio_fildes);
- if (unlikely(!req->file))
- return -EBADF;
- if (unlikely(!req->file->f_op->fsync)) {
- fput(req->file);
+ if (unlikely(!req->file->f_op->fsync))
return -EINVAL;
- }
req->datasync = datasync;
INIT_WORK(&req->work, aio_fsync_work);
@@ -1599,10 +1610,7 @@ static int aio_fsync(struct fsync_iocb *req, struct iocb *iocb, bool datasync)
static inline void aio_poll_complete(struct aio_kiocb *iocb, __poll_t mask)
{
- struct file *file = iocb->poll.file;
-
aio_complete(iocb, mangle_poll(mask), 0);
- fput(file);
}
static void aio_poll_complete_work(struct work_struct *work)
@@ -1658,6 +1666,7 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
struct poll_iocb *req = container_of(wait, struct poll_iocb, wait);
struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll);
__poll_t mask = key_to_poll(key);
+ unsigned long flags;
req->woken = true;
@@ -1666,10 +1675,15 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
if (!(mask & req->events))
return 0;
- /* try to complete the iocb inline if we can: */
- if (spin_trylock(&iocb->ki_ctx->ctx_lock)) {
+ /*
+ * Try to complete the iocb inline if we can. Use
+ * irqsave/irqrestore because not all filesystems (e.g. fuse)
+ * call this function with IRQs disabled and because IRQs
+ * have to be disabled before ctx_lock is obtained.
+ */
+ if (spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) {
list_del(&iocb->ki_list);
- spin_unlock(&iocb->ki_ctx->ctx_lock);
+ spin_unlock_irqrestore(&iocb->ki_ctx->ctx_lock, flags);
list_del_init(&req->wait.entry);
aio_poll_complete(iocb, mask);
@@ -1705,7 +1719,7 @@ aio_poll_queue_proc(struct file *file, struct wait_queue_head *head,
add_wait_queue(head, &pt->iocb->poll.wait);
}
-static ssize_t aio_poll(struct aio_kiocb *aiocb, struct iocb *iocb)
+static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
{
struct kioctx *ctx = aiocb->ki_ctx;
struct poll_iocb *req = &aiocb->poll;
@@ -1721,9 +1735,10 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, struct iocb *iocb)
INIT_WORK(&req->work, aio_poll_complete_work);
req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP;
- req->file = fget(iocb->aio_fildes);
- if (unlikely(!req->file))
- return -EBADF;
+
+ req->head = NULL;
+ req->woken = false;
+ req->cancelled = false;
apt.pt._qproc = aio_poll_queue_proc;
apt.pt._key = req->events;
@@ -1762,10 +1777,8 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, struct iocb *iocb)
spin_unlock_irq(&ctx->ctx_lock);
out:
- if (unlikely(apt.error)) {
- fput(req->file);
+ if (unlikely(apt.error))
return apt.error;
- }
if (mask)
aio_poll_complete(aiocb, mask);
@@ -1773,44 +1786,49 @@ out:
return 0;
}
-static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
- bool compat)
+static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb,
+ struct iocb __user *user_iocb, bool compat)
{
struct aio_kiocb *req;
- struct iocb iocb;
ssize_t ret;
- if (unlikely(copy_from_user(&iocb, user_iocb, sizeof(iocb))))
- return -EFAULT;
-
/* enforce forwards compatibility on users */
- if (unlikely(iocb.aio_reserved2)) {
+ if (unlikely(iocb->aio_reserved2)) {
pr_debug("EINVAL: reserve field set\n");
return -EINVAL;
}
/* prevent overflows */
if (unlikely(
- (iocb.aio_buf != (unsigned long)iocb.aio_buf) ||
- (iocb.aio_nbytes != (size_t)iocb.aio_nbytes) ||
- ((ssize_t)iocb.aio_nbytes < 0)
+ (iocb->aio_buf != (unsigned long)iocb->aio_buf) ||
+ (iocb->aio_nbytes != (size_t)iocb->aio_nbytes) ||
+ ((ssize_t)iocb->aio_nbytes < 0)
)) {
pr_debug("EINVAL: overflow check\n");
return -EINVAL;
}
+ if (!get_reqs_available(ctx))
+ return -EAGAIN;
+
+ ret = -EAGAIN;
req = aio_get_req(ctx);
if (unlikely(!req))
- return -EAGAIN;
+ goto out_put_reqs_available;
+
+ req->ki_filp = fget(iocb->aio_fildes);
+ ret = -EBADF;
+ if (unlikely(!req->ki_filp))
+ goto out_put_req;
- if (iocb.aio_flags & IOCB_FLAG_RESFD) {
+ if (iocb->aio_flags & IOCB_FLAG_RESFD) {
/*
* If the IOCB_FLAG_RESFD flag of aio_flags is set, get an
* instance of the file* now. The file descriptor must be
* an eventfd() fd, and will be signaled for each completed
* event using the eventfd_signal() function.
*/
- req->ki_eventfd = eventfd_ctx_fdget((int) iocb.aio_resfd);
+ req->ki_eventfd = eventfd_ctx_fdget((int) iocb->aio_resfd);
if (IS_ERR(req->ki_eventfd)) {
ret = PTR_ERR(req->ki_eventfd);
req->ki_eventfd = NULL;
@@ -1825,32 +1843,32 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
}
req->ki_user_iocb = user_iocb;
- req->ki_user_data = iocb.aio_data;
+ req->ki_user_data = iocb->aio_data;
- switch (iocb.aio_lio_opcode) {
+ switch (iocb->aio_lio_opcode) {
case IOCB_CMD_PREAD:
- ret = aio_read(&req->rw, &iocb, false, compat);
+ ret = aio_read(&req->rw, iocb, false, compat);
break;
case IOCB_CMD_PWRITE:
- ret = aio_write(&req->rw, &iocb, false, compat);
+ ret = aio_write(&req->rw, iocb, false, compat);
break;
case IOCB_CMD_PREADV:
- ret = aio_read(&req->rw, &iocb, true, compat);
+ ret = aio_read(&req->rw, iocb, true, compat);
break;
case IOCB_CMD_PWRITEV:
- ret = aio_write(&req->rw, &iocb, true, compat);
+ ret = aio_write(&req->rw, iocb, true, compat);
break;
case IOCB_CMD_FSYNC:
- ret = aio_fsync(&req->fsync, &iocb, false);
+ ret = aio_fsync(&req->fsync, iocb, false);
break;
case IOCB_CMD_FDSYNC:
- ret = aio_fsync(&req->fsync, &iocb, true);
+ ret = aio_fsync(&req->fsync, iocb, true);
break;
case IOCB_CMD_POLL:
- ret = aio_poll(req, &iocb);
+ ret = aio_poll(req, iocb);
break;
default:
- pr_debug("invalid aio operation %d\n", iocb.aio_lio_opcode);
+ pr_debug("invalid aio operation %d\n", iocb->aio_lio_opcode);
ret = -EINVAL;
break;
}
@@ -1864,14 +1882,25 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
goto out_put_req;
return 0;
out_put_req:
- put_reqs_available(ctx, 1);
- percpu_ref_put(&ctx->reqs);
if (req->ki_eventfd)
eventfd_ctx_put(req->ki_eventfd);
- kmem_cache_free(kiocb_cachep, req);
+ iocb_put(req);
+out_put_reqs_available:
+ put_reqs_available(ctx, 1);
return ret;
}
+static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
+ bool compat)
+{
+ struct iocb iocb;
+
+ if (unlikely(copy_from_user(&iocb, user_iocb, sizeof(iocb))))
+ return -EFAULT;
+
+ return __io_submit_one(ctx, &iocb, user_iocb, compat);
+}
+
/* sys_io_submit:
* Queue the nr iocbs pointed to by iocbpp for processing. Returns
* the number of iocbs queued. May return -EINVAL if the aio_context
@@ -1904,7 +1933,8 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
if (nr > ctx->nr_events)
nr = ctx->nr_events;
- blk_start_plug(&plug);
+ if (nr > AIO_PLUG_THRESHOLD)
+ blk_start_plug(&plug);
for (i = 0; i < nr; i++) {
struct iocb __user *user_iocb;
@@ -1917,7 +1947,8 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
if (ret)
break;
}
- blk_finish_plug(&plug);
+ if (nr > AIO_PLUG_THRESHOLD)
+ blk_finish_plug(&plug);
percpu_ref_put(&ctx->users);
return i ? i : ret;
@@ -1944,7 +1975,8 @@ COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id,
if (nr > ctx->nr_events)
nr = ctx->nr_events;
- blk_start_plug(&plug);
+ if (nr > AIO_PLUG_THRESHOLD)
+ blk_start_plug(&plug);
for (i = 0; i < nr; i++) {
compat_uptr_t user_iocb;
@@ -1957,7 +1989,8 @@ COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id,
if (ret)
break;
}
- blk_finish_plug(&plug);
+ if (nr > AIO_PLUG_THRESHOLD)
+ blk_finish_plug(&plug);
percpu_ref_put(&ctx->users);
return i ? i : ret;
@@ -2062,11 +2095,13 @@ static long do_io_getevents(aio_context_t ctx_id,
* specifies an infinite timeout. Note that the timeout pointed to by
* timeout is relative. Will fail with -ENOSYS if not implemented.
*/
+#if !defined(CONFIG_64BIT_TIME) || defined(CONFIG_64BIT)
+
SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
long, min_nr,
long, nr,
struct io_event __user *, events,
- struct timespec __user *, timeout)
+ struct __kernel_timespec __user *, timeout)
{
struct timespec64 ts;
int ret;
@@ -2080,6 +2115,8 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
return ret;
}
+#endif
+
struct __aio_sigset {
const sigset_t __user *sigmask;
size_t sigsetsize;
@@ -2090,7 +2127,7 @@ SYSCALL_DEFINE6(io_pgetevents,
long, min_nr,
long, nr,
struct io_event __user *, events,
- struct timespec __user *, timeout,
+ struct __kernel_timespec __user *, timeout,
const struct __aio_sigset __user *, usig)
{
struct __aio_sigset ksig = { NULL, };
@@ -2104,38 +2141,61 @@ SYSCALL_DEFINE6(io_pgetevents,
if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
return -EFAULT;
- if (ksig.sigmask) {
- if (ksig.sigsetsize != sizeof(sigset_t))
- return -EINVAL;
- if (copy_from_user(&ksigmask, ksig.sigmask, sizeof(ksigmask)))
- return -EFAULT;
- sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
- sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
- }
+ ret = set_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize);
+ if (ret)
+ return ret;
ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
- if (signal_pending(current)) {
- if (ksig.sigmask) {
- current->saved_sigmask = sigsaved;
- set_restore_sigmask();
- }
+ restore_user_sigmask(ksig.sigmask, &sigsaved);
+ if (signal_pending(current) && !ret)
+ ret = -ERESTARTNOHAND;
- if (!ret)
- ret = -ERESTARTNOHAND;
- } else {
- if (ksig.sigmask)
- sigprocmask(SIG_SETMASK, &sigsaved, NULL);
- }
+ return ret;
+}
+
+#if defined(CONFIG_COMPAT_32BIT_TIME) && !defined(CONFIG_64BIT)
+
+SYSCALL_DEFINE6(io_pgetevents_time32,
+ aio_context_t, ctx_id,
+ long, min_nr,
+ long, nr,
+ struct io_event __user *, events,
+ struct old_timespec32 __user *, timeout,
+ const struct __aio_sigset __user *, usig)
+{
+ struct __aio_sigset ksig = { NULL, };
+ sigset_t ksigmask, sigsaved;
+ struct timespec64 ts;
+ int ret;
+
+ if (timeout && unlikely(get_old_timespec32(&ts, timeout)))
+ return -EFAULT;
+
+ if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
+ return -EFAULT;
+
+
+ ret = set_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize);
+ if (ret)
+ return ret;
+
+ ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
+ restore_user_sigmask(ksig.sigmask, &sigsaved);
+ if (signal_pending(current) && !ret)
+ ret = -ERESTARTNOHAND;
return ret;
}
-#ifdef CONFIG_COMPAT
-COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id,
- compat_long_t, min_nr,
- compat_long_t, nr,
- struct io_event __user *, events,
- struct old_timespec32 __user *, timeout)
+#endif
+
+#if defined(CONFIG_COMPAT_32BIT_TIME)
+
+SYSCALL_DEFINE5(io_getevents_time32, __u32, ctx_id,
+ __s32, min_nr,
+ __s32, nr,
+ struct io_event __user *, events,
+ struct old_timespec32 __user *, timeout)
{
struct timespec64 t;
int ret;
@@ -2149,12 +2209,17 @@ COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id,
return ret;
}
+#endif
+
+#ifdef CONFIG_COMPAT
struct __compat_aio_sigset {
compat_sigset_t __user *sigmask;
compat_size_t sigsetsize;
};
+#if defined(CONFIG_COMPAT_32BIT_TIME)
+
COMPAT_SYSCALL_DEFINE6(io_pgetevents,
compat_aio_context_t, ctx_id,
compat_long_t, min_nr,
@@ -2174,27 +2239,47 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents,
if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
return -EFAULT;
- if (ksig.sigmask) {
- if (ksig.sigsetsize != sizeof(compat_sigset_t))
- return -EINVAL;
- if (get_compat_sigset(&ksigmask, ksig.sigmask))
- return -EFAULT;
- sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
- sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
- }
+ ret = set_compat_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize);
+ if (ret)
+ return ret;
ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
- if (signal_pending(current)) {
- if (ksig.sigmask) {
- current->saved_sigmask = sigsaved;
- set_restore_sigmask();
- }
- if (!ret)
- ret = -ERESTARTNOHAND;
- } else {
- if (ksig.sigmask)
- sigprocmask(SIG_SETMASK, &sigsaved, NULL);
- }
+ restore_user_sigmask(ksig.sigmask, &sigsaved);
+ if (signal_pending(current) && !ret)
+ ret = -ERESTARTNOHAND;
+
+ return ret;
+}
+
+#endif
+
+COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64,
+ compat_aio_context_t, ctx_id,
+ compat_long_t, min_nr,
+ compat_long_t, nr,
+ struct io_event __user *, events,
+ struct __kernel_timespec __user *, timeout,
+ const struct __compat_aio_sigset __user *, usig)
+{
+ struct __compat_aio_sigset ksig = { NULL, };
+ sigset_t ksigmask, sigsaved;
+ struct timespec64 t;
+ int ret;
+
+ if (timeout && get_timespec64(&t, timeout))
+ return -EFAULT;
+
+ if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
+ return -EFAULT;
+
+ ret = set_compat_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize);
+ if (ret)
+ return ret;
+
+ ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
+ restore_user_sigmask(ksig.sigmask, &sigsaved);
+ if (signal_pending(current) && !ret)
+ ret = -ERESTARTNOHAND;
return ret;
}
diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
index 9f9cadbfbd7a..70c132acdab1 100644
--- a/fs/autofs/autofs_i.h
+++ b/fs/autofs/autofs_i.h
@@ -42,6 +42,8 @@
#endif
#define pr_fmt(fmt) KBUILD_MODNAME ":pid:%d:%s: " fmt, current->pid, __func__
+extern struct file_system_type autofs_fs_type;
+
/*
* Unified info structure. This is pointed to by both the dentry and
* inode structures. Each file in the filesystem has an instance of this
@@ -101,16 +103,20 @@ struct autofs_wait_queue {
#define AUTOFS_SBI_MAGIC 0x6d4a556d
+#define AUTOFS_SBI_CATATONIC 0x0001
+#define AUTOFS_SBI_STRICTEXPIRE 0x0002
+#define AUTOFS_SBI_IGNORE 0x0004
+
struct autofs_sb_info {
u32 magic;
int pipefd;
struct file *pipe;
struct pid *oz_pgrp;
- int catatonic;
int version;
int sub_version;
int min_proto;
int max_proto;
+ unsigned int flags;
unsigned long exp_timeout;
unsigned int type;
struct super_block *sb;
@@ -126,8 +132,7 @@ struct autofs_sb_info {
static inline struct autofs_sb_info *autofs_sbi(struct super_block *sb)
{
- return sb->s_magic != AUTOFS_SUPER_MAGIC ?
- NULL : (struct autofs_sb_info *)(sb->s_fs_info);
+ return (struct autofs_sb_info *)(sb->s_fs_info);
}
static inline struct autofs_info *autofs_dentry_ino(struct dentry *dentry)
@@ -141,7 +146,8 @@ static inline struct autofs_info *autofs_dentry_ino(struct dentry *dentry)
*/
static inline int autofs_oz_mode(struct autofs_sb_info *sbi)
{
- return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp;
+ return ((sbi->flags & AUTOFS_SBI_CATATONIC) ||
+ task_pgrp(current) == sbi->oz_pgrp);
}
struct inode *autofs_get_inode(struct super_block *, umode_t);
@@ -210,6 +216,8 @@ static inline int autofs_prepare_pipe(struct file *pipe)
return -EINVAL;
/* We want a packet pipe */
pipe->f_flags |= O_DIRECT;
+ /* We don't expect -EAGAIN */
+ pipe->f_flags &= ~O_NONBLOCK;
return 0;
}
diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c
index 86eafda4a652..e9fe74d1541b 100644
--- a/fs/autofs/dev-ioctl.c
+++ b/fs/autofs/dev-ioctl.c
@@ -151,22 +151,6 @@ out:
return err;
}
-/*
- * Get the autofs super block info struct from the file opened on
- * the autofs mount point.
- */
-static struct autofs_sb_info *autofs_dev_ioctl_sbi(struct file *f)
-{
- struct autofs_sb_info *sbi = NULL;
- struct inode *inode;
-
- if (f) {
- inode = file_inode(f);
- sbi = autofs_sbi(inode->i_sb);
- }
- return sbi;
-}
-
/* Return autofs dev ioctl version */
static int autofs_dev_ioctl_version(struct file *fp,
struct autofs_sb_info *sbi,
@@ -366,7 +350,7 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
pipefd = param->setpipefd.pipefd;
mutex_lock(&sbi->wq_mutex);
- if (!sbi->catatonic) {
+ if (!(sbi->flags & AUTOFS_SBI_CATATONIC)) {
mutex_unlock(&sbi->wq_mutex);
return -EBUSY;
} else {
@@ -393,7 +377,7 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
swap(sbi->oz_pgrp, new_pid);
sbi->pipefd = pipefd;
sbi->pipe = pipe;
- sbi->catatonic = 0;
+ sbi->flags &= ~AUTOFS_SBI_CATATONIC;
}
out:
put_pid(new_pid);
@@ -658,6 +642,8 @@ static int _autofs_dev_ioctl(unsigned int command,
if (cmd != AUTOFS_DEV_IOCTL_VERSION_CMD &&
cmd != AUTOFS_DEV_IOCTL_OPENMOUNT_CMD &&
cmd != AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD) {
+ struct super_block *sb;
+
fp = fget(param->ioctlfd);
if (!fp) {
if (cmd == AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD)
@@ -666,12 +652,13 @@ static int _autofs_dev_ioctl(unsigned int command,
goto out;
}
- sbi = autofs_dev_ioctl_sbi(fp);
- if (!sbi || sbi->magic != AUTOFS_SBI_MAGIC) {
+ sb = file_inode(fp)->i_sb;
+ if (sb->s_type != &autofs_fs_type) {
err = -EINVAL;
fput(fp);
goto out;
}
+ sbi = autofs_sbi(sb);
/*
* Admin needs to be able to set the mount catatonic in
diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c
index d441244b79df..28d9c2b1b3bb 100644
--- a/fs/autofs/expire.c
+++ b/fs/autofs/expire.c
@@ -596,7 +596,6 @@ int autofs_expire_run(struct super_block *sb,
pkt.len = dentry->d_name.len;
memcpy(pkt.name, dentry->d_name.name, pkt.len);
pkt.name[pkt.len] = '\0';
- dput(dentry);
if (copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire)))
ret = -EFAULT;
@@ -609,6 +608,8 @@ int autofs_expire_run(struct super_block *sb,
complete_all(&ino->expire_complete);
spin_unlock(&sbi->fs_lock);
+ dput(dentry);
+
return ret;
}
diff --git a/fs/autofs/init.c b/fs/autofs/init.c
index 79ae07d9592f..c0c1db2cc6ea 100644
--- a/fs/autofs/init.c
+++ b/fs/autofs/init.c
@@ -16,7 +16,7 @@ static struct dentry *autofs_mount(struct file_system_type *fs_type,
return mount_nodev(fs_type, flags, data, autofs_fill_super);
}
-static struct file_system_type autofs_fs_type = {
+struct file_system_type autofs_fs_type = {
.owner = THIS_MODULE,
.name = "autofs",
.mount = autofs_mount,
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index 846c052569dd..80597b88718b 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -82,16 +82,20 @@ static int autofs_show_options(struct seq_file *m, struct dentry *root)
seq_printf(m, ",maxproto=%d", sbi->max_proto);
if (autofs_type_offset(sbi->type))
- seq_printf(m, ",offset");
+ seq_puts(m, ",offset");
else if (autofs_type_direct(sbi->type))
- seq_printf(m, ",direct");
+ seq_puts(m, ",direct");
else
- seq_printf(m, ",indirect");
+ seq_puts(m, ",indirect");
+ if (sbi->flags & AUTOFS_SBI_STRICTEXPIRE)
+ seq_puts(m, ",strictexpire");
+ if (sbi->flags & AUTOFS_SBI_IGNORE)
+ seq_puts(m, ",ignore");
#ifdef CONFIG_CHECKPOINT_RESTORE
if (sbi->pipe)
seq_printf(m, ",pipe_ino=%ld", file_inode(sbi->pipe)->i_ino);
else
- seq_printf(m, ",pipe_ino=-1");
+ seq_puts(m, ",pipe_ino=-1");
#endif
return 0;
}
@@ -109,7 +113,8 @@ static const struct super_operations autofs_sops = {
};
enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto,
- Opt_indirect, Opt_direct, Opt_offset};
+ Opt_indirect, Opt_direct, Opt_offset, Opt_strictexpire,
+ Opt_ignore};
static const match_table_t tokens = {
{Opt_fd, "fd=%u"},
@@ -121,24 +126,29 @@ static const match_table_t tokens = {
{Opt_indirect, "indirect"},
{Opt_direct, "direct"},
{Opt_offset, "offset"},
+ {Opt_strictexpire, "strictexpire"},
+ {Opt_ignore, "ignore"},
{Opt_err, NULL}
};
-static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid,
- int *pgrp, bool *pgrp_set, unsigned int *type,
- int *minproto, int *maxproto)
+static int parse_options(char *options,
+ struct inode *root, int *pgrp, bool *pgrp_set,
+ struct autofs_sb_info *sbi)
{
char *p;
substring_t args[MAX_OPT_ARGS];
int option;
+ int pipefd = -1;
+ kuid_t uid;
+ kgid_t gid;
- *uid = current_uid();
- *gid = current_gid();
+ root->i_uid = current_uid();
+ root->i_gid = current_gid();
- *minproto = AUTOFS_MIN_PROTO_VERSION;
- *maxproto = AUTOFS_MAX_PROTO_VERSION;
+ sbi->min_proto = AUTOFS_MIN_PROTO_VERSION;
+ sbi->max_proto = AUTOFS_MAX_PROTO_VERSION;
- *pipefd = -1;
+ sbi->pipefd = -1;
if (!options)
return 1;
@@ -152,22 +162,25 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid,
token = match_token(p, tokens, args);
switch (token) {
case Opt_fd:
- if (match_int(args, pipefd))
+ if (match_int(args, &pipefd))
return 1;
+ sbi->pipefd = pipefd;
break;
case Opt_uid:
if (match_int(args, &option))
return 1;
- *uid = make_kuid(current_user_ns(), option);
- if (!uid_valid(*uid))
+ uid = make_kuid(current_user_ns(), option);
+ if (!uid_valid(uid))
return 1;
+ root->i_uid = uid;
break;
case Opt_gid:
if (match_int(args, &option))
return 1;
- *gid = make_kgid(current_user_ns(), option);
- if (!gid_valid(*gid))
+ gid = make_kgid(current_user_ns(), option);
+ if (!gid_valid(gid))
return 1;
+ root->i_gid = gid;
break;
case Opt_pgrp:
if (match_int(args, &option))
@@ -178,27 +191,33 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid,
case Opt_minproto:
if (match_int(args, &option))
return 1;
- *minproto = option;
+ sbi->min_proto = option;
break;
case Opt_maxproto:
if (match_int(args, &option))
return 1;
- *maxproto = option;
+ sbi->max_proto = option;
break;
case Opt_indirect:
- set_autofs_type_indirect(type);
+ set_autofs_type_indirect(&sbi->type);
break;
case Opt_direct:
- set_autofs_type_direct(type);
+ set_autofs_type_direct(&sbi->type);
break;
case Opt_offset:
- set_autofs_type_offset(type);
+ set_autofs_type_offset(&sbi->type);
+ break;
+ case Opt_strictexpire:
+ sbi->flags |= AUTOFS_SBI_STRICTEXPIRE;
+ break;
+ case Opt_ignore:
+ sbi->flags |= AUTOFS_SBI_IGNORE;
break;
default:
return 1;
}
}
- return (*pipefd < 0);
+ return (sbi->pipefd < 0);
}
int autofs_fill_super(struct super_block *s, void *data, int silent)
@@ -206,7 +225,6 @@ int autofs_fill_super(struct super_block *s, void *data, int silent)
struct inode *root_inode;
struct dentry *root;
struct file *pipe;
- int pipefd;
struct autofs_sb_info *sbi;
struct autofs_info *ino;
int pgrp = 0;
@@ -222,12 +240,12 @@ int autofs_fill_super(struct super_block *s, void *data, int silent)
sbi->magic = AUTOFS_SBI_MAGIC;
sbi->pipefd = -1;
sbi->pipe = NULL;
- sbi->catatonic = 1;
sbi->exp_timeout = 0;
sbi->oz_pgrp = NULL;
sbi->sb = s;
sbi->version = 0;
sbi->sub_version = 0;
+ sbi->flags = AUTOFS_SBI_CATATONIC;
set_autofs_type_indirect(&sbi->type);
sbi->min_proto = 0;
sbi->max_proto = 0;
@@ -255,16 +273,16 @@ int autofs_fill_super(struct super_block *s, void *data, int silent)
}
root_inode = autofs_get_inode(s, S_IFDIR | 0755);
root = d_make_root(root_inode);
- if (!root)
+ if (!root) {
+ ret = -ENOMEM;
goto fail_ino;
+ }
pipe = NULL;
root->d_fsdata = ino;
/* Can this call block? */
- if (parse_options(data, &pipefd, &root_inode->i_uid, &root_inode->i_gid,
- &pgrp, &pgrp_set, &sbi->type, &sbi->min_proto,
- &sbi->max_proto)) {
+ if (parse_options(data, root_inode, &pgrp, &pgrp_set, sbi)) {
pr_err("called with bogus options\n");
goto fail_dput;
}
@@ -303,8 +321,9 @@ int autofs_fill_super(struct super_block *s, void *data, int silent)
root_inode->i_fop = &autofs_root_operations;
root_inode->i_op = &autofs_dir_inode_operations;
- pr_debug("pipe fd = %d, pgrp = %u\n", pipefd, pid_nr(sbi->oz_pgrp));
- pipe = fget(pipefd);
+ pr_debug("pipe fd = %d, pgrp = %u\n",
+ sbi->pipefd, pid_nr(sbi->oz_pgrp));
+ pipe = fget(sbi->pipefd);
if (!pipe) {
pr_err("could not open pipe file descriptor\n");
@@ -314,8 +333,7 @@ int autofs_fill_super(struct super_block *s, void *data, int silent)
if (ret < 0)
goto fail_fput;
sbi->pipe = pipe;
- sbi->pipefd = pipefd;
- sbi->catatonic = 0;
+ sbi->flags &= ~AUTOFS_SBI_CATATONIC;
/*
* Success! Install the root dentry now to indicate completion.
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 782e57b911ab..1246f396bf0e 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -275,8 +275,11 @@ static int autofs_mount_wait(const struct path *path, bool rcu_walk)
pr_debug("waiting for mount name=%pd\n", path->dentry);
status = autofs_wait(sbi, path, NFY_MOUNT);
pr_debug("mount wait done status=%d\n", status);
+ ino->last_used = jiffies;
+ return status;
}
- ino->last_used = jiffies;
+ if (!(sbi->flags & AUTOFS_SBI_STRICTEXPIRE))
+ ino->last_used = jiffies;
return status;
}
@@ -510,7 +513,8 @@ static struct dentry *autofs_lookup(struct inode *dir,
sbi = autofs_sbi(dir->i_sb);
pr_debug("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d\n",
- current->pid, task_pgrp_nr(current), sbi->catatonic,
+ current->pid, task_pgrp_nr(current),
+ sbi->flags & AUTOFS_SBI_CATATONIC,
autofs_oz_mode(sbi));
active = autofs_lookup_active(dentry);
@@ -563,7 +567,7 @@ static int autofs_dir_symlink(struct inode *dir,
* autofs mount is catatonic but the state of an autofs
* file system needs to be preserved over restarts.
*/
- if (sbi->catatonic)
+ if (sbi->flags & AUTOFS_SBI_CATATONIC)
return -EACCES;
BUG_ON(!ino);
@@ -626,7 +630,7 @@ static int autofs_dir_unlink(struct inode *dir, struct dentry *dentry)
* autofs mount is catatonic but the state of an autofs
* file system needs to be preserved over restarts.
*/
- if (sbi->catatonic)
+ if (sbi->flags & AUTOFS_SBI_CATATONIC)
return -EACCES;
if (atomic_dec_and_test(&ino->count)) {
@@ -714,7 +718,7 @@ static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry)
* autofs mount is catatonic but the state of an autofs
* file system needs to be preserved over restarts.
*/
- if (sbi->catatonic)
+ if (sbi->flags & AUTOFS_SBI_CATATONIC)
return -EACCES;
spin_lock(&sbi->lookup_lock);
@@ -759,7 +763,7 @@ static int autofs_dir_mkdir(struct inode *dir,
* autofs mount is catatonic but the state of an autofs
* file system needs to be preserved over restarts.
*/
- if (sbi->catatonic)
+ if (sbi->flags & AUTOFS_SBI_CATATONIC)
return -EACCES;
pr_debug("dentry %p, creating %pd\n", dentry, dentry);
diff --git a/fs/autofs/waitq.c b/fs/autofs/waitq.c
index f6385c6ef0a5..15a3e31d0904 100644
--- a/fs/autofs/waitq.c
+++ b/fs/autofs/waitq.c
@@ -20,14 +20,14 @@ void autofs_catatonic_mode(struct autofs_sb_info *sbi)
struct autofs_wait_queue *wq, *nwq;
mutex_lock(&sbi->wq_mutex);
- if (sbi->catatonic) {
+ if (sbi->flags & AUTOFS_SBI_CATATONIC) {
mutex_unlock(&sbi->wq_mutex);
return;
}
pr_debug("entering catatonic mode\n");
- sbi->catatonic = 1;
+ sbi->flags |= AUTOFS_SBI_CATATONIC;
wq = sbi->queues;
sbi->queues = NULL; /* Erase all wait queues */
while (wq) {
@@ -255,7 +255,7 @@ static int validate_request(struct autofs_wait_queue **wait,
struct autofs_wait_queue *wq;
struct autofs_info *ino;
- if (sbi->catatonic)
+ if (sbi->flags & AUTOFS_SBI_CATATONIC)
return -ENOENT;
/* Wait in progress, continue; */
@@ -290,7 +290,7 @@ static int validate_request(struct autofs_wait_queue **wait,
if (mutex_lock_interruptible(&sbi->wq_mutex))
return -EINTR;
- if (sbi->catatonic)
+ if (sbi->flags & AUTOFS_SBI_CATATONIC)
return -ENOENT;
wq = autofs_find_wait(sbi, qstr);
@@ -359,7 +359,7 @@ int autofs_wait(struct autofs_sb_info *sbi,
pid_t tgid;
/* In catatonic mode, we don't wait for nobody */
- if (sbi->catatonic)
+ if (sbi->flags & AUTOFS_SBI_CATATONIC)
return -ENOENT;
/*
diff --git a/fs/bfs/bfs.h b/fs/bfs/bfs.h
index 67aef3bb89e4..606f9378b2f0 100644
--- a/fs/bfs/bfs.h
+++ b/fs/bfs/bfs.h
@@ -1,13 +1,20 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* fs/bfs/bfs.h
- * Copyright (C) 1999 Tigran Aivazian <tigran@veritas.com>
+ * Copyright (C) 1999-2018 Tigran Aivazian <aivazian.tigran@gmail.com>
*/
#ifndef _FS_BFS_BFS_H
#define _FS_BFS_BFS_H
#include <linux/bfs_fs.h>
+/* In theory BFS supports up to 512 inodes, numbered from 2 (for /) up to 513 inclusive.
+ In actual fact, attempting to create the 512th inode (i.e. inode No. 513 or file No. 511)
+ will fail with ENOSPC in bfs_add_entry(): the root directory cannot contain so many entries, counting '..'.
+ So, mkfs.bfs(8) should really limit its -N option to 511 and not 512. For now, we just print a warning
+ if a filesystem is mounted with such "impossible to fill up" number of inodes */
+#define BFS_MAX_LASTI 513
+
/*
* BFS file system in-core superblock info
*/
@@ -17,7 +24,7 @@ struct bfs_sb_info {
unsigned long si_freei;
unsigned long si_lf_eblk;
unsigned long si_lasti;
- unsigned long *si_imap;
+ DECLARE_BITMAP(si_imap, BFS_MAX_LASTI+1);
struct mutex bfs_lock;
};
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index f32f21c3bbc7..d8dfe3a0cb39 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -2,8 +2,8 @@
/*
* fs/bfs/dir.c
* BFS directory operations.
- * Copyright (C) 1999,2000 Tigran Aivazian <tigran@veritas.com>
- * Made endianness-clean by Andrew Stribblehill <ads@wompom.org> 2005
+ * Copyright (C) 1999-2018 Tigran Aivazian <aivazian.tigran@gmail.com>
+ * Made endianness-clean by Andrew Stribblehill <ads@wompom.org> 2005
*/
#include <linux/time.h>
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index 1476cdd90cfb..0dceefc54b48 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -2,7 +2,7 @@
/*
* fs/bfs/file.c
* BFS file operations.
- * Copyright (C) 1999,2000 Tigran Aivazian <tigran@veritas.com>
+ * Copyright (C) 1999-2018 Tigran Aivazian <aivazian.tigran@gmail.com>
*
* Make the file block allocation algorithm understand the size
* of the underlying block device.
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 9a69392f1fb3..d136b2aaafb3 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -1,10 +1,9 @@
/*
* fs/bfs/inode.c
* BFS superblock and inode operations.
- * Copyright (C) 1999-2006 Tigran Aivazian <aivazian.tigran@gmail.com>
+ * Copyright (C) 1999-2018 Tigran Aivazian <aivazian.tigran@gmail.com>
* From fs/minix, Copyright (C) 1991, 1992 Linus Torvalds.
- *
- * Made endianness-clean by Andrew Stribblehill <ads@wompom.org>, 2005.
+ * Made endianness-clean by Andrew Stribblehill <ads@wompom.org>, 2005.
*/
#include <linux/module.h>
@@ -118,12 +117,12 @@ static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc)
{
struct bfs_sb_info *info = BFS_SB(inode->i_sb);
unsigned int ino = (u16)inode->i_ino;
- unsigned long i_sblock;
+ unsigned long i_sblock;
struct bfs_inode *di;
struct buffer_head *bh;
int err = 0;
- dprintf("ino=%08x\n", ino);
+ dprintf("ino=%08x\n", ino);
di = find_inode(inode->i_sb, ino, &bh);
if (IS_ERR(di))
@@ -144,7 +143,7 @@ static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc)
di->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
di->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
di->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
- i_sblock = BFS_I(inode)->i_sblock;
+ i_sblock = BFS_I(inode)->i_sblock;
di->i_sblock = cpu_to_le32(i_sblock);
di->i_eblock = cpu_to_le32(BFS_I(inode)->i_eblock);
di->i_eoffset = cpu_to_le32(i_sblock * BFS_BSIZE + inode->i_size - 1);
@@ -188,13 +187,13 @@ static void bfs_evict_inode(struct inode *inode)
mark_buffer_dirty(bh);
brelse(bh);
- if (bi->i_dsk_ino) {
+ if (bi->i_dsk_ino) {
if (bi->i_sblock)
info->si_freeb += bi->i_eblock + 1 - bi->i_sblock;
info->si_freei++;
clear_bit(ino, info->si_imap);
- bfs_dump_imap("delete_inode", s);
- }
+ bfs_dump_imap("evict_inode", s);
+ }
/*
* If this was the last file, make the previous block
@@ -214,7 +213,6 @@ static void bfs_put_super(struct super_block *s)
return;
mutex_destroy(&info->bfs_lock);
- kfree(info->si_imap);
kfree(info);
s->s_fs_info = NULL;
}
@@ -311,8 +309,7 @@ void bfs_dump_imap(const char *prefix, struct super_block *s)
else
strcat(tmpbuf, "0");
}
- printf("BFS-fs: %s: lasti=%08lx <%s>\n",
- prefix, BFS_SB(s)->si_lasti, tmpbuf);
+ printf("%s: lasti=%08lx <%s>\n", prefix, BFS_SB(s)->si_lasti, tmpbuf);
free_page((unsigned long)tmpbuf);
#endif
}
@@ -322,7 +319,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
struct buffer_head *bh, *sbh;
struct bfs_super_block *bfs_sb;
struct inode *inode;
- unsigned i, imap_len;
+ unsigned i;
struct bfs_sb_info *info;
int ret = -EINVAL;
unsigned long i_sblock, i_eblock, i_eoff, s_size;
@@ -341,8 +338,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
bfs_sb = (struct bfs_super_block *)sbh->b_data;
if (le32_to_cpu(bfs_sb->s_magic) != BFS_MAGIC) {
if (!silent)
- printf("No BFS filesystem on %s (magic=%08x)\n",
- s->s_id, le32_to_cpu(bfs_sb->s_magic));
+ printf("No BFS filesystem on %s (magic=%08x)\n", s->s_id, le32_to_cpu(bfs_sb->s_magic));
goto out1;
}
if (BFS_UNCLEAN(bfs_sb, s) && !silent)
@@ -350,18 +346,19 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
s->s_magic = BFS_MAGIC;
- if (le32_to_cpu(bfs_sb->s_start) > le32_to_cpu(bfs_sb->s_end)) {
- printf("Superblock is corrupted\n");
+ if (le32_to_cpu(bfs_sb->s_start) > le32_to_cpu(bfs_sb->s_end) ||
+ le32_to_cpu(bfs_sb->s_start) < sizeof(struct bfs_super_block) + sizeof(struct bfs_dirent)) {
+ printf("Superblock is corrupted on %s\n", s->s_id);
goto out1;
}
- info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE) /
- sizeof(struct bfs_inode)
- + BFS_ROOT_INO - 1;
- imap_len = (info->si_lasti / 8) + 1;
- info->si_imap = kzalloc(imap_len, GFP_KERNEL);
- if (!info->si_imap)
+ info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE) / sizeof(struct bfs_inode) + BFS_ROOT_INO - 1;
+ if (info->si_lasti == BFS_MAX_LASTI)
+ printf("WARNING: filesystem %s was created with 512 inodes, the real maximum is 511, mounting anyway\n", s->s_id);
+ else if (info->si_lasti > BFS_MAX_LASTI) {
+ printf("Impossible last inode number %lu > %d on %s\n", info->si_lasti, BFS_MAX_LASTI, s->s_id);
goto out1;
+ }
for (i = 0; i < BFS_ROOT_INO; i++)
set_bit(i, info->si_imap);
@@ -369,26 +366,25 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
inode = bfs_iget(s, BFS_ROOT_INO);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
- goto out2;
+ goto out1;
}
s->s_root = d_make_root(inode);
if (!s->s_root) {
ret = -ENOMEM;
- goto out2;
+ goto out1;
}
info->si_blocks = (le32_to_cpu(bfs_sb->s_end) + 1) >> BFS_BSIZE_BITS;
- info->si_freeb = (le32_to_cpu(bfs_sb->s_end) + 1
- - le32_to_cpu(bfs_sb->s_start)) >> BFS_BSIZE_BITS;
+ info->si_freeb = (le32_to_cpu(bfs_sb->s_end) + 1 - le32_to_cpu(bfs_sb->s_start)) >> BFS_BSIZE_BITS;
info->si_freei = 0;
info->si_lf_eblk = 0;
/* can we read the last block? */
bh = sb_bread(s, info->si_blocks - 1);
if (!bh) {
- printf("Last block not available: %lu\n", info->si_blocks - 1);
+ printf("Last block not available on %s: %lu\n", s->s_id, info->si_blocks - 1);
ret = -EIO;
- goto out3;
+ goto out2;
}
brelse(bh);
@@ -422,11 +418,11 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
(i_eoff != le32_to_cpu(-1) && i_eoff > s_size) ||
i_sblock * BFS_BSIZE > i_eoff) {
- printf("Inode 0x%08x corrupted\n", i);
+ printf("Inode 0x%08x corrupted on %s\n", i, s->s_id);
brelse(bh);
ret = -EIO;
- goto out3;
+ goto out2;
}
if (!di->i_ino) {
@@ -442,14 +438,12 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
}
brelse(bh);
brelse(sbh);
- bfs_dump_imap("read_super", s);
+ bfs_dump_imap("fill_super", s);
return 0;
-out3:
+out2:
dput(s->s_root);
s->s_root = NULL;
-out2:
- kfree(info->si_imap);
out1:
brelse(sbh);
out:
@@ -479,7 +473,7 @@ static int __init init_bfs_fs(void)
int err = init_inodecache();
if (err)
goto out1;
- err = register_filesystem(&bfs_fs_type);
+ err = register_filesystem(&bfs_fs_type);
if (err)
goto out;
return 0;
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index c3deb2e35f20..1fefd87eb4b4 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -29,97 +29,14 @@
#include <linux/uaccess.h>
#include <asm/cacheflush.h>
-#include <asm/a.out-core.h>
static int load_aout_binary(struct linux_binprm *);
static int load_aout_library(struct file*);
-#ifdef CONFIG_COREDUMP
-/*
- * Routine writes a core dump image in the current directory.
- * Currently only a stub-function.
- *
- * Note that setuid/setgid files won't make a core-dump if the uid/gid
- * changed due to the set[u|g]id. It's enforced by the "current->mm->dumpable"
- * field, which also makes sure the core-dumps won't be recursive if the
- * dumping of the process results in another error..
- */
-static int aout_core_dump(struct coredump_params *cprm)
-{
- mm_segment_t fs;
- int has_dumped = 0;
- void __user *dump_start;
- int dump_size;
- struct user dump;
-#ifdef __alpha__
-# define START_DATA(u) ((void __user *)u.start_data)
-#else
-# define START_DATA(u) ((void __user *)((u.u_tsize << PAGE_SHIFT) + \
- u.start_code))
-#endif
-# define START_STACK(u) ((void __user *)u.start_stack)
-
- fs = get_fs();
- set_fs(KERNEL_DS);
- has_dumped = 1;
- strncpy(dump.u_comm, current->comm, sizeof(dump.u_comm));
- dump.u_ar0 = offsetof(struct user, regs);
- dump.signal = cprm->siginfo->si_signo;
- aout_dump_thread(cprm->regs, &dump);
-
-/* If the size of the dump file exceeds the rlimit, then see what would happen
- if we wrote the stack, but not the data area. */
- if ((dump.u_dsize + dump.u_ssize+1) * PAGE_SIZE > cprm->limit)
- dump.u_dsize = 0;
-
-/* Make sure we have enough room to write the stack and data areas. */
- if ((dump.u_ssize + 1) * PAGE_SIZE > cprm->limit)
- dump.u_ssize = 0;
-
-/* make sure we actually have a data and stack area to dump */
- set_fs(USER_DS);
- if (!access_ok(VERIFY_READ, START_DATA(dump), dump.u_dsize << PAGE_SHIFT))
- dump.u_dsize = 0;
- if (!access_ok(VERIFY_READ, START_STACK(dump), dump.u_ssize << PAGE_SHIFT))
- dump.u_ssize = 0;
-
- set_fs(KERNEL_DS);
-/* struct user */
- if (!dump_emit(cprm, &dump, sizeof(dump)))
- goto end_coredump;
-/* Now dump all of the user data. Include malloced stuff as well */
- if (!dump_skip(cprm, PAGE_SIZE - sizeof(dump)))
- goto end_coredump;
-/* now we start writing out the user space info */
- set_fs(USER_DS);
-/* Dump the data area */
- if (dump.u_dsize != 0) {
- dump_start = START_DATA(dump);
- dump_size = dump.u_dsize << PAGE_SHIFT;
- if (!dump_emit(cprm, dump_start, dump_size))
- goto end_coredump;
- }
-/* Now prepare to dump the stack area */
- if (dump.u_ssize != 0) {
- dump_start = START_STACK(dump);
- dump_size = dump.u_ssize << PAGE_SHIFT;
- if (!dump_emit(cprm, dump_start, dump_size))
- goto end_coredump;
- }
-end_coredump:
- set_fs(fs);
- return has_dumped;
-}
-#else
-#define aout_core_dump NULL
-#endif
-
static struct linux_binfmt aout_format = {
.module = THIS_MODULE,
.load_binary = load_aout_binary,
.load_shlib = load_aout_library,
- .core_dump = aout_core_dump,
- .min_coredump = PAGE_SIZE
};
#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE)
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 54207327f98f..7d09d125f148 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -57,8 +57,6 @@
#endif
static int load_elf_binary(struct linux_binprm *bprm);
-static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *,
- int, int, unsigned long);
#ifdef CONFIG_USELIB
static int load_elf_library(struct file *);
@@ -347,7 +345,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
#ifndef elf_map
static unsigned long elf_map(struct file *filep, unsigned long addr,
- struct elf_phdr *eppnt, int prot, int type,
+ const struct elf_phdr *eppnt, int prot, int type,
unsigned long total_size)
{
unsigned long map_addr;
@@ -387,7 +385,7 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
#endif /* !elf_map */
-static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr)
+static unsigned long total_mapping_size(const struct elf_phdr *cmds, int nr)
{
int i, first_idx = -1, last_idx = -1;
@@ -414,12 +412,13 @@ static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr)
* header pointed to by elf_ex, into a newly allocated array. The caller is
* responsible for freeing the allocated data. Returns an ERR_PTR upon failure.
*/
-static struct elf_phdr *load_elf_phdrs(struct elfhdr *elf_ex,
+static struct elf_phdr *load_elf_phdrs(const struct elfhdr *elf_ex,
struct file *elf_file)
{
struct elf_phdr *elf_phdata = NULL;
- int retval, size, err = -1;
+ int retval, err = -1;
loff_t pos = elf_ex->e_phoff;
+ unsigned int size;
/*
* If the size of this structure has changed, then punt, since
@@ -429,13 +428,9 @@ static struct elf_phdr *load_elf_phdrs(struct elfhdr *elf_ex,
goto out;
/* Sanity check the number of program headers... */
- if (elf_ex->e_phnum < 1 ||
- elf_ex->e_phnum > 65536U / sizeof(struct elf_phdr))
- goto out;
-
/* ...and their total size. */
size = sizeof(struct elf_phdr) * elf_ex->e_phnum;
- if (size > ELF_MIN_ALIGN)
+ if (size == 0 || size > 65536 || size > ELF_MIN_ALIGN)
goto out;
elf_phdata = kmalloc(size, GFP_KERNEL);
@@ -2033,7 +2028,6 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
struct elf_note_info *info,
const kernel_siginfo_t *siginfo, struct pt_regs *regs)
{
- struct list_head *t;
struct core_thread *ct;
struct elf_thread_status *ets;
@@ -2050,10 +2044,9 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
list_add(&ets->list, &info->thread_list);
}
- list_for_each(t, &info->thread_list) {
+ list_for_each_entry(ets, &info->thread_list, list) {
int sz;
- ets = list_entry(t, struct elf_thread_status, list);
sz = elf_dump_thread_status(siginfo->si_signo, ets);
info->thread_status_size += sz;
}
@@ -2117,20 +2110,17 @@ static size_t get_note_info_size(struct elf_note_info *info)
static int write_note_info(struct elf_note_info *info,
struct coredump_params *cprm)
{
+ struct elf_thread_status *ets;
int i;
- struct list_head *t;
for (i = 0; i < info->numnote; i++)
if (!writenote(info->notes + i, cprm))
return 0;
/* write out the thread status notes section */
- list_for_each(t, &info->thread_list) {
- struct elf_thread_status *tmp =
- list_entry(t, struct elf_thread_status, list);
-
- for (i = 0; i < tmp->num_notes; i++)
- if (!writenote(&tmp->notes[i], cprm))
+ list_for_each_entry(ets, &info->thread_list, list) {
+ for (i = 0; i < ets->num_notes; i++)
+ if (!writenote(&ets->notes[i], cprm))
return 0;
}
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index 7cde3f46ad26..e996174cbfc0 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -14,13 +14,30 @@
#include <linux/err.h>
#include <linux/fs.h>
+static inline bool spacetab(char c) { return c == ' ' || c == '\t'; }
+static inline char *next_non_spacetab(char *first, const char *last)
+{
+ for (; first <= last; first++)
+ if (!spacetab(*first))
+ return first;
+ return NULL;
+}
+static inline char *next_terminator(char *first, const char *last)
+{
+ for (; first <= last; first++)
+ if (spacetab(*first) || !*first)
+ return first;
+ return NULL;
+}
+
static int load_script(struct linux_binprm *bprm)
{
const char *i_arg, *i_name;
- char *cp;
+ char *cp, *buf_end;
struct file *file;
int retval;
+ /* Not ours to exec if we don't start with "#!". */
if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!'))
return -ENOEXEC;
@@ -33,18 +50,40 @@ static int load_script(struct linux_binprm *bprm)
if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE)
return -ENOENT;
- /*
- * This section does the #! interpretation.
- * Sorta complicated, but hopefully it will work. -TYT
- */
-
+ /* Release since we are not mapping a binary into memory. */
allow_write_access(bprm->file);
fput(bprm->file);
bprm->file = NULL;
- bprm->buf[BINPRM_BUF_SIZE - 1] = '\0';
- if ((cp = strchr(bprm->buf, '\n')) == NULL)
- cp = bprm->buf+BINPRM_BUF_SIZE-1;
+ /*
+ * This section handles parsing the #! line into separate
+ * interpreter path and argument strings. We must be careful
+ * because bprm->buf is not yet guaranteed to be NUL-terminated
+ * (though the buffer will have trailing NUL padding when the
+ * file size was smaller than the buffer size).
+ *
+ * We do not want to exec a truncated interpreter path, so either
+ * we find a newline (which indicates nothing is truncated), or
+ * we find a space/tab/NUL after the interpreter path (which
+ * itself may be preceded by spaces/tabs). Truncating the
+ * arguments is fine: the interpreter can re-read the script to
+ * parse them on its own.
+ */
+ buf_end = bprm->buf + sizeof(bprm->buf) - 1;
+ cp = strnchr(bprm->buf, sizeof(bprm->buf), '\n');
+ if (!cp) {
+ cp = next_non_spacetab(bprm->buf + 2, buf_end);
+ if (!cp)
+ return -ENOEXEC; /* Entire buf is spaces/tabs */
+ /*
+ * If there is no later space/tab/NUL we must assume the
+ * interpreter path is truncated.
+ */
+ if (!next_terminator(cp, buf_end))
+ return -ENOEXEC;
+ cp = buf_end;
+ }
+ /* NUL-terminate the buffer and any trailing spaces/tabs. */
*cp = '\0';
while (cp > bprm->buf) {
cp--;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 38b8ce05cbc7..e9faa52bb489 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -104,6 +104,20 @@ void invalidate_bdev(struct block_device *bdev)
}
EXPORT_SYMBOL(invalidate_bdev);
+static void set_init_blocksize(struct block_device *bdev)
+{
+ unsigned bsize = bdev_logical_block_size(bdev);
+ loff_t size = i_size_read(bdev->bd_inode);
+
+ while (bsize < PAGE_SIZE) {
+ if (size & bsize)
+ break;
+ bsize <<= 1;
+ }
+ bdev->bd_block_size = bsize;
+ bdev->bd_inode->i_blkbits = blksize_bits(bsize);
+}
+
int set_blocksize(struct block_device *bdev, int size)
{
/* Size must be a power of two, and between 512 and PAGE_SIZE */
@@ -181,7 +195,7 @@ static void blkdev_bio_end_io_simple(struct bio *bio)
struct task_struct *waiter = bio->bi_private;
WRITE_ONCE(bio->bi_private, NULL);
- wake_up_process(waiter);
+ blk_wake_io_task(waiter);
}
static ssize_t
@@ -197,6 +211,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
ssize_t ret;
blk_qc_t qc;
int i;
+ struct bvec_iter_all iter_all;
if ((pos | iov_iter_alignment(iter)) &
(bdev_logical_block_size(bdev) - 1))
@@ -232,6 +247,8 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
bio.bi_opf = dio_bio_write_op(iocb);
task_io_account_write(ret);
}
+ if (iocb->ki_flags & IOCB_HIPRI)
+ bio_set_polled(&bio, iocb);
qc = submit_bio(&bio);
for (;;) {
@@ -239,12 +256,12 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
if (!READ_ONCE(bio.bi_private))
break;
if (!(iocb->ki_flags & IOCB_HIPRI) ||
- !blk_poll(bdev_get_queue(bdev), qc))
+ !blk_poll(bdev_get_queue(bdev), qc, true))
io_schedule();
}
__set_current_state(TASK_RUNNING);
- bio_for_each_segment_all(bvec, &bio, i) {
+ bio_for_each_segment_all(bvec, &bio, i, iter_all) {
if (should_dirty && !PageCompound(bvec->bv_page))
set_page_dirty_lock(bvec->bv_page);
put_page(bvec->bv_page);
@@ -277,6 +294,14 @@ struct blkdev_dio {
static struct bio_set blkdev_dio_pool;
+static int blkdev_iopoll(struct kiocb *kiocb, bool wait)
+{
+ struct block_device *bdev = I_BDEV(kiocb->ki_filp->f_mapping->host);
+ struct request_queue *q = bdev_get_queue(bdev);
+
+ return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait);
+}
+
static void blkdev_bio_end_io(struct bio *bio)
{
struct blkdev_dio *dio = bio->bi_private;
@@ -298,12 +323,13 @@ static void blkdev_bio_end_io(struct bio *bio)
}
dio->iocb->ki_complete(iocb, ret, 0);
- bio_put(&dio->bio);
+ if (dio->multi_bio)
+ bio_put(&dio->bio);
} else {
struct task_struct *waiter = dio->waiter;
WRITE_ONCE(dio->waiter, NULL);
- wake_up_process(waiter);
+ blk_wake_io_task(waiter);
}
}
@@ -312,8 +338,9 @@ static void blkdev_bio_end_io(struct bio *bio)
} else {
struct bio_vec *bvec;
int i;
+ struct bvec_iter_all iter_all;
- bio_for_each_segment_all(bvec, bio, i)
+ bio_for_each_segment_all(bvec, bio, i, iter_all)
put_page(bvec->bv_page);
bio_put(bio);
}
@@ -328,6 +355,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
struct blk_plug plug;
struct blkdev_dio *dio;
struct bio *bio;
+ bool is_poll = (iocb->ki_flags & IOCB_HIPRI) != 0;
bool is_read = (iov_iter_rw(iter) == READ), is_sync;
loff_t pos = iocb->ki_pos;
blk_qc_t qc = BLK_QC_T_NONE;
@@ -338,20 +366,27 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
return -EINVAL;
bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, &blkdev_dio_pool);
- bio_get(bio); /* extra ref for the completion handler */
dio = container_of(bio, struct blkdev_dio, bio);
dio->is_sync = is_sync = is_sync_kiocb(iocb);
- if (dio->is_sync)
+ if (dio->is_sync) {
dio->waiter = current;
- else
+ bio_get(bio);
+ } else {
dio->iocb = iocb;
+ }
dio->size = 0;
dio->multi_bio = false;
- dio->should_dirty = is_read && (iter->type == ITER_IOVEC);
+ dio->should_dirty = is_read && iter_is_iovec(iter);
+
+ /*
+ * Don't plug for HIPRI/polled IO, as those should go straight
+ * to issue
+ */
+ if (!is_poll)
+ blk_start_plug(&plug);
- blk_start_plug(&plug);
for (;;) {
bio_set_dev(bio, bdev);
bio->bi_iter.bi_sector = pos >> 9;
@@ -381,11 +416,28 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES);
if (!nr_pages) {
+ bool polled = false;
+
+ if (iocb->ki_flags & IOCB_HIPRI) {
+ bio_set_polled(bio, iocb);
+ polled = true;
+ }
+
qc = submit_bio(bio);
+
+ if (polled)
+ WRITE_ONCE(iocb->ki_cookie, qc);
break;
}
if (!dio->multi_bio) {
+ /*
+ * AIO needs an extra reference to ensure the dio
+ * structure which is embedded into the first bio
+ * stays around.
+ */
+ if (!is_sync)
+ bio_get(bio);
dio->multi_bio = true;
atomic_set(&dio->ref, 2);
} else {
@@ -395,7 +447,9 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
submit_bio(bio);
bio = bio_alloc(GFP_KERNEL, nr_pages);
}
- blk_finish_plug(&plug);
+
+ if (!is_poll)
+ blk_finish_plug(&plug);
if (!is_sync)
return -EIOCBQUEUED;
@@ -406,7 +460,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
break;
if (!(iocb->ki_flags & IOCB_HIPRI) ||
- !blk_poll(bdev_get_queue(bdev), qc))
+ !blk_poll(bdev_get_queue(bdev), qc, true))
io_schedule();
}
__set_current_state(TASK_RUNNING);
@@ -1408,18 +1462,9 @@ EXPORT_SYMBOL(check_disk_change);
void bd_set_size(struct block_device *bdev, loff_t size)
{
- unsigned bsize = bdev_logical_block_size(bdev);
-
inode_lock(bdev->bd_inode);
i_size_write(bdev->bd_inode, size);
inode_unlock(bdev->bd_inode);
- while (bsize < PAGE_SIZE) {
- if (size & bsize)
- break;
- bsize <<= 1;
- }
- bdev->bd_block_size = bsize;
- bdev->bd_inode->i_blkbits = blksize_bits(bsize);
}
EXPORT_SYMBOL(bd_set_size);
@@ -1496,8 +1541,10 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
}
}
- if (!ret)
+ if (!ret) {
bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
+ set_init_blocksize(bdev);
+ }
/*
* If the device is invalidated, rescan partition
@@ -1532,6 +1579,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
goto out_clear;
}
bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
+ set_init_blocksize(bdev);
}
if (bdev->bd_bdi == &noop_backing_dev_info)
@@ -1966,6 +2014,7 @@ static const struct address_space_operations def_blk_aops = {
.writepages = blkdev_writepages,
.releasepage = blkdev_releasepage,
.direct_IO = blkdev_direct_IO,
+ .migratepage = buffer_migrate_page_norefs,
.is_dirty_writeback = buffer_check_dirty_writeback,
};
@@ -2044,6 +2093,7 @@ const struct file_operations def_blk_fops = {
.llseek = block_llseek,
.read_iter = blkdev_read_iter,
.write_iter = blkdev_write_iter,
+ .iopoll = blkdev_iopoll,
.mmap = generic_file_mmap,
.fsync = blkdev_fsync,
.unlocked_ioctl = block_ioctl,
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 3b66c957ea6f..5810463dc6d2 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -9,6 +9,7 @@
#include <linux/posix_acl_xattr.h>
#include <linux/posix_acl.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
#include <linux/slab.h>
#include "ctree.h"
@@ -72,8 +73,16 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
}
if (acl) {
+ unsigned int nofs_flag;
+
size = posix_acl_xattr_size(acl->a_count);
+ /*
+ * We're holding a transaction handle, so use a NOFS memory
+ * allocation context to avoid deadlock if reclaim happens.
+ */
+ nofs_flag = memalloc_nofs_save();
value = kmalloc(size, GFP_KERNEL);
+ memalloc_nofs_restore(nofs_flag);
if (!value) {
ret = -ENOMEM;
goto out;
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index d522494698fa..122cb97c7909 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -139,13 +139,11 @@ __btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name,
}
if (flags & WQ_HIGHPRI)
- ret->normal_wq = alloc_workqueue("%s-%s-high", flags,
- ret->current_active, "btrfs",
- name);
+ ret->normal_wq = alloc_workqueue("btrfs-%s-high", flags,
+ ret->current_active, name);
else
- ret->normal_wq = alloc_workqueue("%s-%s", flags,
- ret->current_active, "btrfs",
- name);
+ ret->normal_wq = alloc_workqueue("btrfs-%s", flags,
+ ret->current_active, name);
if (!ret->normal_wq) {
kfree(ret);
return NULL;
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 68ebe188446a..11459fe84a29 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -591,7 +591,7 @@ unode_aux_to_inode_list(struct ulist_node *node)
}
/*
- * We maintain three seperate rbtrees: one for direct refs, one for
+ * We maintain three separate rbtrees: one for direct refs, one for
* indirect refs which have a key, and one for indirect refs which do not
* have a key. Each tree does merge on insertion.
*
@@ -695,7 +695,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
}
/*
- * Now it's a direct ref, put it in the the direct tree. We must
+ * Now it's a direct ref, put it in the direct tree. We must
* do this last because the ref could be merged/freed here.
*/
prelim_ref_insert(fs_info, &preftrees->direct, ref, NULL);
@@ -712,7 +712,7 @@ out:
* read tree blocks and add keys where required.
*/
static int add_missing_keys(struct btrfs_fs_info *fs_info,
- struct preftrees *preftrees)
+ struct preftrees *preftrees, bool lock)
{
struct prelim_ref *ref;
struct extent_buffer *eb;
@@ -737,12 +737,14 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
free_extent_buffer(eb);
return -EIO;
}
- btrfs_tree_read_lock(eb);
+ if (lock)
+ btrfs_tree_read_lock(eb);
if (btrfs_header_level(eb) == 0)
btrfs_item_key_to_cpu(eb, &ref->key_for_search, 0);
else
btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0);
- btrfs_tree_read_unlock(eb);
+ if (lock)
+ btrfs_tree_read_unlock(eb);
free_extent_buffer(eb);
prelim_ref_insert(fs_info, &preftrees->indirect, ref, NULL);
cond_resched();
@@ -1227,7 +1229,7 @@ again:
btrfs_release_path(path);
- ret = add_missing_keys(fs_info, &preftrees);
+ ret = add_missing_keys(fs_info, &preftrees, path->skip_locking == 0);
if (ret)
goto out;
@@ -1288,11 +1290,15 @@ again:
ret = -EIO;
goto out;
}
- btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+
+ if (!path->skip_locking) {
+ btrfs_tree_read_lock(eb);
+ btrfs_set_lock_blocking_read(eb);
+ }
ret = find_extent_in_eb(eb, bytenr,
*extent_item_pos, &eie, ignore_offset);
- btrfs_tree_read_unlock_blocking(eb);
+ if (!path->skip_locking)
+ btrfs_tree_read_unlock_blocking(eb);
free_extent_buffer(eb);
if (ret < 0)
goto out;
@@ -1650,7 +1656,7 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
/* make sure we can use eb after releasing the path */
if (eb != eb_in) {
if (!path->skip_locking)
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ btrfs_set_lock_blocking_read(eb);
path->nodes[0] = NULL;
path->locks[0] = 0;
}
@@ -2020,9 +2026,6 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
ret = -ENOMEM;
break;
}
- extent_buffer_get(eb);
- btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
btrfs_release_path(path);
item = btrfs_item_nr(slot);
@@ -2042,7 +2045,6 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
len = sizeof(*iref) + name_len;
iref = (struct btrfs_inode_ref *)((char *)iref + len);
}
- btrfs_tree_read_unlock_blocking(eb);
free_extent_buffer(eb);
}
@@ -2083,10 +2085,6 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
ret = -ENOMEM;
break;
}
- extent_buffer_get(eb);
-
- btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
btrfs_release_path(path);
item_size = btrfs_item_size_nr(eb, slot);
@@ -2107,7 +2105,6 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
cur_offset += btrfs_inode_extref_name_len(eb, extref);
cur_offset += sizeof(*extref);
}
- btrfs_tree_read_unlock_blocking(eb);
free_extent_buffer(eb);
offset++;
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 97d91e55b70a..6f5d07415dab 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -20,7 +20,7 @@
* new data the application may have written before commit.
*/
enum {
- BTRFS_INODE_ORDERED_DATA_CLOSE = 0,
+ BTRFS_INODE_ORDERED_DATA_CLOSE,
BTRFS_INODE_DUMMY,
BTRFS_INODE_IN_DEFRAG,
BTRFS_INODE_HAS_ASYNC_EXTENT,
@@ -29,6 +29,7 @@ enum {
BTRFS_INODE_IN_DELALLOC_LIST,
BTRFS_INODE_READDIO_NEED_LOCK,
BTRFS_INODE_HAS_PROPS,
+ BTRFS_INODE_SNAPSHOT_FLUSH,
};
/* in memory btrfs inode */
@@ -147,6 +148,12 @@ struct btrfs_inode {
u64 last_unlink_trans;
/*
+ * Track the transaction id of the last transaction used to create a
+ * hard link for the inode. This is used by the log tree (fsync).
+ */
+ u64 last_link_trans;
+
+ /*
* Number of bytes outstanding that are going to need csums. This is
* used in ENOSPC accounting.
*/
@@ -253,6 +260,11 @@ static inline bool btrfs_is_free_space_inode(struct btrfs_inode *inode)
return false;
}
+static inline bool is_data_inode(struct inode *inode)
+{
+ return btrfs_ino(BTRFS_I(inode)) != BTRFS_BTREE_INODE_OBJECTID;
+}
+
static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode,
int mod)
{
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 2e43fba44035..b0c8094528d1 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -1202,24 +1202,24 @@ static void btrfsic_read_from_block_data(
void *dstv, u32 offset, size_t len)
{
size_t cur;
- size_t offset_in_page;
+ size_t pgoff;
char *kaddr;
char *dst = (char *)dstv;
- size_t start_offset = block_ctx->start & ((u64)PAGE_SIZE - 1);
+ size_t start_offset = offset_in_page(block_ctx->start);
unsigned long i = (start_offset + offset) >> PAGE_SHIFT;
WARN_ON(offset + len > block_ctx->len);
- offset_in_page = (start_offset + offset) & (PAGE_SIZE - 1);
+ pgoff = offset_in_page(start_offset + offset);
while (len > 0) {
- cur = min(len, ((size_t)PAGE_SIZE - offset_in_page));
+ cur = min(len, ((size_t)PAGE_SIZE - pgoff));
BUG_ON(i >= DIV_ROUND_UP(block_ctx->len, PAGE_SIZE));
kaddr = block_ctx->datav[i];
- memcpy(dst, kaddr + offset_in_page, cur);
+ memcpy(dst, kaddr + pgoff, cur);
dst += cur;
len -= cur;
- offset_in_page = 0;
+ pgoff = 0;
i++;
}
}
@@ -1601,7 +1601,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
BUG_ON(block_ctx->datav);
BUG_ON(block_ctx->pagev);
BUG_ON(block_ctx->mem_to_free);
- if (block_ctx->dev_bytenr & ((u64)PAGE_SIZE - 1)) {
+ if (!PAGE_ALIGNED(block_ctx->dev_bytenr)) {
pr_info("btrfsic: read_block() with unaligned bytenr %llu\n",
block_ctx->dev_bytenr);
return -1;
@@ -1720,7 +1720,7 @@ static int btrfsic_test_for_metadata(struct btrfsic_state *state,
num_pages = state->metablock_size >> PAGE_SHIFT;
h = (struct btrfs_header *)datav[0];
- if (memcmp(h->fsid, fs_info->fsid, BTRFS_FSID_SIZE))
+ if (memcmp(h->fsid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE))
return 1;
for (i = 0; i < num_pages; i++) {
@@ -1778,7 +1778,7 @@ again:
return;
}
is_metadata = 1;
- BUG_ON(BTRFS_SUPER_INFO_SIZE & (PAGE_SIZE - 1));
+ BUG_ON(!PAGE_ALIGNED(BTRFS_SUPER_INFO_SIZE));
processed_len = BTRFS_SUPER_INFO_SIZE;
if (state->print_mask &
BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) {
@@ -2327,7 +2327,7 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
* write operations. Therefore it keeps the linkage
* information for a block until a block is
* rewritten. This can temporarily cause incorrect
- * and even circular linkage informations. This
+ * and even circular linkage information. This
* causes no harm unless such blocks are referenced
* by the most recent super block.
*/
@@ -2892,12 +2892,12 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info,
struct list_head *dev_head = &fs_devices->devices;
struct btrfs_device *device;
- if (fs_info->nodesize & ((u64)PAGE_SIZE - 1)) {
+ if (!PAGE_ALIGNED(fs_info->nodesize)) {
pr_info("btrfsic: cannot handle nodesize %d not being a multiple of PAGE_SIZE %ld!\n",
fs_info->nodesize, PAGE_SIZE);
return -1;
}
- if (fs_info->sectorsize & ((u64)PAGE_SIZE - 1)) {
+ if (!PAGE_ALIGNED(fs_info->sectorsize)) {
pr_info("btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_SIZE %ld!\n",
fs_info->sectorsize, PAGE_SIZE);
return -1;
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 8703ce68fe9d..4f2a8ae0aa42 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -162,13 +162,14 @@ csum_failed:
} else {
int i;
struct bio_vec *bvec;
+ struct bvec_iter_all iter_all;
/*
* we have verified the checksum already, set page
* checked so the end_io handlers know about it
*/
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, cb->orig_bio, i)
+ bio_for_each_segment_all(bvec, cb->orig_bio, i, iter_all)
SetPageChecked(bvec->bv_page);
bio_endio(cb->orig_bio);
@@ -229,7 +230,6 @@ static noinline void end_compressed_writeback(struct inode *inode,
*/
static void end_compressed_bio_write(struct bio *bio)
{
- struct extent_io_tree *tree;
struct compressed_bio *cb = bio->bi_private;
struct inode *inode;
struct page *page;
@@ -248,14 +248,10 @@ static void end_compressed_bio_write(struct bio *bio)
* call back into the FS and do all the end_io operations
*/
inode = cb->inode;
- tree = &BTRFS_I(inode)->io_tree;
cb->compressed_pages[0]->mapping = cb->inode->i_mapping;
- tree->ops->writepage_end_io_hook(cb->compressed_pages[0],
- cb->start,
- cb->start + cb->len - 1,
- NULL,
- bio->bi_status ?
- BLK_STS_OK : BLK_STS_NOTSUPP);
+ btrfs_writepage_endio_finish_ordered(cb->compressed_pages[0],
+ cb->start, cb->start + cb->len - 1,
+ bio->bi_status ? BLK_STS_OK : BLK_STS_NOTSUPP);
cb->compressed_pages[0]->mapping = NULL;
end_compressed_writeback(inode, cb);
@@ -306,7 +302,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
blk_status_t ret;
int skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
- WARN_ON(start & ((u64)PAGE_SIZE - 1));
+ WARN_ON(!PAGE_ALIGNED(start));
cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
if (!cb)
return BLK_STS_RESOURCE;
@@ -337,7 +333,8 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
page = compressed_pages[pg_index];
page->mapping = inode->i_mapping;
if (bio->bi_iter.bi_size)
- submit = btrfs_merge_bio_hook(page, 0, PAGE_SIZE, bio, 0);
+ submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE, bio,
+ 0);
page->mapping = NULL;
if (submit || bio_add_page(bio, page, PAGE_SIZE, 0) <
@@ -437,10 +434,8 @@ static noinline int add_ra_bio_pages(struct inode *inode,
if (pg_index > end_index)
break;
- rcu_read_lock();
- page = radix_tree_lookup(&mapping->i_pages, pg_index);
- rcu_read_unlock();
- if (page && !radix_tree_exceptional_entry(page)) {
+ page = xa_load(&mapping->i_pages, pg_index);
+ if (page && !xa_is_value(page)) {
misses++;
if (misses > 4)
break;
@@ -483,7 +478,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
if (page->index == end_index) {
char *userpage;
- size_t zero_offset = isize & (PAGE_SIZE - 1);
+ size_t zero_offset = offset_in_page(isize);
if (zero_offset) {
int zeros;
@@ -617,8 +612,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
page->index = em_start >> PAGE_SHIFT;
if (comp_bio->bi_iter.bi_size)
- submit = btrfs_merge_bio_hook(page, 0, PAGE_SIZE,
- comp_bio, 0);
+ submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE,
+ comp_bio, 0);
page->mapping = NULL;
if (submit || bio_add_page(comp_bio, page, PAGE_SIZE, 0) <
@@ -736,6 +731,28 @@ struct heuristic_ws {
struct list_head list;
};
+static struct workspace_manager heuristic_wsm;
+
+static void heuristic_init_workspace_manager(void)
+{
+ btrfs_init_workspace_manager(&heuristic_wsm, &btrfs_heuristic_compress);
+}
+
+static void heuristic_cleanup_workspace_manager(void)
+{
+ btrfs_cleanup_workspace_manager(&heuristic_wsm);
+}
+
+static struct list_head *heuristic_get_workspace(unsigned int level)
+{
+ return btrfs_get_workspace(&heuristic_wsm, level);
+}
+
+static void heuristic_put_workspace(struct list_head *ws)
+{
+ btrfs_put_workspace(&heuristic_wsm, ws);
+}
+
static void free_heuristic_ws(struct list_head *ws)
{
struct heuristic_ws *workspace;
@@ -748,7 +765,7 @@ static void free_heuristic_ws(struct list_head *ws)
kfree(workspace);
}
-static struct list_head *alloc_heuristic_ws(void)
+static struct list_head *alloc_heuristic_ws(unsigned int level)
{
struct heuristic_ws *ws;
@@ -775,65 +792,59 @@ fail:
return ERR_PTR(-ENOMEM);
}
-struct workspaces_list {
- struct list_head idle_ws;
- spinlock_t ws_lock;
- /* Number of free workspaces */
- int free_ws;
- /* Total number of allocated workspaces */
- atomic_t total_ws;
- /* Waiters for a free workspace */
- wait_queue_head_t ws_wait;
+const struct btrfs_compress_op btrfs_heuristic_compress = {
+ .init_workspace_manager = heuristic_init_workspace_manager,
+ .cleanup_workspace_manager = heuristic_cleanup_workspace_manager,
+ .get_workspace = heuristic_get_workspace,
+ .put_workspace = heuristic_put_workspace,
+ .alloc_workspace = alloc_heuristic_ws,
+ .free_workspace = free_heuristic_ws,
};
-static struct workspaces_list btrfs_comp_ws[BTRFS_COMPRESS_TYPES];
-
-static struct workspaces_list btrfs_heuristic_ws;
-
static const struct btrfs_compress_op * const btrfs_compress_op[] = {
+ /* The heuristic is represented as compression type 0 */
+ &btrfs_heuristic_compress,
&btrfs_zlib_compress,
&btrfs_lzo_compress,
&btrfs_zstd_compress,
};
-void __init btrfs_init_compress(void)
+void btrfs_init_workspace_manager(struct workspace_manager *wsm,
+ const struct btrfs_compress_op *ops)
{
struct list_head *workspace;
- int i;
- INIT_LIST_HEAD(&btrfs_heuristic_ws.idle_ws);
- spin_lock_init(&btrfs_heuristic_ws.ws_lock);
- atomic_set(&btrfs_heuristic_ws.total_ws, 0);
- init_waitqueue_head(&btrfs_heuristic_ws.ws_wait);
+ wsm->ops = ops;
+
+ INIT_LIST_HEAD(&wsm->idle_ws);
+ spin_lock_init(&wsm->ws_lock);
+ atomic_set(&wsm->total_ws, 0);
+ init_waitqueue_head(&wsm->ws_wait);
- workspace = alloc_heuristic_ws();
+ /*
+ * Preallocate one workspace for each compression type so we can
+ * guarantee forward progress in the worst case
+ */
+ workspace = wsm->ops->alloc_workspace(0);
if (IS_ERR(workspace)) {
pr_warn(
- "BTRFS: cannot preallocate heuristic workspace, will try later\n");
+ "BTRFS: cannot preallocate compression workspace, will try later\n");
} else {
- atomic_set(&btrfs_heuristic_ws.total_ws, 1);
- btrfs_heuristic_ws.free_ws = 1;
- list_add(workspace, &btrfs_heuristic_ws.idle_ws);
+ atomic_set(&wsm->total_ws, 1);
+ wsm->free_ws = 1;
+ list_add(workspace, &wsm->idle_ws);
}
+}
- for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
- INIT_LIST_HEAD(&btrfs_comp_ws[i].idle_ws);
- spin_lock_init(&btrfs_comp_ws[i].ws_lock);
- atomic_set(&btrfs_comp_ws[i].total_ws, 0);
- init_waitqueue_head(&btrfs_comp_ws[i].ws_wait);
+void btrfs_cleanup_workspace_manager(struct workspace_manager *wsman)
+{
+ struct list_head *ws;
- /*
- * Preallocate one workspace for each compression type so
- * we can guarantee forward progress in the worst case
- */
- workspace = btrfs_compress_op[i]->alloc_workspace();
- if (IS_ERR(workspace)) {
- pr_warn("BTRFS: cannot preallocate compression workspace, will try later\n");
- } else {
- atomic_set(&btrfs_comp_ws[i].total_ws, 1);
- btrfs_comp_ws[i].free_ws = 1;
- list_add(workspace, &btrfs_comp_ws[i].idle_ws);
- }
+ while (!list_empty(&wsman->idle_ws)) {
+ ws = wsman->idle_ws.next;
+ list_del(ws);
+ wsman->ops->free_workspace(ws);
+ atomic_dec(&wsman->total_ws);
}
}
@@ -843,11 +854,11 @@ void __init btrfs_init_compress(void)
* Preallocation makes a forward progress guarantees and we do not return
* errors.
*/
-static struct list_head *__find_workspace(int type, bool heuristic)
+struct list_head *btrfs_get_workspace(struct workspace_manager *wsm,
+ unsigned int level)
{
struct list_head *workspace;
int cpus = num_online_cpus();
- int idx = type - 1;
unsigned nofs_flag;
struct list_head *idle_ws;
spinlock_t *ws_lock;
@@ -855,19 +866,11 @@ static struct list_head *__find_workspace(int type, bool heuristic)
wait_queue_head_t *ws_wait;
int *free_ws;
- if (heuristic) {
- idle_ws = &btrfs_heuristic_ws.idle_ws;
- ws_lock = &btrfs_heuristic_ws.ws_lock;
- total_ws = &btrfs_heuristic_ws.total_ws;
- ws_wait = &btrfs_heuristic_ws.ws_wait;
- free_ws = &btrfs_heuristic_ws.free_ws;
- } else {
- idle_ws = &btrfs_comp_ws[idx].idle_ws;
- ws_lock = &btrfs_comp_ws[idx].ws_lock;
- total_ws = &btrfs_comp_ws[idx].total_ws;
- ws_wait = &btrfs_comp_ws[idx].ws_wait;
- free_ws = &btrfs_comp_ws[idx].free_ws;
- }
+ idle_ws = &wsm->idle_ws;
+ ws_lock = &wsm->ws_lock;
+ total_ws = &wsm->total_ws;
+ ws_wait = &wsm->ws_wait;
+ free_ws = &wsm->free_ws;
again:
spin_lock(ws_lock);
@@ -898,10 +901,7 @@ again:
* context of btrfs_compress_bio/btrfs_compress_pages
*/
nofs_flag = memalloc_nofs_save();
- if (heuristic)
- workspace = alloc_heuristic_ws();
- else
- workspace = btrfs_compress_op[idx]->alloc_workspace();
+ workspace = wsm->ops->alloc_workspace(level);
memalloc_nofs_restore(nofs_flag);
if (IS_ERR(workspace)) {
@@ -932,85 +932,47 @@ again:
return workspace;
}
-static struct list_head *find_workspace(int type)
+static struct list_head *get_workspace(int type, int level)
{
- return __find_workspace(type, false);
+ return btrfs_compress_op[type]->get_workspace(level);
}
/*
* put a workspace struct back on the list or free it if we have enough
* idle ones sitting around
*/
-static void __free_workspace(int type, struct list_head *workspace,
- bool heuristic)
+void btrfs_put_workspace(struct workspace_manager *wsm, struct list_head *ws)
{
- int idx = type - 1;
struct list_head *idle_ws;
spinlock_t *ws_lock;
atomic_t *total_ws;
wait_queue_head_t *ws_wait;
int *free_ws;
- if (heuristic) {
- idle_ws = &btrfs_heuristic_ws.idle_ws;
- ws_lock = &btrfs_heuristic_ws.ws_lock;
- total_ws = &btrfs_heuristic_ws.total_ws;
- ws_wait = &btrfs_heuristic_ws.ws_wait;
- free_ws = &btrfs_heuristic_ws.free_ws;
- } else {
- idle_ws = &btrfs_comp_ws[idx].idle_ws;
- ws_lock = &btrfs_comp_ws[idx].ws_lock;
- total_ws = &btrfs_comp_ws[idx].total_ws;
- ws_wait = &btrfs_comp_ws[idx].ws_wait;
- free_ws = &btrfs_comp_ws[idx].free_ws;
- }
+ idle_ws = &wsm->idle_ws;
+ ws_lock = &wsm->ws_lock;
+ total_ws = &wsm->total_ws;
+ ws_wait = &wsm->ws_wait;
+ free_ws = &wsm->free_ws;
spin_lock(ws_lock);
if (*free_ws <= num_online_cpus()) {
- list_add(workspace, idle_ws);
+ list_add(ws, idle_ws);
(*free_ws)++;
spin_unlock(ws_lock);
goto wake;
}
spin_unlock(ws_lock);
- if (heuristic)
- free_heuristic_ws(workspace);
- else
- btrfs_compress_op[idx]->free_workspace(workspace);
+ wsm->ops->free_workspace(ws);
atomic_dec(total_ws);
wake:
cond_wake_up(ws_wait);
}
-static void free_workspace(int type, struct list_head *ws)
+static void put_workspace(int type, struct list_head *ws)
{
- return __free_workspace(type, ws, false);
-}
-
-/*
- * cleanup function for module exit
- */
-static void free_workspaces(void)
-{
- struct list_head *workspace;
- int i;
-
- while (!list_empty(&btrfs_heuristic_ws.idle_ws)) {
- workspace = btrfs_heuristic_ws.idle_ws.next;
- list_del(workspace);
- free_heuristic_ws(workspace);
- atomic_dec(&btrfs_heuristic_ws.total_ws);
- }
-
- for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
- while (!list_empty(&btrfs_comp_ws[i].idle_ws)) {
- workspace = btrfs_comp_ws[i].idle_ws.next;
- list_del(workspace);
- btrfs_compress_op[i]->free_workspace(workspace);
- atomic_dec(&btrfs_comp_ws[i].total_ws);
- }
- }
+ return btrfs_compress_op[type]->put_workspace(ws);
}
/*
@@ -1042,18 +1004,17 @@ int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
unsigned long *total_in,
unsigned long *total_out)
{
+ int type = btrfs_compress_type(type_level);
+ int level = btrfs_compress_level(type_level);
struct list_head *workspace;
int ret;
- int type = type_level & 0xF;
-
- workspace = find_workspace(type);
- btrfs_compress_op[type - 1]->set_level(workspace, type_level);
- ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping,
+ workspace = get_workspace(type, level);
+ ret = btrfs_compress_op[type]->compress_pages(workspace, mapping,
start, pages,
out_pages,
total_in, total_out);
- free_workspace(type, workspace);
+ put_workspace(type, workspace);
return ret;
}
@@ -1077,9 +1038,9 @@ static int btrfs_decompress_bio(struct compressed_bio *cb)
int ret;
int type = cb->compress_type;
- workspace = find_workspace(type);
- ret = btrfs_compress_op[type - 1]->decompress_bio(workspace, cb);
- free_workspace(type, workspace);
+ workspace = get_workspace(type, 0);
+ ret = btrfs_compress_op[type]->decompress_bio(workspace, cb);
+ put_workspace(type, workspace);
return ret;
}
@@ -1095,19 +1056,29 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
struct list_head *workspace;
int ret;
- workspace = find_workspace(type);
-
- ret = btrfs_compress_op[type-1]->decompress(workspace, data_in,
+ workspace = get_workspace(type, 0);
+ ret = btrfs_compress_op[type]->decompress(workspace, data_in,
dest_page, start_byte,
srclen, destlen);
+ put_workspace(type, workspace);
- free_workspace(type, workspace);
return ret;
}
+void __init btrfs_init_compress(void)
+{
+ int i;
+
+ for (i = 0; i < BTRFS_NR_WORKSPACE_MANAGERS; i++)
+ btrfs_compress_op[i]->init_workspace_manager();
+}
+
void __cold btrfs_exit_compress(void)
{
- free_workspaces();
+ int i;
+
+ for (i = 0; i < BTRFS_NR_WORKSPACE_MANAGERS; i++)
+ btrfs_compress_op[i]->cleanup_workspace_manager();
}
/*
@@ -1209,7 +1180,7 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start,
/*
* Shannon Entropy calculation
*
- * Pure byte distribution analysis fails to determine compressiability of data.
+ * Pure byte distribution analysis fails to determine compressibility of data.
* Try calculating entropy to estimate the average minimum number of bits
* needed to encode the sampled data.
*
@@ -1273,7 +1244,7 @@ static u8 get4bits(u64 num, int shift) {
/*
* Use 4 bits as radix base
- * Use 16 u32 counters for calculating new possition in buf array
+ * Use 16 u32 counters for calculating new position in buf array
*
* @array - array that will be sorted
* @array_buf - buffer array to store sorting results
@@ -1518,7 +1489,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end,
*/
int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end)
{
- struct list_head *ws_list = __find_workspace(0, true);
+ struct list_head *ws_list = get_workspace(0, 0);
struct heuristic_ws *ws;
u32 i;
u8 byte;
@@ -1587,18 +1558,29 @@ int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end)
}
out:
- __free_workspace(0, ws_list, true);
+ put_workspace(0, ws_list);
return ret;
}
-unsigned int btrfs_compress_str2level(const char *str)
+/*
+ * Convert the compression suffix (eg. after "zlib" starting with ":") to
+ * level, unrecognized string will set the default level
+ */
+unsigned int btrfs_compress_str2level(unsigned int type, const char *str)
{
- if (strncmp(str, "zlib", 4) != 0)
+ unsigned int level = 0;
+ int ret;
+
+ if (!type)
return 0;
- /* Accepted form: zlib:1 up to zlib:9 and nothing left after the number */
- if (str[4] == ':' && '1' <= str[5] && str[5] <= '9' && str[6] == 0)
- return str[5] - '0';
+ if (str[0] == ':') {
+ ret = kstrtouint(str + 1, 10, &level);
+ if (ret)
+ level = 0;
+ }
+
+ level = btrfs_compress_op[type]->set_level(level);
- return BTRFS_ZLIB_DEFAULT_LEVEL;
+ return level;
}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index ddda9b80bf20..9976fe0f7526 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -64,6 +64,16 @@ struct compressed_bio {
u32 sums;
};
+static inline unsigned int btrfs_compress_type(unsigned int type_level)
+{
+ return (type_level & 0xF);
+}
+
+static inline unsigned int btrfs_compress_level(unsigned int type_level)
+{
+ return ((type_level & 0xF0) >> 4);
+}
+
void __init btrfs_init_compress(void);
void __cold btrfs_exit_compress(void);
@@ -87,7 +97,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags);
-unsigned btrfs_compress_str2level(const char *str);
+unsigned int btrfs_compress_str2level(unsigned int type, const char *str);
enum btrfs_compression_type {
BTRFS_COMPRESS_NONE = 0,
@@ -97,8 +107,35 @@ enum btrfs_compression_type {
BTRFS_COMPRESS_TYPES = 3,
};
+struct workspace_manager {
+ const struct btrfs_compress_op *ops;
+ struct list_head idle_ws;
+ spinlock_t ws_lock;
+ /* Number of free workspaces */
+ int free_ws;
+ /* Total number of allocated workspaces */
+ atomic_t total_ws;
+ /* Waiters for a free workspace */
+ wait_queue_head_t ws_wait;
+};
+
+void btrfs_init_workspace_manager(struct workspace_manager *wsm,
+ const struct btrfs_compress_op *ops);
+struct list_head *btrfs_get_workspace(struct workspace_manager *wsm,
+ unsigned int level);
+void btrfs_put_workspace(struct workspace_manager *wsm, struct list_head *ws);
+void btrfs_cleanup_workspace_manager(struct workspace_manager *wsm);
+
struct btrfs_compress_op {
- struct list_head *(*alloc_workspace)(void);
+ void (*init_workspace_manager)(void);
+
+ void (*cleanup_workspace_manager)(void);
+
+ struct list_head *(*get_workspace)(unsigned int level);
+
+ void (*put_workspace)(struct list_head *ws);
+
+ struct list_head *(*alloc_workspace)(unsigned int level);
void (*free_workspace)(struct list_head *workspace);
@@ -119,9 +156,18 @@ struct btrfs_compress_op {
unsigned long start_byte,
size_t srclen, size_t destlen);
- void (*set_level)(struct list_head *ws, unsigned int type);
+ /*
+ * This bounds the level set by the user to be within range of a
+ * particular compression type. It returns the level that will be used
+ * if the level is out of bounds or the default if 0 is passed in.
+ */
+ unsigned int (*set_level)(unsigned int level);
};
+/* The heuristic workspaces are managed via the 0th workspace manager */
+#define BTRFS_NR_WORKSPACE_MANAGERS (BTRFS_COMPRESS_TYPES + 1)
+
+extern const struct btrfs_compress_op btrfs_heuristic_compress;
extern const struct btrfs_compress_op btrfs_zlib_compress;
extern const struct btrfs_compress_op btrfs_lzo_compress;
extern const struct btrfs_compress_op btrfs_zstd_compress;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 2ee43b6a4f09..324df36d28bf 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -12,6 +12,8 @@
#include "transaction.h"
#include "print-tree.h"
#include "locking.h"
+#include "volumes.h"
+#include "qgroup.h"
static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_path *path, int level);
@@ -44,11 +46,18 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p)
for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
if (!p->nodes[i] || !p->locks[i])
continue;
- btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
- if (p->locks[i] == BTRFS_READ_LOCK)
+ /*
+ * If we currently have a spinning reader or writer lock this
+ * will bump the count of blocking holders and drop the
+ * spinlock.
+ */
+ if (p->locks[i] == BTRFS_READ_LOCK) {
+ btrfs_set_lock_blocking_read(p->nodes[i]);
p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
- else if (p->locks[i] == BTRFS_WRITE_LOCK)
+ } else if (p->locks[i] == BTRFS_WRITE_LOCK) {
+ btrfs_set_lock_blocking_write(p->nodes[i]);
p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
+ }
}
}
@@ -224,7 +233,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
else
btrfs_set_header_owner(cow, new_root_objectid);
- write_extent_buffer_fsid(cow, fs_info->fsid);
+ write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
WARN_ON(btrfs_header_generation(buf) > trans->transid);
if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
@@ -967,6 +976,48 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
return 0;
}
+static struct extent_buffer *alloc_tree_block_no_bg_flush(
+ struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 parent_start,
+ const struct btrfs_disk_key *disk_key,
+ int level,
+ u64 hint,
+ u64 empty_size)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct extent_buffer *ret;
+
+ /*
+ * If we are COWing a node/leaf from the extent, chunk, device or free
+ * space trees, make sure that we do not finish block group creation of
+ * pending block groups. We do this to avoid a deadlock.
+ * COWing can result in allocation of a new chunk, and flushing pending
+ * block groups (btrfs_create_pending_block_groups()) can be triggered
+ * when finishing allocation of a new chunk. Creation of a pending block
+ * group modifies the extent, chunk, device and free space trees,
+ * therefore we could deadlock with ourselves since we are holding a
+ * lock on an extent buffer that btrfs_create_pending_block_groups() may
+ * try to COW later.
+ * For similar reasons, we also need to delay flushing pending block
+ * groups when splitting a leaf or node, from one of those trees, since
+ * we are holding a write lock on it and its parent or when inserting a
+ * new root node for one of those trees.
+ */
+ if (root == fs_info->extent_root ||
+ root == fs_info->chunk_root ||
+ root == fs_info->dev_root ||
+ root == fs_info->free_space_root)
+ trans->can_flush_pending_bgs = false;
+
+ ret = btrfs_alloc_tree_block(trans, root, parent_start,
+ root->root_key.objectid, disk_key, level,
+ hint, empty_size);
+ trans->can_flush_pending_bgs = true;
+
+ return ret;
+}
+
/*
* does the dirty work in cow of a single block. The parent block (if
* supplied) is updated to point to the new cow copy. The new buffer is marked
@@ -1014,9 +1065,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
parent_start = parent->start;
- cow = btrfs_alloc_tree_block(trans, root, parent_start,
- root->root_key.objectid, &disk_key, level,
- search_start, empty_size);
+ cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key,
+ level, search_start, empty_size);
if (IS_ERR(cow))
return PTR_ERR(cow);
@@ -1033,7 +1083,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
else
btrfs_set_header_owner(cow, root->root_key.objectid);
- write_extent_buffer_fsid(cow, fs_info->fsid);
+ write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
if (ret) {
@@ -1246,7 +1296,7 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
return eb;
btrfs_set_path_blocking(path);
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ btrfs_set_lock_blocking_read(eb);
if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
BUG_ON(tm->slot != 0);
@@ -1273,7 +1323,6 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
btrfs_tree_read_unlock_blocking(eb);
free_extent_buffer(eb);
- extent_buffer_get(eb_rewin);
btrfs_tree_read_lock(eb_rewin);
__tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
WARN_ON(btrfs_header_nritems(eb_rewin) >
@@ -1337,7 +1386,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
free_extent_buffer(eb_root);
eb = alloc_dummy_extent_buffer(fs_info, logical);
} else {
- btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
+ btrfs_set_lock_blocking_read(eb_root);
eb = btrfs_clone_extent_buffer(eb_root);
btrfs_tree_read_unlock_blocking(eb_root);
free_extent_buffer(eb_root);
@@ -1345,7 +1394,6 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
if (!eb)
return NULL;
- extent_buffer_get(eb);
btrfs_tree_read_lock(eb);
if (old_root) {
btrfs_set_header_bytenr(eb, eb->start);
@@ -1398,7 +1446,7 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
*
* What is forced COW:
* when we create snapshot during committing the transaction,
- * after we've finished coping src root, we must COW the shared
+ * after we've finished copying src root, we must COW the shared
* block to ensure the metadata consistency.
*/
if (btrfs_header_generation(buf) == trans->transid &&
@@ -1424,6 +1472,10 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
u64 search_start;
int ret;
+ if (test_bit(BTRFS_ROOT_DELETING, &root->state))
+ btrfs_err(fs_info,
+ "COW'ing blocks on a fs root that's being dropped");
+
if (trans->transaction != fs_info->running_transaction)
WARN(1, KERN_CRIT "trans %llu running %llu\n",
trans->transid,
@@ -1442,9 +1494,16 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
search_start = buf->start & ~((u64)SZ_1G - 1);
if (parent)
- btrfs_set_lock_blocking(parent);
- btrfs_set_lock_blocking(buf);
+ btrfs_set_lock_blocking_write(parent);
+ btrfs_set_lock_blocking_write(buf);
+ /*
+ * Before CoWing this block for later modification, check if it's
+ * the subtree root and do the delayed subtree trace if needed.
+ *
+ * Also We don't care about the error, as it's handled internally.
+ */
+ btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
ret = __btrfs_cow_block(trans, root, buf, parent,
parent_slot, cow_ret, search_start, 0);
@@ -1538,7 +1597,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
if (parent_nritems <= 1)
return 0;
- btrfs_set_lock_blocking(parent);
+ btrfs_set_lock_blocking_write(parent);
for (i = start_slot; i <= end_slot; i++) {
struct btrfs_key first_key;
@@ -1597,7 +1656,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
search_start = last_block;
btrfs_tree_lock(cur);
- btrfs_set_lock_blocking(cur);
+ btrfs_set_lock_blocking_write(cur);
err = __btrfs_cow_block(trans, root, cur, parent, i,
&cur, search_start,
min(16 * blocksize,
@@ -1812,7 +1871,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
}
btrfs_tree_lock(child);
- btrfs_set_lock_blocking(child);
+ btrfs_set_lock_blocking_write(child);
ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
if (ret) {
btrfs_tree_unlock(child);
@@ -1850,7 +1909,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
if (left) {
btrfs_tree_lock(left);
- btrfs_set_lock_blocking(left);
+ btrfs_set_lock_blocking_write(left);
wret = btrfs_cow_block(trans, root, left,
parent, pslot - 1, &left);
if (wret) {
@@ -1865,7 +1924,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
if (right) {
btrfs_tree_lock(right);
- btrfs_set_lock_blocking(right);
+ btrfs_set_lock_blocking_write(right);
wret = btrfs_cow_block(trans, root, right,
parent, pslot + 1, &right);
if (wret) {
@@ -2028,7 +2087,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
u32 left_nr;
btrfs_tree_lock(left);
- btrfs_set_lock_blocking(left);
+ btrfs_set_lock_blocking_write(left);
left_nr = btrfs_header_nritems(left);
if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
@@ -2083,7 +2142,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
u32 right_nr;
btrfs_tree_lock(right);
- btrfs_set_lock_blocking(right);
+ btrfs_set_lock_blocking_write(right);
right_nr = btrfs_header_nritems(right);
if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
@@ -2485,26 +2544,6 @@ done:
return ret;
}
-static void key_search_validate(struct extent_buffer *b,
- const struct btrfs_key *key,
- int level)
-{
-#ifdef CONFIG_BTRFS_ASSERT
- struct btrfs_disk_key disk_key;
-
- btrfs_cpu_key_to_disk(&disk_key, key);
-
- if (level == 0)
- ASSERT(!memcmp_extent_buffer(b, &disk_key,
- offsetof(struct btrfs_leaf, items[0].key),
- sizeof(disk_key)));
- else
- ASSERT(!memcmp_extent_buffer(b, &disk_key,
- offsetof(struct btrfs_node, ptrs[0].key),
- sizeof(disk_key)));
-#endif
-}
-
static int key_search(struct extent_buffer *b, const struct btrfs_key *key,
int level, int *prev_cmp, int *slot)
{
@@ -2513,7 +2552,6 @@ static int key_search(struct extent_buffer *b, const struct btrfs_key *key,
return *prev_cmp;
}
- key_search_validate(b, key, level);
*slot = 0;
return 0;
@@ -2567,14 +2605,27 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
root_lock = BTRFS_READ_LOCK;
if (p->search_commit_root) {
- /* The commit roots are read only so we always do read locks */
- if (p->need_commit_sem)
+ /*
+ * The commit roots are read only so we always do read locks,
+ * and we always must hold the commit_root_sem when doing
+ * searches on them, the only exception is send where we don't
+ * want to block transaction commits for a long time, so
+ * we need to clone the commit root in order to avoid races
+ * with transaction commits that create a snapshot of one of
+ * the roots used by a send operation.
+ */
+ if (p->need_commit_sem) {
down_read(&fs_info->commit_root_sem);
- b = root->commit_root;
- extent_buffer_get(b);
- level = btrfs_header_level(b);
- if (p->need_commit_sem)
+ b = btrfs_clone_extent_buffer(root->commit_root);
up_read(&fs_info->commit_root_sem);
+ if (!b)
+ return ERR_PTR(-ENOMEM);
+
+ } else {
+ b = root->commit_root;
+ extent_buffer_get(b);
+ }
+ level = btrfs_header_level(b);
/*
* Ensure that all callers have set skip_locking when
* p->search_commit_root = 1.
@@ -2700,6 +2751,10 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
again:
prev_cmp = -1;
b = btrfs_search_slot_get_root(root, p, write_lock_level);
+ if (IS_ERR(b)) {
+ ret = PTR_ERR(b);
+ goto done;
+ }
while (b) {
level = btrfs_header_level(b);
@@ -2944,6 +2999,8 @@ again:
*/
prev_cmp = -1;
ret = key_search(b, key, level, &prev_cmp, &slot);
+ if (ret < 0)
+ goto done;
if (level != 0) {
int dec = 0;
@@ -3306,8 +3363,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
else
btrfs_node_key(lower, &lower_key, 0);
- c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
- &lower_key, level, root->node->start, 0);
+ c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level,
+ root->node->start, 0);
if (IS_ERR(c))
return PTR_ERR(c);
@@ -3436,8 +3493,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
mid = (c_nritems + 1) / 2;
btrfs_node_key(c, &disk_key, mid);
- split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
- &disk_key, level, c->start, 0);
+ split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level,
+ c->start, 0);
if (IS_ERR(split))
return PTR_ERR(split);
@@ -3710,7 +3767,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
return 1;
btrfs_tree_lock(right);
- btrfs_set_lock_blocking(right);
+ btrfs_set_lock_blocking_write(right);
free_space = btrfs_leaf_free_space(fs_info, right);
if (free_space < data_size)
@@ -3734,7 +3791,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
/* Key greater than all keys in the leaf, right neighbor has
* enough room for it and we're not emptying our leaf to delete
* it, therefore use right neighbor to insert the new item and
- * no need to touch/dirty our left leaft. */
+ * no need to touch/dirty our left leaf. */
btrfs_tree_unlock(left);
free_extent_buffer(left);
path->nodes[0] = right;
@@ -3944,7 +4001,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
return 1;
btrfs_tree_lock(left);
- btrfs_set_lock_blocking(left);
+ btrfs_set_lock_blocking_write(left);
free_space = btrfs_leaf_free_space(fs_info, left);
if (free_space < data_size) {
@@ -4221,8 +4278,8 @@ again:
else
btrfs_item_key(l, &disk_key, mid);
- right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
- &disk_key, 0, l->start, 0);
+ right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0,
+ l->start, 0);
if (IS_ERR(right))
return PTR_ERR(right);
@@ -5095,6 +5152,10 @@ again:
nritems = btrfs_header_nritems(cur);
level = btrfs_header_level(cur);
sret = btrfs_bin_search(cur, min_key, level, &slot);
+ if (sret < 0) {
+ ret = sret;
+ goto out;
+ }
/* at the lowest level, we're done, setup the path and exit */
if (level == path->lowest_level) {
@@ -5373,7 +5434,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
ret = -ENOMEM;
goto out;
}
- extent_buffer_get(left_path->nodes[left_level]);
right_level = btrfs_header_level(right_root->commit_root);
right_root_level = right_level;
@@ -5384,7 +5444,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
ret = -ENOMEM;
goto out;
}
- extent_buffer_get(right_path->nodes[right_level]);
up_read(&fs_info->commit_root_sem);
if (left_level == 0)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 68ca41dbbef3..129d26226e70 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -35,6 +35,7 @@
struct btrfs_trans_handle;
struct btrfs_transaction;
struct btrfs_pending_snapshot;
+struct btrfs_delayed_ref_root;
extern struct kmem_cache *btrfs_trans_handle_cachep;
extern struct kmem_cache *btrfs_bit_radix_cachep;
extern struct kmem_cache *btrfs_path_cachep;
@@ -109,13 +110,26 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
}
/*
- * File system states
+ * Runtime (in-memory) states of filesystem
*/
-#define BTRFS_FS_STATE_ERROR 0
-#define BTRFS_FS_STATE_REMOUNTING 1
-#define BTRFS_FS_STATE_TRANS_ABORTED 2
-#define BTRFS_FS_STATE_DEV_REPLACING 3
-#define BTRFS_FS_STATE_DUMMY_FS_INFO 4
+enum {
+ /* Global indicator of serious filesystem errors */
+ BTRFS_FS_STATE_ERROR,
+ /*
+ * Filesystem is being remounted, allow to skip some operations, like
+ * defrag
+ */
+ BTRFS_FS_STATE_REMOUNTING,
+ /* Track if a transaction abort has been reported on this filesystem */
+ BTRFS_FS_STATE_TRANS_ABORTED,
+ /*
+ * Bio operations should be blocked on this filesystem because a source
+ * or target device is being destroyed as part of a device replace
+ */
+ BTRFS_FS_STATE_DEV_REPLACING,
+ /* The btrfs_fs_info created for self-tests */
+ BTRFS_FS_STATE_DUMMY_FS_INFO,
+};
#define BTRFS_BACKREF_REV_MAX 256
#define BTRFS_BACKREF_REV_SHIFT 56
@@ -195,9 +209,10 @@ struct btrfs_root_backup {
* it currently lacks any block count etc etc
*/
struct btrfs_super_block {
- u8 csum[BTRFS_CSUM_SIZE];
/* the first 4 fields must match struct btrfs_header */
- u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
+ u8 csum[BTRFS_CSUM_SIZE];
+ /* FS specific UUID, visible to user */
+ u8 fsid[BTRFS_FSID_SIZE];
__le64 bytenr; /* this block number */
__le64 flags;
@@ -234,8 +249,11 @@ struct btrfs_super_block {
__le64 cache_generation;
__le64 uuid_tree_generation;
+ /* the UUID written into btree blocks */
+ u8 metadata_uuid[BTRFS_FSID_SIZE];
+
/* future expansion */
- __le64 reserved[30];
+ __le64 reserved[28];
u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
} __attribute__ ((__packed__));
@@ -265,7 +283,8 @@ struct btrfs_super_block {
BTRFS_FEATURE_INCOMPAT_RAID56 | \
BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \
BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \
- BTRFS_FEATURE_INCOMPAT_NO_HOLES)
+ BTRFS_FEATURE_INCOMPAT_NO_HOLES | \
+ BTRFS_FEATURE_INCOMPAT_METADATA_UUID)
#define BTRFS_FEATURE_INCOMPAT_SAFE_SET \
(BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
@@ -316,7 +335,7 @@ struct btrfs_node {
* The slots array records the index of the item or block pointer
* used while walking the tree.
*/
-enum { READA_NONE = 0, READA_BACK, READA_FORWARD };
+enum { READA_NONE, READA_BACK, READA_FORWARD };
struct btrfs_path {
struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
int slots[BTRFS_MAX_LEVEL];
@@ -360,9 +379,7 @@ struct btrfs_dev_replace {
struct btrfs_device *tgtdev;
struct mutex lock_finishing_cancel_unmount;
- rwlock_t lock;
- atomic_t blocking_readers;
- wait_queue_head_t read_lock_wq;
+ struct rw_semaphore rwsem;
struct btrfs_scrub_progress scrub_progress;
@@ -443,13 +460,19 @@ struct btrfs_space_info {
struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES];
};
-#define BTRFS_BLOCK_RSV_GLOBAL 1
-#define BTRFS_BLOCK_RSV_DELALLOC 2
-#define BTRFS_BLOCK_RSV_TRANS 3
-#define BTRFS_BLOCK_RSV_CHUNK 4
-#define BTRFS_BLOCK_RSV_DELOPS 5
-#define BTRFS_BLOCK_RSV_EMPTY 6
-#define BTRFS_BLOCK_RSV_TEMP 7
+/*
+ * Types of block reserves
+ */
+enum {
+ BTRFS_BLOCK_RSV_GLOBAL,
+ BTRFS_BLOCK_RSV_DELALLOC,
+ BTRFS_BLOCK_RSV_TRANS,
+ BTRFS_BLOCK_RSV_CHUNK,
+ BTRFS_BLOCK_RSV_DELOPS,
+ BTRFS_BLOCK_RSV_DELREFS,
+ BTRFS_BLOCK_RSV_EMPTY,
+ BTRFS_BLOCK_RSV_TEMP,
+};
struct btrfs_block_rsv {
u64 size;
@@ -509,18 +532,18 @@ struct btrfs_free_cluster {
};
enum btrfs_caching_type {
- BTRFS_CACHE_NO = 0,
- BTRFS_CACHE_STARTED = 1,
- BTRFS_CACHE_FAST = 2,
- BTRFS_CACHE_FINISHED = 3,
- BTRFS_CACHE_ERROR = 4,
+ BTRFS_CACHE_NO,
+ BTRFS_CACHE_STARTED,
+ BTRFS_CACHE_FAST,
+ BTRFS_CACHE_FINISHED,
+ BTRFS_CACHE_ERROR,
};
enum btrfs_disk_cache_state {
- BTRFS_DC_WRITTEN = 0,
- BTRFS_DC_ERROR = 1,
- BTRFS_DC_CLEAR = 2,
- BTRFS_DC_SETUP = 3,
+ BTRFS_DC_WRITTEN,
+ BTRFS_DC_ERROR,
+ BTRFS_DC_CLEAR,
+ BTRFS_DC_SETUP,
};
struct btrfs_caching_control {
@@ -712,41 +735,64 @@ struct btrfs_fs_devices;
struct btrfs_balance_control;
struct btrfs_delayed_root;
-#define BTRFS_FS_BARRIER 1
-#define BTRFS_FS_CLOSING_START 2
-#define BTRFS_FS_CLOSING_DONE 3
-#define BTRFS_FS_LOG_RECOVERING 4
-#define BTRFS_FS_OPEN 5
-#define BTRFS_FS_QUOTA_ENABLED 6
-#define BTRFS_FS_UPDATE_UUID_TREE_GEN 9
-#define BTRFS_FS_CREATING_FREE_SPACE_TREE 10
-#define BTRFS_FS_BTREE_ERR 11
-#define BTRFS_FS_LOG1_ERR 12
-#define BTRFS_FS_LOG2_ERR 13
-#define BTRFS_FS_QUOTA_OVERRIDE 14
-/* Used to record internally whether fs has been frozen */
-#define BTRFS_FS_FROZEN 15
-
/*
- * Indicate that a whole-filesystem exclusive operation is running
- * (device replace, resize, device add/delete, balance)
+ * Block group or device which contains an active swapfile. Used for preventing
+ * unsafe operations while a swapfile is active.
+ *
+ * These are sorted on (ptr, inode) (note that a block group or device can
+ * contain more than one swapfile). We compare the pointer values because we
+ * don't actually care what the object is, we just need a quick check whether
+ * the object exists in the rbtree.
*/
-#define BTRFS_FS_EXCL_OP 16
+struct btrfs_swapfile_pin {
+ struct rb_node node;
+ void *ptr;
+ struct inode *inode;
+ /*
+ * If true, ptr points to a struct btrfs_block_group_cache. Otherwise,
+ * ptr points to a struct btrfs_device.
+ */
+ bool is_block_group;
+};
-/*
- * To info transaction_kthread we need an immediate commit so it doesn't
- * need to wait for commit_interval
- */
-#define BTRFS_FS_NEED_ASYNC_COMMIT 17
+bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr);
+
+enum {
+ BTRFS_FS_BARRIER,
+ BTRFS_FS_CLOSING_START,
+ BTRFS_FS_CLOSING_DONE,
+ BTRFS_FS_LOG_RECOVERING,
+ BTRFS_FS_OPEN,
+ BTRFS_FS_QUOTA_ENABLED,
+ BTRFS_FS_UPDATE_UUID_TREE_GEN,
+ BTRFS_FS_CREATING_FREE_SPACE_TREE,
+ BTRFS_FS_BTREE_ERR,
+ BTRFS_FS_LOG1_ERR,
+ BTRFS_FS_LOG2_ERR,
+ BTRFS_FS_QUOTA_OVERRIDE,
+ /* Used to record internally whether fs has been frozen */
+ BTRFS_FS_FROZEN,
+ /*
+ * Indicate that a whole-filesystem exclusive operation is running
+ * (device replace, resize, device add/delete, balance)
+ */
+ BTRFS_FS_EXCL_OP,
+ /*
+ * To info transaction_kthread we need an immediate commit so it
+ * doesn't need to wait for commit_interval
+ */
+ BTRFS_FS_NEED_ASYNC_COMMIT,
+ /*
+ * Indicate that balance has been set up from the ioctl and is in the
+ * main phase. The fs_info::balance_ctl is initialized.
+ */
+ BTRFS_FS_BALANCE_RUNNING,
-/*
- * Indicate that balance has been set up from the ioctl and is in the main
- * phase. The fs_info::balance_ctl is initialized.
- */
-#define BTRFS_FS_BALANCE_RUNNING 18
+ /* Indicate that the cleaner thread is awake and doing something. */
+ BTRFS_FS_CLEANER_RUNNING,
+};
struct btrfs_fs_info {
- u8 fsid[BTRFS_FSID_SIZE];
u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
unsigned long flags;
struct btrfs_root *extent_root;
@@ -790,6 +836,8 @@ struct btrfs_fs_info {
struct btrfs_block_rsv chunk_block_rsv;
/* block reservation for delayed operations */
struct btrfs_block_rsv delayed_block_rsv;
+ /* block reservation for delayed refs */
+ struct btrfs_block_rsv delayed_refs_rsv;
struct btrfs_block_rsv empty_block_rsv;
@@ -886,7 +934,8 @@ struct btrfs_fs_info {
spinlock_t delayed_iput_lock;
struct list_head delayed_iputs;
- struct mutex cleaner_delayed_iput_mutex;
+ atomic_t nr_delayed_iputs;
+ wait_queue_head_t delayed_iputs_wait;
/* this protects tree_mod_seq_list */
spinlock_t tree_mod_seq_lock;
@@ -1026,10 +1075,13 @@ struct btrfs_fs_info {
atomic_t scrubs_paused;
atomic_t scrub_cancel_req;
wait_queue_head_t scrub_pause_wait;
- int scrub_workers_refcnt;
+ /*
+ * The worker pointers are NULL iff the refcount is 0, ie. scrub is not
+ * running.
+ */
+ refcount_t scrub_workers_refcnt;
struct btrfs_workqueue *scrub_workers;
struct btrfs_workqueue *scrub_wr_completion_workers;
- struct btrfs_workqueue *scrub_nocow_workers;
struct btrfs_workqueue *scrub_parity_workers;
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
@@ -1100,9 +1152,6 @@ struct btrfs_fs_info {
struct mutex unused_bg_unpin_mutex;
struct mutex delete_unused_bgs_mutex;
- /* For btrfs to record security options */
- struct security_mnt_opts security_opts;
-
/*
* Chunks that can't be freed yet (under a trim/discard operation)
* and will be latter freed. Protected by fs_info->chunk_mutex.
@@ -1114,6 +1163,10 @@ struct btrfs_fs_info {
u32 sectorsize;
u32 stripesize;
+ /* Block groups and devices containing active swapfiles. */
+ spinlock_t swapfile_pins_lock;
+ struct rb_root swapfile_pins;
+
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
spinlock_t ref_verify_lock;
struct rb_root block_tree;
@@ -1133,22 +1186,42 @@ struct btrfs_subvolume_writers {
/*
* The state of btrfs root
*/
+enum {
+ /*
+ * btrfs_record_root_in_trans is a multi-step process, and it can race
+ * with the balancing code. But the race is very small, and only the
+ * first time the root is added to each transaction. So IN_TRANS_SETUP
+ * is used to tell us when more checks are required
+ */
+ BTRFS_ROOT_IN_TRANS_SETUP,
+ BTRFS_ROOT_REF_COWS,
+ BTRFS_ROOT_TRACK_DIRTY,
+ BTRFS_ROOT_IN_RADIX,
+ BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
+ BTRFS_ROOT_DEFRAG_RUNNING,
+ BTRFS_ROOT_FORCE_COW,
+ BTRFS_ROOT_MULTI_LOG_TASKS,
+ BTRFS_ROOT_DIRTY,
+ BTRFS_ROOT_DELETING,
+
+ /*
+ * Reloc tree is orphan, only kept here for qgroup delayed subtree scan
+ *
+ * Set for the subvolume tree owning the reloc tree.
+ */
+ BTRFS_ROOT_DEAD_RELOC_TREE,
+};
+
/*
- * btrfs_record_root_in_trans is a multi-step process,
- * and it can race with the balancing code. But the
- * race is very small, and only the first time the root
- * is added to each transaction. So IN_TRANS_SETUP
- * is used to tell us when more checks are required
+ * Record swapped tree blocks of a subvolume tree for delayed subtree trace
+ * code. For detail check comment in fs/btrfs/qgroup.c.
*/
-#define BTRFS_ROOT_IN_TRANS_SETUP 0
-#define BTRFS_ROOT_REF_COWS 1
-#define BTRFS_ROOT_TRACK_DIRTY 2
-#define BTRFS_ROOT_IN_RADIX 3
-#define BTRFS_ROOT_ORPHAN_ITEM_INSERTED 4
-#define BTRFS_ROOT_DEFRAG_RUNNING 5
-#define BTRFS_ROOT_FORCE_COW 6
-#define BTRFS_ROOT_MULTI_LOG_TASKS 7
-#define BTRFS_ROOT_DIRTY 8
+struct btrfs_qgroup_swapped_blocks {
+ spinlock_t lock;
+ /* RM_EMPTY_ROOT() of above blocks[] */
+ bool swapped;
+ struct rb_root blocks[BTRFS_MAX_LEVEL];
+};
/*
* in ram representation of the tree. extent_root is used for all allocations
@@ -1261,6 +1334,14 @@ struct btrfs_root {
u64 nr_ordered_extents;
/*
+ * Not empty if this subvolume root has gone through tree block swap
+ * (relocation)
+ *
+ * Will be used by reloc_control::dirty_subvol_roots.
+ */
+ struct list_head reloc_dirty_list;
+
+ /*
* Number of currently running SEND ioctls to prevent
* manipulation with the read-only status via SUBVOL_SETFLAGS
*/
@@ -1274,6 +1355,12 @@ struct btrfs_root {
u64 qgroup_meta_rsv_pertrans;
u64 qgroup_meta_rsv_prealloc;
+ /* Number of active swapfiles */
+ atomic_t nr_swapfiles;
+
+ /* Record pairs of swapped blocks for qgroup */
+ struct btrfs_qgroup_swapped_blocks swapped_blocks;
+
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
u64 alloc_bytenr;
#endif
@@ -2570,10 +2657,10 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
/* extent-tree.c */
enum btrfs_inline_ref_type {
- BTRFS_REF_TYPE_INVALID = 0,
- BTRFS_REF_TYPE_BLOCK = 1,
- BTRFS_REF_TYPE_DATA = 2,
- BTRFS_REF_TYPE_ANY = 3,
+ BTRFS_REF_TYPE_INVALID,
+ BTRFS_REF_TYPE_BLOCK,
+ BTRFS_REF_TYPE_DATA,
+ BTRFS_REF_TYPE_ANY,
};
int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
@@ -2599,7 +2686,7 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_fs_info *fs_info,
}
int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans);
-int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans);
+bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info);
void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
const u64 start);
void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg);
@@ -2611,6 +2698,9 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
unsigned long count);
int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info,
unsigned long count, u64 transid, int wait);
+void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_head *head);
int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len);
int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
@@ -2713,10 +2803,13 @@ enum btrfs_reserve_flush_enum {
enum btrfs_flush_state {
FLUSH_DELAYED_ITEMS_NR = 1,
FLUSH_DELAYED_ITEMS = 2,
- FLUSH_DELALLOC = 3,
- FLUSH_DELALLOC_WAIT = 4,
- ALLOC_CHUNK = 5,
- COMMIT_TRANS = 6,
+ FLUSH_DELAYED_REFS_NR = 3,
+ FLUSH_DELAYED_REFS = 4,
+ FLUSH_DELALLOC = 5,
+ FLUSH_DELALLOC_WAIT = 6,
+ ALLOC_CHUNK = 7,
+ ALLOC_CHUNK_FORCE = 8,
+ COMMIT_TRANS = 9,
};
int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes);
@@ -2767,6 +2860,13 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv,
u64 num_bytes);
+void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr);
+void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans);
+int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
+ enum btrfs_reserve_flush_enum flush);
+void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *src,
+ u64 num_bytes);
int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache);
void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache);
void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
@@ -2959,7 +3059,6 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info)
kfree(fs_info->free_space_root);
kfree(fs_info->super_copy);
kfree(fs_info->super_for_commit);
- security_free_mnt_opts(&fs_info->security_opts);
kvfree(fs_info);
}
@@ -3116,8 +3215,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
/* inode.c */
struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
- struct page *page, size_t pg_offset, u64 start,
- u64 len, int create);
+ u64 start, u64 len);
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
u64 *orig_start, u64 *orig_block_len,
u64 *ram_bytes);
@@ -3141,7 +3239,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
struct inode *inode, u64 new_size,
u32 min_type);
-int btrfs_start_delalloc_inodes(struct btrfs_root *root);
+int btrfs_start_delalloc_snapshot(struct btrfs_root *root);
int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr);
int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
unsigned int extra_bits,
@@ -3150,9 +3248,16 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
struct btrfs_root *new_root,
struct btrfs_root *parent_root,
u64 new_dirid);
-int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
- size_t size, struct bio *bio,
- unsigned long bio_flags);
+ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
+ unsigned *bits);
+void btrfs_clear_delalloc_extent(struct inode *inode,
+ struct extent_state *state, unsigned *bits);
+void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
+ struct extent_state *other);
+void btrfs_split_delalloc_extent(struct inode *inode,
+ struct extent_state *orig, u64 split);
+int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio,
+ unsigned long bio_flags);
void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end);
vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf);
int btrfs_readpage(struct file *file, struct page *page);
@@ -3163,6 +3268,9 @@ void btrfs_destroy_inode(struct inode *inode);
int btrfs_drop_inode(struct inode *inode);
int __init btrfs_init_cachep(void);
void __cold btrfs_destroy_cachep(void);
+struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key *location,
+ struct btrfs_root *root, int *new,
+ struct btrfs_path *path);
struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
struct btrfs_root *root, int *was_new);
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
@@ -3179,6 +3287,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root);
int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
void btrfs_add_delayed_iput(struct inode *inode);
void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info);
+int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info);
int btrfs_prealloc_file_range(struct inode *inode, int mode,
u64 start, u64 num_bytes, u64 min_size,
loff_t actual_len, u64 *alloc_hint);
@@ -3186,6 +3295,12 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
struct btrfs_trans_handle *trans, int mode,
u64 start, u64 num_bytes, u64 min_size,
loff_t actual_len, u64 *alloc_hint);
+int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page,
+ u64 start, u64 end, int *page_started, unsigned long *nr_written,
+ struct writeback_control *wbc);
+int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end);
+void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
+ u64 end, int uptodate);
extern const struct dentry_operations btrfs_dentry_operations;
/* ioctl.c */
@@ -3201,9 +3316,6 @@ void btrfs_get_block_group_info(struct list_head *groups_list,
struct btrfs_ioctl_space_info *space);
void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_balance_args *bargs);
-int btrfs_dedupe_file_range(struct file *src_file, loff_t src_loff,
- struct file *dst_file, loff_t dst_loff,
- u64 olen);
/* file.c */
int __init btrfs_auto_defrag_init(void);
@@ -3233,8 +3345,9 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages,
size_t num_pages, loff_t pos, size_t write_bytes,
struct extent_state **cached);
int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
-int btrfs_clone_file_range(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out, u64 len);
+loff_t btrfs_remap_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t len, unsigned int remap_flags);
/* tree-defrag.c */
int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
@@ -3336,31 +3449,17 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
#if defined(CONFIG_DYNAMIC_DEBUG)
#define btrfs_debug(fs_info, fmt, args...) \
-do { \
- DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \
- if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT)) \
- btrfs_printk(fs_info, KERN_DEBUG fmt, ##args); \
-} while (0)
-#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
-do { \
- DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \
- if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT)) \
- btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args); \
-} while (0)
+ _dynamic_func_call_no_desc(fmt, btrfs_printk, \
+ fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
+ _dynamic_func_call_no_desc(fmt, btrfs_printk_in_rcu, \
+ fs_info, KERN_DEBUG fmt, ##args)
#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
-do { \
- DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \
- if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT)) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, \
- ##args);\
-} while (0)
-#define btrfs_debug_rl(fs_info, fmt, args...) \
-do { \
- DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \
- if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT)) \
- btrfs_printk_ratelimited(fs_info, KERN_DEBUG fmt, \
- ##args); \
-} while (0)
+ _dynamic_func_call_no_desc(fmt, btrfs_printk_rl_in_rcu, \
+ fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl(fs_info, fmt, args...) \
+ _dynamic_func_call_no_desc(fmt, btrfs_printk_ratelimited, \
+ fs_info, KERN_DEBUG fmt, ##args)
#elif defined(DEBUG)
#define btrfs_debug(fs_info, fmt, args...) \
btrfs_printk(fs_info, KERN_DEBUG fmt, ##args)
@@ -3411,20 +3510,27 @@ do { \
rcu_read_unlock(); \
} while (0)
-#ifdef CONFIG_BTRFS_ASSERT
-
__cold
static inline void assfail(const char *expr, const char *file, int line)
{
- pr_err("assertion failed: %s, file: %s, line: %d\n",
- expr, file, line);
- BUG();
+ if (IS_ENABLED(CONFIG_BTRFS_ASSERT)) {
+ pr_err("assertion failed: %s, file: %s, line: %d\n",
+ expr, file, line);
+ BUG();
+ }
}
#define ASSERT(expr) \
(likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
+
+/*
+ * Use that for functions that are conditionally exported for sanity tests but
+ * otherwise static
+ */
+#ifndef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+#define EXPORT_FOR_TESTS static
#else
-#define ASSERT(expr) ((void)0)
+#define EXPORT_FOR_TESTS
#endif
__cold
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 5149165b49a4..7d2a413df90d 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -164,14 +164,27 @@ static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root,
return NULL;
}
+static struct btrfs_delayed_ref_head *find_first_ref_head(
+ struct btrfs_delayed_ref_root *dr)
+{
+ struct rb_node *n;
+ struct btrfs_delayed_ref_head *entry;
+
+ n = rb_first_cached(&dr->href_root);
+ if (!n)
+ return NULL;
+
+ entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
+
+ return entry;
+}
+
/*
- * find an head entry based on bytenr. This returns the delayed ref
- * head if it was able to find one, or NULL if nothing was in that spot.
- * If return_bigger is given, the next bigger entry is returned if no exact
- * match is found. But if no bigger one is found then the first node of the
- * ref head tree will be returned.
+ * Find a head entry based on bytenr. This returns the delayed ref head if it
+ * was able to find one, or NULL if nothing was in that spot. If return_bigger
+ * is given, the next bigger entry is returned if no exact match is found.
*/
-static struct btrfs_delayed_ref_head* find_ref_head(
+static struct btrfs_delayed_ref_head *find_ref_head(
struct btrfs_delayed_ref_root *dr, u64 bytenr,
bool return_bigger)
{
@@ -195,10 +208,9 @@ static struct btrfs_delayed_ref_head* find_ref_head(
if (bytenr > entry->bytenr) {
n = rb_next(&entry->href_node);
if (!n)
- n = rb_first_cached(&dr->href_root);
+ return NULL;
entry = rb_entry(n, struct btrfs_delayed_ref_head,
href_node);
- return entry;
}
return entry;
}
@@ -239,8 +251,6 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
ref->in_tree = 0;
btrfs_put_delayed_ref(ref);
atomic_dec(&delayed_refs->num_entries);
- if (trans->delayed_ref_updates)
- trans->delayed_ref_updates--;
}
static bool merge_ref(struct btrfs_trans_handle *trans,
@@ -355,33 +365,25 @@ struct btrfs_delayed_ref_head *btrfs_select_ref_head(
struct btrfs_delayed_ref_root *delayed_refs)
{
struct btrfs_delayed_ref_head *head;
- u64 start;
- bool loop = false;
again:
- start = delayed_refs->run_delayed_start;
- head = find_ref_head(delayed_refs, start, true);
- if (!head && !loop) {
+ head = find_ref_head(delayed_refs, delayed_refs->run_delayed_start,
+ true);
+ if (!head && delayed_refs->run_delayed_start != 0) {
delayed_refs->run_delayed_start = 0;
- start = 0;
- loop = true;
- head = find_ref_head(delayed_refs, start, true);
- if (!head)
- return NULL;
- } else if (!head && loop) {
- return NULL;
+ head = find_first_ref_head(delayed_refs);
}
+ if (!head)
+ return NULL;
while (head->processing) {
struct rb_node *node;
node = rb_next(&head->href_node);
if (!node) {
- if (loop)
+ if (delayed_refs->run_delayed_start == 0)
return NULL;
delayed_refs->run_delayed_start = 0;
- start = 0;
- loop = true;
goto again;
}
head = rb_entry(node, struct btrfs_delayed_ref_head,
@@ -396,6 +398,20 @@ again:
return head;
}
+void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_head *head)
+{
+ lockdep_assert_held(&delayed_refs->lock);
+ lockdep_assert_held(&head->lock);
+
+ rb_erase_cached(&head->href_node, &delayed_refs->href_root);
+ RB_CLEAR_NODE(&head->href_node);
+ atomic_dec(&delayed_refs->num_entries);
+ delayed_refs->num_heads--;
+ if (head->processing == 0)
+ delayed_refs->num_heads_ready--;
+}
+
/*
* Helper to insert the ref_node to the tail or merge with tail.
*
@@ -449,7 +465,6 @@ inserted:
if (ref->action == BTRFS_ADD_DELAYED_REF)
list_add_tail(&ref->add_list, &href->ref_add_list);
atomic_inc(&root->num_entries);
- trans->delayed_ref_updates++;
spin_unlock(&href->lock);
return ret;
}
@@ -458,12 +473,14 @@ inserted:
* helper function to update the accounting in the head ref
* existing and update must have the same bytenr
*/
-static noinline void
-update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
+static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *existing,
struct btrfs_delayed_ref_head *update,
int *old_ref_mod_ret)
{
+ struct btrfs_delayed_ref_root *delayed_refs =
+ &trans->transaction->delayed_refs;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
int old_ref_mod;
BUG_ON(existing->is_data != update->is_data);
@@ -521,10 +538,18 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
* versa we need to make sure to adjust pending_csums accordingly.
*/
if (existing->is_data) {
- if (existing->total_ref_mod >= 0 && old_ref_mod < 0)
+ u64 csum_leaves =
+ btrfs_csum_bytes_to_leaves(fs_info,
+ existing->num_bytes);
+
+ if (existing->total_ref_mod >= 0 && old_ref_mod < 0) {
delayed_refs->pending_csums -= existing->num_bytes;
- if (existing->total_ref_mod < 0 && old_ref_mod >= 0)
+ btrfs_delayed_refs_rsv_release(fs_info, csum_leaves);
+ }
+ if (existing->total_ref_mod < 0 && old_ref_mod >= 0) {
delayed_refs->pending_csums += existing->num_bytes;
+ trans->delayed_ref_updates += csum_leaves;
+ }
}
spin_unlock(&existing->lock);
}
@@ -577,17 +602,14 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
RB_CLEAR_NODE(&head_ref->href_node);
head_ref->processing = 0;
head_ref->total_ref_mod = count_mod;
- head_ref->qgroup_reserved = 0;
- head_ref->qgroup_ref_root = 0;
spin_lock_init(&head_ref->lock);
mutex_init(&head_ref->mutex);
if (qrecord) {
if (ref_root && reserved) {
- head_ref->qgroup_ref_root = ref_root;
- head_ref->qgroup_reserved = reserved;
+ qrecord->data_rsv = reserved;
+ qrecord->data_rsv_refroot = ref_root;
}
-
qrecord->bytenr = bytenr;
qrecord->num_bytes = num_bytes;
qrecord->old_roots = NULL;
@@ -626,11 +648,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
existing = htree_insert(&delayed_refs->href_root,
&head_ref->href_node);
if (existing) {
- WARN_ON(qrecord && head_ref->qgroup_ref_root
- && head_ref->qgroup_reserved
- && existing->qgroup_ref_root
- && existing->qgroup_reserved);
- update_existing_head_ref(delayed_refs, existing, head_ref,
+ update_existing_head_ref(trans, existing, head_ref,
old_ref_mod);
/*
* we've updated the existing ref, free the newly
@@ -641,8 +659,12 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
} else {
if (old_ref_mod)
*old_ref_mod = 0;
- if (head_ref->is_data && head_ref->ref_mod < 0)
+ if (head_ref->is_data && head_ref->ref_mod < 0) {
delayed_refs->pending_csums += head_ref->num_bytes;
+ trans->delayed_ref_updates +=
+ btrfs_csum_bytes_to_leaves(trans->fs_info,
+ head_ref->num_bytes);
+ }
delayed_refs->num_heads++;
delayed_refs->num_heads_ready++;
atomic_inc(&delayed_refs->num_entries);
@@ -741,7 +763,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
is_fstree(ref_root)) {
- record = kmalloc(sizeof(*record), GFP_NOFS);
+ record = kzalloc(sizeof(*record), GFP_NOFS);
if (!record) {
kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
@@ -778,6 +800,12 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
spin_unlock(&delayed_refs->lock);
+ /*
+ * Need to update the delayed_refs_rsv with any changes we may have
+ * made.
+ */
+ btrfs_update_delayed_refs_rsv(trans);
+
trace_add_delayed_tree_ref(fs_info, &ref->node, ref,
action == BTRFS_ADD_DELAYED_EXTENT ?
BTRFS_ADD_DELAYED_REF : action);
@@ -832,7 +860,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
is_fstree(ref_root)) {
- record = kmalloc(sizeof(*record), GFP_NOFS);
+ record = kzalloc(sizeof(*record), GFP_NOFS);
if (!record) {
kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
kmem_cache_free(btrfs_delayed_ref_head_cachep,
@@ -859,6 +887,12 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
spin_unlock(&delayed_refs->lock);
+ /*
+ * Need to update the delayed_refs_rsv with any changes we may have
+ * made.
+ */
+ btrfs_update_delayed_refs_rsv(trans);
+
trace_add_delayed_data_ref(trans->fs_info, &ref->node, ref,
action == BTRFS_ADD_DELAYED_EXTENT ?
BTRFS_ADD_DELAYED_REF : action);
@@ -895,6 +929,12 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
NULL, NULL, NULL);
spin_unlock(&delayed_refs->lock);
+
+ /*
+ * Need to update the delayed_refs_rsv with any changes we may have
+ * made.
+ */
+ btrfs_update_delayed_refs_rsv(trans);
return 0;
}
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 8e20c5cb5404..70606da440aa 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -103,17 +103,6 @@ struct btrfs_delayed_ref_head {
int ref_mod;
/*
- * For qgroup reserved space freeing.
- *
- * ref_root and reserved will be recorded after
- * BTRFS_ADD_DELAYED_EXTENT is called.
- * And will be used to free reserved qgroup space at
- * run_delayed_refs() time.
- */
- u64 qgroup_ref_root;
- u64 qgroup_reserved;
-
- /*
* when a new extent is allocated, it is just reserved in memory
* The actual extent isn't inserted into the extent allocation tree
* until the delayed ref is processed. must_insert_reserved is
@@ -261,7 +250,8 @@ static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head)
{
mutex_unlock(&head->mutex);
}
-
+void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_head *head);
struct btrfs_delayed_ref_head *btrfs_select_ref_head(
struct btrfs_delayed_ref_root *delayed_refs);
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 2aa48aecc52b..ee193c5222b2 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -59,7 +59,6 @@ no_valid_dev_replace_entry_found:
BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED;
dev_replace->cont_reading_from_srcdev_mode =
BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS;
- dev_replace->replace_state = 0;
dev_replace->time_started = 0;
dev_replace->time_stopped = 0;
atomic64_set(&dev_replace->num_write_errors, 0);
@@ -112,11 +111,11 @@ no_valid_dev_replace_entry_found:
break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
- dev_replace->srcdev = btrfs_find_device(fs_info, src_devid,
- NULL, NULL);
- dev_replace->tgtdev = btrfs_find_device(fs_info,
+ dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices,
+ src_devid, NULL, NULL, true);
+ dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices,
BTRFS_DEV_REPLACE_DEVID,
- NULL, NULL);
+ NULL, NULL, true);
/*
* allow 'btrfs dev replace_cancel' if src/tgt device is
* missing
@@ -285,13 +284,13 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
struct btrfs_dev_replace_item *ptr;
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
- btrfs_dev_replace_read_lock(dev_replace);
+ down_read(&dev_replace->rwsem);
if (!dev_replace->is_valid ||
!dev_replace->item_needs_writeback) {
- btrfs_dev_replace_read_unlock(dev_replace);
+ up_read(&dev_replace->rwsem);
return 0;
}
- btrfs_dev_replace_read_unlock(dev_replace);
+ up_read(&dev_replace->rwsem);
key.objectid = 0;
key.type = BTRFS_DEV_REPLACE_KEY;
@@ -349,7 +348,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
ptr = btrfs_item_ptr(eb, path->slots[0],
struct btrfs_dev_replace_item);
- btrfs_dev_replace_write_lock(dev_replace);
+ down_write(&dev_replace->rwsem);
if (dev_replace->srcdev)
btrfs_set_dev_replace_src_devid(eb, ptr,
dev_replace->srcdev->devid);
@@ -372,7 +371,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
btrfs_set_dev_replace_cursor_right(eb, ptr,
dev_replace->cursor_right);
dev_replace->item_needs_writeback = 0;
- btrfs_dev_replace_write_unlock(dev_replace);
+ up_write(&dev_replace->rwsem);
btrfs_mark_buffer_dirty(eb);
@@ -390,7 +389,7 @@ static char* btrfs_dev_name(struct btrfs_device *device)
return rcu_str_deref(device->name);
}
-int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
+static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
const char *tgtdev_name, u64 srcdevid, const char *srcdev_name,
int read_src)
{
@@ -407,6 +406,13 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
if (IS_ERR(src_device))
return PTR_ERR(src_device);
+ if (btrfs_pinned_by_swapfile(fs_info, src_device)) {
+ btrfs_warn_in_rcu(fs_info,
+ "cannot replace device %s (devid %llu) due to active swapfile",
+ btrfs_dev_name(src_device), src_device->devid);
+ return -ETXTBSY;
+ }
+
ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name,
src_device, &tgt_device);
if (ret)
@@ -426,7 +432,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
}
need_unlock = true;
- btrfs_dev_replace_write_lock(dev_replace);
+ down_write(&dev_replace->rwsem);
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
@@ -464,7 +470,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
dev_replace->item_needs_writeback = 1;
atomic64_set(&dev_replace->num_write_errors, 0);
atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
- btrfs_dev_replace_write_unlock(dev_replace);
+ up_write(&dev_replace->rwsem);
need_unlock = false;
ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device);
@@ -478,7 +484,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
need_unlock = true;
- btrfs_dev_replace_write_lock(dev_replace);
+ down_write(&dev_replace->rwsem);
dev_replace->replace_state =
BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED;
dev_replace->srcdev = NULL;
@@ -497,7 +503,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
ret = btrfs_dev_replace_finishing(fs_info, ret);
if (ret == -EINPROGRESS) {
ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS;
- } else {
+ } else if (ret != -ECANCELED) {
WARN_ON(ret);
}
@@ -505,7 +511,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
leave:
if (need_unlock)
- btrfs_dev_replace_write_unlock(dev_replace);
+ up_write(&dev_replace->rwsem);
btrfs_destroy_dev_replace_tgtdev(tgt_device);
return ret;
}
@@ -533,8 +539,9 @@ int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,
args->start.cont_reading_from_srcdev_mode);
args->result = ret;
/* don't warn if EINPROGRESS, someone else might be running scrub */
- if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS)
- ret = 0;
+ if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS ||
+ ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR)
+ return 0;
return ret;
}
@@ -572,18 +579,18 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
/* don't allow cancel or unmount to disturb the finishing procedure */
mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
- btrfs_dev_replace_read_lock(dev_replace);
+ down_read(&dev_replace->rwsem);
/* was the operation canceled, or is it finished? */
if (dev_replace->replace_state !=
BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
- btrfs_dev_replace_read_unlock(dev_replace);
+ up_read(&dev_replace->rwsem);
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return 0;
}
tgt_device = dev_replace->tgtdev;
src_device = dev_replace->srcdev;
- btrfs_dev_replace_read_unlock(dev_replace);
+ up_read(&dev_replace->rwsem);
/*
* flush all outstanding I/O and inode extent mappings before the
@@ -607,7 +614,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
/* keep away write_all_supers() during the finishing procedure */
mutex_lock(&fs_info->fs_devices->device_list_mutex);
mutex_lock(&fs_info->chunk_mutex);
- btrfs_dev_replace_write_lock(dev_replace);
+ down_write(&dev_replace->rwsem);
dev_replace->replace_state =
scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
: BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
@@ -622,12 +629,13 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
src_device,
tgt_device);
} else {
- btrfs_err_in_rcu(fs_info,
+ if (scrub_ret != -ECANCELED)
+ btrfs_err_in_rcu(fs_info,
"btrfs_scrub_dev(%s, %llu, %s) failed %d",
btrfs_dev_name(src_device),
src_device->devid,
rcu_str_deref(tgt_device->name), scrub_ret);
- btrfs_dev_replace_write_unlock(dev_replace);
+ up_write(&dev_replace->rwsem);
mutex_unlock(&fs_info->chunk_mutex);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
btrfs_rm_dev_replace_blocked(fs_info);
@@ -663,8 +671,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
fs_info->fs_devices->rw_devices++;
- btrfs_dev_replace_write_unlock(dev_replace);
-
+ up_write(&dev_replace->rwsem);
btrfs_rm_dev_replace_blocked(fs_info);
btrfs_rm_dev_replace_remove_srcdev(src_device);
@@ -761,7 +768,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
{
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
- btrfs_dev_replace_read_lock(dev_replace);
+ down_read(&dev_replace->rwsem);
/* even if !dev_replace_is_valid, the values are good enough for
* the replace_status ioctl */
args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
@@ -773,7 +780,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
args->status.num_uncorrectable_read_errors =
atomic64_read(&dev_replace->num_uncorrectable_read_errors);
args->status.progress_1000 = btrfs_dev_replace_progress(fs_info);
- btrfs_dev_replace_read_unlock(dev_replace);
+ up_read(&dev_replace->rwsem);
}
int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
@@ -790,46 +797,75 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
return -EROFS;
mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
- btrfs_dev_replace_write_lock(dev_replace);
+ down_write(&dev_replace->rwsem);
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
- btrfs_dev_replace_write_unlock(dev_replace);
- goto leave;
+ up_write(&dev_replace->rwsem);
+ break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+ tgt_device = dev_replace->tgtdev;
+ src_device = dev_replace->srcdev;
+ up_write(&dev_replace->rwsem);
+ ret = btrfs_scrub_cancel(fs_info);
+ if (ret < 0) {
+ result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
+ } else {
+ result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
+ /*
+ * btrfs_dev_replace_finishing() will handle the
+ * cleanup part
+ */
+ btrfs_info_in_rcu(fs_info,
+ "dev_replace from %s (devid %llu) to %s canceled",
+ btrfs_dev_name(src_device), src_device->devid,
+ btrfs_dev_name(tgt_device));
+ }
+ break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+ /*
+ * Scrub doing the replace isn't running so we need to do the
+ * cleanup step of btrfs_dev_replace_finishing() here
+ */
result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
tgt_device = dev_replace->tgtdev;
src_device = dev_replace->srcdev;
dev_replace->tgtdev = NULL;
dev_replace->srcdev = NULL;
- break;
- }
- dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
- dev_replace->time_stopped = ktime_get_real_seconds();
- dev_replace->item_needs_writeback = 1;
- btrfs_dev_replace_write_unlock(dev_replace);
- btrfs_scrub_cancel(fs_info);
+ dev_replace->replace_state =
+ BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
+ dev_replace->time_stopped = ktime_get_real_seconds();
+ dev_replace->item_needs_writeback = 1;
- trans = btrfs_start_transaction(root, 0);
- if (IS_ERR(trans)) {
- mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
- return PTR_ERR(trans);
- }
- ret = btrfs_commit_transaction(trans);
- WARN_ON(ret);
+ up_write(&dev_replace->rwsem);
- btrfs_info_in_rcu(fs_info,
- "dev_replace from %s (devid %llu) to %s canceled",
- btrfs_dev_name(src_device), src_device->devid,
- btrfs_dev_name(tgt_device));
+ /* Scrub for replace must not be running in suspended state */
+ ret = btrfs_scrub_cancel(fs_info);
+ ASSERT(ret != -ENOTCONN);
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+ return PTR_ERR(trans);
+ }
+ ret = btrfs_commit_transaction(trans);
+ WARN_ON(ret);
- if (tgt_device)
- btrfs_destroy_dev_replace_tgtdev(tgt_device);
+ btrfs_info_in_rcu(fs_info,
+ "suspended dev_replace from %s (devid %llu) to %s canceled",
+ btrfs_dev_name(src_device), src_device->devid,
+ btrfs_dev_name(tgt_device));
+
+ if (tgt_device)
+ btrfs_destroy_dev_replace_tgtdev(tgt_device);
+ break;
+ default:
+ up_write(&dev_replace->rwsem);
+ result = -EINVAL;
+ }
-leave:
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return result;
}
@@ -839,7 +875,8 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
- btrfs_dev_replace_write_lock(dev_replace);
+ down_write(&dev_replace->rwsem);
+
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
@@ -855,7 +892,7 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
break;
}
- btrfs_dev_replace_write_unlock(dev_replace);
+ up_write(&dev_replace->rwsem);
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
}
@@ -865,12 +902,13 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
struct task_struct *task;
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
- btrfs_dev_replace_write_lock(dev_replace);
+ down_write(&dev_replace->rwsem);
+
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
- btrfs_dev_replace_write_unlock(dev_replace);
+ up_write(&dev_replace->rwsem);
return 0;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
break;
@@ -884,10 +922,12 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
"cannot continue dev_replace, tgtdev is missing");
btrfs_info(fs_info,
"you may cancel the operation after 'mount -o degraded'");
- btrfs_dev_replace_write_unlock(dev_replace);
+ dev_replace->replace_state =
+ BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
+ up_write(&dev_replace->rwsem);
return 0;
}
- btrfs_dev_replace_write_unlock(dev_replace);
+ up_write(&dev_replace->rwsem);
/*
* This could collide with a paused balance, but the exclusive op logic
@@ -895,6 +935,10 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
* dev-replace to start anyway.
*/
if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+ down_write(&dev_replace->rwsem);
+ dev_replace->replace_state =
+ BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
+ up_write(&dev_replace->rwsem);
btrfs_info(fs_info,
"cannot resume dev-replace, other exclusive operation running");
return 0;
@@ -925,7 +969,7 @@ static int btrfs_dev_replace_kthread(void *data)
btrfs_device_get_total_bytes(dev_replace->srcdev),
&dev_replace->scrub_progress, 0, 1);
ret = btrfs_dev_replace_finishing(fs_info, ret);
- WARN_ON(ret);
+ WARN_ON(ret && ret != -ECANCELED);
clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
return 0;
@@ -948,7 +992,7 @@ int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
* something that can happen if the dev_replace
* procedure is suspended by an umount and then
* the tgtdev is missing (or "btrfs dev scan") was
- * not called and the the filesystem is remounted
+ * not called and the filesystem is remounted
* in degraded state. This does not stop the
* dev_replace procedure. It needs to be canceled
* manually if the cancellation is wanted.
@@ -958,42 +1002,6 @@ int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
return 1;
}
-void btrfs_dev_replace_read_lock(struct btrfs_dev_replace *dev_replace)
-{
- read_lock(&dev_replace->lock);
-}
-
-void btrfs_dev_replace_read_unlock(struct btrfs_dev_replace *dev_replace)
-{
- read_unlock(&dev_replace->lock);
-}
-
-void btrfs_dev_replace_write_lock(struct btrfs_dev_replace *dev_replace)
-{
-again:
- wait_event(dev_replace->read_lock_wq,
- atomic_read(&dev_replace->blocking_readers) == 0);
- write_lock(&dev_replace->lock);
- if (atomic_read(&dev_replace->blocking_readers)) {
- write_unlock(&dev_replace->lock);
- goto again;
- }
-}
-
-void btrfs_dev_replace_write_unlock(struct btrfs_dev_replace *dev_replace)
-{
- write_unlock(&dev_replace->lock);
-}
-
-/* inc blocking cnt and release read lock */
-void btrfs_dev_replace_set_lock_blocking(
- struct btrfs_dev_replace *dev_replace)
-{
- /* only set blocking for read lock */
- atomic_inc(&dev_replace->blocking_readers);
- read_unlock(&dev_replace->lock);
-}
-
void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
{
percpu_counter_inc(&fs_info->dev_replace.bio_counter);
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
index 795c551f5b5e..4aa40bacc6cc 100644
--- a/fs/btrfs/dev-replace.h
+++ b/fs/btrfs/dev-replace.h
@@ -13,19 +13,11 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_dev_replace_args *args);
-int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
- const char *tgtdev_name, u64 srcdevid, const char *srcdev_name,
- int read_src);
void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_dev_replace_args *args);
int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info);
int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info);
int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
-void btrfs_dev_replace_read_lock(struct btrfs_dev_replace *dev_replace);
-void btrfs_dev_replace_read_unlock(struct btrfs_dev_replace *dev_replace);
-void btrfs_dev_replace_write_lock(struct btrfs_dev_replace *dev_replace);
-void btrfs_dev_replace_write_unlock(struct btrfs_dev_replace *dev_replace);
-void btrfs_dev_replace_set_lock_blocking(struct btrfs_dev_replace *dev_replace);
#endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b0ab41da91d1..f0cdb53f3e2d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -17,6 +17,7 @@
#include <linux/semaphore.h>
#include <linux/error-injection.h>
#include <linux/crc32c.h>
+#include <linux/sched/mm.h>
#include <asm/unaligned.h>
#include "ctree.h"
#include "disk-io.h"
@@ -279,6 +280,12 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info,
len = buf->len - offset;
while (len > 0) {
+ /*
+ * Note: we don't need to check for the err == 1 case here, as
+ * with the given combination of 'start = BTRFS_CSUM_SIZE (32)'
+ * and 'min_len = 32' and the currently implemented mapping
+ * algorithm we cannot cross a page boundary.
+ */
err = map_private_extent_buffer(buf, offset, 32,
&kaddr, &map_start, &map_len);
if (err)
@@ -335,7 +342,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
if (need_lock) {
btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ btrfs_set_lock_blocking_read(eb);
}
lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
@@ -477,9 +484,9 @@ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info,
int mirror_num = 0;
int failed_mirror = 0;
- clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
while (1) {
+ clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
ret = read_extent_buffer_pages(io_tree, eb, WAIT_COMPLETE,
mirror_num);
if (!ret) {
@@ -493,15 +500,6 @@ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info,
break;
}
- /*
- * This buffer's crc is fine, but its contents are corrupted, so
- * there is no reason to read the other copies, they won't be
- * any less wrong.
- */
- if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags) ||
- ret == -EUCLEAN)
- break;
-
num_copies = btrfs_num_copies(fs_info,
eb->start, eb->len);
if (num_copies == 1)
@@ -551,7 +549,7 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page)
if (WARN_ON(!PageUptodate(page)))
return -EUCLEAN;
- ASSERT(memcmp_extent_buffer(eb, fs_info->fsid,
+ ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,
btrfs_header_fsid(), BTRFS_FSID_SIZE) == 0);
return csum_tree_block(fs_info, eb, 0);
@@ -566,7 +564,20 @@ static int check_tree_block_fsid(struct btrfs_fs_info *fs_info,
read_extent_buffer(eb, fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE);
while (fs_devices) {
- if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) {
+ u8 *metadata_uuid;
+
+ /*
+ * Checking the incompat flag is only valid for the current
+ * fs. For seed devices it's forbidden to have their uuid
+ * changed so reading ->fsid in this case is fine
+ */
+ if (fs_devices == fs_info->fs_devices &&
+ btrfs_fs_incompat(fs_info, METADATA_UUID))
+ metadata_uuid = fs_devices->metadata_uuid;
+ else
+ metadata_uuid = fs_devices->fsid;
+
+ if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE)) {
ret = 0;
break;
}
@@ -669,19 +680,6 @@ out:
return ret;
}
-static int btree_io_failed_hook(struct page *page, int failed_mirror)
-{
- struct extent_buffer *eb;
-
- eb = (struct extent_buffer *)page->private;
- set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
- eb->read_mirror = failed_mirror;
- atomic_dec(&eb->io_pages);
- if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
- btree_readahead_hook(eb, -EIO);
- return -EIO; /* we fixed nothing */
-}
-
static void end_workqueue_bio(struct bio *bio)
{
struct btrfs_end_io_wq *end_io_wq = bio->bi_private;
@@ -760,11 +758,22 @@ static void run_one_async_start(struct btrfs_work *work)
async->status = ret;
}
+/*
+ * In order to insert checksums into the metadata in large chunks, we wait
+ * until bio submission time. All the pages in the bio are checksummed and
+ * sums are attached onto the ordered extent record.
+ *
+ * At IO completion time the csums attached on the ordered extent record are
+ * inserted into the tree.
+ */
static void run_one_async_done(struct btrfs_work *work)
{
struct async_submit_bio *async;
+ struct inode *inode;
+ blk_status_t ret;
async = container_of(work, struct async_submit_bio, work);
+ inode = async->private_data;
/* If an error occurred we just want to clean up the bio and move on */
if (async->status) {
@@ -773,7 +782,12 @@ static void run_one_async_done(struct btrfs_work *work)
return;
}
- btrfs_submit_bio_done(async->private_data, async->bio, async->mirror_num);
+ ret = btrfs_map_bio(btrfs_sb(inode->i_sb), async->bio,
+ async->mirror_num, 1);
+ if (ret) {
+ async->bio->bi_status = ret;
+ bio_endio(async->bio);
+ }
}
static void run_one_async_free(struct btrfs_work *work)
@@ -819,9 +833,10 @@ static blk_status_t btree_csum_one_bio(struct bio *bio)
struct bio_vec *bvec;
struct btrfs_root *root;
int i, ret = 0;
+ struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, i, iter_all) {
root = BTRFS_I(bvec->bv_page->mapping->host)->root;
ret = csum_dirty_buffer(root->fs_info, bvec->bv_page);
if (ret)
@@ -1107,7 +1122,7 @@ void clean_tree_block(struct btrfs_fs_info *fs_info,
-buf->len,
fs_info->dirty_metadata_batch);
/* ugh, clear_extent_buffer_dirty needs to lock the page */
- btrfs_set_lock_blocking(buf);
+ btrfs_set_lock_blocking_write(buf);
clear_extent_buffer_dirty(buf);
}
}
@@ -1162,6 +1177,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
INIT_LIST_HEAD(&root->delalloc_root);
INIT_LIST_HEAD(&root->ordered_extents);
INIT_LIST_HEAD(&root->ordered_root);
+ INIT_LIST_HEAD(&root->reloc_dirty_list);
INIT_LIST_HEAD(&root->logged_list[0]);
INIT_LIST_HEAD(&root->logged_list[1]);
spin_lock_init(&root->inode_lock);
@@ -1187,6 +1203,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
refcount_set(&root->refs, 1);
atomic_set(&root->will_be_snapshotted, 0);
atomic_set(&root->snapshot_force_cow, 0);
+ atomic_set(&root->nr_swapfiles, 0);
root->log_transid = 0;
root->log_transid_committed = -1;
root->last_log_commit = 0;
@@ -1204,6 +1221,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
root->anon_dev = 0;
spin_lock_init(&root->root_item_lock);
+ btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks);
}
static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
@@ -1244,10 +1262,17 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *tree_root = fs_info->tree_root;
struct btrfs_root *root;
struct btrfs_key key;
+ unsigned int nofs_flag;
int ret = 0;
uuid_le uuid = NULL_UUID_LE;
+ /*
+ * We're holding a transaction handle, so use a NOFS memory allocation
+ * context to avoid deadlock if reclaim happens.
+ */
+ nofs_flag = memalloc_nofs_save();
root = btrfs_alloc_root(fs_info, GFP_KERNEL);
+ memalloc_nofs_restore(nofs_flag);
if (!root)
return ERR_PTR(-ENOMEM);
@@ -1664,11 +1689,12 @@ static int cleaner_kthread(void *arg)
struct btrfs_root *root = arg;
struct btrfs_fs_info *fs_info = root->fs_info;
int again;
- struct btrfs_trans_handle *trans;
- do {
+ while (1) {
again = 0;
+ set_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);
+
/* Make the cleaner go to sleep early. */
if (btrfs_need_cleaner_sleep(fs_info))
goto sleep;
@@ -1692,9 +1718,7 @@ static int cleaner_kthread(void *arg)
goto sleep;
}
- mutex_lock(&fs_info->cleaner_delayed_iput_mutex);
btrfs_run_delayed_iputs(fs_info);
- mutex_unlock(&fs_info->cleaner_delayed_iput_mutex);
again = btrfs_clean_one_deleted_snapshot(root);
mutex_unlock(&fs_info->cleaner_mutex);
@@ -1715,42 +1739,17 @@ static int cleaner_kthread(void *arg)
*/
btrfs_delete_unused_bgs(fs_info);
sleep:
+ clear_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);
+ if (kthread_should_park())
+ kthread_parkme();
+ if (kthread_should_stop())
+ return 0;
if (!again) {
set_current_state(TASK_INTERRUPTIBLE);
- if (!kthread_should_stop())
- schedule();
+ schedule();
__set_current_state(TASK_RUNNING);
}
- } while (!kthread_should_stop());
-
- /*
- * Transaction kthread is stopped before us and wakes us up.
- * However we might have started a new transaction and COWed some
- * tree blocks when deleting unused block groups for example. So
- * make sure we commit the transaction we started to have a clean
- * shutdown when evicting the btree inode - if it has dirty pages
- * when we do the final iput() on it, eviction will trigger a
- * writeback for it which will fail with null pointer dereferences
- * since work queues and other resources were already released and
- * destroyed by the time the iput/eviction/writeback is made.
- */
- trans = btrfs_attach_transaction(root);
- if (IS_ERR(trans)) {
- if (PTR_ERR(trans) != -ENOENT)
- btrfs_err(fs_info,
- "cleaner transaction attach returned %ld",
- PTR_ERR(trans));
- } else {
- int ret;
-
- ret = btrfs_commit_transaction(trans);
- if (ret)
- btrfs_err(fs_info,
- "cleaner open transaction commit returned %d",
- ret);
}
-
- return 0;
}
static int transaction_kthread(void *arg)
@@ -2111,7 +2110,7 @@ static void btrfs_init_scrub(struct btrfs_fs_info *fs_info)
atomic_set(&fs_info->scrubs_paused, 0);
atomic_set(&fs_info->scrub_cancel_req, 0);
init_waitqueue_head(&fs_info->scrub_pause_wait);
- fs_info->scrub_workers_refcnt = 0;
+ refcount_set(&fs_info->scrub_workers_refcnt, 0);
}
static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
@@ -2154,10 +2153,8 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
{
mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
- rwlock_init(&fs_info->dev_replace.lock);
- atomic_set(&fs_info->dev_replace.blocking_readers, 0);
+ init_rwsem(&fs_info->dev_replace.rwsem);
init_waitqueue_head(&fs_info->dev_replace.replace_wait);
- init_waitqueue_head(&fs_info->dev_replace.read_lock_wq);
}
static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
@@ -2478,10 +2475,11 @@ static int validate_super(struct btrfs_fs_info *fs_info,
ret = -EINVAL;
}
- if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_FSID_SIZE) != 0) {
+ if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid,
+ BTRFS_FSID_SIZE) != 0) {
btrfs_err(fs_info,
- "dev_item UUID does not match fsid: %pU != %pU",
- fs_info->fsid, sb->dev_item.fsid);
+ "dev_item UUID does not match metadata fsid: %pU != %pU",
+ fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid);
ret = -EINVAL;
}
@@ -2677,7 +2675,6 @@ int open_ctree(struct super_block *sb,
mutex_init(&fs_info->delete_unused_bgs_mutex);
mutex_init(&fs_info->reloc_mutex);
mutex_init(&fs_info->delalloc_root_mutex);
- mutex_init(&fs_info->cleaner_delayed_iput_mutex);
seqlock_init(&fs_info->profiles_lock);
INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
@@ -2692,10 +2689,14 @@ int open_ctree(struct super_block *sb,
btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
BTRFS_BLOCK_RSV_DELOPS);
+ btrfs_init_block_rsv(&fs_info->delayed_refs_rsv,
+ BTRFS_BLOCK_RSV_DELREFS);
+
atomic_set(&fs_info->async_delalloc_pages, 0);
atomic_set(&fs_info->defrag_running, 0);
atomic_set(&fs_info->qgroup_op_seq, 0);
atomic_set(&fs_info->reada_works_cnt, 0);
+ atomic_set(&fs_info->nr_delayed_iputs, 0);
atomic64_set(&fs_info->tree_mod_seq, 0);
fs_info->sb = sb;
fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
@@ -2773,6 +2774,7 @@ int open_ctree(struct super_block *sb,
init_waitqueue_head(&fs_info->transaction_wait);
init_waitqueue_head(&fs_info->transaction_blocked_wait);
init_waitqueue_head(&fs_info->async_submit_wait);
+ init_waitqueue_head(&fs_info->delayed_iputs_wait);
INIT_LIST_HEAD(&fs_info->pinned_chunks);
@@ -2781,6 +2783,9 @@ int open_ctree(struct super_block *sb,
fs_info->sectorsize = 4096;
fs_info->stripesize = 4096;
+ spin_lock_init(&fs_info->swapfile_pins_lock);
+ fs_info->swapfile_pins = RB_ROOT;
+
ret = btrfs_alloc_stripe_hash_table(fs_info);
if (ret) {
err = ret;
@@ -2817,11 +2822,29 @@ int open_ctree(struct super_block *sb,
* the whole block of INFO_SIZE
*/
memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy));
- memcpy(fs_info->super_for_commit, fs_info->super_copy,
- sizeof(*fs_info->super_for_commit));
brelse(bh);
- memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE);
+ disk_super = fs_info->super_copy;
+
+ ASSERT(!memcmp(fs_info->fs_devices->fsid, fs_info->super_copy->fsid,
+ BTRFS_FSID_SIZE));
+
+ if (btrfs_fs_incompat(fs_info, METADATA_UUID)) {
+ ASSERT(!memcmp(fs_info->fs_devices->metadata_uuid,
+ fs_info->super_copy->metadata_uuid,
+ BTRFS_FSID_SIZE));
+ }
+
+ features = btrfs_super_flags(disk_super);
+ if (features & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) {
+ features &= ~BTRFS_SUPER_FLAG_CHANGING_FSID_V2;
+ btrfs_set_super_flags(disk_super, features);
+ btrfs_info(fs_info,
+ "found metadata UUID change in progress flag, clearing");
+ }
+
+ memcpy(fs_info->super_for_commit, fs_info->super_copy,
+ sizeof(*fs_info->super_for_commit));
ret = btrfs_validate_mount_super(fs_info);
if (ret) {
@@ -2830,7 +2853,6 @@ int open_ctree(struct super_block *sb,
goto fail_alloc;
}
- disk_super = fs_info->super_copy;
if (!btrfs_super_root(disk_super))
goto fail_alloc;
@@ -2942,7 +2964,7 @@ int open_ctree(struct super_block *sb,
sb->s_blocksize = sectorsize;
sb->s_blocksize_bits = blksize_bits(sectorsize);
- memcpy(&sb->s_uuid, fs_info->fsid, BTRFS_FSID_SIZE);
+ memcpy(&sb->s_uuid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE);
mutex_lock(&fs_info->chunk_mutex);
ret = btrfs_read_sys_array(fs_info);
@@ -3091,7 +3113,7 @@ retry_root_backup:
if (!sb_rdonly(sb) && !btrfs_check_rw_degradable(fs_info, NULL)) {
btrfs_warn(fs_info,
- "writeable mount is not allowed due to too many missing devices");
+ "writable mount is not allowed due to too many missing devices");
goto fail_sysfs;
}
@@ -3760,7 +3782,8 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
btrfs_set_stack_device_io_width(dev_item, dev->io_width);
btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
- memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_FSID_SIZE);
+ memcpy(dev_item->fsid, dev->fs_devices->metadata_uuid,
+ BTRFS_FSID_SIZE);
flags = btrfs_super_flags(sb);
btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
@@ -3931,6 +3954,13 @@ void close_ctree(struct btrfs_fs_info *fs_info)
int ret;
set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags);
+ /*
+ * We don't want the cleaner to start new transactions, add more delayed
+ * iputs, etc. while we're closing. We can't use kthread_stop() yet
+ * because that frees the task_struct, and the transaction kthread might
+ * still try to wake up the cleaner.
+ */
+ kthread_park(fs_info->cleaner_kthread);
/* wait for the qgroup rescan worker to stop */
btrfs_qgroup_wait_for_completion(fs_info, false);
@@ -3958,9 +3988,8 @@ void close_ctree(struct btrfs_fs_info *fs_info)
if (!sb_rdonly(fs_info->sb)) {
/*
- * If the cleaner thread is stopped and there are
- * block groups queued for removal, the deletion will be
- * skipped when we quit the cleaner thread.
+ * The cleaner kthread is stopped, so do one final pass over
+ * unused block groups.
*/
btrfs_delete_unused_bgs(fs_info);
@@ -4061,7 +4090,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
/*
* This is a fast path so only do this check if we have sanity tests
- * enabled. Normal people shouldn't be using umapped buffers as dirty
+ * enabled. Normal people shouldn't be using unmapped buffers as dirty
* outside of the sanity tests.
*/
if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags)))
@@ -4185,6 +4214,14 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
spin_lock(&fs_info->ordered_root_lock);
}
spin_unlock(&fs_info->ordered_root_lock);
+
+ /*
+ * We need this here because if we've been flipped read-only we won't
+ * get sync() from the umount, so we need to make sure any ordered
+ * extents that haven't had their dirty pages IO start writeout yet
+ * actually get run and error out properly.
+ */
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
}
static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
@@ -4211,16 +4248,9 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
head = rb_entry(node, struct btrfs_delayed_ref_head,
href_node);
- if (!mutex_trylock(&head->mutex)) {
- refcount_inc(&head->refs);
- spin_unlock(&delayed_refs->lock);
-
- mutex_lock(&head->mutex);
- mutex_unlock(&head->mutex);
- btrfs_put_delayed_ref_head(head);
- spin_lock(&delayed_refs->lock);
+ if (btrfs_delayed_ref_lock(delayed_refs, head))
continue;
- }
+
spin_lock(&head->lock);
while ((n = rb_first_cached(&head->ref_tree)) != NULL) {
ref = rb_entry(n, struct btrfs_delayed_ref_node,
@@ -4236,12 +4266,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
if (head->must_insert_reserved)
pin_bytes = true;
btrfs_free_delayed_extent_op(head->extent_op);
- delayed_refs->num_heads--;
- if (head->processing == 0)
- delayed_refs->num_heads_ready--;
- atomic_dec(&delayed_refs->num_entries);
- rb_erase_cached(&head->href_node, &delayed_refs->href_root);
- RB_CLEAR_NODE(&head->href_node);
+ btrfs_delete_ref_head(delayed_refs, head);
spin_unlock(&head->lock);
spin_unlock(&delayed_refs->lock);
mutex_unlock(&head->mutex);
@@ -4249,6 +4274,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
if (pin_bytes)
btrfs_pin_extent(fs_info, head->bytenr,
head->num_bytes, 1);
+ btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
btrfs_put_delayed_ref_head(head);
cond_resched();
spin_lock(&delayed_refs->lock);
@@ -4359,13 +4385,26 @@ static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
unpin = pinned_extents;
again:
while (1) {
+ struct extent_state *cached_state = NULL;
+
+ /*
+ * The btrfs_finish_extent_commit() may get the same range as
+ * ours between find_first_extent_bit and clear_extent_dirty.
+ * Hence, hold the unused_bg_unpin_mutex to avoid double unpin
+ * the same extent range.
+ */
+ mutex_lock(&fs_info->unused_bg_unpin_mutex);
ret = find_first_extent_bit(unpin, 0, &start, &end,
- EXTENT_DIRTY, NULL);
- if (ret)
+ EXTENT_DIRTY, &cached_state);
+ if (ret) {
+ mutex_unlock(&fs_info->unused_bg_unpin_mutex);
break;
+ }
- clear_extent_dirty(unpin, start, end);
+ clear_extent_dirty(unpin, start, end, &cached_state);
+ free_extent_state(cached_state);
btrfs_error_unpin_extent_range(fs_info, start, end);
+ mutex_unlock(&fs_info->unused_bg_unpin_mutex);
cond_resched();
}
@@ -4420,6 +4459,7 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
spin_unlock(&cur_trans->dirty_bgs_lock);
btrfs_put_block_group(cache);
+ btrfs_delayed_refs_rsv_release(fs_info, 1);
spin_lock(&cur_trans->dirty_bgs_lock);
}
spin_unlock(&cur_trans->dirty_bgs_lock);
@@ -4525,7 +4565,4 @@ static const struct extent_io_ops btree_extent_io_ops = {
/* mandatory callbacks */
.submit_bio_hook = btree_submit_bio_hook,
.readpage_end_io_hook = btree_readpage_end_io_hook,
- .readpage_io_failed_hook = btree_io_failed_hook,
-
- /* optional callbacks */
};
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 4cccba22640f..987a64bc0c66 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -21,11 +21,11 @@
#define BTRFS_BDEV_BLOCKSIZE (4096)
enum btrfs_wq_endio_type {
- BTRFS_WQ_ENDIO_DATA = 0,
- BTRFS_WQ_ENDIO_METADATA = 1,
- BTRFS_WQ_ENDIO_FREE_SPACE = 2,
- BTRFS_WQ_ENDIO_RAID56 = 3,
- BTRFS_WQ_ENDIO_DIO_REPAIR = 4,
+ BTRFS_WQ_ENDIO_DATA,
+ BTRFS_WQ_ENDIO_METADATA,
+ BTRFS_WQ_ENDIO_FREE_SPACE,
+ BTRFS_WQ_ENDIO_RAID56,
+ BTRFS_WQ_ENDIO_DIO_REPAIR,
};
static inline u64 btrfs_sb_offset(int mirror)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a4cd0221bc8d..994f0cc41799 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -51,6 +51,24 @@ enum {
CHUNK_ALLOC_FORCE = 2,
};
+/*
+ * Declare a helper function to detect underflow of various space info members
+ */
+#define DECLARE_SPACE_INFO_UPDATE(name) \
+static inline void update_##name(struct btrfs_space_info *sinfo, \
+ s64 bytes) \
+{ \
+ if (bytes < 0 && sinfo->name < -bytes) { \
+ WARN_ON(1); \
+ sinfo->name = 0; \
+ return; \
+ } \
+ sinfo->name += bytes; \
+}
+
+DECLARE_SPACE_INFO_UPDATE(bytes_may_use);
+DECLARE_SPACE_INFO_UPDATE(bytes_pinned);
+
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *node, u64 parent,
u64 root_objectid, u64 owner_objectid,
@@ -1037,7 +1055,7 @@ out_free:
/*
* is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
- * is_data == BTRFS_REF_TYPE_DATA, data type is requried,
+ * is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
* is_data == BTRFS_REF_TYPE_ANY, either type is OK.
*/
int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
@@ -2366,6 +2384,9 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
insert_reserved);
else
BUG();
+ if (ret && insert_reserved)
+ btrfs_pin_extent(trans->fs_info, node->bytenr,
+ node->num_bytes, 1);
return ret;
}
@@ -2403,25 +2424,77 @@ static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_ref
btrfs_delayed_ref_unlock(head);
}
-static int cleanup_extent_op(struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_head *head)
+static struct btrfs_delayed_extent_op *cleanup_extent_op(
+ struct btrfs_delayed_ref_head *head)
{
struct btrfs_delayed_extent_op *extent_op = head->extent_op;
- int ret;
if (!extent_op)
- return 0;
- head->extent_op = NULL;
+ return NULL;
+
if (head->must_insert_reserved) {
+ head->extent_op = NULL;
btrfs_free_delayed_extent_op(extent_op);
- return 0;
+ return NULL;
}
+ return extent_op;
+}
+
+static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_head *head)
+{
+ struct btrfs_delayed_extent_op *extent_op;
+ int ret;
+
+ extent_op = cleanup_extent_op(head);
+ if (!extent_op)
+ return 0;
+ head->extent_op = NULL;
spin_unlock(&head->lock);
ret = run_delayed_extent_op(trans, head, extent_op);
btrfs_free_delayed_extent_op(extent_op);
return ret ? ret : 1;
}
+void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_head *head)
+{
+ int nr_items = 1; /* Dropping this ref head update. */
+
+ if (head->total_ref_mod < 0) {
+ struct btrfs_space_info *space_info;
+ u64 flags;
+
+ if (head->is_data)
+ flags = BTRFS_BLOCK_GROUP_DATA;
+ else if (head->is_system)
+ flags = BTRFS_BLOCK_GROUP_SYSTEM;
+ else
+ flags = BTRFS_BLOCK_GROUP_METADATA;
+ space_info = __find_space_info(fs_info, flags);
+ ASSERT(space_info);
+ percpu_counter_add_batch(&space_info->total_bytes_pinned,
+ -head->num_bytes,
+ BTRFS_TOTAL_BYTES_PINNED_BATCH);
+
+ /*
+ * We had csum deletions accounted for in our delayed refs rsv,
+ * we need to drop the csum leaves for this update from our
+ * delayed_refs_rsv.
+ */
+ if (head->is_data) {
+ spin_lock(&delayed_refs->lock);
+ delayed_refs->pending_csums -= head->num_bytes;
+ spin_unlock(&delayed_refs->lock);
+ nr_items += btrfs_csum_bytes_to_leaves(fs_info,
+ head->num_bytes);
+ }
+ }
+
+ btrfs_delayed_refs_rsv_release(fs_info, nr_items);
+}
+
static int cleanup_ref_head(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *head)
{
@@ -2432,7 +2505,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
delayed_refs = &trans->transaction->delayed_refs;
- ret = cleanup_extent_op(trans, head);
+ ret = run_and_cleanup_extent_op(trans, head);
if (ret < 0) {
unselect_delayed_ref_head(delayed_refs, head);
btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
@@ -2453,37 +2526,9 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
spin_unlock(&delayed_refs->lock);
return 1;
}
- delayed_refs->num_heads--;
- rb_erase_cached(&head->href_node, &delayed_refs->href_root);
- RB_CLEAR_NODE(&head->href_node);
+ btrfs_delete_ref_head(delayed_refs, head);
spin_unlock(&head->lock);
spin_unlock(&delayed_refs->lock);
- atomic_dec(&delayed_refs->num_entries);
-
- trace_run_delayed_ref_head(fs_info, head, 0);
-
- if (head->total_ref_mod < 0) {
- struct btrfs_space_info *space_info;
- u64 flags;
-
- if (head->is_data)
- flags = BTRFS_BLOCK_GROUP_DATA;
- else if (head->is_system)
- flags = BTRFS_BLOCK_GROUP_SYSTEM;
- else
- flags = BTRFS_BLOCK_GROUP_METADATA;
- space_info = __find_space_info(fs_info, flags);
- ASSERT(space_info);
- percpu_counter_add_batch(&space_info->total_bytes_pinned,
- -head->num_bytes,
- BTRFS_TOTAL_BYTES_PINNED_BATCH);
-
- if (head->is_data) {
- spin_lock(&delayed_refs->lock);
- delayed_refs->pending_csums -= head->num_bytes;
- spin_unlock(&delayed_refs->lock);
- }
- }
if (head->must_insert_reserved) {
btrfs_pin_extent(fs_info, head->bytenr,
@@ -2494,9 +2539,9 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
}
}
- /* Also free its reserved qgroup space */
- btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root,
- head->qgroup_reserved);
+ btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
+
+ trace_run_delayed_ref_head(fs_info, head, 0);
btrfs_delayed_ref_unlock(head);
btrfs_put_delayed_ref_head(head);
return 0;
@@ -2789,40 +2834,28 @@ u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes)
return num_csums;
}
-int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans)
+bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info)
{
- struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_block_rsv *global_rsv;
- u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
- u64 csum_bytes = trans->transaction->delayed_refs.pending_csums;
- unsigned int num_dirty_bgs = trans->transaction->num_dirty_bgs;
- u64 num_bytes, num_dirty_bgs_bytes;
- int ret = 0;
+ struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
+ struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+ bool ret = false;
+ u64 reserved;
- num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
- num_heads = heads_to_leaves(fs_info, num_heads);
- if (num_heads > 1)
- num_bytes += (num_heads - 1) * fs_info->nodesize;
- num_bytes <<= 1;
- num_bytes += btrfs_csum_bytes_to_leaves(fs_info, csum_bytes) *
- fs_info->nodesize;
- num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(fs_info,
- num_dirty_bgs);
- global_rsv = &fs_info->global_block_rsv;
+ spin_lock(&global_rsv->lock);
+ reserved = global_rsv->reserved;
+ spin_unlock(&global_rsv->lock);
/*
- * If we can't allocate any more chunks lets make sure we have _lots_ of
- * wiggle room since running delayed refs can create more delayed refs.
+ * Since the global reserve is just kind of magic we don't really want
+ * to rely on it to save our bacon, so if our size is more than the
+ * delayed_refs_rsv and the global rsv then it's time to think about
+ * bailing.
*/
- if (global_rsv->space_info->full) {
- num_dirty_bgs_bytes <<= 1;
- num_bytes <<= 1;
- }
-
- spin_lock(&global_rsv->lock);
- if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes)
- ret = 1;
- spin_unlock(&global_rsv->lock);
+ spin_lock(&delayed_refs_rsv->lock);
+ reserved += delayed_refs_rsv->reserved;
+ if (delayed_refs_rsv->size >= reserved)
+ ret = true;
+ spin_unlock(&delayed_refs_rsv->lock);
return ret;
}
@@ -2841,7 +2874,7 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans)
if (val >= NSEC_PER_SEC / 2)
return 2;
- return btrfs_check_space_for_delayed_refs(trans);
+ return btrfs_check_space_for_delayed_refs(trans->fs_info);
}
struct async_delayed_refs {
@@ -2954,7 +2987,6 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *head;
int ret;
int run_all = count == (unsigned long)-1;
- bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
/* We'll clean this up in btrfs_cleanup_transaction */
if (trans->aborted)
@@ -2971,7 +3003,6 @@ again:
#ifdef SCRAMBLE_DELAYED_REFS
delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
#endif
- trans->can_flush_pending_bgs = false;
ret = __btrfs_run_delayed_refs(trans, count);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
@@ -2979,8 +3010,7 @@ again:
}
if (run_all) {
- if (!list_empty(&trans->new_bgs))
- btrfs_create_pending_block_groups(trans);
+ btrfs_create_pending_block_groups(trans);
spin_lock(&delayed_refs->lock);
node = rb_first_cached(&delayed_refs->href_root);
@@ -3002,7 +3032,6 @@ again:
goto again;
}
out:
- trans->can_flush_pending_bgs = can_flush_pending_bgs;
return 0;
}
@@ -3588,6 +3617,8 @@ again:
*/
mutex_lock(&trans->transaction->cache_write_mutex);
while (!list_empty(&dirty)) {
+ bool drop_reserve = true;
+
cache = list_first_entry(&dirty,
struct btrfs_block_group_cache,
dirty_list);
@@ -3660,6 +3691,7 @@ again:
list_add_tail(&cache->dirty_list,
&cur_trans->dirty_bgs);
btrfs_get_block_group(cache);
+ drop_reserve = false;
}
spin_unlock(&cur_trans->dirty_bgs_lock);
} else if (ret) {
@@ -3667,9 +3699,11 @@ again:
}
}
- /* if its not on the io list, we need to put the block group */
+ /* if it's not on the io list, we need to put the block group */
if (should_put)
btrfs_put_block_group(cache);
+ if (drop_reserve)
+ btrfs_delayed_refs_rsv_release(fs_info, 1);
if (ret)
break;
@@ -3818,6 +3852,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
/* if its not on the io list, we need to put the block group */
if (should_put)
btrfs_put_block_group(cache);
+ btrfs_delayed_refs_rsv_release(fs_info, 1);
spin_lock(&cur_trans->dirty_bgs_lock);
}
spin_unlock(&cur_trans->dirty_bgs_lock);
@@ -4241,10 +4276,14 @@ commit_trans:
/*
* The cleaner kthread might still be doing iput
* operations. Wait for it to finish so that
- * more space is released.
+ * more space is released. We don't need to
+ * explicitly run the delayed iputs here because
+ * the commit_transaction would have woken up
+ * the cleaner.
*/
- mutex_lock(&fs_info->cleaner_delayed_iput_mutex);
- mutex_unlock(&fs_info->cleaner_delayed_iput_mutex);
+ ret = btrfs_wait_on_delayed_iputs(fs_info);
+ if (ret)
+ return ret;
goto again;
} else {
btrfs_end_transaction(trans);
@@ -4256,7 +4295,7 @@ commit_trans:
data_sinfo->flags, bytes, 1);
return -ENOSPC;
}
- data_sinfo->bytes_may_use += bytes;
+ update_bytes_may_use(data_sinfo, bytes);
trace_btrfs_space_reservation(fs_info, "space_info",
data_sinfo->flags, bytes, 1);
spin_unlock(&data_sinfo->lock);
@@ -4309,10 +4348,7 @@ void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
data_sinfo = fs_info->data_sinfo;
spin_lock(&data_sinfo->lock);
- if (WARN_ON(data_sinfo->bytes_may_use < len))
- data_sinfo->bytes_may_use = 0;
- else
- data_sinfo->bytes_may_use -= len;
+ update_bytes_may_use(data_sinfo, -len);
trace_btrfs_space_reservation(fs_info, "space_info",
data_sinfo->flags, len, 0);
spin_unlock(&data_sinfo->lock);
@@ -4360,7 +4396,6 @@ static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *sinfo, int force)
{
- struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
u64 bytes_used = btrfs_space_info_used(sinfo, false);
u64 thresh;
@@ -4368,14 +4403,6 @@ static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
return 1;
/*
- * We need to take into account the global rsv because for all intents
- * and purposes it's used space. Don't worry about locking the
- * global_rsv, it doesn't change except when the transaction commits.
- */
- if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
- bytes_used += calc_global_rsv_need_space(global_rsv);
-
- /*
* in limited mode, we want to have some free space up to
* about 1% of the FS size.
*/
@@ -4568,6 +4595,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
goto out;
} else {
ret = 1;
+ space_info->max_extent_size = 0;
}
space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
@@ -4589,11 +4617,9 @@ out:
* the block groups that were made dirty during the lifetime of the
* transaction.
*/
- if (trans->can_flush_pending_bgs &&
- trans->chunk_bytes_reserved >= (u64)SZ_2M) {
+ if (trans->chunk_bytes_reserved >= (u64)SZ_2M)
btrfs_create_pending_block_groups(trans);
- btrfs_trans_release_chunk_metadata(trans);
- }
+
return ret;
}
@@ -4638,7 +4664,7 @@ static int can_overcommit(struct btrfs_fs_info *fs_info,
/*
* If we have dup, raid1 or raid10 then only half of the free
- * space is actually useable. For raid56, the space info used
+ * space is actually usable. For raid56, the space info used
* doesn't include the parity drive, so we don't have to
* change the math
*/
@@ -4706,7 +4732,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
struct btrfs_space_info *space_info;
struct btrfs_trans_handle *trans;
u64 delalloc_bytes;
- u64 max_reclaim;
+ u64 async_pages;
u64 items;
long time_left;
unsigned long nr_pages;
@@ -4731,25 +4757,36 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
loops = 0;
while (delalloc_bytes && loops < 3) {
- max_reclaim = min(delalloc_bytes, to_reclaim);
- nr_pages = max_reclaim >> PAGE_SHIFT;
+ nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
+
+ /*
+ * Triggers inode writeback for up to nr_pages. This will invoke
+ * ->writepages callback and trigger delalloc filling
+ * (btrfs_run_delalloc_range()).
+ */
btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
+
/*
- * We need to wait for the async pages to actually start before
- * we do anything.
+ * We need to wait for the compressed pages to start before
+ * we continue.
*/
- max_reclaim = atomic_read(&fs_info->async_delalloc_pages);
- if (!max_reclaim)
+ async_pages = atomic_read(&fs_info->async_delalloc_pages);
+ if (!async_pages)
goto skip_async;
- if (max_reclaim <= nr_pages)
- max_reclaim = 0;
+ /*
+ * Calculate how many compressed pages we want to be written
+ * before we continue. I.e if there are more async pages than we
+ * require wait_event will wait until nr_pages are written.
+ */
+ if (async_pages <= nr_pages)
+ async_pages = 0;
else
- max_reclaim -= nr_pages;
+ async_pages -= nr_pages;
wait_event(fs_info->async_submit_wait,
atomic_read(&fs_info->async_delalloc_pages) <=
- (int)max_reclaim);
+ (int)async_pages);
skip_async:
spin_lock(&space_info->lock);
if (list_empty(&space_info->tickets) &&
@@ -4773,6 +4810,7 @@ skip_async:
}
struct reserve_ticket {
+ u64 orig_bytes;
u64 bytes;
int error;
struct list_head list;
@@ -4794,8 +4832,10 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
{
struct reserve_ticket *ticket = NULL;
struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
+ struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
struct btrfs_trans_handle *trans;
- u64 bytes;
+ u64 bytes_needed;
+ u64 reclaim_bytes = 0;
trans = (struct btrfs_trans_handle *)current->journal_info;
if (trans)
@@ -4808,16 +4848,25 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
else if (!list_empty(&space_info->tickets))
ticket = list_first_entry(&space_info->tickets,
struct reserve_ticket, list);
- bytes = (ticket) ? ticket->bytes : 0;
+ bytes_needed = (ticket) ? ticket->bytes : 0;
spin_unlock(&space_info->lock);
- if (!bytes)
+ if (!bytes_needed)
return 0;
- /* See if there is enough pinned space to make this reservation */
- if (__percpu_counter_compare(&space_info->total_bytes_pinned,
- bytes,
- BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
+ trans = btrfs_join_transaction(fs_info->extent_root);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ /*
+ * See if there is enough pinned space to make this reservation, or if
+ * we have block groups that are going to be freed, allowing us to
+ * possibly do a chunk allocation the next loop through.
+ */
+ if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) ||
+ __percpu_counter_compare(&space_info->total_bytes_pinned,
+ bytes_needed,
+ BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
goto commit;
/*
@@ -4825,27 +4874,29 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
* this reservation.
*/
if (space_info != delayed_rsv->space_info)
- return -ENOSPC;
+ goto enospc;
spin_lock(&delayed_rsv->lock);
- if (delayed_rsv->size > bytes)
- bytes = 0;
- else
- bytes -= delayed_rsv->size;
+ reclaim_bytes += delayed_rsv->reserved;
spin_unlock(&delayed_rsv->lock);
+ spin_lock(&delayed_refs_rsv->lock);
+ reclaim_bytes += delayed_refs_rsv->reserved;
+ spin_unlock(&delayed_refs_rsv->lock);
+ if (reclaim_bytes >= bytes_needed)
+ goto commit;
+ bytes_needed -= reclaim_bytes;
+
if (__percpu_counter_compare(&space_info->total_bytes_pinned,
- bytes,
- BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) {
- return -ENOSPC;
- }
+ bytes_needed,
+ BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0)
+ goto enospc;
commit:
- trans = btrfs_join_transaction(fs_info->extent_root);
- if (IS_ERR(trans))
- return -ENOSPC;
-
return btrfs_commit_transaction(trans);
+enospc:
+ btrfs_end_transaction(trans);
+ return -ENOSPC;
}
/*
@@ -4883,7 +4934,22 @@ static void flush_space(struct btrfs_fs_info *fs_info,
shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
state == FLUSH_DELALLOC_WAIT);
break;
+ case FLUSH_DELAYED_REFS_NR:
+ case FLUSH_DELAYED_REFS:
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ break;
+ }
+ if (state == FLUSH_DELAYED_REFS_NR)
+ nr = calc_reclaim_items_nr(fs_info, num_bytes);
+ else
+ nr = 0;
+ btrfs_run_delayed_refs(trans, nr);
+ btrfs_end_transaction(trans);
+ break;
case ALLOC_CHUNK:
+ case ALLOC_CHUNK_FORCE:
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
@@ -4891,12 +4957,21 @@ static void flush_space(struct btrfs_fs_info *fs_info,
}
ret = do_chunk_alloc(trans,
btrfs_metadata_alloc_profile(fs_info),
- CHUNK_ALLOC_NO_FORCE);
+ (state == ALLOC_CHUNK) ?
+ CHUNK_ALLOC_NO_FORCE : CHUNK_ALLOC_FORCE);
btrfs_end_transaction(trans);
if (ret > 0 || ret == -ENOSPC)
ret = 0;
break;
case COMMIT_TRANS:
+ /*
+ * If we have pending delayed iputs then we could free up a
+ * bunch of pinned space, so make sure we run the iputs before
+ * we do our pinned bytes check below.
+ */
+ btrfs_run_delayed_iputs(fs_info);
+ btrfs_wait_on_delayed_iputs(fs_info);
+
ret = may_commit_transaction(fs_info, space_info);
break;
default:
@@ -4966,7 +5041,7 @@ static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
!test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
}
-static void wake_all_tickets(struct list_head *head)
+static bool wake_all_tickets(struct list_head *head)
{
struct reserve_ticket *ticket;
@@ -4975,7 +5050,10 @@ static void wake_all_tickets(struct list_head *head)
list_del_init(&ticket->list);
ticket->error = -ENOSPC;
wake_up(&ticket->wait);
+ if (ticket->bytes != ticket->orig_bytes)
+ return true;
}
+ return false;
}
/*
@@ -5027,11 +5105,28 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
commit_cycles--;
}
+ /*
+ * We don't want to force a chunk allocation until we've tried
+ * pretty hard to reclaim space. Think of the case where we
+ * freed up a bunch of space and so have a lot of pinned space
+ * to reclaim. We would rather use that than possibly create a
+ * underutilized metadata chunk. So if this is our first run
+ * through the flushing state machine skip ALLOC_CHUNK_FORCE and
+ * commit the transaction. If nothing has changed the next go
+ * around then we can force a chunk allocation.
+ */
+ if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
+ flush_state++;
+
if (flush_state > COMMIT_TRANS) {
commit_cycles++;
if (commit_cycles > 2) {
- wake_all_tickets(&space_info->tickets);
- space_info->flush = 0;
+ if (wake_all_tickets(&space_info->tickets)) {
+ flush_state = FLUSH_DELAYED_ITEMS_NR;
+ commit_cycles--;
+ } else {
+ space_info->flush = 0;
+ }
} else {
flush_state = FLUSH_DELAYED_ITEMS_NR;
}
@@ -5045,12 +5140,18 @@ void btrfs_init_async_reclaim_work(struct work_struct *work)
INIT_WORK(work, btrfs_async_reclaim_metadata_space);
}
+static const enum btrfs_flush_state priority_flush_states[] = {
+ FLUSH_DELAYED_ITEMS_NR,
+ FLUSH_DELAYED_ITEMS,
+ ALLOC_CHUNK,
+};
+
static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
struct reserve_ticket *ticket)
{
u64 to_reclaim;
- int flush_state = FLUSH_DELAYED_ITEMS_NR;
+ int flush_state;
spin_lock(&space_info->lock);
to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
@@ -5061,8 +5162,10 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
}
spin_unlock(&space_info->lock);
+ flush_state = 0;
do {
- flush_space(fs_info, space_info, to_reclaim, flush_state);
+ flush_space(fs_info, space_info, to_reclaim,
+ priority_flush_states[flush_state]);
flush_state++;
spin_lock(&space_info->lock);
if (ticket->bytes == 0) {
@@ -5070,23 +5173,16 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
return;
}
spin_unlock(&space_info->lock);
-
- /*
- * Priority flushers can't wait on delalloc without
- * deadlocking.
- */
- if (flush_state == FLUSH_DELALLOC ||
- flush_state == FLUSH_DELALLOC_WAIT)
- flush_state = ALLOC_CHUNK;
- } while (flush_state < COMMIT_TRANS);
+ } while (flush_state < ARRAY_SIZE(priority_flush_states));
}
static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
- struct reserve_ticket *ticket, u64 orig_bytes)
+ struct reserve_ticket *ticket)
{
DEFINE_WAIT(wait);
+ u64 reclaim_bytes = 0;
int ret = 0;
spin_lock(&space_info->lock);
@@ -5107,14 +5203,12 @@ static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
ret = ticket->error;
if (!list_empty(&ticket->list))
list_del_init(&ticket->list);
- if (ticket->bytes && ticket->bytes < orig_bytes) {
- u64 num_bytes = orig_bytes - ticket->bytes;
- space_info->bytes_may_use -= num_bytes;
- trace_btrfs_space_reservation(fs_info, "space_info",
- space_info->flags, num_bytes, 0);
- }
+ if (ticket->bytes && ticket->bytes < ticket->orig_bytes)
+ reclaim_bytes = ticket->orig_bytes - ticket->bytes;
spin_unlock(&space_info->lock);
+ if (reclaim_bytes)
+ space_info_add_old_bytes(fs_info, space_info, reclaim_bytes);
return ret;
}
@@ -5140,6 +5234,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
{
struct reserve_ticket ticket;
u64 used;
+ u64 reclaim_bytes = 0;
int ret = 0;
ASSERT(orig_bytes);
@@ -5155,13 +5250,13 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
* If not things get more complicated.
*/
if (used + orig_bytes <= space_info->total_bytes) {
- space_info->bytes_may_use += orig_bytes;
+ update_bytes_may_use(space_info, orig_bytes);
trace_btrfs_space_reservation(fs_info, "space_info",
space_info->flags, orig_bytes, 1);
ret = 0;
} else if (can_overcommit(fs_info, space_info, orig_bytes, flush,
system_chunk)) {
- space_info->bytes_may_use += orig_bytes;
+ update_bytes_may_use(space_info, orig_bytes);
trace_btrfs_space_reservation(fs_info, "space_info",
space_info->flags, orig_bytes, 1);
ret = 0;
@@ -5175,6 +5270,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
* the list and we will do our own flushing further down.
*/
if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
+ ticket.orig_bytes = orig_bytes;
ticket.bytes = orig_bytes;
ticket.error = 0;
init_waitqueue_head(&ticket.wait);
@@ -5215,25 +5311,21 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
return ret;
if (flush == BTRFS_RESERVE_FLUSH_ALL)
- return wait_reserve_ticket(fs_info, space_info, &ticket,
- orig_bytes);
+ return wait_reserve_ticket(fs_info, space_info, &ticket);
ret = 0;
priority_reclaim_metadata_space(fs_info, space_info, &ticket);
spin_lock(&space_info->lock);
if (ticket.bytes) {
- if (ticket.bytes < orig_bytes) {
- u64 num_bytes = orig_bytes - ticket.bytes;
- space_info->bytes_may_use -= num_bytes;
- trace_btrfs_space_reservation(fs_info, "space_info",
- space_info->flags,
- num_bytes, 0);
-
- }
+ if (ticket.bytes < orig_bytes)
+ reclaim_bytes = orig_bytes - ticket.bytes;
list_del_init(&ticket.list);
ret = -ENOSPC;
}
spin_unlock(&space_info->lock);
+
+ if (reclaim_bytes)
+ space_info_add_old_bytes(fs_info, space_info, reclaim_bytes);
ASSERT(list_empty(&ticket.list));
return ret;
}
@@ -5245,7 +5337,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
* @orig_bytes - the number of bytes we want
* @flush - whether or not we can flush to make our reservation
*
- * This will reserve orgi_bytes number of bytes from the space info associated
+ * This will reserve orig_bytes number of bytes from the space info associated
* with the block_rsv. If there is not enough space it will make an attempt to
* flush out space to make room. It will do this by flushing delalloc if
* possible or committing the transaction. If flush is 0 then no attempts to
@@ -5355,6 +5447,90 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
return 0;
}
+/**
+ * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv.
+ * @fs_info - the fs info for our fs.
+ * @src - the source block rsv to transfer from.
+ * @num_bytes - the number of bytes to transfer.
+ *
+ * This transfers up to the num_bytes amount from the src rsv to the
+ * delayed_refs_rsv. Any extra bytes are returned to the space info.
+ */
+void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *src,
+ u64 num_bytes)
+{
+ struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
+ u64 to_free = 0;
+
+ spin_lock(&src->lock);
+ src->reserved -= num_bytes;
+ src->size -= num_bytes;
+ spin_unlock(&src->lock);
+
+ spin_lock(&delayed_refs_rsv->lock);
+ if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) {
+ u64 delta = delayed_refs_rsv->size -
+ delayed_refs_rsv->reserved;
+ if (num_bytes > delta) {
+ to_free = num_bytes - delta;
+ num_bytes = delta;
+ }
+ } else {
+ to_free = num_bytes;
+ num_bytes = 0;
+ }
+
+ if (num_bytes)
+ delayed_refs_rsv->reserved += num_bytes;
+ if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size)
+ delayed_refs_rsv->full = 1;
+ spin_unlock(&delayed_refs_rsv->lock);
+
+ if (num_bytes)
+ trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
+ 0, num_bytes, 1);
+ if (to_free)
+ space_info_add_old_bytes(fs_info, delayed_refs_rsv->space_info,
+ to_free);
+}
+
+/**
+ * btrfs_delayed_refs_rsv_refill - refill based on our delayed refs usage.
+ * @fs_info - the fs_info for our fs.
+ * @flush - control how we can flush for this reservation.
+ *
+ * This will refill the delayed block_rsv up to 1 items size worth of space and
+ * will return -ENOSPC if we can't make the reservation.
+ */
+int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
+ enum btrfs_reserve_flush_enum flush)
+{
+ struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
+ u64 limit = btrfs_calc_trans_metadata_size(fs_info, 1);
+ u64 num_bytes = 0;
+ int ret = -ENOSPC;
+
+ spin_lock(&block_rsv->lock);
+ if (block_rsv->reserved < block_rsv->size) {
+ num_bytes = block_rsv->size - block_rsv->reserved;
+ num_bytes = min(num_bytes, limit);
+ }
+ spin_unlock(&block_rsv->lock);
+
+ if (!num_bytes)
+ return 0;
+
+ ret = reserve_metadata_bytes(fs_info->extent_root, block_rsv,
+ num_bytes, flush);
+ if (ret)
+ return ret;
+ block_rsv_add_bytes(block_rsv, num_bytes, 0);
+ trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
+ 0, num_bytes, 1);
+ return 0;
+}
+
/*
* This is for space we already have accounted in space_info->bytes_may_use, so
* basically when we're returning space from block_rsv's.
@@ -5408,7 +5584,7 @@ again:
flush = BTRFS_RESERVE_FLUSH_ALL;
goto again;
}
- space_info->bytes_may_use -= num_bytes;
+ update_bytes_may_use(space_info, -num_bytes);
trace_btrfs_space_reservation(fs_info, "space_info",
space_info->flags, num_bytes, 0);
spin_unlock(&space_info->lock);
@@ -5436,7 +5612,7 @@ again:
ticket->bytes, 1);
list_del_init(&ticket->list);
num_bytes -= ticket->bytes;
- space_info->bytes_may_use += ticket->bytes;
+ update_bytes_may_use(space_info, ticket->bytes);
ticket->bytes = 0;
space_info->tickets_id++;
wake_up(&ticket->wait);
@@ -5444,7 +5620,7 @@ again:
trace_btrfs_space_reservation(fs_info, "space_info",
space_info->flags,
num_bytes, 1);
- space_info->bytes_may_use += num_bytes;
+ update_bytes_may_use(space_info, num_bytes);
ticket->bytes -= num_bytes;
num_bytes = 0;
}
@@ -5627,14 +5803,29 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
return ret;
}
+static void calc_refill_bytes(struct btrfs_block_rsv *block_rsv,
+ u64 *metadata_bytes, u64 *qgroup_bytes)
+{
+ *metadata_bytes = 0;
+ *qgroup_bytes = 0;
+
+ spin_lock(&block_rsv->lock);
+ if (block_rsv->reserved < block_rsv->size)
+ *metadata_bytes = block_rsv->size - block_rsv->reserved;
+ if (block_rsv->qgroup_rsv_reserved < block_rsv->qgroup_rsv_size)
+ *qgroup_bytes = block_rsv->qgroup_rsv_size -
+ block_rsv->qgroup_rsv_reserved;
+ spin_unlock(&block_rsv->lock);
+}
+
/**
* btrfs_inode_rsv_refill - refill the inode block rsv.
* @inode - the inode we are refilling.
- * @flush - the flusing restriction.
+ * @flush - the flushing restriction.
*
* Essentially the same as btrfs_block_rsv_refill, except it uses the
* block_rsv->size as the minimum size. We'll either refill the missing amount
- * or return if we already have enough space. This will also handle the resreve
+ * or return if we already have enough space. This will also handle the reserve
* tracepoint for the reserved amount.
*/
static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
@@ -5642,25 +5833,42 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
{
struct btrfs_root *root = inode->root;
struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
- u64 num_bytes = 0;
- u64 qgroup_num_bytes = 0;
+ u64 num_bytes, last = 0;
+ u64 qgroup_num_bytes;
int ret = -ENOSPC;
- spin_lock(&block_rsv->lock);
- if (block_rsv->reserved < block_rsv->size)
- num_bytes = block_rsv->size - block_rsv->reserved;
- if (block_rsv->qgroup_rsv_reserved < block_rsv->qgroup_rsv_size)
- qgroup_num_bytes = block_rsv->qgroup_rsv_size -
- block_rsv->qgroup_rsv_reserved;
- spin_unlock(&block_rsv->lock);
-
+ calc_refill_bytes(block_rsv, &num_bytes, &qgroup_num_bytes);
if (num_bytes == 0)
return 0;
- ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_num_bytes, true);
- if (ret)
- return ret;
- ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
+ do {
+ ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_num_bytes,
+ true);
+ if (ret)
+ return ret;
+ ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
+ if (ret) {
+ btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
+ last = num_bytes;
+ /*
+ * If we are fragmented we can end up with a lot of
+ * outstanding extents which will make our size be much
+ * larger than our reserved amount.
+ *
+ * If the reservation happens here, it might be very
+ * big though not needed in the end, if the delalloc
+ * flushing happens.
+ *
+ * If this is the case try and do the reserve again.
+ */
+ if (flush == BTRFS_RESERVE_FLUSH_ALL)
+ calc_refill_bytes(block_rsv, &num_bytes,
+ &qgroup_num_bytes);
+ if (num_bytes == 0)
+ return 0;
+ }
+ } while (ret && last != num_bytes);
+
if (!ret) {
block_rsv_add_bytes(block_rsv, num_bytes, false);
trace_btrfs_space_reservation(root->fs_info, "delalloc",
@@ -5670,11 +5878,35 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
spin_lock(&block_rsv->lock);
block_rsv->qgroup_rsv_reserved += qgroup_num_bytes;
spin_unlock(&block_rsv->lock);
- } else
- btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
+ }
return ret;
}
+static u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes, u64 *qgroup_to_release)
+{
+ struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+ struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
+ struct btrfs_block_rsv *target = delayed_rsv;
+
+ if (target->full || target == block_rsv)
+ target = global_rsv;
+
+ if (block_rsv->space_info != target->space_info)
+ target = NULL;
+
+ return block_rsv_release_bytes(fs_info, block_rsv, target, num_bytes,
+ qgroup_to_release);
+}
+
+void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes)
+{
+ __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL);
+}
+
/**
* btrfs_inode_rsv_release - release any excessive reservation.
* @inode - the inode we need to release from.
@@ -5689,7 +5921,6 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
u64 released = 0;
u64 qgroup_to_release = 0;
@@ -5699,8 +5930,8 @@ static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
* are releasing 0 bytes, and then we'll just get the reservation over
* the size free'd.
*/
- released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 0,
- &qgroup_to_release);
+ released = __btrfs_block_rsv_release(fs_info, block_rsv, 0,
+ &qgroup_to_release);
if (released > 0)
trace_btrfs_space_reservation(fs_info, "delalloc",
btrfs_ino(inode), released, 0);
@@ -5711,16 +5942,26 @@ static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
qgroup_to_release);
}
-void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
- struct btrfs_block_rsv *block_rsv,
- u64 num_bytes)
+/**
+ * btrfs_delayed_refs_rsv_release - release a ref head's reservation.
+ * @fs_info - the fs_info for our fs.
+ * @nr - the number of items to drop.
+ *
+ * This drops the delayed ref head's count from the delayed refs rsv and frees
+ * any excess reservation we had.
+ */
+void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
{
+ struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+ u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, nr);
+ u64 released = 0;
- if (global_rsv == block_rsv ||
- block_rsv->space_info != global_rsv->space_info)
- global_rsv = NULL;
- block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes, NULL);
+ released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv,
+ num_bytes, NULL);
+ if (released)
+ trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
+ 0, released, 0);
}
static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
@@ -5751,14 +5992,14 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
num_bytes = min(num_bytes,
block_rsv->size - block_rsv->reserved);
block_rsv->reserved += num_bytes;
- sinfo->bytes_may_use += num_bytes;
+ update_bytes_may_use(sinfo, num_bytes);
trace_btrfs_space_reservation(fs_info, "space_info",
sinfo->flags, num_bytes,
1);
}
} else if (block_rsv->reserved > block_rsv->size) {
num_bytes = block_rsv->reserved - block_rsv->size;
- sinfo->bytes_may_use -= num_bytes;
+ update_bytes_may_use(sinfo, -num_bytes);
trace_btrfs_space_reservation(fs_info, "space_info",
sinfo->flags, num_bytes, 0);
block_rsv->reserved = block_rsv->size;
@@ -5785,9 +6026,10 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
fs_info->trans_block_rsv.space_info = space_info;
fs_info->empty_block_rsv.space_info = space_info;
fs_info->delayed_block_rsv.space_info = space_info;
+ fs_info->delayed_refs_rsv.space_info = space_info;
- fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
- fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
+ fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv;
+ fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv;
fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
if (fs_info->quota_root)
@@ -5807,8 +6049,34 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
WARN_ON(fs_info->delayed_block_rsv.size > 0);
WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
+ WARN_ON(fs_info->delayed_refs_rsv.reserved > 0);
+ WARN_ON(fs_info->delayed_refs_rsv.size > 0);
}
+/*
+ * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv
+ * @trans - the trans that may have generated delayed refs
+ *
+ * This is to be called anytime we may have adjusted trans->delayed_ref_updates,
+ * it'll calculate the additional size and add it to the delayed_refs_rsv.
+ */
+void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
+ u64 num_bytes;
+
+ if (!trans->delayed_ref_updates)
+ return;
+
+ num_bytes = btrfs_calc_trans_metadata_size(fs_info,
+ trans->delayed_ref_updates);
+ spin_lock(&delayed_rsv->lock);
+ delayed_rsv->size += num_bytes;
+ delayed_rsv->full = 0;
+ spin_unlock(&delayed_rsv->lock);
+ trans->delayed_ref_updates = 0;
+}
/*
* To be called after all the new block groups attached to the transaction
@@ -6101,6 +6369,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
u64 old_val;
u64 byte_in_group;
int factor;
+ int ret = 0;
/* block accounting for super block */
spin_lock(&info->delalloc_root_lock);
@@ -6114,8 +6383,10 @@ static int update_block_group(struct btrfs_trans_handle *trans,
while (total) {
cache = btrfs_lookup_block_group(info, bytenr);
- if (!cache)
- return -ENOENT;
+ if (!cache) {
+ ret = -ENOENT;
+ break;
+ }
factor = btrfs_bg_type_to_factor(cache->flags);
/*
@@ -6152,7 +6423,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
old_val -= num_bytes;
btrfs_set_block_group_used(&cache->item, old_val);
cache->pinned += num_bytes;
- cache->space_info->bytes_pinned += num_bytes;
+ update_bytes_pinned(cache->space_info, num_bytes);
cache->space_info->bytes_used -= num_bytes;
cache->space_info->disk_used -= num_bytes * factor;
spin_unlock(&cache->lock);
@@ -6174,6 +6445,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
list_add_tail(&cache->dirty_list,
&trans->transaction->dirty_bgs);
trans->transaction->num_dirty_bgs++;
+ trans->delayed_ref_updates++;
btrfs_get_block_group(cache);
}
spin_unlock(&trans->transaction->dirty_bgs_lock);
@@ -6191,7 +6463,10 @@ static int update_block_group(struct btrfs_trans_handle *trans,
total -= num_bytes;
bytenr += num_bytes;
}
- return 0;
+
+ /* Modified block groups are accounted for in the delayed_refs_rsv. */
+ btrfs_update_delayed_refs_rsv(trans);
+ return ret;
}
static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
@@ -6223,7 +6498,7 @@ static int pin_down_extent(struct btrfs_fs_info *fs_info,
spin_lock(&cache->space_info->lock);
spin_lock(&cache->lock);
cache->pinned += num_bytes;
- cache->space_info->bytes_pinned += num_bytes;
+ update_bytes_pinned(cache->space_info, num_bytes);
if (reserved) {
cache->reserved -= num_bytes;
cache->space_info->bytes_reserved -= num_bytes;
@@ -6432,7 +6707,7 @@ static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
} else {
cache->reserved += num_bytes;
space_info->bytes_reserved += num_bytes;
- space_info->bytes_may_use -= ram_bytes;
+ update_bytes_may_use(space_info, -ram_bytes);
if (delalloc)
cache->delalloc_bytes += num_bytes;
}
@@ -6464,6 +6739,7 @@ static void btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
space_info->bytes_readonly += num_bytes;
cache->reserved -= num_bytes;
space_info->bytes_reserved -= num_bytes;
+ space_info->max_extent_size = 0;
if (delalloc)
cache->delalloc_bytes -= num_bytes;
@@ -6587,7 +6863,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
spin_lock(&space_info->lock);
spin_lock(&cache->lock);
cache->pinned -= len;
- space_info->bytes_pinned -= len;
+ update_bytes_pinned(space_info, -len);
trace_btrfs_space_reservation(fs_info, "pinned",
space_info->flags, len, 0);
@@ -6608,7 +6884,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
to_add = min(len, global_rsv->size -
global_rsv->reserved);
global_rsv->reserved += to_add;
- space_info->bytes_may_use += to_add;
+ update_bytes_may_use(space_info, to_add);
if (global_rsv->reserved >= global_rsv->size)
global_rsv->full = 1;
trace_btrfs_space_reservation(fs_info,
@@ -6647,9 +6923,11 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
unpin = &fs_info->freed_extents[0];
while (!trans->aborted) {
+ struct extent_state *cached_state = NULL;
+
mutex_lock(&fs_info->unused_bg_unpin_mutex);
ret = find_first_extent_bit(unpin, 0, &start, &end,
- EXTENT_DIRTY, NULL);
+ EXTENT_DIRTY, &cached_state);
if (ret) {
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
break;
@@ -6659,9 +6937,10 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
ret = btrfs_discard_extent(fs_info, start,
end + 1 - start, NULL);
- clear_extent_dirty(unpin, start, end);
+ clear_extent_dirty(unpin, start, end, &cached_state);
unpin_extent_range(fs_info, start, end, true);
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
+ free_extent_state(cached_state);
cond_resched();
}
@@ -6955,12 +7234,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
goto out;
- if (head->extent_op) {
- if (!head->must_insert_reserved)
- goto out;
- btrfs_free_delayed_extent_op(head->extent_op);
- head->extent_op = NULL;
- }
+ if (cleanup_extent_op(head) != NULL)
+ goto out;
/*
* waiting for the lock here would deadlock. If someone else has it
@@ -6969,22 +7244,9 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
if (!mutex_trylock(&head->mutex))
goto out;
- /*
- * at this point we have a head with no other entries. Go
- * ahead and process it.
- */
- rb_erase_cached(&head->href_node, &delayed_refs->href_root);
- RB_CLEAR_NODE(&head->href_node);
- atomic_dec(&delayed_refs->num_entries);
-
- /*
- * we don't take a ref on the node because we're removing it from the
- * tree, so we just steal the ref the tree was holding.
- */
- delayed_refs->num_heads--;
- if (head->processing == 0)
- delayed_refs->num_heads_ready--;
+ btrfs_delete_ref_head(delayed_refs, head);
head->processing = 0;
+
spin_unlock(&head->lock);
spin_unlock(&delayed_refs->lock);
@@ -6992,6 +7254,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
if (head->must_insert_reserved)
ret = 1;
+ btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head);
mutex_unlock(&head->mutex);
btrfs_put_delayed_ref_head(head);
return ret;
@@ -7239,6 +7502,345 @@ btrfs_release_block_group(struct btrfs_block_group_cache *cache,
}
/*
+ * Structure used internally for find_free_extent() function. Wraps needed
+ * parameters.
+ */
+struct find_free_extent_ctl {
+ /* Basic allocation info */
+ u64 ram_bytes;
+ u64 num_bytes;
+ u64 empty_size;
+ u64 flags;
+ int delalloc;
+
+ /* Where to start the search inside the bg */
+ u64 search_start;
+
+ /* For clustered allocation */
+ u64 empty_cluster;
+
+ bool have_caching_bg;
+ bool orig_have_caching_bg;
+
+ /* RAID index, converted from flags */
+ int index;
+
+ /*
+ * Current loop number, check find_free_extent_update_loop() for details
+ */
+ int loop;
+
+ /*
+ * Whether we're refilling a cluster, if true we need to re-search
+ * current block group but don't try to refill the cluster again.
+ */
+ bool retry_clustered;
+
+ /*
+ * Whether we're updating free space cache, if true we need to re-search
+ * current block group but don't try updating free space cache again.
+ */
+ bool retry_unclustered;
+
+ /* If current block group is cached */
+ int cached;
+
+ /* Max contiguous hole found */
+ u64 max_extent_size;
+
+ /* Total free space from free space cache, not always contiguous */
+ u64 total_free_space;
+
+ /* Found result */
+ u64 found_offset;
+};
+
+
+/*
+ * Helper function for find_free_extent().
+ *
+ * Return -ENOENT to inform caller that we need fallback to unclustered mode.
+ * Return -EAGAIN to inform caller that we need to re-search this block group
+ * Return >0 to inform caller that we find nothing
+ * Return 0 means we have found a location and set ffe_ctl->found_offset.
+ */
+static int find_free_extent_clustered(struct btrfs_block_group_cache *bg,
+ struct btrfs_free_cluster *last_ptr,
+ struct find_free_extent_ctl *ffe_ctl,
+ struct btrfs_block_group_cache **cluster_bg_ret)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+ struct btrfs_block_group_cache *cluster_bg;
+ u64 aligned_cluster;
+ u64 offset;
+ int ret;
+
+ cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc);
+ if (!cluster_bg)
+ goto refill_cluster;
+ if (cluster_bg != bg && (cluster_bg->ro ||
+ !block_group_bits(cluster_bg, ffe_ctl->flags)))
+ goto release_cluster;
+
+ offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
+ ffe_ctl->num_bytes, cluster_bg->key.objectid,
+ &ffe_ctl->max_extent_size);
+ if (offset) {
+ /* We have a block, we're done */
+ spin_unlock(&last_ptr->refill_lock);
+ trace_btrfs_reserve_extent_cluster(cluster_bg,
+ ffe_ctl->search_start, ffe_ctl->num_bytes);
+ *cluster_bg_ret = cluster_bg;
+ ffe_ctl->found_offset = offset;
+ return 0;
+ }
+ WARN_ON(last_ptr->block_group != cluster_bg);
+
+release_cluster:
+ /*
+ * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so
+ * lets just skip it and let the allocator find whatever block it can
+ * find. If we reach this point, we will have tried the cluster
+ * allocator plenty of times and not have found anything, so we are
+ * likely way too fragmented for the clustering stuff to find anything.
+ *
+ * However, if the cluster is taken from the current block group,
+ * release the cluster first, so that we stand a better chance of
+ * succeeding in the unclustered allocation.
+ */
+ if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) {
+ spin_unlock(&last_ptr->refill_lock);
+ btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
+ return -ENOENT;
+ }
+
+ /* This cluster didn't work out, free it and start over */
+ btrfs_return_cluster_to_free_space(NULL, last_ptr);
+
+ if (cluster_bg != bg)
+ btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
+
+refill_cluster:
+ if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) {
+ spin_unlock(&last_ptr->refill_lock);
+ return -ENOENT;
+ }
+
+ aligned_cluster = max_t(u64,
+ ffe_ctl->empty_cluster + ffe_ctl->empty_size,
+ bg->full_stripe_len);
+ ret = btrfs_find_space_cluster(fs_info, bg, last_ptr,
+ ffe_ctl->search_start, ffe_ctl->num_bytes,
+ aligned_cluster);
+ if (ret == 0) {
+ /* Now pull our allocation out of this cluster */
+ offset = btrfs_alloc_from_cluster(bg, last_ptr,
+ ffe_ctl->num_bytes, ffe_ctl->search_start,
+ &ffe_ctl->max_extent_size);
+ if (offset) {
+ /* We found one, proceed */
+ spin_unlock(&last_ptr->refill_lock);
+ trace_btrfs_reserve_extent_cluster(bg,
+ ffe_ctl->search_start,
+ ffe_ctl->num_bytes);
+ ffe_ctl->found_offset = offset;
+ return 0;
+ }
+ } else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
+ !ffe_ctl->retry_clustered) {
+ spin_unlock(&last_ptr->refill_lock);
+
+ ffe_ctl->retry_clustered = true;
+ wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
+ ffe_ctl->empty_cluster + ffe_ctl->empty_size);
+ return -EAGAIN;
+ }
+ /*
+ * At this point we either didn't find a cluster or we weren't able to
+ * allocate a block from our cluster. Free the cluster we've been
+ * trying to use, and go to the next block group.
+ */
+ btrfs_return_cluster_to_free_space(NULL, last_ptr);
+ spin_unlock(&last_ptr->refill_lock);
+ return 1;
+}
+
+/*
+ * Return >0 to inform caller that we find nothing
+ * Return 0 when we found an free extent and set ffe_ctrl->found_offset
+ * Return -EAGAIN to inform caller that we need to re-search this block group
+ */
+static int find_free_extent_unclustered(struct btrfs_block_group_cache *bg,
+ struct btrfs_free_cluster *last_ptr,
+ struct find_free_extent_ctl *ffe_ctl)
+{
+ u64 offset;
+
+ /*
+ * We are doing an unclustered allocation, set the fragmented flag so
+ * we don't bother trying to setup a cluster again until we get more
+ * space.
+ */
+ if (unlikely(last_ptr)) {
+ spin_lock(&last_ptr->lock);
+ last_ptr->fragmented = 1;
+ spin_unlock(&last_ptr->lock);
+ }
+ if (ffe_ctl->cached) {
+ struct btrfs_free_space_ctl *free_space_ctl;
+
+ free_space_ctl = bg->free_space_ctl;
+ spin_lock(&free_space_ctl->tree_lock);
+ if (free_space_ctl->free_space <
+ ffe_ctl->num_bytes + ffe_ctl->empty_cluster +
+ ffe_ctl->empty_size) {
+ ffe_ctl->total_free_space = max_t(u64,
+ ffe_ctl->total_free_space,
+ free_space_ctl->free_space);
+ spin_unlock(&free_space_ctl->tree_lock);
+ return 1;
+ }
+ spin_unlock(&free_space_ctl->tree_lock);
+ }
+
+ offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,
+ ffe_ctl->num_bytes, ffe_ctl->empty_size,
+ &ffe_ctl->max_extent_size);
+
+ /*
+ * If we didn't find a chunk, and we haven't failed on this block group
+ * before, and this block group is in the middle of caching and we are
+ * ok with waiting, then go ahead and wait for progress to be made, and
+ * set @retry_unclustered to true.
+ *
+ * If @retry_unclustered is true then we've already waited on this
+ * block group once and should move on to the next block group.
+ */
+ if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached &&
+ ffe_ctl->loop > LOOP_CACHING_NOWAIT) {
+ wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
+ ffe_ctl->empty_size);
+ ffe_ctl->retry_unclustered = true;
+ return -EAGAIN;
+ } else if (!offset) {
+ return 1;
+ }
+ ffe_ctl->found_offset = offset;
+ return 0;
+}
+
+/*
+ * Return >0 means caller needs to re-search for free extent
+ * Return 0 means we have the needed free extent.
+ * Return <0 means we failed to locate any free extent.
+ */
+static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
+ struct btrfs_free_cluster *last_ptr,
+ struct btrfs_key *ins,
+ struct find_free_extent_ctl *ffe_ctl,
+ int full_search, bool use_cluster)
+{
+ struct btrfs_root *root = fs_info->extent_root;
+ int ret;
+
+ if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) &&
+ ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
+ ffe_ctl->orig_have_caching_bg = true;
+
+ if (!ins->objectid && ffe_ctl->loop >= LOOP_CACHING_WAIT &&
+ ffe_ctl->have_caching_bg)
+ return 1;
+
+ if (!ins->objectid && ++(ffe_ctl->index) < BTRFS_NR_RAID_TYPES)
+ return 1;
+
+ if (ins->objectid) {
+ if (!use_cluster && last_ptr) {
+ spin_lock(&last_ptr->lock);
+ last_ptr->window_start = ins->objectid;
+ spin_unlock(&last_ptr->lock);
+ }
+ return 0;
+ }
+
+ /*
+ * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
+ * caching kthreads as we move along
+ * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
+ * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
+ * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
+ * again
+ */
+ if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
+ ffe_ctl->index = 0;
+ if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) {
+ /*
+ * We want to skip the LOOP_CACHING_WAIT step if we
+ * don't have any uncached bgs and we've already done a
+ * full search through.
+ */
+ if (ffe_ctl->orig_have_caching_bg || !full_search)
+ ffe_ctl->loop = LOOP_CACHING_WAIT;
+ else
+ ffe_ctl->loop = LOOP_ALLOC_CHUNK;
+ } else {
+ ffe_ctl->loop++;
+ }
+
+ if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) {
+ struct btrfs_trans_handle *trans;
+ int exist = 0;
+
+ trans = current->journal_info;
+ if (trans)
+ exist = 1;
+ else
+ trans = btrfs_join_transaction(root);
+
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ return ret;
+ }
+
+ ret = do_chunk_alloc(trans, ffe_ctl->flags,
+ CHUNK_ALLOC_FORCE);
+
+ /*
+ * If we can't allocate a new chunk we've already looped
+ * through at least once, move on to the NO_EMPTY_SIZE
+ * case.
+ */
+ if (ret == -ENOSPC)
+ ffe_ctl->loop = LOOP_NO_EMPTY_SIZE;
+
+ /* Do not bail out on ENOSPC since we can do more. */
+ if (ret < 0 && ret != -ENOSPC)
+ btrfs_abort_transaction(trans, ret);
+ else
+ ret = 0;
+ if (!exist)
+ btrfs_end_transaction(trans);
+ if (ret)
+ return ret;
+ }
+
+ if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) {
+ /*
+ * Don't loop again if we already have no empty_size and
+ * no empty_cluster.
+ */
+ if (ffe_ctl->empty_size == 0 &&
+ ffe_ctl->empty_cluster == 0)
+ return -ENOSPC;
+ ffe_ctl->empty_size = 0;
+ ffe_ctl->empty_cluster = 0;
+ }
+ return 1;
+ }
+ return -ENOSPC;
+}
+
+/*
* walks the btree of allocated extents and find a hole of a given size.
* The key ins is changed to record the hole:
* ins->objectid == start position
@@ -7248,6 +7850,20 @@ btrfs_release_block_group(struct btrfs_block_group_cache *cache,
*
* If there is no suitable free space, we will record the max size of
* the free space extent currently.
+ *
+ * The overall logic and call chain:
+ *
+ * find_free_extent()
+ * |- Iterate through all block groups
+ * | |- Get a valid block group
+ * | |- Try to do clustered allocation in that block group
+ * | |- Try to do unclustered allocation in that block group
+ * | |- Check if the result is valid
+ * | | |- If valid, then exit
+ * | |- Jump to next block group
+ * |
+ * |- Push harder to find free extents
+ * |- If not found, re-iterate all block groups
*/
static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
u64 ram_bytes, u64 num_bytes, u64 empty_size,
@@ -7255,23 +7871,28 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
u64 flags, int delalloc)
{
int ret = 0;
- struct btrfs_root *root = fs_info->extent_root;
struct btrfs_free_cluster *last_ptr = NULL;
struct btrfs_block_group_cache *block_group = NULL;
- u64 search_start = 0;
- u64 max_extent_size = 0;
- u64 empty_cluster = 0;
+ struct find_free_extent_ctl ffe_ctl = {0};
struct btrfs_space_info *space_info;
- int loop = 0;
- int index = btrfs_bg_flags_to_raid_index(flags);
- bool failed_cluster_refill = false;
- bool failed_alloc = false;
bool use_cluster = true;
- bool have_caching_bg = false;
- bool orig_have_caching_bg = false;
bool full_search = false;
WARN_ON(num_bytes < fs_info->sectorsize);
+
+ ffe_ctl.ram_bytes = ram_bytes;
+ ffe_ctl.num_bytes = num_bytes;
+ ffe_ctl.empty_size = empty_size;
+ ffe_ctl.flags = flags;
+ ffe_ctl.search_start = 0;
+ ffe_ctl.retry_clustered = false;
+ ffe_ctl.retry_unclustered = false;
+ ffe_ctl.delalloc = delalloc;
+ ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags);
+ ffe_ctl.have_caching_bg = false;
+ ffe_ctl.orig_have_caching_bg = false;
+ ffe_ctl.found_offset = 0;
+
ins->type = BTRFS_EXTENT_ITEM_KEY;
ins->objectid = 0;
ins->offset = 0;
@@ -7307,7 +7928,8 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
spin_unlock(&space_info->lock);
}
- last_ptr = fetch_cluster_info(fs_info, space_info, &empty_cluster);
+ last_ptr = fetch_cluster_info(fs_info, space_info,
+ &ffe_ctl.empty_cluster);
if (last_ptr) {
spin_lock(&last_ptr->lock);
if (last_ptr->block_group)
@@ -7324,10 +7946,12 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
spin_unlock(&last_ptr->lock);
}
- search_start = max(search_start, first_logical_byte(fs_info, 0));
- search_start = max(search_start, hint_byte);
- if (search_start == hint_byte) {
- block_group = btrfs_lookup_block_group(fs_info, search_start);
+ ffe_ctl.search_start = max(ffe_ctl.search_start,
+ first_logical_byte(fs_info, 0));
+ ffe_ctl.search_start = max(ffe_ctl.search_start, hint_byte);
+ if (ffe_ctl.search_start == hint_byte) {
+ block_group = btrfs_lookup_block_group(fs_info,
+ ffe_ctl.search_start);
/*
* we don't want to use the block group if it doesn't match our
* allocation bits, or if its not cached.
@@ -7349,7 +7973,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
btrfs_put_block_group(block_group);
up_read(&space_info->groups_sem);
} else {
- index = btrfs_bg_flags_to_raid_index(
+ ffe_ctl.index = btrfs_bg_flags_to_raid_index(
block_group->flags);
btrfs_lock_block_group(block_group, delalloc);
goto have_block_group;
@@ -7359,21 +7983,19 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
}
}
search:
- have_caching_bg = false;
- if (index == 0 || index == btrfs_bg_flags_to_raid_index(flags))
+ ffe_ctl.have_caching_bg = false;
+ if (ffe_ctl.index == btrfs_bg_flags_to_raid_index(flags) ||
+ ffe_ctl.index == 0)
full_search = true;
down_read(&space_info->groups_sem);
- list_for_each_entry(block_group, &space_info->block_groups[index],
- list) {
- u64 offset;
- int cached;
-
+ list_for_each_entry(block_group,
+ &space_info->block_groups[ffe_ctl.index], list) {
/* If the block group is read-only, we can skip it entirely. */
if (unlikely(block_group->ro))
continue;
btrfs_grab_block_group(block_group, delalloc);
- search_start = block_group->key.objectid;
+ ffe_ctl.search_start = block_group->key.objectid;
/*
* this can happen if we end up cycling through all the
@@ -7397,9 +8019,9 @@ search:
}
have_block_group:
- cached = block_group_cache_done(block_group);
- if (unlikely(!cached)) {
- have_caching_bg = true;
+ ffe_ctl.cached = block_group_cache_done(block_group);
+ if (unlikely(!ffe_ctl.cached)) {
+ ffe_ctl.have_caching_bg = true;
ret = cache_block_group(block_group, 0);
BUG_ON(ret < 0);
ret = 0;
@@ -7413,324 +8035,105 @@ have_block_group:
* lets look there
*/
if (last_ptr && use_cluster) {
- struct btrfs_block_group_cache *used_block_group;
- unsigned long aligned_cluster;
- /*
- * the refill lock keeps out other
- * people trying to start a new cluster
- */
- used_block_group = btrfs_lock_cluster(block_group,
- last_ptr,
- delalloc);
- if (!used_block_group)
- goto refill_cluster;
-
- if (used_block_group != block_group &&
- (used_block_group->ro ||
- !block_group_bits(used_block_group, flags)))
- goto release_cluster;
-
- offset = btrfs_alloc_from_cluster(used_block_group,
- last_ptr,
- num_bytes,
- used_block_group->key.objectid,
- &max_extent_size);
- if (offset) {
- /* we have a block, we're done */
- spin_unlock(&last_ptr->refill_lock);
- trace_btrfs_reserve_extent_cluster(
- used_block_group,
- search_start, num_bytes);
- if (used_block_group != block_group) {
- btrfs_release_block_group(block_group,
- delalloc);
- block_group = used_block_group;
- }
- goto checks;
- }
-
- WARN_ON(last_ptr->block_group != used_block_group);
-release_cluster:
- /* If we are on LOOP_NO_EMPTY_SIZE, we can't
- * set up a new clusters, so lets just skip it
- * and let the allocator find whatever block
- * it can find. If we reach this point, we
- * will have tried the cluster allocator
- * plenty of times and not have found
- * anything, so we are likely way too
- * fragmented for the clustering stuff to find
- * anything.
- *
- * However, if the cluster is taken from the
- * current block group, release the cluster
- * first, so that we stand a better chance of
- * succeeding in the unclustered
- * allocation. */
- if (loop >= LOOP_NO_EMPTY_SIZE &&
- used_block_group != block_group) {
- spin_unlock(&last_ptr->refill_lock);
- btrfs_release_block_group(used_block_group,
- delalloc);
- goto unclustered_alloc;
- }
-
- /*
- * this cluster didn't work out, free it and
- * start over
- */
- btrfs_return_cluster_to_free_space(NULL, last_ptr);
+ struct btrfs_block_group_cache *cluster_bg = NULL;
- if (used_block_group != block_group)
- btrfs_release_block_group(used_block_group,
- delalloc);
-refill_cluster:
- if (loop >= LOOP_NO_EMPTY_SIZE) {
- spin_unlock(&last_ptr->refill_lock);
- goto unclustered_alloc;
- }
+ ret = find_free_extent_clustered(block_group, last_ptr,
+ &ffe_ctl, &cluster_bg);
- aligned_cluster = max_t(unsigned long,
- empty_cluster + empty_size,
- block_group->full_stripe_len);
-
- /* allocate a cluster in this block group */
- ret = btrfs_find_space_cluster(fs_info, block_group,
- last_ptr, search_start,
- num_bytes,
- aligned_cluster);
if (ret == 0) {
- /*
- * now pull our allocation out of this
- * cluster
- */
- offset = btrfs_alloc_from_cluster(block_group,
- last_ptr,
- num_bytes,
- search_start,
- &max_extent_size);
- if (offset) {
- /* we found one, proceed */
- spin_unlock(&last_ptr->refill_lock);
- trace_btrfs_reserve_extent_cluster(
- block_group, search_start,
- num_bytes);
- goto checks;
+ if (cluster_bg && cluster_bg != block_group) {
+ btrfs_release_block_group(block_group,
+ delalloc);
+ block_group = cluster_bg;
}
- } else if (!cached && loop > LOOP_CACHING_NOWAIT
- && !failed_cluster_refill) {
- spin_unlock(&last_ptr->refill_lock);
-
- failed_cluster_refill = true;
- wait_block_group_cache_progress(block_group,
- num_bytes + empty_cluster + empty_size);
+ goto checks;
+ } else if (ret == -EAGAIN) {
goto have_block_group;
- }
-
- /*
- * at this point we either didn't find a cluster
- * or we weren't able to allocate a block from our
- * cluster. Free the cluster we've been trying
- * to use, and go to the next block group
- */
- btrfs_return_cluster_to_free_space(NULL, last_ptr);
- spin_unlock(&last_ptr->refill_lock);
- goto loop;
- }
-
-unclustered_alloc:
- /*
- * We are doing an unclustered alloc, set the fragmented flag so
- * we don't bother trying to setup a cluster again until we get
- * more space.
- */
- if (unlikely(last_ptr)) {
- spin_lock(&last_ptr->lock);
- last_ptr->fragmented = 1;
- spin_unlock(&last_ptr->lock);
- }
- if (cached) {
- struct btrfs_free_space_ctl *ctl =
- block_group->free_space_ctl;
-
- spin_lock(&ctl->tree_lock);
- if (ctl->free_space <
- num_bytes + empty_cluster + empty_size) {
- if (ctl->free_space > max_extent_size)
- max_extent_size = ctl->free_space;
- spin_unlock(&ctl->tree_lock);
+ } else if (ret > 0) {
goto loop;
}
- spin_unlock(&ctl->tree_lock);
+ /* ret == -ENOENT case falls through */
}
- offset = btrfs_find_space_for_alloc(block_group, search_start,
- num_bytes, empty_size,
- &max_extent_size);
- /*
- * If we didn't find a chunk, and we haven't failed on this
- * block group before, and this block group is in the middle of
- * caching and we are ok with waiting, then go ahead and wait
- * for progress to be made, and set failed_alloc to true.
- *
- * If failed_alloc is true then we've already waited on this
- * block group once and should move on to the next block group.
- */
- if (!offset && !failed_alloc && !cached &&
- loop > LOOP_CACHING_NOWAIT) {
- wait_block_group_cache_progress(block_group,
- num_bytes + empty_size);
- failed_alloc = true;
+ ret = find_free_extent_unclustered(block_group, last_ptr,
+ &ffe_ctl);
+ if (ret == -EAGAIN)
goto have_block_group;
- } else if (!offset) {
+ else if (ret > 0)
goto loop;
- }
+ /* ret == 0 case falls through */
checks:
- search_start = round_up(offset, fs_info->stripesize);
+ ffe_ctl.search_start = round_up(ffe_ctl.found_offset,
+ fs_info->stripesize);
/* move on to the next group */
- if (search_start + num_bytes >
+ if (ffe_ctl.search_start + num_bytes >
block_group->key.objectid + block_group->key.offset) {
- btrfs_add_free_space(block_group, offset, num_bytes);
+ btrfs_add_free_space(block_group, ffe_ctl.found_offset,
+ num_bytes);
goto loop;
}
- if (offset < search_start)
- btrfs_add_free_space(block_group, offset,
- search_start - offset);
+ if (ffe_ctl.found_offset < ffe_ctl.search_start)
+ btrfs_add_free_space(block_group, ffe_ctl.found_offset,
+ ffe_ctl.search_start - ffe_ctl.found_offset);
ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
num_bytes, delalloc);
if (ret == -EAGAIN) {
- btrfs_add_free_space(block_group, offset, num_bytes);
+ btrfs_add_free_space(block_group, ffe_ctl.found_offset,
+ num_bytes);
goto loop;
}
btrfs_inc_block_group_reservations(block_group);
/* we are all good, lets return */
- ins->objectid = search_start;
+ ins->objectid = ffe_ctl.search_start;
ins->offset = num_bytes;
- trace_btrfs_reserve_extent(block_group, search_start, num_bytes);
+ trace_btrfs_reserve_extent(block_group, ffe_ctl.search_start,
+ num_bytes);
btrfs_release_block_group(block_group, delalloc);
break;
loop:
- failed_cluster_refill = false;
- failed_alloc = false;
+ ffe_ctl.retry_clustered = false;
+ ffe_ctl.retry_unclustered = false;
BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
- index);
+ ffe_ctl.index);
btrfs_release_block_group(block_group, delalloc);
cond_resched();
}
up_read(&space_info->groups_sem);
- if ((loop == LOOP_CACHING_NOWAIT) && have_caching_bg
- && !orig_have_caching_bg)
- orig_have_caching_bg = true;
-
- if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
- goto search;
-
- if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
+ ret = find_free_extent_update_loop(fs_info, last_ptr, ins, &ffe_ctl,
+ full_search, use_cluster);
+ if (ret > 0)
goto search;
- /*
- * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
- * caching kthreads as we move along
- * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
- * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
- * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
- * again
- */
- if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
- index = 0;
- if (loop == LOOP_CACHING_NOWAIT) {
- /*
- * We want to skip the LOOP_CACHING_WAIT step if we
- * don't have any uncached bgs and we've already done a
- * full search through.
- */
- if (orig_have_caching_bg || !full_search)
- loop = LOOP_CACHING_WAIT;
- else
- loop = LOOP_ALLOC_CHUNK;
- } else {
- loop++;
- }
-
- if (loop == LOOP_ALLOC_CHUNK) {
- struct btrfs_trans_handle *trans;
- int exist = 0;
-
- trans = current->journal_info;
- if (trans)
- exist = 1;
- else
- trans = btrfs_join_transaction(root);
-
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- goto out;
- }
-
- ret = do_chunk_alloc(trans, flags, CHUNK_ALLOC_FORCE);
-
- /*
- * If we can't allocate a new chunk we've already looped
- * through at least once, move on to the NO_EMPTY_SIZE
- * case.
- */
- if (ret == -ENOSPC)
- loop = LOOP_NO_EMPTY_SIZE;
-
- /*
- * Do not bail out on ENOSPC since we
- * can do more things.
- */
- if (ret < 0 && ret != -ENOSPC)
- btrfs_abort_transaction(trans, ret);
- else
- ret = 0;
- if (!exist)
- btrfs_end_transaction(trans);
- if (ret)
- goto out;
- }
-
- if (loop == LOOP_NO_EMPTY_SIZE) {
- /*
- * Don't loop again if we already have no empty_size and
- * no empty_cluster.
- */
- if (empty_size == 0 &&
- empty_cluster == 0) {
- ret = -ENOSPC;
- goto out;
- }
- empty_size = 0;
- empty_cluster = 0;
- }
-
- goto search;
- } else if (!ins->objectid) {
- ret = -ENOSPC;
- } else if (ins->objectid) {
- if (!use_cluster && last_ptr) {
- spin_lock(&last_ptr->lock);
- last_ptr->window_start = ins->objectid;
- spin_unlock(&last_ptr->lock);
- }
- ret = 0;
- }
-out:
if (ret == -ENOSPC) {
+ /*
+ * Use ffe_ctl->total_free_space as fallback if we can't find
+ * any contiguous hole.
+ */
+ if (!ffe_ctl.max_extent_size)
+ ffe_ctl.max_extent_size = ffe_ctl.total_free_space;
spin_lock(&space_info->lock);
- space_info->max_extent_size = max_extent_size;
+ space_info->max_extent_size = ffe_ctl.max_extent_size;
spin_unlock(&space_info->lock);
- ins->offset = max_extent_size;
+ ins->offset = ffe_ctl.max_extent_size;
}
return ret;
}
+#define DUMP_BLOCK_RSV(fs_info, rsv_name) \
+do { \
+ struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \
+ spin_lock(&__rsv->lock); \
+ btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \
+ __rsv->size, __rsv->reserved); \
+ spin_unlock(&__rsv->lock); \
+} while (0)
+
static void dump_space_info(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *info, u64 bytes,
int dump_block_groups)
@@ -7750,6 +8153,12 @@ static void dump_space_info(struct btrfs_fs_info *fs_info,
info->bytes_readonly);
spin_unlock(&info->lock);
+ DUMP_BLOCK_RSV(fs_info, global_block_rsv);
+ DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
+ DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
+ DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
+ DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
+
if (!dump_block_groups)
return;
@@ -8004,21 +8413,14 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
}
path = btrfs_alloc_path();
- if (!path) {
- btrfs_free_and_pin_reserved_extent(fs_info,
- extent_key.objectid,
- fs_info->nodesize);
+ if (!path)
return -ENOMEM;
- }
path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
&extent_key, size);
if (ret) {
btrfs_free_path(path);
- btrfs_free_and_pin_reserved_extent(fs_info,
- extent_key.objectid,
- fs_info->nodesize);
return ret;
}
@@ -8164,7 +8566,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
clean_tree_block(fs_info, buf);
clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
- btrfs_set_lock_blocking(buf);
+ btrfs_set_lock_blocking_write(buf);
set_extent_buffer_uptodate(buf);
memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
@@ -8173,13 +8575,13 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
btrfs_set_header_generation(buf, trans->transid);
btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);
btrfs_set_header_owner(buf, owner);
- write_extent_buffer_fsid(buf, fs_info->fsid);
+ write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid);
write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
buf->log_index = root->log_transid % 2;
/*
* we allow two log transactions at a time, use different
- * EXENT bit to differentiate dirty pages.
+ * EXTENT bit to differentiate dirty pages.
*/
if (buf->log_index == 0)
set_extent_dirty(&root->dirty_log_pages, buf->start,
@@ -8225,7 +8627,12 @@ again:
goto again;
}
- if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
+ /*
+ * The global reserve still exists to save us from ourselves, so don't
+ * warn_on if we are short on our delayed refs reserve.
+ */
+ if (block_rsv->type != BTRFS_BLOCK_RSV_DELREFS &&
+ btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
static DEFINE_RATELIMIT_STATE(_rs,
DEFAULT_RATELIMIT_INTERVAL * 10,
/*DEFAULT_RATELIMIT_BURST*/ 1);
@@ -8548,7 +8955,6 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
u64 bytenr;
u64 generation;
u64 parent;
- u32 blocksize;
struct btrfs_key key;
struct btrfs_key first_key;
struct extent_buffer *next;
@@ -8573,7 +8979,6 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
btrfs_node_key_to_cpu(path->nodes[level], &first_key,
path->slots[level]);
- blocksize = fs_info->nodesize;
next = find_extent_buffer(fs_info, bytenr);
if (!next) {
@@ -8586,7 +8991,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
reada = 1;
}
btrfs_tree_lock(next);
- btrfs_set_lock_blocking(next);
+ btrfs_set_lock_blocking_write(next);
ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
&wc->refs[level - 1],
@@ -8646,7 +9051,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
return -EIO;
}
btrfs_tree_lock(next);
- btrfs_set_lock_blocking(next);
+ btrfs_set_lock_blocking_write(next);
}
level--;
@@ -8697,7 +9102,7 @@ skip:
ret);
}
}
- ret = btrfs_free_extent(trans, root, bytenr, blocksize,
+ ret = btrfs_free_extent(trans, root, bytenr, fs_info->nodesize,
parent, root->root_key.objectid,
level - 1, 0);
if (ret)
@@ -8758,7 +9163,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
if (!path->locks[level]) {
BUG_ON(level == 0);
btrfs_tree_lock(eb);
- btrfs_set_lock_blocking(eb);
+ btrfs_set_lock_blocking_write(eb);
path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
ret = btrfs_lookup_extent_info(trans, fs_info,
@@ -8800,7 +9205,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
if (!path->locks[level] &&
btrfs_header_generation(eb) == trans->transid) {
btrfs_tree_lock(eb);
- btrfs_set_lock_blocking(eb);
+ btrfs_set_lock_blocking_write(eb);
path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
}
clean_tree_block(fs_info, eb);
@@ -8948,13 +9353,26 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
goto out_free;
}
+ err = btrfs_run_delayed_items(trans);
+ if (err)
+ goto out_end_trans;
+
if (block_rsv)
trans->block_rsv = block_rsv;
+ /*
+ * This will help us catch people modifying the fs tree while we're
+ * dropping it. It is unsafe to mess with the fs tree while it's being
+ * dropped as we unlock the root node and parent nodes as we walk down
+ * the tree, assuming nothing will change. If something does change
+ * then we'll have stale information and drop references to blocks we've
+ * already dropped.
+ */
+ set_bit(BTRFS_ROOT_DELETING, &root->state);
if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
level = btrfs_header_level(root->node);
path->nodes[level] = btrfs_lock_root_node(root);
- btrfs_set_lock_blocking(path->nodes[level]);
+ btrfs_set_lock_blocking_write(path->nodes[level]);
path->slots[level] = 0;
path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
memset(&wc->update_progress, 0,
@@ -8984,7 +9402,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
level = btrfs_header_level(root->node);
while (1) {
btrfs_tree_lock(path->nodes[level]);
- btrfs_set_lock_blocking(path->nodes[level]);
+ btrfs_set_lock_blocking_write(path->nodes[level]);
path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
ret = btrfs_lookup_extent_info(trans, fs_info,
@@ -9251,6 +9669,7 @@ static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
{
struct btrfs_space_info *sinfo = cache->space_info;
u64 num_bytes;
+ u64 sinfo_used;
u64 min_allocable_bytes;
int ret = -ENOSPC;
@@ -9277,9 +9696,10 @@ static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
num_bytes = cache->key.offset - cache->reserved - cache->pinned -
cache->bytes_super - btrfs_block_group_used(&cache->item);
+ sinfo_used = btrfs_space_info_used(sinfo, true);
- if (btrfs_space_info_used(sinfo, true) + num_bytes +
- min_allocable_bytes <= sinfo->total_bytes) {
+ if (sinfo_used + num_bytes + min_allocable_bytes <=
+ sinfo->total_bytes) {
sinfo->bytes_readonly += num_bytes;
cache->ro++;
list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
@@ -9288,6 +9708,15 @@ static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
out:
spin_unlock(&cache->lock);
spin_unlock(&sinfo->lock);
+ if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
+ btrfs_info(cache->fs_info,
+ "unable to make block group %llu ro",
+ cache->key.objectid);
+ btrfs_info(cache->fs_info,
+ "sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu",
+ sinfo_used, num_bytes, min_allocable_bytes);
+ dump_space_info(cache->fs_info, cache->space_info, 0, 0);
+ }
return ret;
}
@@ -9425,7 +9854,7 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache)
}
/*
- * checks to see if its even possible to relocate this block group.
+ * Checks to see if it's even possible to relocate this block group.
*
* @return - -1 if it's not a good idea to relocate this block group, 0 if its
* ok to go ahead and try.
@@ -10053,7 +10482,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
* check for two cases, either we are full, and therefore
* don't need to bother with the caching work since we won't
* find any space, or we are empty, and we can just add all
- * the space in and be done with it. This saves us _alot_ of
+ * the space in and be done with it. This saves us _a_lot_ of
* time, particularly in the full case.
*/
if (found_key.offset == btrfs_block_group_used(&cache->item)) {
@@ -10132,9 +10561,10 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
struct btrfs_block_group_item item;
struct btrfs_key key;
int ret = 0;
- bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
- trans->can_flush_pending_bgs = false;
+ if (!trans->can_flush_pending_bgs)
+ return;
+
while (!list_empty(&trans->new_bgs)) {
block_group = list_first_entry(&trans->new_bgs,
struct btrfs_block_group_cache,
@@ -10157,9 +10587,10 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
add_block_group_free_space(trans, block_group);
/* already aborted the transaction if it failed. */
next:
+ btrfs_delayed_refs_rsv_release(fs_info, 1);
list_del_init(&block_group->bg_list);
}
- trans->can_flush_pending_bgs = can_flush_pending_bgs;
+ btrfs_trans_release_chunk_metadata(trans);
}
int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
@@ -10234,6 +10665,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
link_block_group(cache);
list_add_tail(&cache->bg_list, &trans->new_bgs);
+ trans->delayed_ref_updates++;
+ btrfs_update_delayed_refs_rsv(trans);
set_avail_alloc_bits(fs_info, type);
return 0;
@@ -10271,6 +10704,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
int factor;
struct btrfs_caching_control *caching_ctl = NULL;
bool remove_em;
+ bool remove_rsv = false;
block_group = btrfs_lookup_block_group(fs_info, group_start);
BUG_ON(!block_group);
@@ -10318,7 +10752,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
mutex_lock(&trans->transaction->cache_write_mutex);
/*
- * make sure our free spache cache IO is done before remove the
+ * Make sure our free space cache IO is done before removing the
* free space inode
*/
spin_lock(&trans->transaction->dirty_bgs_lock);
@@ -10335,6 +10769,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
if (!list_empty(&block_group->dirty_list)) {
list_del_init(&block_group->dirty_list);
+ remove_rsv = true;
btrfs_put_block_group(block_group);
}
spin_unlock(&trans->transaction->dirty_bgs_lock);
@@ -10431,13 +10866,10 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
}
spin_lock(&trans->transaction->dirty_bgs_lock);
- if (!list_empty(&block_group->dirty_list)) {
- WARN_ON(1);
- }
- if (!list_empty(&block_group->io_list)) {
- WARN_ON(1);
- }
+ WARN_ON(!list_empty(&block_group->dirty_list));
+ WARN_ON(!list_empty(&block_group->io_list));
spin_unlock(&trans->transaction->dirty_bgs_lock);
+
btrfs_remove_free_space_cache(block_group);
spin_lock(&block_group->space_info->lock);
@@ -10544,6 +10976,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
ret = btrfs_del_item(trans, root, path);
out:
+ if (remove_rsv)
+ btrfs_delayed_refs_rsv_release(fs_info, 1);
btrfs_free_path(path);
return ret;
}
@@ -10701,7 +11135,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
spin_lock(&space_info->lock);
spin_lock(&block_group->lock);
- space_info->bytes_pinned -= block_group->pinned;
+ update_bytes_pinned(space_info, -block_group->pinned);
space_info->bytes_readonly += block_group->pinned;
percpu_counter_add_batch(&space_info->total_bytes_pinned,
-block_group->pinned,
@@ -10832,7 +11266,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device,
if (!blk_queue_discard(bdev_get_queue(device->bdev)))
return 0;
- /* Not writeable = nothing to do. */
+ /* Not writable = nothing to do. */
if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
return 0;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 6877a74c7469..ab705183d749 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -89,9 +89,18 @@ void btrfs_leak_debug_check(void)
static inline void __btrfs_debug_check_extent_io_range(const char *caller,
struct extent_io_tree *tree, u64 start, u64 end)
{
- if (tree->ops && tree->ops->check_extent_io_range)
- tree->ops->check_extent_io_range(tree->private_data, caller,
- start, end);
+ struct inode *inode = tree->private_data;
+ u64 isize;
+
+ if (!inode || !is_data_inode(inode))
+ return;
+
+ isize = i_size_read(inode);
+ if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
+ btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
+ "%s: ino %llu isize %llu odd range [%llu,%llu]",
+ caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
+ }
}
#else
#define btrfs_leak_debug_add(new, head) do {} while (0)
@@ -138,7 +147,39 @@ static int add_extent_changeset(struct extent_state *state, unsigned bits,
return ret;
}
-static void flush_write_bio(struct extent_page_data *epd);
+static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
+ unsigned long bio_flags)
+{
+ blk_status_t ret = 0;
+ struct bio_vec *bvec = bio_last_bvec_all(bio);
+ struct bio_vec bv;
+ struct extent_io_tree *tree = bio->bi_private;
+ u64 start;
+
+ mp_bvec_last_segment(bvec, &bv);
+ start = page_offset(bv.bv_page) + bv.bv_offset;
+
+ bio->bi_private = NULL;
+
+ if (tree->ops)
+ ret = tree->ops->submit_bio_hook(tree->private_data, bio,
+ mirror_num, bio_flags, start);
+ else
+ btrfsic_submit_bio(bio);
+
+ return blk_status_to_errno(ret);
+}
+
+static void flush_write_bio(struct extent_page_data *epd)
+{
+ if (epd->bio) {
+ int ret;
+
+ ret = submit_one_bio(epd->bio, 0, 0);
+ BUG_ON(ret < 0); /* -ENOMEM */
+ epd->bio = NULL;
+ }
+}
int __init extent_io_init(void)
{
@@ -272,8 +313,8 @@ do_insert:
}
static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
- struct rb_node **prev_ret,
struct rb_node **next_ret,
+ struct rb_node **prev_ret,
struct rb_node ***p_ret,
struct rb_node **parent_ret)
{
@@ -302,23 +343,23 @@ static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
if (parent_ret)
*parent_ret = prev;
- if (prev_ret) {
+ if (next_ret) {
orig_prev = prev;
while (prev && offset > prev_entry->end) {
prev = rb_next(prev);
prev_entry = rb_entry(prev, struct tree_entry, rb_node);
}
- *prev_ret = prev;
+ *next_ret = prev;
prev = orig_prev;
}
- if (next_ret) {
+ if (prev_ret) {
prev_entry = rb_entry(prev, struct tree_entry, rb_node);
while (prev && offset < prev_entry->start) {
prev = rb_prev(prev);
prev_entry = rb_entry(prev, struct tree_entry, rb_node);
}
- *next_ret = prev;
+ *prev_ret = prev;
}
return NULL;
}
@@ -329,12 +370,12 @@ tree_search_for_insert(struct extent_io_tree *tree,
struct rb_node ***p_ret,
struct rb_node **parent_ret)
{
- struct rb_node *prev = NULL;
+ struct rb_node *next= NULL;
struct rb_node *ret;
- ret = __etree_search(tree, offset, &prev, NULL, p_ret, parent_ret);
+ ret = __etree_search(tree, offset, &next, NULL, p_ret, parent_ret);
if (!ret)
- return prev;
+ return next;
return ret;
}
@@ -344,13 +385,6 @@ static inline struct rb_node *tree_search(struct extent_io_tree *tree,
return tree_search_for_insert(tree, offset, NULL, NULL);
}
-static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
- struct extent_state *other)
-{
- if (tree->ops && tree->ops->merge_extent_hook)
- tree->ops->merge_extent_hook(tree->private_data, new, other);
-}
-
/*
* utility function to look for merge candidates inside a given range.
* Any extents with matching state are merged together into a single
@@ -374,7 +408,10 @@ static void merge_state(struct extent_io_tree *tree,
other = rb_entry(other_node, struct extent_state, rb_node);
if (other->end == state->start - 1 &&
other->state == state->state) {
- merge_cb(tree, state, other);
+ if (tree->private_data &&
+ is_data_inode(tree->private_data))
+ btrfs_merge_delalloc_extent(tree->private_data,
+ state, other);
state->start = other->start;
rb_erase(&other->rb_node, &tree->state);
RB_CLEAR_NODE(&other->rb_node);
@@ -386,7 +423,10 @@ static void merge_state(struct extent_io_tree *tree,
other = rb_entry(other_node, struct extent_state, rb_node);
if (other->start == state->end + 1 &&
other->state == state->state) {
- merge_cb(tree, state, other);
+ if (tree->private_data &&
+ is_data_inode(tree->private_data))
+ btrfs_merge_delalloc_extent(tree->private_data,
+ state, other);
state->end = other->end;
rb_erase(&other->rb_node, &tree->state);
RB_CLEAR_NODE(&other->rb_node);
@@ -395,20 +435,6 @@ static void merge_state(struct extent_io_tree *tree,
}
}
-static void set_state_cb(struct extent_io_tree *tree,
- struct extent_state *state, unsigned *bits)
-{
- if (tree->ops && tree->ops->set_bit_hook)
- tree->ops->set_bit_hook(tree->private_data, state, bits);
-}
-
-static void clear_state_cb(struct extent_io_tree *tree,
- struct extent_state *state, unsigned *bits)
-{
- if (tree->ops && tree->ops->clear_bit_hook)
- tree->ops->clear_bit_hook(tree->private_data, state, bits);
-}
-
static void set_state_bits(struct extent_io_tree *tree,
struct extent_state *state, unsigned *bits,
struct extent_changeset *changeset);
@@ -451,13 +477,6 @@ static int insert_state(struct extent_io_tree *tree,
return 0;
}
-static void split_cb(struct extent_io_tree *tree, struct extent_state *orig,
- u64 split)
-{
- if (tree->ops && tree->ops->split_extent_hook)
- tree->ops->split_extent_hook(tree->private_data, orig, split);
-}
-
/*
* split a given extent state struct in two, inserting the preallocated
* struct 'prealloc' as the newly created second half. 'split' indicates an
@@ -477,7 +496,8 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
{
struct rb_node *node;
- split_cb(tree, orig, split);
+ if (tree->private_data && is_data_inode(tree->private_data))
+ btrfs_split_delalloc_extent(tree->private_data, orig, split);
prealloc->start = orig->start;
prealloc->end = split - 1;
@@ -504,7 +524,7 @@ static struct extent_state *next_state(struct extent_state *state)
/*
* utility function to clear some bits in an extent state struct.
- * it will optionally wake up any one waiting on this state (wake == 1).
+ * it will optionally wake up anyone waiting on this state (wake == 1).
*
* If no bits are set on the state struct after clearing things, the
* struct is freed and removed from the tree
@@ -523,7 +543,10 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
WARN_ON(range > tree->dirty_bytes);
tree->dirty_bytes -= range;
}
- clear_state_cb(tree, state, bits);
+
+ if (tree->private_data && is_data_inode(tree->private_data))
+ btrfs_clear_delalloc_extent(tree->private_data, state, bits);
+
ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
BUG_ON(ret < 0);
state->state &= ~bits_to_clear;
@@ -594,7 +617,6 @@ int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
if (delete)
bits |= ~EXTENT_CTLBITS;
- bits |= EXTENT_FIRST_DELALLOC;
if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
clear = 1;
@@ -800,7 +822,9 @@ static void set_state_bits(struct extent_io_tree *tree,
unsigned bits_to_set = *bits & ~EXTENT_CTLBITS;
int ret;
- set_state_cb(tree, state, bits);
+ if (tree->private_data && is_data_inode(tree->private_data))
+ btrfs_set_delalloc_extent(tree->private_data, state, bits);
+
if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
u64 range = state->end - state->start + 1;
tree->dirty_bytes += range;
@@ -857,7 +881,6 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
btrfs_debug_check_extent_io_range(tree, start, end);
- bits |= EXTENT_FIRST_DELALLOC;
again:
if (!prealloc && gfpflags_allow_blocking(mask)) {
/*
@@ -1459,16 +1482,16 @@ out:
* find a contiguous range of bytes in the file marked as delalloc, not
* more than 'max_bytes'. start and end are used to return the range,
*
- * 1 is returned if we find something, 0 if nothing was in the tree
+ * true is returned if we find something, false if nothing was in the tree
*/
-static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
+static noinline bool find_delalloc_range(struct extent_io_tree *tree,
u64 *start, u64 *end, u64 max_bytes,
struct extent_state **cached_state)
{
struct rb_node *node;
struct extent_state *state;
u64 cur_start = *start;
- u64 found = 0;
+ bool found = false;
u64 total_bytes = 0;
spin_lock(&tree->lock);
@@ -1479,8 +1502,7 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
*/
node = tree_search(tree, cur_start);
if (!node) {
- if (!found)
- *end = (u64)-1;
+ *end = (u64)-1;
goto out;
}
@@ -1500,7 +1522,7 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
*cached_state = state;
refcount_inc(&state->refs);
}
- found++;
+ found = true;
*end = state->end;
cur_start = state->end + 1;
node = rb_next(node);
@@ -1558,19 +1580,22 @@ static noinline int lock_delalloc_pages(struct inode *inode,
}
/*
- * find a contiguous range of bytes in the file marked as delalloc, not
- * more than 'max_bytes'. start and end are used to return the range,
+ * Find and lock a contiguous range of bytes in the file marked as delalloc, no
+ * more than @max_bytes. @Start and @end are used to return the range,
*
- * 1 is returned if we find something, 0 if nothing was in the tree
+ * Return: true if we find something
+ * false if nothing was in the tree
*/
-static noinline_for_stack u64 find_lock_delalloc_range(struct inode *inode,
+EXPORT_FOR_TESTS
+noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
struct extent_io_tree *tree,
struct page *locked_page, u64 *start,
- u64 *end, u64 max_bytes)
+ u64 *end)
{
+ u64 max_bytes = BTRFS_MAX_EXTENT_SIZE;
u64 delalloc_start;
u64 delalloc_end;
- u64 found;
+ bool found;
struct extent_state *cached_state = NULL;
int ret;
int loops = 0;
@@ -1585,7 +1610,7 @@ again:
*start = delalloc_start;
*end = delalloc_end;
free_extent_state(cached_state);
- return 0;
+ return false;
}
/*
@@ -1605,6 +1630,7 @@ again:
/* step two, lock all the pages after the page that has start */
ret = lock_delalloc_pages(inode, locked_page,
delalloc_start, delalloc_end);
+ ASSERT(!ret || ret == -EAGAIN);
if (ret == -EAGAIN) {
/* some of the pages are gone, lets avoid looping by
* shortening the size of the delalloc range we're searching
@@ -1616,11 +1642,10 @@ again:
loops = 1;
goto again;
} else {
- found = 0;
+ found = false;
goto out_failed;
}
}
- BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */
/* step three, lock the state bits for the whole range */
lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state);
@@ -1643,17 +1668,6 @@ out_failed:
return found;
}
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-u64 btrfs_find_lock_delalloc_range(struct inode *inode,
- struct extent_io_tree *tree,
- struct page *locked_page, u64 *start,
- u64 *end, u64 max_bytes)
-{
- return find_lock_delalloc_range(inode, tree, locked_page, start, end,
- max_bytes);
-}
-#endif
-
static int __process_pages_contig(struct address_space *mapping,
struct page *locked_page,
pgoff_t start_index, pgoff_t end_index,
@@ -2349,13 +2363,11 @@ struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
}
/*
- * this is a generic handler for readpage errors (default
- * readpage_io_failed_hook). if other copies exist, read those and write back
- * good data to the failed position. does not investigate in remapping the
- * failed extent elsewhere, hoping the device will be smart enough to do this as
- * needed
+ * This is a generic handler for readpage errors. If other copies exist, read
+ * those and write back good data to the failed position. Does not investigate
+ * in remapping the failed extent elsewhere, hoping the device will be smart
+ * enough to do this as needed
*/
-
static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
struct page *page, u64 start, u64 end,
int failed_mirror)
@@ -2368,7 +2380,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
int read_mode = 0;
blk_status_t status;
int ret;
- unsigned failed_bio_pages = bio_pages_all(failed_bio);
+ unsigned failed_bio_pages = failed_bio->bi_iter.bi_size >> PAGE_SHIFT;
BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
@@ -2412,14 +2424,9 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
{
int uptodate = (err == 0);
- struct extent_io_tree *tree;
int ret = 0;
- tree = &BTRFS_I(page->mapping->host)->io_tree;
-
- if (tree->ops && tree->ops->writepage_end_io_hook)
- tree->ops->writepage_end_io_hook(page, start, end, NULL,
- uptodate);
+ btrfs_writepage_endio_finish_ordered(page, start, end, uptodate);
if (!uptodate) {
ClearPageUptodate(page);
@@ -2445,9 +2452,10 @@ static void end_bio_extent_writepage(struct bio *bio)
u64 start;
u64 end;
int i;
+ struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, i, iter_all) {
struct page *page = bvec->bv_page;
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -2516,12 +2524,15 @@ static void end_bio_extent_readpage(struct bio *bio)
int mirror;
int ret;
int i;
+ struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, i, iter_all) {
struct page *page = bvec->bv_page;
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ bool data_inode = btrfs_ino(BTRFS_I(inode))
+ != BTRFS_BTREE_INODE_OBJECTID;
btrfs_debug(fs_info,
"end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
@@ -2551,7 +2562,7 @@ static void end_bio_extent_readpage(struct bio *bio)
len = bvec->bv_len;
mirror = io_bio->mirror_num;
- if (likely(uptodate && tree->ops)) {
+ if (likely(uptodate)) {
ret = tree->ops->readpage_end_io_hook(io_bio, offset,
page, start, end,
mirror);
@@ -2567,38 +2578,37 @@ static void end_bio_extent_readpage(struct bio *bio)
if (likely(uptodate))
goto readpage_ok;
- if (tree->ops) {
- ret = tree->ops->readpage_io_failed_hook(page, mirror);
- if (ret == -EAGAIN) {
- /*
- * Data inode's readpage_io_failed_hook() always
- * returns -EAGAIN.
- *
- * The generic bio_readpage_error handles errors
- * the following way: If possible, new read
- * requests are created and submitted and will
- * end up in end_bio_extent_readpage as well (if
- * we're lucky, not in the !uptodate case). In
- * that case it returns 0 and we just go on with
- * the next page in our bio. If it can't handle
- * the error it will return -EIO and we remain
- * responsible for that page.
- */
- ret = bio_readpage_error(bio, offset, page,
- start, end, mirror);
- if (ret == 0) {
- uptodate = !bio->bi_status;
- offset += len;
- continue;
- }
- }
+ if (data_inode) {
/*
- * metadata's readpage_io_failed_hook() always returns
- * -EIO and fixes nothing. -EIO is also returned if
- * data inode error could not be fixed.
+ * The generic bio_readpage_error handles errors the
+ * following way: If possible, new read requests are
+ * created and submitted and will end up in
+ * end_bio_extent_readpage as well (if we're lucky,
+ * not in the !uptodate case). In that case it returns
+ * 0 and we just go on with the next page in our bio.
+ * If it can't handle the error it will return -EIO and
+ * we remain responsible for that page.
*/
- ASSERT(ret == -EIO);
+ ret = bio_readpage_error(bio, offset, page, start, end,
+ mirror);
+ if (ret == 0) {
+ uptodate = !bio->bi_status;
+ offset += len;
+ continue;
+ }
+ } else {
+ struct extent_buffer *eb;
+
+ eb = (struct extent_buffer *)page->private;
+ set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
+ eb->read_mirror = mirror;
+ atomic_dec(&eb->io_pages);
+ if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD,
+ &eb->bflags))
+ btree_readahead_hook(eb, -EIO);
+
+ ret = -EIO;
}
readpage_ok:
if (likely(uptodate)) {
@@ -2607,7 +2617,7 @@ readpage_ok:
unsigned off;
/* Zero out the end if this page straddles i_size */
- off = i_size & (PAGE_SIZE-1);
+ off = offset_in_page(i_size);
if (page->index == end_index && off)
zero_user_segment(page, off, PAGE_SIZE);
SetPageUptodate(page);
@@ -2644,8 +2654,7 @@ readpage_ok:
if (extent_len)
endio_readpage_release_extent(tree, extent_start, extent_len,
uptodate);
- if (io_bio->end_io)
- io_bio->end_io(io_bio, blk_status_to_errno(bio->bi_status));
+ btrfs_io_bio_free_csum(io_bio);
bio_put(bio);
}
@@ -2715,28 +2724,6 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
return bio;
}
-static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
- unsigned long bio_flags)
-{
- blk_status_t ret = 0;
- struct bio_vec *bvec = bio_last_bvec_all(bio);
- struct page *page = bvec->bv_page;
- struct extent_io_tree *tree = bio->bi_private;
- u64 start;
-
- start = page_offset(page) + bvec->bv_offset;
-
- bio->bi_private = NULL;
-
- if (tree->ops)
- ret = tree->ops->submit_bio_hook(tree->private_data, bio,
- mirror_num, bio_flags, start);
- else
- btrfsic_submit_bio(bio);
-
- return blk_status_to_errno(ret);
-}
-
/*
* @opf: bio REQ_OP_* and REQ_* flags as one value
* @tree: tree so we can call our merge_bio hook
@@ -2782,8 +2769,8 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
else
contig = bio_end_sector(bio) == sector;
- if (tree->ops && btrfs_merge_bio_hook(page, offset, page_size,
- bio, bio_flags))
+ ASSERT(tree->ops);
+ if (btrfs_bio_fits_in_stripe(page, page_size, bio, bio_flags))
can_merge = false;
if (prev_bio_flags != bio_flags || !contig || !can_merge ||
@@ -2911,7 +2898,7 @@ static int __do_readpage(struct extent_io_tree *tree,
if (page->index == last_byte >> PAGE_SHIFT) {
char *userpage;
- size_t zero_offset = last_byte & (PAGE_SIZE - 1);
+ size_t zero_offset = offset_in_page(last_byte);
if (zero_offset) {
iosize = PAGE_SIZE - zero_offset;
@@ -3205,7 +3192,7 @@ static void update_nr_written(struct writeback_control *wbc,
/*
* helper for __extent_writepage, doing all of the delayed allocation setup.
*
- * This returns 1 if our fill_delalloc function did all the work required
+ * This returns 1 if btrfs_run_delalloc_range function did all the work required
* to write the page (copy into inline extent). In this case the IO has
* been started and the page is already unlocked.
*
@@ -3213,44 +3200,37 @@ static void update_nr_written(struct writeback_control *wbc,
* This returns < 0 if there were errors (page still locked)
*/
static noinline_for_stack int writepage_delalloc(struct inode *inode,
- struct page *page, struct writeback_control *wbc,
- struct extent_page_data *epd,
- u64 delalloc_start,
- unsigned long *nr_written)
+ struct page *page, struct writeback_control *wbc,
+ u64 delalloc_start, unsigned long *nr_written)
{
- struct extent_io_tree *tree = epd->tree;
+ struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
u64 page_end = delalloc_start + PAGE_SIZE - 1;
- u64 nr_delalloc;
+ bool found;
u64 delalloc_to_write = 0;
u64 delalloc_end = 0;
int ret;
int page_started = 0;
- if (epd->extent_locked || !tree->ops || !tree->ops->fill_delalloc)
- return 0;
while (delalloc_end < page_end) {
- nr_delalloc = find_lock_delalloc_range(inode, tree,
+ found = find_lock_delalloc_range(inode, tree,
page,
&delalloc_start,
- &delalloc_end,
- BTRFS_MAX_EXTENT_SIZE);
- if (nr_delalloc == 0) {
+ &delalloc_end);
+ if (!found) {
delalloc_start = delalloc_end + 1;
continue;
}
- ret = tree->ops->fill_delalloc(inode, page,
- delalloc_start,
- delalloc_end,
- &page_started,
- nr_written, wbc);
+ ret = btrfs_run_delalloc_range(inode, page, delalloc_start,
+ delalloc_end, &page_started, nr_written, wbc);
/* File system has been set read-only */
if (ret) {
SetPageError(page);
- /* fill_delalloc should be return < 0 for error
- * but just in case, we use > 0 here meaning the
- * IO is started, so we don't want to return > 0
- * unless things are going well.
+ /*
+ * btrfs_run_delalloc_range should return < 0 for error
+ * but just in case, we use > 0 here meaning the IO is
+ * started, so we don't want to return > 0 unless
+ * things are going well.
*/
ret = ret < 0 ? ret : -EIO;
goto done;
@@ -3323,20 +3303,17 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
int nr = 0;
bool compressed;
- if (tree->ops && tree->ops->writepage_start_hook) {
- ret = tree->ops->writepage_start_hook(page, start,
- page_end);
- if (ret) {
- /* Fixup worker will requeue */
- if (ret == -EBUSY)
- wbc->pages_skipped++;
- else
- redirty_page_for_writepage(wbc, page);
+ ret = btrfs_writepage_cow_fixup(page, start, page_end);
+ if (ret) {
+ /* Fixup worker will requeue */
+ if (ret == -EBUSY)
+ wbc->pages_skipped++;
+ else
+ redirty_page_for_writepage(wbc, page);
- update_nr_written(wbc, nr_written);
- unlock_page(page);
- return 1;
- }
+ update_nr_written(wbc, nr_written);
+ unlock_page(page);
+ return 1;
}
/*
@@ -3347,9 +3324,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
end = page_end;
if (i_size <= start) {
- if (tree->ops && tree->ops->writepage_end_io_hook)
- tree->ops->writepage_end_io_hook(page, start,
- page_end, NULL, 1);
+ btrfs_writepage_endio_finish_ordered(page, start, page_end, 1);
goto done;
}
@@ -3360,9 +3335,8 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
u64 offset;
if (cur >= i_size) {
- if (tree->ops && tree->ops->writepage_end_io_hook)
- tree->ops->writepage_end_io_hook(page, cur,
- page_end, NULL, 1);
+ btrfs_writepage_endio_finish_ordered(page, cur,
+ page_end, 1);
break;
}
em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, cur,
@@ -3396,11 +3370,10 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
* end_io notification does not happen here for
* compressed extents
*/
- if (!compressed && tree->ops &&
- tree->ops->writepage_end_io_hook)
- tree->ops->writepage_end_io_hook(page, cur,
- cur + iosize - 1,
- NULL, 1);
+ if (!compressed)
+ btrfs_writepage_endio_finish_ordered(page, cur,
+ cur + iosize - 1,
+ 1);
else if (compressed) {
/* we don't want to end_page_writeback on
* a compressed extent. this happens
@@ -3469,7 +3442,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
ClearPageError(page);
- pg_offset = i_size & (PAGE_SIZE - 1);
+ pg_offset = offset_in_page(i_size);
if (page->index > end_index ||
(page->index == end_index && !pg_offset)) {
page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE);
@@ -3491,11 +3464,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
set_page_extent_mapped(page);
- ret = writepage_delalloc(inode, page, wbc, epd, start, &nr_written);
- if (ret == 1)
- goto done_unlocked;
- if (ret)
- goto done;
+ if (!epd->extent_locked) {
+ ret = writepage_delalloc(inode, page, wbc, start, &nr_written);
+ if (ret == 1)
+ goto done_unlocked;
+ if (ret)
+ goto done;
+ }
ret = __extent_writepage_io(inode, page, wbc, epd,
i_size, nr_written, write_flags, &nr);
@@ -3669,9 +3644,10 @@ static void end_bio_extent_buffer_writepage(struct bio *bio)
struct bio_vec *bvec;
struct extent_buffer *eb;
int i, done;
+ struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, i, iter_all) {
struct page *page = bvec->bv_page;
eb = (struct extent_buffer *)page->private;
@@ -3784,7 +3760,7 @@ int btree_write_cache_pages(struct address_space *mapping,
pgoff_t index;
pgoff_t end; /* Inclusive */
int scanned = 0;
- int tag;
+ xa_mark_t tag;
pagevec_init(&pvec);
if (wbc->range_cyclic) {
@@ -3909,7 +3885,7 @@ static int extent_write_cache_pages(struct address_space *mapping,
pgoff_t done_index;
int range_whole = 0;
int scanned = 0;
- int tag;
+ xa_mark_t tag;
/*
* We have to hold onto the inode so that ordered extents can do their
@@ -3934,12 +3910,25 @@ static int extent_write_cache_pages(struct address_space *mapping,
range_whole = 1;
scanned = 1;
}
- if (wbc->sync_mode == WB_SYNC_ALL)
+
+ /*
+ * We do the tagged writepage as long as the snapshot flush bit is set
+ * and we are the first one who do the filemap_flush() on this inode.
+ *
+ * The nr_to_write == LONG_MAX is needed to make sure other flushers do
+ * not race in and drop the bit.
+ */
+ if (range_whole && wbc->nr_to_write == LONG_MAX &&
+ test_and_clear_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
+ &BTRFS_I(inode)->runtime_flags))
+ wbc->tagged_writepages = 1;
+
+ if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
tag = PAGECACHE_TAG_TOWRITE;
else
tag = PAGECACHE_TAG_DIRTY;
retry:
- if (wbc->sync_mode == WB_SYNC_ALL)
+ if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
tag_pages_for_writeback(mapping, index, end);
done_index = index;
while (!done && !nr_to_write_done && (index <= end) &&
@@ -4029,17 +4018,6 @@ retry:
return ret;
}
-static void flush_write_bio(struct extent_page_data *epd)
-{
- if (epd->bio) {
- int ret;
-
- ret = submit_one_bio(epd->bio, 0, 0);
- BUG_ON(ret < 0); /* -ENOMEM */
- epd->bio = NULL;
- }
-}
-
int extent_write_full_page(struct page *page, struct writeback_control *wbc)
{
int ret;
@@ -4084,10 +4062,8 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
if (clear_page_dirty_for_io(page))
ret = __extent_writepage(page, &wbc_writepages, &epd);
else {
- if (tree->ops && tree->ops->writepage_end_io_hook)
- tree->ops->writepage_end_io_hook(page, start,
- start + PAGE_SIZE - 1,
- NULL, 1);
+ btrfs_writepage_endio_finish_ordered(page, start,
+ start + PAGE_SIZE - 1, 1);
unlock_page(page);
}
put_page(page);
@@ -4118,42 +4094,35 @@ int extent_readpages(struct address_space *mapping, struct list_head *pages,
unsigned nr_pages)
{
struct bio *bio = NULL;
- unsigned page_idx;
unsigned long bio_flags = 0;
struct page *pagepool[16];
- struct page *page;
struct extent_map *em_cached = NULL;
struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
int nr = 0;
u64 prev_em_start = (u64)-1;
- for (page_idx = 0; page_idx < nr_pages; page_idx++) {
- page = list_entry(pages->prev, struct page, lru);
+ while (!list_empty(pages)) {
+ for (nr = 0; nr < ARRAY_SIZE(pagepool) && !list_empty(pages);) {
+ struct page *page = lru_to_page(pages);
- prefetchw(&page->flags);
- list_del(&page->lru);
- if (add_to_page_cache_lru(page, mapping,
- page->index,
- readahead_gfp_mask(mapping))) {
- put_page(page);
- continue;
+ prefetchw(&page->flags);
+ list_del(&page->lru);
+ if (add_to_page_cache_lru(page, mapping, page->index,
+ readahead_gfp_mask(mapping))) {
+ put_page(page);
+ continue;
+ }
+
+ pagepool[nr++] = page;
}
- pagepool[nr++] = page;
- if (nr < ARRAY_SIZE(pagepool))
- continue;
__extent_readpages(tree, pagepool, nr, &em_cached, &bio,
- &bio_flags, &prev_em_start);
- nr = 0;
+ &bio_flags, &prev_em_start);
}
- if (nr)
- __extent_readpages(tree, pagepool, nr, &em_cached, &bio,
- &bio_flags, &prev_em_start);
if (em_cached)
free_extent_map(em_cached);
- BUG_ON(!list_empty(pages));
if (bio)
return submit_one_bio(bio, 0, bio_flags);
return 0;
@@ -4290,8 +4259,7 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
if (len == 0)
break;
len = ALIGN(len, sectorsize);
- em = btrfs_get_extent_fiemap(BTRFS_I(inode), NULL, 0, offset,
- len, 0);
+ em = btrfs_get_extent_fiemap(BTRFS_I(inode), offset, len);
if (IS_ERR_OR_NULL(em))
return em;
@@ -4342,7 +4310,7 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
/*
* Sanity check, extent_fiemap() should have ensured that new
- * fiemap extent won't overlap with cahced one.
+ * fiemap extent won't overlap with cached one.
* Not recoverable.
*
* NOTE: Physical address can overlap, due to compression
@@ -4914,13 +4882,6 @@ again:
check_buffer_tree_ref(eb);
set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
- /*
- * We will free dummy extent buffer's if they come into
- * free_extent_buffer with a ref count of 2, but if we are using this we
- * want the buffers to stay in memory until we're done with them, so
- * bump the ref count again.
- */
- atomic_inc(&eb->refs);
return eb;
free_eb:
btrfs_release_extent_buffer(eb);
@@ -5102,7 +5063,9 @@ void free_extent_buffer(struct extent_buffer *eb)
while (1) {
refs = atomic_read(&eb->refs);
- if (refs <= 3)
+ if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3)
+ || (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) &&
+ refs == 1))
break;
old = atomic_cmpxchg(&eb->refs, refs, refs - 1);
if (old == refs)
@@ -5111,10 +5074,6 @@ void free_extent_buffer(struct extent_buffer *eb)
spin_lock(&eb->refs_lock);
if (atomic_read(&eb->refs) == 2 &&
- test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))
- atomic_dec(&eb->refs);
-
- if (atomic_read(&eb->refs) == 2 &&
test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
!extent_buffer_under_io(eb) &&
test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
@@ -5159,11 +5118,9 @@ void clear_extent_buffer_dirty(struct extent_buffer *eb)
clear_page_dirty_for_io(page);
xa_lock_irq(&page->mapping->i_pages);
- if (!PageDirty(page)) {
- radix_tree_tag_clear(&page->mapping->i_pages,
- page_index(page),
- PAGECACHE_TAG_DIRTY);
- }
+ if (!PageDirty(page))
+ __xa_clear_mark(&page->mapping->i_pages,
+ page_index(page), PAGECACHE_TAG_DIRTY);
xa_unlock_irq(&page->mapping->i_pages);
ClearPageError(page);
unlock_page(page);
@@ -5342,7 +5299,7 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
struct page *page;
char *kaddr;
char *dst = (char *)dstv;
- size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
+ size_t start_offset = offset_in_page(eb->start);
unsigned long i = (start_offset + start) >> PAGE_SHIFT;
if (start + len > eb->len) {
@@ -5352,7 +5309,7 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
return;
}
- offset = (start_offset + start) & (PAGE_SIZE - 1);
+ offset = offset_in_page(start_offset + start);
while (len > 0) {
page = eb->pages[i];
@@ -5377,14 +5334,14 @@ int read_extent_buffer_to_user(const struct extent_buffer *eb,
struct page *page;
char *kaddr;
char __user *dst = (char __user *)dstv;
- size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
+ size_t start_offset = offset_in_page(eb->start);
unsigned long i = (start_offset + start) >> PAGE_SHIFT;
int ret = 0;
WARN_ON(start > eb->len);
WARN_ON(start + len > eb->start + eb->len);
- offset = (start_offset + start) & (PAGE_SIZE - 1);
+ offset = offset_in_page(start_offset + start);
while (len > 0) {
page = eb->pages[i];
@@ -5415,10 +5372,10 @@ int map_private_extent_buffer(const struct extent_buffer *eb,
char **map, unsigned long *map_start,
unsigned long *map_len)
{
- size_t offset = start & (PAGE_SIZE - 1);
+ size_t offset;
char *kaddr;
struct page *p;
- size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
+ size_t start_offset = offset_in_page(eb->start);
unsigned long i = (start_offset + start) >> PAGE_SHIFT;
unsigned long end_i = (start_offset + start + min_len - 1) >>
PAGE_SHIFT;
@@ -5455,14 +5412,14 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
struct page *page;
char *kaddr;
char *ptr = (char *)ptrv;
- size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
+ size_t start_offset = offset_in_page(eb->start);
unsigned long i = (start_offset + start) >> PAGE_SHIFT;
int ret = 0;
WARN_ON(start > eb->len);
WARN_ON(start + len > eb->start + eb->len);
- offset = (start_offset + start) & (PAGE_SIZE - 1);
+ offset = offset_in_page(start_offset + start);
while (len > 0) {
page = eb->pages[i];
@@ -5511,13 +5468,13 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
struct page *page;
char *kaddr;
char *src = (char *)srcv;
- size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
+ size_t start_offset = offset_in_page(eb->start);
unsigned long i = (start_offset + start) >> PAGE_SHIFT;
WARN_ON(start > eb->len);
WARN_ON(start + len > eb->start + eb->len);
- offset = (start_offset + start) & (PAGE_SIZE - 1);
+ offset = offset_in_page(start_offset + start);
while (len > 0) {
page = eb->pages[i];
@@ -5541,13 +5498,13 @@ void memzero_extent_buffer(struct extent_buffer *eb, unsigned long start,
size_t offset;
struct page *page;
char *kaddr;
- size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
+ size_t start_offset = offset_in_page(eb->start);
unsigned long i = (start_offset + start) >> PAGE_SHIFT;
WARN_ON(start > eb->len);
WARN_ON(start + len > eb->start + eb->len);
- offset = (start_offset + start) & (PAGE_SIZE - 1);
+ offset = offset_in_page(start_offset + start);
while (len > 0) {
page = eb->pages[i];
@@ -5586,13 +5543,12 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
size_t offset;
struct page *page;
char *kaddr;
- size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1);
+ size_t start_offset = offset_in_page(dst->start);
unsigned long i = (start_offset + dst_offset) >> PAGE_SHIFT;
WARN_ON(src->len != dst_len);
- offset = (start_offset + dst_offset) &
- (PAGE_SIZE - 1);
+ offset = offset_in_page(start_offset + dst_offset);
while (len > 0) {
page = dst->pages[i];
@@ -5628,7 +5584,7 @@ static inline void eb_bitmap_offset(struct extent_buffer *eb,
unsigned long *page_index,
size_t *page_offset)
{
- size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
+ size_t start_offset = offset_in_page(eb->start);
size_t byte_offset = BIT_BYTE(nr);
size_t offset;
@@ -5640,7 +5596,7 @@ static inline void eb_bitmap_offset(struct extent_buffer *eb,
offset = start_offset + start + byte_offset;
*page_index = offset >> PAGE_SHIFT;
- *page_offset = offset & (PAGE_SIZE - 1);
+ *page_offset = offset_in_page(offset);
}
/**
@@ -5782,7 +5738,7 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
size_t cur;
size_t dst_off_in_page;
size_t src_off_in_page;
- size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1);
+ size_t start_offset = offset_in_page(dst->start);
unsigned long dst_i;
unsigned long src_i;
@@ -5800,10 +5756,8 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
}
while (len > 0) {
- dst_off_in_page = (start_offset + dst_offset) &
- (PAGE_SIZE - 1);
- src_off_in_page = (start_offset + src_offset) &
- (PAGE_SIZE - 1);
+ dst_off_in_page = offset_in_page(start_offset + dst_offset);
+ src_off_in_page = offset_in_page(start_offset + src_offset);
dst_i = (start_offset + dst_offset) >> PAGE_SHIFT;
src_i = (start_offset + src_offset) >> PAGE_SHIFT;
@@ -5831,7 +5785,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
size_t src_off_in_page;
unsigned long dst_end = dst_offset + len - 1;
unsigned long src_end = src_offset + len - 1;
- size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1);
+ size_t start_offset = offset_in_page(dst->start);
unsigned long dst_i;
unsigned long src_i;
@@ -5855,10 +5809,8 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
dst_i = (start_offset + dst_end) >> PAGE_SHIFT;
src_i = (start_offset + src_end) >> PAGE_SHIFT;
- dst_off_in_page = (start_offset + dst_end) &
- (PAGE_SIZE - 1);
- src_off_in_page = (start_offset + src_end) &
- (PAGE_SIZE - 1);
+ dst_off_in_page = offset_in_page(start_offset + dst_end);
+ src_off_in_page = offset_in_page(start_offset + src_end);
cur = min_t(unsigned long, len, src_off_in_page + 1);
cur = min(cur, dst_off_in_page + 1);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 369daa5d4f73..08749e0b9c32 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -18,17 +18,16 @@
#define EXTENT_BOUNDARY (1U << 9)
#define EXTENT_NODATASUM (1U << 10)
#define EXTENT_CLEAR_META_RESV (1U << 11)
-#define EXTENT_FIRST_DELALLOC (1U << 12)
-#define EXTENT_NEED_WAIT (1U << 13)
-#define EXTENT_DAMAGED (1U << 14)
-#define EXTENT_NORESERVE (1U << 15)
-#define EXTENT_QGROUP_RESERVED (1U << 16)
-#define EXTENT_CLEAR_DATA_RESV (1U << 17)
-#define EXTENT_DELALLOC_NEW (1U << 18)
+#define EXTENT_NEED_WAIT (1U << 12)
+#define EXTENT_DAMAGED (1U << 13)
+#define EXTENT_NORESERVE (1U << 14)
+#define EXTENT_QGROUP_RESERVED (1U << 15)
+#define EXTENT_CLEAR_DATA_RESV (1U << 16)
+#define EXTENT_DELALLOC_NEW (1U << 17)
#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
#define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \
EXTENT_CLEAR_DATA_RESV)
-#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
+#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING)
/*
* flags for bio submission. The high bits indicate the compression
@@ -37,18 +36,22 @@
#define EXTENT_BIO_COMPRESSED 1
#define EXTENT_BIO_FLAG_SHIFT 16
-/* these are bit numbers for test/set bit */
-#define EXTENT_BUFFER_UPTODATE 0
-#define EXTENT_BUFFER_DIRTY 2
-#define EXTENT_BUFFER_CORRUPT 3
-#define EXTENT_BUFFER_READAHEAD 4 /* this got triggered by readahead */
-#define EXTENT_BUFFER_TREE_REF 5
-#define EXTENT_BUFFER_STALE 6
-#define EXTENT_BUFFER_WRITEBACK 7
-#define EXTENT_BUFFER_READ_ERR 8 /* read IO error */
-#define EXTENT_BUFFER_UNMAPPED 9
-#define EXTENT_BUFFER_IN_TREE 10
-#define EXTENT_BUFFER_WRITE_ERR 11 /* write IO error */
+enum {
+ EXTENT_BUFFER_UPTODATE,
+ EXTENT_BUFFER_DIRTY,
+ EXTENT_BUFFER_CORRUPT,
+ /* this got triggered by readahead */
+ EXTENT_BUFFER_READAHEAD,
+ EXTENT_BUFFER_TREE_REF,
+ EXTENT_BUFFER_STALE,
+ EXTENT_BUFFER_WRITEBACK,
+ /* read IO error */
+ EXTENT_BUFFER_READ_ERR,
+ EXTENT_BUFFER_UNMAPPED,
+ EXTENT_BUFFER_IN_TREE,
+ /* write IO error */
+ EXTENT_BUFFER_WRITE_ERR,
+};
/* these are flags for __process_pages_contig */
#define PAGE_UNLOCK (1 << 0)
@@ -94,38 +97,13 @@ typedef blk_status_t (extent_submit_bio_start_t)(void *private_data,
struct extent_io_ops {
/*
- * The following callbacks must be allways defined, the function
+ * The following callbacks must be always defined, the function
* pointer will be called unconditionally.
*/
extent_submit_bio_hook_t *submit_bio_hook;
int (*readpage_end_io_hook)(struct btrfs_io_bio *io_bio, u64 phy_offset,
struct page *page, u64 start, u64 end,
int mirror);
- int (*readpage_io_failed_hook)(struct page *page, int failed_mirror);
-
- /*
- * Optional hooks, called if the pointer is not NULL
- */
- int (*fill_delalloc)(void *private_data, struct page *locked_page,
- u64 start, u64 end, int *page_started,
- unsigned long *nr_written,
- struct writeback_control *wbc);
-
- int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
- void (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
- struct extent_state *state, int uptodate);
- void (*set_bit_hook)(void *private_data, struct extent_state *state,
- unsigned *bits);
- void (*clear_bit_hook)(void *private_data,
- struct extent_state *state,
- unsigned *bits);
- void (*merge_extent_hook)(void *private_data,
- struct extent_state *new,
- struct extent_state *other);
- void (*split_extent_hook)(void *private_data,
- struct extent_state *orig, u64 split);
- void (*check_extent_io_range)(void *private_data, const char *caller,
- u64 start, u64 end);
};
struct extent_io_tree {
@@ -353,11 +331,11 @@ static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start,
}
static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start,
- u64 end)
+ u64 end, struct extent_state **cached)
{
return clear_extent_bit(tree, start, end,
EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING, 0, 0, NULL);
+ EXTENT_DO_ACCOUNTING, 0, 0, cached);
}
int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
@@ -546,10 +524,9 @@ int free_io_failure(struct extent_io_tree *failure_tree,
struct extent_io_tree *io_tree,
struct io_failure_record *rec);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-u64 btrfs_find_lock_delalloc_range(struct inode *inode,
- struct extent_io_tree *tree,
- struct page *locked_page, u64 *start,
- u64 *end, u64 max_bytes);
+bool find_lock_delalloc_range(struct inode *inode, struct extent_io_tree *tree,
+ struct page *locked_page, u64 *start,
+ u64 *end);
#endif
struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 7eea8b6e2cd3..928f729c55ba 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -210,6 +210,9 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
if (!list_empty(&prev->list) || !list_empty(&next->list))
return 0;
+ ASSERT(next->block_start != EXTENT_MAP_DELALLOC &&
+ prev->block_start != EXTENT_MAP_DELALLOC);
+
if (extent_map_end(prev) == next->start &&
prev->flags == next->flags &&
prev->bdev == next->bdev &&
@@ -217,8 +220,6 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
prev->block_start == EXTENT_MAP_HOLE) ||
(next->block_start == EXTENT_MAP_INLINE &&
prev->block_start == EXTENT_MAP_INLINE) ||
- (next->block_start == EXTENT_MAP_DELALLOC &&
- prev->block_start == EXTENT_MAP_DELALLOC) ||
(next->block_start < EXTENT_MAP_LAST_BYTE - 1 &&
next->block_start == extent_map_block_end(prev)))) {
return 1;
@@ -475,7 +476,8 @@ static struct extent_map *prev_extent_map(struct extent_map *em)
return container_of(prev, struct extent_map, rb_node);
}
-/* helper for btfs_get_extent. Given an existing extent in the tree,
+/*
+ * Helper for btrfs_get_extent. Given an existing extent in the tree,
* the existing extent is the nearest extent to map_start,
* and an extent that you want to insert, deal with overlap and insert
* the best fitted new extent into the tree.
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 31977ffd6190..473f039fcd7c 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -9,15 +9,23 @@
#define EXTENT_MAP_LAST_BYTE ((u64)-4)
#define EXTENT_MAP_HOLE ((u64)-3)
#define EXTENT_MAP_INLINE ((u64)-2)
+/* used only during fiemap calls */
#define EXTENT_MAP_DELALLOC ((u64)-1)
-/* bits for the flags field */
-#define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */
-#define EXTENT_FLAG_COMPRESSED 1
-#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
-#define EXTENT_FLAG_LOGGING 4 /* Logging this extent */
-#define EXTENT_FLAG_FILLING 5 /* Filling in a preallocated extent */
-#define EXTENT_FLAG_FS_MAPPING 6 /* filesystem extent mapping type */
+/* bits for the extent_map::flags field */
+enum {
+ /* this entry not yet on disk, don't free it */
+ EXTENT_FLAG_PINNED,
+ EXTENT_FLAG_COMPRESSED,
+ /* pre-allocated extent */
+ EXTENT_FLAG_PREALLOC,
+ /* Logging this extent */
+ EXTENT_FLAG_LOGGING,
+ /* Filling in a preallocated extent */
+ EXTENT_FLAG_FILLING,
+ /* filesystem extent mapping type */
+ EXTENT_FLAG_FS_MAPPING,
+};
struct extent_map {
struct rb_node rb_node;
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index ba74827beb32..920bf3b4b0ef 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -142,11 +142,6 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
return ret;
}
-static void btrfs_io_bio_endio_readpage(struct btrfs_io_bio *bio, int err)
-{
- kfree(bio->csum_allocated);
-}
-
static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
u64 logical_offset, u32 *dst, int dio)
{
@@ -175,14 +170,12 @@ static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio
nblocks = bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits;
if (!dst) {
if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) {
- btrfs_bio->csum_allocated = kmalloc_array(nblocks,
- csum_size, GFP_NOFS);
- if (!btrfs_bio->csum_allocated) {
+ btrfs_bio->csum = kmalloc_array(nblocks, csum_size,
+ GFP_NOFS);
+ if (!btrfs_bio->csum) {
btrfs_free_path(path);
return BLK_STS_RESOURCE;
}
- btrfs_bio->csum = btrfs_bio->csum_allocated;
- btrfs_bio->end_io = btrfs_io_bio_endio_readpage;
} else {
btrfs_bio->csum = btrfs_bio->csum_inline;
}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 15b925142793..34fe8a58b0e9 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -399,7 +399,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
size_t copied = 0;
size_t total_copied = 0;
int pg = 0;
- int offset = pos & (PAGE_SIZE - 1);
+ int offset = offset_in_page(pos);
while (write_bytes > 0) {
size_t count = min_t(size_t,
@@ -1611,7 +1611,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
return -ENOMEM;
while (iov_iter_count(i) > 0) {
- size_t offset = pos & (PAGE_SIZE - 1);
+ size_t offset = offset_in_page(pos);
size_t sector_offset;
size_t write_bytes = min(iov_iter_count(i),
nrptrs * (size_t)PAGE_SIZE -
@@ -2005,7 +2005,7 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
filp->private_data = NULL;
/*
- * ordered_data_close is set by settattr when we are about to truncate
+ * ordered_data_close is set by setattr when we are about to truncate
* a file from a non-zero size to a zero size. This tries to
* flush down new bytes that may have been written if the
* application were using truncate to replace a file in place.
@@ -2078,14 +2078,47 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
goto out;
inode_lock(inode);
+
+ /*
+ * We take the dio_sem here because the tree log stuff can race with
+ * lockless dio writes and get an extent map logged for an extent we
+ * never waited on. We need it this high up for lockdep reasons.
+ */
+ down_write(&BTRFS_I(inode)->dio_sem);
+
atomic_inc(&root->log_batch);
/*
+ * Before we acquired the inode's lock, someone may have dirtied more
+ * pages in the target range. We need to make sure that writeback for
+ * any such pages does not start while we are logging the inode, because
+ * if it does, any of the following might happen when we are not doing a
+ * full inode sync:
+ *
+ * 1) We log an extent after its writeback finishes but before its
+ * checksums are added to the csum tree, leading to -EIO errors
+ * when attempting to read the extent after a log replay.
+ *
+ * 2) We can end up logging an extent before its writeback finishes.
+ * Therefore after the log replay we will have a file extent item
+ * pointing to an unwritten extent (and no data checksums as well).
+ *
+ * So trigger writeback for any eventual new dirty pages and then we
+ * wait for all ordered extents to complete below.
+ */
+ ret = start_ordered_ops(inode, start, end);
+ if (ret) {
+ inode_unlock(inode);
+ goto out;
+ }
+
+ /*
* We have to do this here to avoid the priority inversion of waiting on
- * IO of a lower priority task while holding a transaciton open.
+ * IO of a lower priority task while holding a transaction open.
*/
ret = btrfs_wait_ordered_range(inode, start, len);
if (ret) {
+ up_write(&BTRFS_I(inode)->dio_sem);
inode_unlock(inode);
goto out;
}
@@ -2109,6 +2142,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* checked called fsync.
*/
ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err);
+ up_write(&BTRFS_I(inode)->dio_sem);
inode_unlock(inode);
goto out;
}
@@ -2120,13 +2154,14 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* here we could get into a situation where we're waiting on IO to
* happen that is blocked on a transaction trying to commit. With start
* we inc the extwriter counter, so we wait for all extwriters to exit
- * before we start blocking join'ers. This comment is to keep somebody
+ * before we start blocking joiners. This comment is to keep somebody
* from thinking they are super smart and changing this to
* btrfs_join_transaction *cough*Josef*cough*.
*/
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
+ up_write(&BTRFS_I(inode)->dio_sem);
inode_unlock(inode);
goto out;
}
@@ -2148,27 +2183,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* file again, but that will end up using the synchronization
* inside btrfs_sync_log to keep things safe.
*/
+ up_write(&BTRFS_I(inode)->dio_sem);
inode_unlock(inode);
- /*
- * If any of the ordered extents had an error, just return it to user
- * space, so that the application knows some writes didn't succeed and
- * can take proper action (retry for e.g.). Blindly committing the
- * transaction in this case, would fool userspace that everything was
- * successful. And we also want to make sure our log doesn't contain
- * file extent items pointing to extents that weren't fully written to -
- * just like in the non fast fsync path, where we check for the ordered
- * operation's error flag before writing to the log tree and return -EIO
- * if any of them had this flag set (btrfs_wait_ordered_range) -
- * therefore we need to check for errors in the ordered operations,
- * which are indicated by ctx.io_err.
- */
- if (ctx.io_err) {
- btrfs_end_transaction(trans);
- ret = ctx.io_err;
- goto out;
- }
-
if (ret != BTRFS_NO_LOG_SYNC) {
if (!ret) {
ret = btrfs_sync_log(trans, root, &ctx);
@@ -3201,8 +3218,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
&cached_state);
while (start < inode->i_size) {
- em = btrfs_get_extent_fiemap(BTRFS_I(inode), NULL, 0,
- start, len, 0);
+ em = btrfs_get_extent_fiemap(BTRFS_I(inode), start, len);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
em = NULL;
@@ -3286,8 +3302,7 @@ const struct file_operations btrfs_file_operations = {
#ifdef CONFIG_COMPAT
.compat_ioctl = btrfs_compat_ioctl,
#endif
- .clone_file_range = btrfs_clone_file_range,
- .dedupe_file_range = btrfs_dedupe_file_range,
+ .remap_file_range = btrfs_remap_file_range,
};
void __cold btrfs_auto_defrag_exit(void)
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 67441219d6c9..74aa552f4793 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -75,7 +75,8 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
* sure NOFS is set to keep us from deadlocking.
*/
nofs_flag = memalloc_nofs_save();
- inode = btrfs_iget(fs_info->sb, &location, root, NULL);
+ inode = btrfs_iget_path(fs_info->sb, &location, root, NULL, path);
+ btrfs_release_path(path);
memalloc_nofs_restore(nofs_flag);
if (IS_ERR(inode))
return inode;
@@ -838,6 +839,25 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
path->search_commit_root = 1;
path->skip_locking = 1;
+ /*
+ * We must pass a path with search_commit_root set to btrfs_iget in
+ * order to avoid a deadlock when allocating extents for the tree root.
+ *
+ * When we are COWing an extent buffer from the tree root, when looking
+ * for a free extent, at extent-tree.c:find_free_extent(), we can find
+ * block group without its free space cache loaded. When we find one
+ * we must load its space cache which requires reading its free space
+ * cache's inode item from the root tree. If this inode item is located
+ * in the same leaf that we started COWing before, then we end up in
+ * deadlock on the extent buffer (trying to read lock it when we
+ * previously write locked it).
+ *
+ * It's safe to read the inode item using the commit root because
+ * block groups, once loaded, stay in memory forever (until they are
+ * removed) as well as their space caches once loaded. New block groups
+ * once created get their ->cached field set to BTRFS_CACHE_FINISHED so
+ * we will never try to read their inode item while the fs is mounted.
+ */
inode = lookup_free_space_inode(fs_info, block_group, path);
if (IS_ERR(inode)) {
btrfs_free_path(path);
@@ -1772,6 +1792,13 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
return -1;
}
+static inline u64 get_max_extent_size(struct btrfs_free_space *entry)
+{
+ if (entry->bitmap)
+ return entry->max_extent_size;
+ return entry->bytes;
+}
+
/* Cache the size of the max extent in bytes */
static struct btrfs_free_space *
find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
@@ -1793,8 +1820,8 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
for (node = &entry->offset_index; node; node = rb_next(node)) {
entry = rb_entry(node, struct btrfs_free_space, offset_index);
if (entry->bytes < *bytes) {
- if (entry->bytes > *max_extent_size)
- *max_extent_size = entry->bytes;
+ *max_extent_size = max(get_max_extent_size(entry),
+ *max_extent_size);
continue;
}
@@ -1812,8 +1839,8 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
}
if (entry->bytes < *bytes + align_off) {
- if (entry->bytes > *max_extent_size)
- *max_extent_size = entry->bytes;
+ *max_extent_size = max(get_max_extent_size(entry),
+ *max_extent_size);
continue;
}
@@ -1825,8 +1852,10 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
*offset = tmp;
*bytes = size;
return entry;
- } else if (size > *max_extent_size) {
- *max_extent_size = size;
+ } else {
+ *max_extent_size =
+ max(get_max_extent_size(entry),
+ *max_extent_size);
}
continue;
}
@@ -2449,6 +2478,7 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
struct rb_node *n;
int count = 0;
+ spin_lock(&ctl->tree_lock);
for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
info = rb_entry(n, struct btrfs_free_space, offset_index);
if (info->bytes >= bytes && !block_group->ro)
@@ -2457,6 +2487,7 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
info->offset, info->bytes,
(info->bitmap) ? "yes" : "no");
}
+ spin_unlock(&ctl->tree_lock);
btrfs_info(fs_info, "block group has cluster?: %s",
list_empty(&block_group->cluster_list) ? "no" : "yes");
btrfs_info(fs_info,
@@ -2685,8 +2716,8 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
err = search_bitmap(ctl, entry, &search_start, &search_bytes, true);
if (err) {
- if (search_bytes > *max_extent_size)
- *max_extent_size = search_bytes;
+ *max_extent_size = max(get_max_extent_size(entry),
+ *max_extent_size);
return 0;
}
@@ -2723,8 +2754,9 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
entry = rb_entry(node, struct btrfs_free_space, offset_index);
while (1) {
- if (entry->bytes < bytes && entry->bytes > *max_extent_size)
- *max_extent_size = entry->bytes;
+ if (entry->bytes < bytes)
+ *max_extent_size = max(get_max_extent_size(entry),
+ *max_extent_size);
if (entry->bytes < bytes ||
(!entry->bitmap && entry->offset < min_start)) {
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index d6736595ec57..e5089087eaa6 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -74,11 +74,11 @@ out:
return ret;
}
-struct btrfs_free_space_info *
-search_free_space_info(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *block_group,
- struct btrfs_path *path, int cow)
+EXPORT_FOR_TESTS
+struct btrfs_free_space_info *search_free_space_info(
+ struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path, int cow)
{
struct btrfs_root *root = fs_info->free_space_root;
struct btrfs_key key;
@@ -176,6 +176,7 @@ static void le_bitmap_set(unsigned long *map, unsigned int start, int len)
}
}
+EXPORT_FOR_TESTS
int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path)
@@ -315,6 +316,7 @@ out:
return ret;
}
+EXPORT_FOR_TESTS
int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path)
@@ -487,6 +489,7 @@ out:
return ret;
}
+EXPORT_FOR_TESTS
int free_space_test_bit(struct btrfs_block_group_cache *block_group,
struct btrfs_path *path, u64 offset)
{
@@ -775,6 +778,7 @@ out:
return ret;
}
+EXPORT_FOR_TESTS
int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path, u64 start, u64 size)
@@ -968,6 +972,7 @@ out:
return ret;
}
+EXPORT_FOR_TESTS
int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path, u64 start, u64 size)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 181c58b23110..82fdda8ff5ab 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -27,6 +27,7 @@
#include <linux/uio.h>
#include <linux/magic.h>
#include <linux/iversion.h>
+#include <linux/swap.h>
#include <asm/unaligned.h>
#include "ctree.h"
#include "disk-io.h"
@@ -103,23 +104,23 @@ static void __endio_write_update_ordered(struct inode *inode,
/*
* Cleanup all submitted ordered extents in specified range to handle errors
- * from the fill_dellaloc() callback.
+ * from the btrfs_run_delalloc_range() callback.
*
* NOTE: caller must ensure that when an error happens, it can not call
* extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
* and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
* to be released, which we want to happen only when finishing the ordered
- * extent (btrfs_finish_ordered_io()). Also note that the caller of the
- * fill_delalloc() callback already does proper cleanup for the first page of
- * the range, that is, it invokes the callback writepage_end_io_hook() for the
- * range of the first page.
+ * extent (btrfs_finish_ordered_io()).
*/
static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
- const u64 offset,
- const u64 bytes)
+ struct page *locked_page,
+ u64 offset, u64 bytes)
{
unsigned long index = offset >> PAGE_SHIFT;
unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
+ u64 page_start = page_offset(locked_page);
+ u64 page_end = page_start + PAGE_SIZE - 1;
+
struct page *page;
while (index <= end_index) {
@@ -130,8 +131,18 @@ static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
ClearPagePrivate2(page);
put_page(page);
}
- return __endio_write_update_ordered(inode, offset + PAGE_SIZE,
- bytes - PAGE_SIZE, false);
+
+ /*
+ * In case this page belongs to the delalloc range being instantiated
+ * then skip it, since the first page of a range is going to be
+ * properly cleaned up by the caller of run_delalloc_range
+ */
+ if (page_start >= offset && page_end <= (offset + bytes - 1)) {
+ offset += PAGE_SIZE;
+ bytes -= PAGE_SIZE;
+ }
+
+ return __endio_write_update_ordered(inode, offset, bytes, false);
}
static int btrfs_dirty_inode(struct inode *inode);
@@ -229,7 +240,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
start >> PAGE_SHIFT);
btrfs_set_file_extent_compression(leaf, ei, 0);
kaddr = kmap_atomic(page);
- offset = start & (PAGE_SIZE - 1);
+ offset = offset_in_page(start);
write_extent_buffer(leaf, kaddr + offset, ptr, size);
kunmap_atomic(kaddr);
put_page(page);
@@ -357,7 +368,7 @@ struct async_extent {
struct async_cow {
struct inode *inode;
- struct btrfs_root *root;
+ struct btrfs_fs_info *fs_info;
struct page *locked_page;
u64 start;
u64 end;
@@ -442,7 +453,6 @@ static noinline void compress_file_range(struct inode *inode,
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 blocksize = fs_info->sectorsize;
u64 actual_end;
- u64 isize = i_size_read(inode);
int ret = 0;
struct page **pages = NULL;
unsigned long nr_pages;
@@ -456,7 +466,7 @@ static noinline void compress_file_range(struct inode *inode,
inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1,
SZ_16K);
- actual_end = min_t(u64, isize, end + 1);
+ actual_end = min_t(u64, i_size_read(inode), end + 1);
again:
will_compress = 0;
nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
@@ -502,6 +512,7 @@ again:
pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
if (!pages) {
/* just bail out to the uncompressed code */
+ nr_pages = 0;
goto cont;
}
@@ -537,8 +548,7 @@ again:
&total_compressed);
if (!ret) {
- unsigned long offset = total_compressed &
- (PAGE_SIZE - 1);
+ unsigned long offset = offset_in_page(total_compressed);
struct page *page = pages[nr_pages - 1];
char *kaddr;
@@ -703,9 +713,9 @@ static void free_async_extent_pages(struct async_extent *async_extent)
* queued. We walk all the async extents created by compress_file_range
* and send them down to the disk.
*/
-static noinline void submit_compressed_extents(struct inode *inode,
- struct async_cow *async_cow)
+static noinline void submit_compressed_extents(struct async_cow *async_cow)
{
+ struct inode *inode = async_cow->inode;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct async_extent *async_extent;
u64 alloc_hint = 0;
@@ -846,14 +856,13 @@ retry:
ins.offset, async_extent->pages,
async_extent->nr_pages,
async_cow->write_flags)) {
- struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
struct page *p = async_extent->pages[0];
const u64 start = async_extent->start;
const u64 end = start + async_extent->ram_size - 1;
p->mapping = inode->i_mapping;
- tree->ops->writepage_end_io_hook(p, start, end,
- NULL, 0);
+ btrfs_writepage_endio_finish_ordered(p, start, end, 0);
+
p->mapping = NULL;
extent_clear_unlock_delalloc(inode, start, end, end,
NULL, 0,
@@ -1143,13 +1152,11 @@ static noinline void async_cow_submit(struct btrfs_work *work)
{
struct btrfs_fs_info *fs_info;
struct async_cow *async_cow;
- struct btrfs_root *root;
unsigned long nr_pages;
async_cow = container_of(work, struct async_cow, work);
- root = async_cow->root;
- fs_info = root->fs_info;
+ fs_info = async_cow->fs_info;
nr_pages = (async_cow->end - async_cow->start + PAGE_SIZE) >>
PAGE_SHIFT;
@@ -1158,8 +1165,14 @@ static noinline void async_cow_submit(struct btrfs_work *work)
5 * SZ_1M)
cond_wake_up_nomb(&fs_info->async_submit_wait);
+ /*
+ * ->inode could be NULL if async_cow_start has failed to compress,
+ * in which case we don't have anything to submit, yet we need to
+ * always adjust ->async_delalloc_pages as its paired with the init
+ * happening in cow_file_range_async
+ */
if (async_cow->inode)
- submit_compressed_extents(async_cow->inode, async_cow);
+ submit_compressed_extents(async_cow);
}
static noinline void async_cow_free(struct btrfs_work *work)
@@ -1178,7 +1191,6 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct async_cow *async_cow;
- struct btrfs_root *root = BTRFS_I(inode)->root;
unsigned long nr_pages;
u64 cur_end;
@@ -1187,8 +1199,13 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
while (start < end) {
async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
BUG_ON(!async_cow); /* -ENOMEM */
- async_cow->inode = igrab(inode);
- async_cow->root = root;
+ /*
+ * igrab is called higher up in the call chain, take only the
+ * lightweight reference for the callback lifetime
+ */
+ ihold(inode);
+ async_cow->inode = inode;
+ async_cow->fs_info = fs_info;
async_cow->locked_page = locked_page;
async_cow->start = start;
async_cow->write_flags = write_flags;
@@ -1371,7 +1388,8 @@ next_slot:
* Do the same check as in btrfs_cross_ref_exist but
* without the unnecessary search.
*/
- if (btrfs_file_extent_generation(leaf, fi) <=
+ if (!nolock &&
+ btrfs_file_extent_generation(leaf, fi) <=
btrfs_root_last_snapshot(&root->root_item))
goto out_check;
if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
@@ -1530,12 +1548,11 @@ out_check:
}
btrfs_release_path(path);
- if (cur_offset <= end && cow_start == (u64)-1) {
+ if (cur_offset <= end && cow_start == (u64)-1)
cow_start = cur_offset;
- cur_offset = end;
- }
if (cow_start != (u64)-1) {
+ cur_offset = end;
ret = cow_file_range(inode, locked_page, cow_start, end, end,
page_started, nr_written, 1, NULL);
if (ret)
@@ -1576,14 +1593,13 @@ static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
}
/*
- * extent_io.c call back to do delayed allocation processing
+ * Function to process delayed allocation (create CoW) for ranges which are
+ * being touched for the first time.
*/
-static int run_delalloc_range(void *private_data, struct page *locked_page,
- u64 start, u64 end, int *page_started,
- unsigned long *nr_written,
- struct writeback_control *wbc)
+int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page,
+ u64 start, u64 end, int *page_started, unsigned long *nr_written,
+ struct writeback_control *wbc)
{
- struct inode *inode = private_data;
int ret;
int force_cow = need_force_cow(inode, start, end);
unsigned int write_flags = wbc_to_write_flags(wbc);
@@ -1605,14 +1621,14 @@ static int run_delalloc_range(void *private_data, struct page *locked_page,
write_flags);
}
if (ret)
- btrfs_cleanup_ordered_extents(inode, start, end - start + 1);
+ btrfs_cleanup_ordered_extents(inode, locked_page, start,
+ end - start + 1);
return ret;
}
-static void btrfs_split_extent_hook(void *private_data,
- struct extent_state *orig, u64 split)
+void btrfs_split_delalloc_extent(struct inode *inode,
+ struct extent_state *orig, u64 split)
{
- struct inode *inode = private_data;
u64 size;
/* not delalloc, ignore it */
@@ -1625,7 +1641,7 @@ static void btrfs_split_extent_hook(void *private_data,
u64 new_size;
/*
- * See the explanation in btrfs_merge_extent_hook, the same
+ * See the explanation in btrfs_merge_delalloc_extent, the same
* applies here, just in reverse.
*/
new_size = orig->end - split + 1;
@@ -1642,16 +1658,13 @@ static void btrfs_split_extent_hook(void *private_data,
}
/*
- * extent_io.c merge_extent_hook, used to track merged delayed allocation
- * extents so we can keep track of new extents that are just merged onto old
- * extents, such as when we are doing sequential writes, so we can properly
- * account for the metadata space we'll need.
+ * Handle merged delayed allocation extents so we can keep track of new extents
+ * that are just merged onto old extents, such as when we are doing sequential
+ * writes, so we can properly account for the metadata space we'll need.
*/
-static void btrfs_merge_extent_hook(void *private_data,
- struct extent_state *new,
- struct extent_state *other)
+void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
+ struct extent_state *other)
{
- struct inode *inode = private_data;
u64 new_size, old_size;
u32 num_extents;
@@ -1755,15 +1768,12 @@ static void btrfs_del_delalloc_inode(struct btrfs_root *root,
}
/*
- * extent_io.c set_bit_hook, used to track delayed allocation
- * bytes in this file, and to maintain the list of inodes that
- * have pending delalloc work to be done.
+ * Properly track delayed allocation bytes in the inode and to maintain the
+ * list of inodes that have pending delalloc work to be done.
*/
-static void btrfs_set_bit_hook(void *private_data,
- struct extent_state *state, unsigned *bits)
+void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
+ unsigned *bits)
{
- struct inode *inode = private_data;
-
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
@@ -1809,14 +1819,14 @@ static void btrfs_set_bit_hook(void *private_data,
}
/*
- * extent_io.c clear_bit_hook, see set_bit_hook for why
+ * Once a range is no longer delalloc this function ensures that proper
+ * accounting happens.
*/
-static void btrfs_clear_bit_hook(void *private_data,
- struct extent_state *state,
- unsigned *bits)
+void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
+ struct extent_state *state, unsigned *bits)
{
- struct btrfs_inode *inode = BTRFS_I((struct inode *)private_data);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
+ struct btrfs_inode *inode = BTRFS_I(vfs_inode);
+ struct btrfs_fs_info *fs_info = btrfs_sb(vfs_inode->i_sb);
u64 len = state->end + 1 - state->start;
u32 num_extents = count_max_extents(len);
@@ -1841,7 +1851,7 @@ static void btrfs_clear_bit_hook(void *private_data,
/*
* We don't reserve metadata space for space cache inodes so we
- * don't need to call dellalloc_release_metadata if there is an
+ * don't need to call delalloc_release_metadata if there is an
* error.
*/
if (*bits & EXTENT_CLEAR_META_RESV &&
@@ -1880,16 +1890,21 @@ static void btrfs_clear_bit_hook(void *private_data,
}
/*
- * Merge bio hook, this must check the chunk tree to make sure we don't create
- * bios that span stripes or chunks
+ * btrfs_bio_fits_in_stripe - Checks whether the size of the given bio will fit
+ * in a chunk's stripe. This function ensures that bios do not span a
+ * stripe/chunk
*
- * return 1 if page cannot be merged to bio
- * return 0 if page can be merged to bio
+ * @page - The page we are about to add to the bio
+ * @size - size we want to add to the bio
+ * @bio - bio we want to ensure is smaller than a stripe
+ * @bio_flags - flags of the bio
+ *
+ * return 1 if page cannot be added to the bio
+ * return 0 if page can be added to the bio
* return error otherwise
*/
-int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
- size_t size, struct bio *bio,
- unsigned long bio_flags)
+int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio,
+ unsigned long bio_flags)
{
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -1932,29 +1947,6 @@ static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio,
}
/*
- * in order to insert checksums into the metadata in large chunks,
- * we wait until bio submission time. All the pages in the bio are
- * checksummed and sums are attached onto the ordered extent record.
- *
- * At IO completion time the cums attached on the ordered extent record
- * are inserted into the btree
- */
-blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio,
- int mirror_num)
-{
- struct inode *inode = private_data;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- blk_status_t ret;
-
- ret = btrfs_map_bio(fs_info, bio, mirror_num, 1);
- if (ret) {
- bio->bi_status = ret;
- bio_endio(bio);
- }
- return ret;
-}
-
-/*
* extent_io.c submission hook. This does the right thing for csum calculation
* on write, or reading the csums from the tree before a read.
*
@@ -2056,7 +2048,7 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
unsigned int extra_bits,
struct extent_state **cached_state, int dedupe)
{
- WARN_ON((end & (PAGE_SIZE - 1)) == 0);
+ WARN_ON(PAGE_ALIGNED(end));
return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
extra_bits, cached_state);
}
@@ -2152,7 +2144,7 @@ out_page:
* to fix it up. The async helper will wait for ordered extents, set
* the delalloc bit and make it safe to write the page.
*/
-static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
+int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end)
{
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -2940,6 +2932,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
bool truncated = false;
bool range_locked = false;
bool clear_new_delalloc_bytes = false;
+ bool clear_reserved_extent = true;
if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
!test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
@@ -3043,10 +3036,12 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
logical_len, logical_len,
compress_type, 0, 0,
BTRFS_FILE_EXTENT_REG);
- if (!ret)
+ if (!ret) {
+ clear_reserved_extent = false;
btrfs_release_delalloc_bytes(fs_info,
ordered_extent->start,
ordered_extent->disk_len);
+ }
}
unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
ordered_extent->file_offset, ordered_extent->len,
@@ -3107,8 +3102,13 @@ out:
* wrong we need to return the space for this ordered extent
* back to the allocator. We only free the extent in the
* truncated case if we didn't write out the extent at all.
+ *
+ * If we made it past insert_reserved_file_extent before we
+ * errored out then we don't need to do this as the accounting
+ * has already been done.
*/
if ((ret || !logical_len) &&
+ clear_reserved_extent &&
!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
!test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
btrfs_free_reserved_extent(fs_info,
@@ -3138,9 +3138,6 @@ out:
/* once for the tree */
btrfs_put_ordered_extent(ordered_extent);
- /* Try to release some metadata so we don't get an OOM but don't wait */
- btrfs_btree_balance_dirty_nodelay(fs_info);
-
return ret;
}
@@ -3151,8 +3148,8 @@ static void finish_ordered_fn(struct btrfs_work *work)
btrfs_finish_ordered_io(ordered_extent);
}
-static void btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
- struct extent_state *state, int uptodate)
+void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
+ u64 end, int uptodate)
{
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -3259,10 +3256,13 @@ void btrfs_add_delayed_iput(struct inode *inode)
if (atomic_add_unless(&inode->i_count, -1, 1))
return;
+ atomic_inc(&fs_info->nr_delayed_iputs);
spin_lock(&fs_info->delayed_iput_lock);
ASSERT(list_empty(&binode->delayed_iput));
list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
spin_unlock(&fs_info->delayed_iput_lock);
+ if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags))
+ wake_up_process(fs_info->cleaner_kthread);
}
void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
@@ -3277,11 +3277,32 @@ void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
list_del_init(&inode->delayed_iput);
spin_unlock(&fs_info->delayed_iput_lock);
iput(&inode->vfs_inode);
+ if (atomic_dec_and_test(&fs_info->nr_delayed_iputs))
+ wake_up(&fs_info->delayed_iputs_wait);
spin_lock(&fs_info->delayed_iput_lock);
}
spin_unlock(&fs_info->delayed_iput_lock);
}
+/**
+ * btrfs_wait_on_delayed_iputs - wait on the delayed iputs to be done running
+ * @fs_info - the fs_info for this fs
+ * @return - EINTR if we were killed, 0 if nothing's pending
+ *
+ * This will wait on any delayed iputs that are currently running with KILLABLE
+ * set. Once they are all done running we will return, unless we are killed in
+ * which case we return EINTR. This helps in user operations like fallocate etc
+ * that might get blocked on the iputs.
+ */
+int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info)
+{
+ int ret = wait_event_killable(fs_info->delayed_iputs_wait,
+ atomic_read(&fs_info->nr_delayed_iputs) == 0);
+ if (ret)
+ return -EINTR;
+ return 0;
+}
+
/*
* This creates an orphan entry for the given inode in case something goes wrong
* in the middle of an unlink.
@@ -3561,10 +3582,11 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
/*
* read an inode from the btree into the in-memory inode
*/
-static int btrfs_read_locked_inode(struct inode *inode)
+static int btrfs_read_locked_inode(struct inode *inode,
+ struct btrfs_path *in_path)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_path *path;
+ struct btrfs_path *path = in_path;
struct extent_buffer *leaf;
struct btrfs_inode_item *inode_item;
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -3580,15 +3602,18 @@ static int btrfs_read_locked_inode(struct inode *inode)
if (!ret)
filled = true;
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
+ if (!path) {
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ }
memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
if (ret) {
- btrfs_free_path(path);
+ if (path != in_path)
+ btrfs_free_path(path);
return ret;
}
@@ -3674,6 +3699,21 @@ cache_index:
* inode is not a directory, logging its parent unnecessarily.
*/
BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
+ /*
+ * Similar reasoning for last_link_trans, needs to be set otherwise
+ * for a case like the following:
+ *
+ * mkdir A
+ * touch foo
+ * ln foo A/bar
+ * echo 2 > /proc/sys/vm/drop_caches
+ * fsync foo
+ * <power failure>
+ *
+ * Would result in link bar and directory A not existing after the power
+ * failure.
+ */
+ BTRFS_I(inode)->last_link_trans = BTRFS_I(inode)->last_trans;
path->slots[0]++;
if (inode->i_nlink != 1 ||
@@ -3713,7 +3753,8 @@ cache_acl:
btrfs_ino(BTRFS_I(inode)),
root->root_key.objectid, ret);
}
- btrfs_free_path(path);
+ if (path != in_path)
+ btrfs_free_path(path);
if (!maybe_acls)
cache_no_acl(inode);
@@ -4431,31 +4472,6 @@ out:
return err;
}
-static int truncate_space_check(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 bytes_deleted)
-{
- struct btrfs_fs_info *fs_info = root->fs_info;
- int ret;
-
- /*
- * This is only used to apply pressure to the enospc system, we don't
- * intend to use this reservation at all.
- */
- bytes_deleted = btrfs_csum_bytes_to_leaves(fs_info, bytes_deleted);
- bytes_deleted *= fs_info->nodesize;
- ret = btrfs_block_rsv_add(root, &fs_info->trans_block_rsv,
- bytes_deleted, BTRFS_RESERVE_NO_FLUSH);
- if (!ret) {
- trace_btrfs_space_reservation(fs_info, "transaction",
- trans->transid,
- bytes_deleted, 1);
- trans->bytes_reserved += bytes_deleted;
- }
- return ret;
-
-}
-
/*
* Return this if we need to call truncate_block for the last bit of the
* truncate.
@@ -4500,7 +4516,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
u64 bytes_deleted = 0;
bool be_nice = false;
bool should_throttle = false;
- bool should_end = false;
BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
@@ -4531,7 +4546,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
/*
* This function is also used to drop the items in the log tree before
* we relog the inode, so if root != BTRFS_I(inode)->root, it means
- * it is used to drop the loged items. So we shouldn't kill the delayed
+ * it is used to drop the logged items. So we shouldn't kill the delayed
* items.
*/
if (min_type == 0 && root == BTRFS_I(inode)->root)
@@ -4713,15 +4728,7 @@ delete:
btrfs_abort_transaction(trans, ret);
break;
}
- if (btrfs_should_throttle_delayed_refs(trans))
- btrfs_async_run_delayed_refs(fs_info,
- trans->delayed_ref_updates * 2,
- trans->transid, 0);
if (be_nice) {
- if (truncate_space_check(trans, root,
- extent_num_bytes)) {
- should_end = true;
- }
if (btrfs_should_throttle_delayed_refs(trans))
should_throttle = true;
}
@@ -4732,7 +4739,7 @@ delete:
if (path->slots[0] == 0 ||
path->slots[0] != pending_del_slot ||
- should_throttle || should_end) {
+ should_throttle) {
if (pending_del_nr) {
ret = btrfs_del_items(trans, root, path,
pending_del_slot,
@@ -4744,23 +4751,24 @@ delete:
pending_del_nr = 0;
}
btrfs_release_path(path);
- if (should_throttle) {
- unsigned long updates = trans->delayed_ref_updates;
- if (updates) {
- trans->delayed_ref_updates = 0;
- ret = btrfs_run_delayed_refs(trans,
- updates * 2);
- if (ret)
- break;
- }
- }
+
/*
- * if we failed to refill our space rsv, bail out
- * and let the transaction restart
+ * We can generate a lot of delayed refs, so we need to
+ * throttle every once and a while and make sure we're
+ * adding enough space to keep up with the work we are
+ * generating. Since we hold a transaction here we
+ * can't flush, and we don't want to FLUSH_LIMIT because
+ * we could have generated too many delayed refs to
+ * actually allocate, so just bail if we're short and
+ * let the normal reservation dance happen higher up.
*/
- if (should_end) {
- ret = -EAGAIN;
- break;
+ if (should_throttle) {
+ ret = btrfs_delayed_refs_rsv_refill(fs_info,
+ BTRFS_RESERVE_NO_FLUSH);
+ if (ret) {
+ ret = -EAGAIN;
+ break;
+ }
}
goto search_again;
} else {
@@ -4786,18 +4794,6 @@ out:
}
btrfs_free_path(path);
-
- if (be_nice && bytes_deleted > SZ_32M && (ret >= 0 || ret == -EAGAIN)) {
- unsigned long updates = trans->delayed_ref_updates;
- int err;
-
- if (updates) {
- trans->delayed_ref_updates = 0;
- err = btrfs_run_delayed_refs(trans, updates * 2);
- if (err)
- ret = err;
- }
- }
return ret;
}
@@ -5142,7 +5138,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
truncate_setsize(inode, newsize);
- /* Disable nonlocked read DIO to avoid the end less truncate */
+ /* Disable nonlocked read DIO to avoid the endless truncate */
btrfs_inode_block_unlocked_dio(BTRFS_I(inode));
inode_dio_wait(inode);
btrfs_inode_resume_unlocked_dio(BTRFS_I(inode));
@@ -5259,11 +5255,13 @@ static void evict_inode_truncate_pages(struct inode *inode)
struct extent_state *cached_state = NULL;
u64 start;
u64 end;
+ unsigned state_flags;
node = rb_first(&io_tree->state);
state = rb_entry(node, struct extent_state, rb_node);
start = state->start;
end = state->end;
+ state_flags = state->state;
spin_unlock(&io_tree->lock);
lock_extent_bits(io_tree, start, end, &cached_state);
@@ -5276,7 +5274,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
*
* Note, end is the bytenr of last byte, so we need + 1 here.
*/
- if (state->state & EXTENT_DELALLOC)
+ if (state_flags & EXTENT_DELALLOC)
btrfs_qgroup_free_data(inode, NULL, start, end - start + 1);
clear_extent_bit(io_tree, start, end,
@@ -5295,13 +5293,15 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+ u64 delayed_refs_extra = btrfs_calc_trans_metadata_size(fs_info, 1);
int failures = 0;
for (;;) {
struct btrfs_trans_handle *trans;
int ret;
- ret = btrfs_block_rsv_refill(root, rsv, rsv->size,
+ ret = btrfs_block_rsv_refill(root, rsv,
+ rsv->size + delayed_refs_extra,
BTRFS_RESERVE_FLUSH_LIMIT);
if (ret && ++failures > 2) {
@@ -5310,16 +5310,35 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
return ERR_PTR(-ENOSPC);
}
+ /*
+ * Evict can generate a large amount of delayed refs without
+ * having a way to add space back since we exhaust our temporary
+ * block rsv. We aren't allowed to do FLUSH_ALL in this case
+ * because we could deadlock with so many things in the flushing
+ * code, so we have to try and hold some extra space to
+ * compensate for our delayed ref generation. If we can't get
+ * that space then we need see if we can steal our minimum from
+ * the global reserve. We will be ratelimited by the amount of
+ * space we have for the delayed refs rsv, so we'll end up
+ * committing and trying again.
+ */
trans = btrfs_join_transaction(root);
- if (IS_ERR(trans) || !ret)
+ if (IS_ERR(trans) || !ret) {
+ if (!IS_ERR(trans)) {
+ trans->block_rsv = &fs_info->trans_block_rsv;
+ trans->bytes_reserved = delayed_refs_extra;
+ btrfs_block_rsv_migrate(rsv, trans->block_rsv,
+ delayed_refs_extra, 1);
+ }
return trans;
+ }
/*
* Try to steal from the global reserve if there is space for
* it.
*/
- if (!btrfs_check_space_for_delayed_refs(trans) &&
- !btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, false))
+ if (!btrfs_check_space_for_delayed_refs(fs_info) &&
+ !btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, 0))
return trans;
/* If not, commit and try again. */
@@ -5633,8 +5652,9 @@ static struct inode *btrfs_iget_locked(struct super_block *s,
/* Get an inode object given its location and corresponding root.
* Returns in *is_new if the inode was read from disk
*/
-struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
- struct btrfs_root *root, int *new)
+struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key *location,
+ struct btrfs_root *root, int *new,
+ struct btrfs_path *path)
{
struct inode *inode;
@@ -5645,7 +5665,7 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
if (inode->i_state & I_NEW) {
int ret;
- ret = btrfs_read_locked_inode(inode);
+ ret = btrfs_read_locked_inode(inode, path);
if (!ret) {
inode_tree_add(inode);
unlock_new_inode(inode);
@@ -5667,6 +5687,12 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
return inode;
}
+struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
+ struct btrfs_root *root, int *new)
+{
+ return btrfs_iget_path(s, location, root, new, NULL);
+}
+
static struct inode *new_simple_dir(struct super_block *s,
struct btrfs_key *key,
struct btrfs_root *root)
@@ -6384,14 +6410,19 @@ fail_dir_item:
err = btrfs_del_root_ref(trans, key.objectid,
root->root_key.objectid, parent_ino,
&local_index, name, name_len);
-
+ if (err)
+ btrfs_abort_transaction(trans, err);
} else if (add_backref) {
u64 local_index;
int err;
err = btrfs_del_inode_ref(trans, root, name, name_len,
ino, parent_ino, &local_index);
+ if (err)
+ btrfs_abort_transaction(trans, err);
}
+
+ /* Return the original error code */
return ret;
}
@@ -6603,6 +6634,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
if (err)
goto fail;
}
+ BTRFS_I(inode)->last_link_trans = trans->transid;
d_instantiate(dentry, inode);
ret = btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent,
true, NULL);
@@ -6630,7 +6662,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(dir)->root;
int err = 0;
- int drop_on_err = 0;
u64 objectid = 0;
u64 index = 0;
@@ -6656,7 +6687,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
goto out_fail;
}
- drop_on_err = 1;
/* these must be set before we unlock the inode */
inode->i_op = &btrfs_dir_inode_operations;
inode->i_fop = &btrfs_dir_file_operations;
@@ -6677,7 +6707,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
goto out_fail;
d_instantiate_new(dentry, inode);
- drop_on_err = 0;
out_fail:
btrfs_end_transaction(trans);
@@ -6754,7 +6783,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
u64 extent_start = 0;
u64 extent_end = 0;
u64 objectid = btrfs_ino(inode);
- u32 found_type;
+ u8 extent_type;
struct btrfs_path *path = NULL;
struct btrfs_root *root = inode->root;
struct btrfs_file_extent_item *item;
@@ -6809,9 +6838,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
if (ret < 0) {
err = ret;
goto out;
- }
-
- if (ret != 0) {
+ } else if (ret > 0) {
if (path->slots[0] == 0)
goto not_found;
path->slots[0]--;
@@ -6820,11 +6847,9 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
- /* are we inside the extent that was found? */
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
- found_type = found_key.type;
if (found_key.objectid != objectid ||
- found_type != BTRFS_EXTENT_DATA_KEY) {
+ found_key.type != BTRFS_EXTENT_DATA_KEY) {
/*
* If we backup past the first extent we want to move forward
* and see if there is an extent in front of us, otherwise we'll
@@ -6835,16 +6860,16 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
goto next;
}
- found_type = btrfs_file_extent_type(leaf, item);
+ extent_type = btrfs_file_extent_type(leaf, item);
extent_start = found_key.offset;
- if (found_type == BTRFS_FILE_EXTENT_REG ||
- found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ if (extent_type == BTRFS_FILE_EXTENT_REG ||
+ extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
extent_end = extent_start +
btrfs_file_extent_num_bytes(leaf, item);
trace_btrfs_get_extent_show_fi_regular(inode, leaf, item,
extent_start);
- } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+ } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
size_t size;
size = btrfs_file_extent_ram_bytes(leaf, item);
@@ -6863,9 +6888,9 @@ next:
if (ret < 0) {
err = ret;
goto out;
- }
- if (ret > 0)
+ } else if (ret > 0) {
goto not_found;
+ }
leaf = path->nodes[0];
}
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
@@ -6876,19 +6901,22 @@ next:
goto not_found;
if (start > found_key.offset)
goto next;
+
+ /* New extent overlaps with existing one */
em->start = start;
em->orig_start = start;
em->len = found_key.offset - start;
- goto not_found_em;
+ em->block_start = EXTENT_MAP_HOLE;
+ goto insert;
}
btrfs_extent_item_to_extent_map(inode, path, item,
new_inline, em);
- if (found_type == BTRFS_FILE_EXTENT_REG ||
- found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ if (extent_type == BTRFS_FILE_EXTENT_REG ||
+ extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
goto insert;
- } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+ } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
unsigned long ptr;
char *map;
size_t size;
@@ -6939,7 +6967,6 @@ not_found:
em->start = start;
em->orig_start = start;
em->len = len;
-not_found_em:
em->block_start = EXTENT_MAP_HOLE;
insert:
btrfs_release_path(path);
@@ -6969,19 +6996,17 @@ out:
}
struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
- struct page *page,
- size_t pg_offset, u64 start, u64 len,
- int create)
+ u64 start, u64 len)
{
struct extent_map *em;
struct extent_map *hole_em = NULL;
- u64 range_start = start;
+ u64 delalloc_start = start;
u64 end;
- u64 found;
- u64 found_end;
+ u64 delalloc_len;
+ u64 delalloc_end;
int err = 0;
- em = btrfs_get_extent(inode, page, pg_offset, start, len, create);
+ em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
if (IS_ERR(em))
return em;
/*
@@ -7006,80 +7031,84 @@ struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
em = NULL;
/* ok, we didn't find anything, lets look for delalloc */
- found = count_range_bits(&inode->io_tree, &range_start,
+ delalloc_len = count_range_bits(&inode->io_tree, &delalloc_start,
end, len, EXTENT_DELALLOC, 1);
- found_end = range_start + found;
- if (found_end < range_start)
- found_end = (u64)-1;
+ delalloc_end = delalloc_start + delalloc_len;
+ if (delalloc_end < delalloc_start)
+ delalloc_end = (u64)-1;
/*
- * we didn't find anything useful, return
- * the original results from get_extent()
+ * We didn't find anything useful, return the original results from
+ * get_extent()
*/
- if (range_start > end || found_end <= start) {
+ if (delalloc_start > end || delalloc_end <= start) {
em = hole_em;
hole_em = NULL;
goto out;
}
- /* adjust the range_start to make sure it doesn't
- * go backwards from the start they passed in
+ /*
+ * Adjust the delalloc_start to make sure it doesn't go backwards from
+ * the start they passed in
*/
- range_start = max(start, range_start);
- found = found_end - range_start;
+ delalloc_start = max(start, delalloc_start);
+ delalloc_len = delalloc_end - delalloc_start;
- if (found > 0) {
- u64 hole_start = start;
- u64 hole_len = len;
+ if (delalloc_len > 0) {
+ u64 hole_start;
+ u64 hole_len;
+ const u64 hole_end = extent_map_end(hole_em);
em = alloc_extent_map();
if (!em) {
err = -ENOMEM;
goto out;
}
+ em->bdev = NULL;
+
+ ASSERT(hole_em);
/*
- * when btrfs_get_extent can't find anything it
- * returns one huge hole
+ * When btrfs_get_extent can't find anything it returns one
+ * huge hole
*
- * make sure what it found really fits our range, and
- * adjust to make sure it is based on the start from
- * the caller
+ * Make sure what it found really fits our range, and adjust to
+ * make sure it is based on the start from the caller
*/
- if (hole_em) {
- u64 calc_end = extent_map_end(hole_em);
-
- if (calc_end <= start || (hole_em->start > end)) {
- free_extent_map(hole_em);
- hole_em = NULL;
- } else {
- hole_start = max(hole_em->start, start);
- hole_len = calc_end - hole_start;
- }
+ if (hole_end <= start || hole_em->start > end) {
+ free_extent_map(hole_em);
+ hole_em = NULL;
+ } else {
+ hole_start = max(hole_em->start, start);
+ hole_len = hole_end - hole_start;
}
- em->bdev = NULL;
- if (hole_em && range_start > hole_start) {
- /* our hole starts before our delalloc, so we
- * have to return just the parts of the hole
- * that go until the delalloc starts
+
+ if (hole_em && delalloc_start > hole_start) {
+ /*
+ * Our hole starts before our delalloc, so we have to
+ * return just the parts of the hole that go until the
+ * delalloc starts
*/
- em->len = min(hole_len,
- range_start - hole_start);
+ em->len = min(hole_len, delalloc_start - hole_start);
em->start = hole_start;
em->orig_start = hole_start;
/*
- * don't adjust block start at all,
- * it is fixed at EXTENT_MAP_HOLE
+ * Don't adjust block start at all, it is fixed at
+ * EXTENT_MAP_HOLE
*/
em->block_start = hole_em->block_start;
em->block_len = hole_len;
if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
} else {
- em->start = range_start;
- em->len = found;
- em->orig_start = range_start;
+ /*
+ * Hole is out of passed range or it starts after
+ * delalloc range
+ */
+ em->start = delalloc_start;
+ em->len = delalloc_len;
+ em->orig_start = delalloc_start;
em->block_start = EXTENT_MAP_DELALLOC;
- em->block_len = found;
+ em->block_len = delalloc_len;
}
} else {
return hole_em;
@@ -7800,6 +7829,7 @@ static void btrfs_retry_endio_nocsum(struct bio *bio)
struct bio_vec *bvec;
struct extent_io_tree *io_tree, *failure_tree;
int i;
+ struct bvec_iter_all iter_all;
if (bio->bi_status)
goto end;
@@ -7811,7 +7841,7 @@ static void btrfs_retry_endio_nocsum(struct bio *bio)
done->uptodate = 1;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, i)
+ bio_for_each_segment_all(bvec, bio, i, iter_all)
clean_io_failure(BTRFS_I(inode)->root->fs_info, failure_tree,
io_tree, done->start, bvec->bv_page,
btrfs_ino(BTRFS_I(inode)), 0);
@@ -7890,6 +7920,7 @@ static void btrfs_retry_endio(struct bio *bio)
int uptodate;
int ret;
int i;
+ struct bvec_iter_all iter_all;
if (bio->bi_status)
goto end;
@@ -7903,7 +7934,7 @@ static void btrfs_retry_endio(struct bio *bio)
failure_tree = &BTRFS_I(inode)->io_failure_tree;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, i, iter_all) {
ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page,
bvec->bv_offset, done->start,
bvec->bv_len);
@@ -8031,9 +8062,7 @@ static void btrfs_endio_direct_read(struct bio *bio)
dio_bio->bi_status = err;
dio_end_io(dio_bio);
-
- if (io_bio->end_io)
- io_bio->end_io(io_bio, blk_status_to_errno(err));
+ btrfs_io_bio_free_csum(io_bio);
bio_put(bio);
}
@@ -8076,7 +8105,7 @@ static void __endio_write_update_ordered(struct inode *inode,
return;
/*
* Our bio might span multiple ordered extents. In this case
- * we keep goin until we have accounted the whole dio.
+ * we keep going until we have accounted the whole dio.
*/
if (ordered_offset < offset + bytes) {
ordered_bytes = offset + bytes - ordered_offset;
@@ -8386,8 +8415,7 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
if (!ret)
return;
- if (io_bio->end_io)
- io_bio->end_io(io_bio, ret);
+ btrfs_io_bio_free_csum(io_bio);
free_ordered:
/*
@@ -8890,7 +8918,7 @@ again:
/* page is wholly or partially inside EOF */
if (page_start + PAGE_SIZE > size)
- zero_start = size & ~PAGE_MASK;
+ zero_start = offset_in_page(size);
else
zero_start = PAGE_SIZE;
@@ -9135,6 +9163,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->index_cnt = (u64)-1;
ei->dir_index = 0;
ei->last_unlink_trans = 0;
+ ei->last_link_trans = 0;
ei->last_log_commit = 0;
spin_lock_init(&ei->lock);
@@ -9935,7 +9964,6 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode
init_completion(&work->completion);
INIT_LIST_HEAD(&work->list);
work->inode = inode;
- WARN_ON_ONCE(!inode);
btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
btrfs_run_delalloc_work, NULL, NULL);
@@ -9946,7 +9974,7 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode
* some fairly slow code that needs optimization. This walks the list
* of all the inodes with pending delalloc and forces them to disk.
*/
-static int start_delalloc_inodes(struct btrfs_root *root, int nr)
+static int start_delalloc_inodes(struct btrfs_root *root, int nr, bool snapshot)
{
struct btrfs_inode *binode;
struct inode *inode;
@@ -9974,6 +10002,9 @@ static int start_delalloc_inodes(struct btrfs_root *root, int nr)
}
spin_unlock(&root->delalloc_lock);
+ if (snapshot)
+ set_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
+ &binode->runtime_flags);
work = btrfs_alloc_delalloc_work(inode);
if (!work) {
iput(inode);
@@ -10007,7 +10038,7 @@ out:
return ret;
}
-int btrfs_start_delalloc_inodes(struct btrfs_root *root)
+int btrfs_start_delalloc_snapshot(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
@@ -10015,7 +10046,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root)
if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
return -EROFS;
- ret = start_delalloc_inodes(root, -1);
+ ret = start_delalloc_inodes(root, -1, true);
if (ret > 0)
ret = 0;
return ret;
@@ -10044,7 +10075,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr)
&fs_info->delalloc_roots);
spin_unlock(&fs_info->delalloc_root_lock);
- ret = start_delalloc_inodes(root, nr);
+ ret = start_delalloc_inodes(root, nr, false);
btrfs_put_fs_root(root);
if (ret < 0)
goto out;
@@ -10423,26 +10454,6 @@ out:
return ret;
}
-__attribute__((const))
-static int btrfs_readpage_io_failed_hook(struct page *page, int failed_mirror)
-{
- return -EAGAIN;
-}
-
-static void btrfs_check_extent_io_range(void *private_data, const char *caller,
- u64 start, u64 end)
-{
- struct inode *inode = private_data;
- u64 isize;
-
- isize = i_size_read(inode);
- if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
- btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
- "%s: ino %llu isize %llu odd range [%llu,%llu]",
- caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
- }
-}
-
void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
{
struct inode *inode = tree->private_data;
@@ -10459,6 +10470,343 @@ void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
}
}
+#ifdef CONFIG_SWAP
+/*
+ * Add an entry indicating a block group or device which is pinned by a
+ * swapfile. Returns 0 on success, 1 if there is already an entry for it, or a
+ * negative errno on failure.
+ */
+static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr,
+ bool is_block_group)
+{
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ struct btrfs_swapfile_pin *sp, *entry;
+ struct rb_node **p;
+ struct rb_node *parent = NULL;
+
+ sp = kmalloc(sizeof(*sp), GFP_NOFS);
+ if (!sp)
+ return -ENOMEM;
+ sp->ptr = ptr;
+ sp->inode = inode;
+ sp->is_block_group = is_block_group;
+
+ spin_lock(&fs_info->swapfile_pins_lock);
+ p = &fs_info->swapfile_pins.rb_node;
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct btrfs_swapfile_pin, node);
+ if (sp->ptr < entry->ptr ||
+ (sp->ptr == entry->ptr && sp->inode < entry->inode)) {
+ p = &(*p)->rb_left;
+ } else if (sp->ptr > entry->ptr ||
+ (sp->ptr == entry->ptr && sp->inode > entry->inode)) {
+ p = &(*p)->rb_right;
+ } else {
+ spin_unlock(&fs_info->swapfile_pins_lock);
+ kfree(sp);
+ return 1;
+ }
+ }
+ rb_link_node(&sp->node, parent, p);
+ rb_insert_color(&sp->node, &fs_info->swapfile_pins);
+ spin_unlock(&fs_info->swapfile_pins_lock);
+ return 0;
+}
+
+/* Free all of the entries pinned by this swapfile. */
+static void btrfs_free_swapfile_pins(struct inode *inode)
+{
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ struct btrfs_swapfile_pin *sp;
+ struct rb_node *node, *next;
+
+ spin_lock(&fs_info->swapfile_pins_lock);
+ node = rb_first(&fs_info->swapfile_pins);
+ while (node) {
+ next = rb_next(node);
+ sp = rb_entry(node, struct btrfs_swapfile_pin, node);
+ if (sp->inode == inode) {
+ rb_erase(&sp->node, &fs_info->swapfile_pins);
+ if (sp->is_block_group)
+ btrfs_put_block_group(sp->ptr);
+ kfree(sp);
+ }
+ node = next;
+ }
+ spin_unlock(&fs_info->swapfile_pins_lock);
+}
+
+struct btrfs_swap_info {
+ u64 start;
+ u64 block_start;
+ u64 block_len;
+ u64 lowest_ppage;
+ u64 highest_ppage;
+ unsigned long nr_pages;
+ int nr_extents;
+};
+
+static int btrfs_add_swap_extent(struct swap_info_struct *sis,
+ struct btrfs_swap_info *bsi)
+{
+ unsigned long nr_pages;
+ u64 first_ppage, first_ppage_reported, next_ppage;
+ int ret;
+
+ first_ppage = ALIGN(bsi->block_start, PAGE_SIZE) >> PAGE_SHIFT;
+ next_ppage = ALIGN_DOWN(bsi->block_start + bsi->block_len,
+ PAGE_SIZE) >> PAGE_SHIFT;
+
+ if (first_ppage >= next_ppage)
+ return 0;
+ nr_pages = next_ppage - first_ppage;
+
+ first_ppage_reported = first_ppage;
+ if (bsi->start == 0)
+ first_ppage_reported++;
+ if (bsi->lowest_ppage > first_ppage_reported)
+ bsi->lowest_ppage = first_ppage_reported;
+ if (bsi->highest_ppage < (next_ppage - 1))
+ bsi->highest_ppage = next_ppage - 1;
+
+ ret = add_swap_extent(sis, bsi->nr_pages, nr_pages, first_ppage);
+ if (ret < 0)
+ return ret;
+ bsi->nr_extents += ret;
+ bsi->nr_pages += nr_pages;
+ return 0;
+}
+
+static void btrfs_swap_deactivate(struct file *file)
+{
+ struct inode *inode = file_inode(file);
+
+ btrfs_free_swapfile_pins(inode);
+ atomic_dec(&BTRFS_I(inode)->root->nr_swapfiles);
+}
+
+static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
+ sector_t *span)
+{
+ struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct extent_state *cached_state = NULL;
+ struct extent_map *em = NULL;
+ struct btrfs_device *device = NULL;
+ struct btrfs_swap_info bsi = {
+ .lowest_ppage = (sector_t)-1ULL,
+ };
+ int ret = 0;
+ u64 isize;
+ u64 start;
+
+ /*
+ * If the swap file was just created, make sure delalloc is done. If the
+ * file changes again after this, the user is doing something stupid and
+ * we don't really care.
+ */
+ ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ if (ret)
+ return ret;
+
+ /*
+ * The inode is locked, so these flags won't change after we check them.
+ */
+ if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) {
+ btrfs_warn(fs_info, "swapfile must not be compressed");
+ return -EINVAL;
+ }
+ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) {
+ btrfs_warn(fs_info, "swapfile must not be copy-on-write");
+ return -EINVAL;
+ }
+ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
+ btrfs_warn(fs_info, "swapfile must not be checksummed");
+ return -EINVAL;
+ }
+
+ /*
+ * Balance or device remove/replace/resize can move stuff around from
+ * under us. The EXCL_OP flag makes sure they aren't running/won't run
+ * concurrently while we are mapping the swap extents, and
+ * fs_info->swapfile_pins prevents them from running while the swap file
+ * is active and moving the extents. Note that this also prevents a
+ * concurrent device add which isn't actually necessary, but it's not
+ * really worth the trouble to allow it.
+ */
+ if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+ btrfs_warn(fs_info,
+ "cannot activate swapfile while exclusive operation is running");
+ return -EBUSY;
+ }
+ /*
+ * Snapshots can create extents which require COW even if NODATACOW is
+ * set. We use this counter to prevent snapshots. We must increment it
+ * before walking the extents because we don't want a concurrent
+ * snapshot to run after we've already checked the extents.
+ */
+ atomic_inc(&BTRFS_I(inode)->root->nr_swapfiles);
+
+ isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);
+
+ lock_extent_bits(io_tree, 0, isize - 1, &cached_state);
+ start = 0;
+ while (start < isize) {
+ u64 logical_block_start, physical_block_start;
+ struct btrfs_block_group_cache *bg;
+ u64 len = isize - start;
+
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto out;
+ }
+
+ if (em->block_start == EXTENT_MAP_HOLE) {
+ btrfs_warn(fs_info, "swapfile must not have holes");
+ ret = -EINVAL;
+ goto out;
+ }
+ if (em->block_start == EXTENT_MAP_INLINE) {
+ /*
+ * It's unlikely we'll ever actually find ourselves
+ * here, as a file small enough to fit inline won't be
+ * big enough to store more than the swap header, but in
+ * case something changes in the future, let's catch it
+ * here rather than later.
+ */
+ btrfs_warn(fs_info, "swapfile must not be inline");
+ ret = -EINVAL;
+ goto out;
+ }
+ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+ btrfs_warn(fs_info, "swapfile must not be compressed");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ logical_block_start = em->block_start + (start - em->start);
+ len = min(len, em->len - (start - em->start));
+ free_extent_map(em);
+ em = NULL;
+
+ ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL);
+ if (ret < 0) {
+ goto out;
+ } else if (ret) {
+ ret = 0;
+ } else {
+ btrfs_warn(fs_info,
+ "swapfile must not be copy-on-write");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ em = btrfs_get_chunk_map(fs_info, logical_block_start, len);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto out;
+ }
+
+ if (em->map_lookup->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+ btrfs_warn(fs_info,
+ "swapfile must have single data profile");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (device == NULL) {
+ device = em->map_lookup->stripes[0].dev;
+ ret = btrfs_add_swapfile_pin(inode, device, false);
+ if (ret == 1)
+ ret = 0;
+ else if (ret)
+ goto out;
+ } else if (device != em->map_lookup->stripes[0].dev) {
+ btrfs_warn(fs_info, "swapfile must be on one device");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ physical_block_start = (em->map_lookup->stripes[0].physical +
+ (logical_block_start - em->start));
+ len = min(len, em->len - (logical_block_start - em->start));
+ free_extent_map(em);
+ em = NULL;
+
+ bg = btrfs_lookup_block_group(fs_info, logical_block_start);
+ if (!bg) {
+ btrfs_warn(fs_info,
+ "could not find block group containing swapfile");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = btrfs_add_swapfile_pin(inode, bg, true);
+ if (ret) {
+ btrfs_put_block_group(bg);
+ if (ret == 1)
+ ret = 0;
+ else
+ goto out;
+ }
+
+ if (bsi.block_len &&
+ bsi.block_start + bsi.block_len == physical_block_start) {
+ bsi.block_len += len;
+ } else {
+ if (bsi.block_len) {
+ ret = btrfs_add_swap_extent(sis, &bsi);
+ if (ret)
+ goto out;
+ }
+ bsi.start = start;
+ bsi.block_start = physical_block_start;
+ bsi.block_len = len;
+ }
+
+ start += len;
+ }
+
+ if (bsi.block_len)
+ ret = btrfs_add_swap_extent(sis, &bsi);
+
+out:
+ if (!IS_ERR_OR_NULL(em))
+ free_extent_map(em);
+
+ unlock_extent_cached(io_tree, 0, isize - 1, &cached_state);
+
+ if (ret)
+ btrfs_swap_deactivate(file);
+
+ clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+
+ if (ret)
+ return ret;
+
+ if (device)
+ sis->bdev = device->bdev;
+ *span = bsi.highest_ppage - bsi.lowest_ppage + 1;
+ sis->max = bsi.nr_pages;
+ sis->pages = bsi.nr_pages - 1;
+ sis->highest_bit = bsi.nr_pages - 1;
+ return bsi.nr_extents;
+}
+#else
+static void btrfs_swap_deactivate(struct file *file)
+{
+}
+
+static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
+ sector_t *span)
+{
+ return -EOPNOTSUPP;
+}
+#endif
+
static const struct inode_operations btrfs_dir_inode_operations = {
.getattr = btrfs_getattr,
.lookup = btrfs_lookup,
@@ -10501,17 +10849,6 @@ static const struct extent_io_ops btrfs_extent_io_ops = {
/* mandatory callbacks */
.submit_bio_hook = btrfs_submit_bio_hook,
.readpage_end_io_hook = btrfs_readpage_end_io_hook,
- .readpage_io_failed_hook = btrfs_readpage_io_failed_hook,
-
- /* optional callbacks */
- .fill_delalloc = run_delalloc_range,
- .writepage_end_io_hook = btrfs_writepage_end_io_hook,
- .writepage_start_hook = btrfs_writepage_start_hook,
- .set_bit_hook = btrfs_set_bit_hook,
- .clear_bit_hook = btrfs_clear_bit_hook,
- .merge_extent_hook = btrfs_merge_extent_hook,
- .split_extent_hook = btrfs_split_extent_hook,
- .check_extent_io_range = btrfs_check_extent_io_range,
};
/*
@@ -10536,6 +10873,8 @@ static const struct address_space_operations btrfs_aops = {
.releasepage = btrfs_releasepage,
.set_page_dirty = btrfs_set_page_dirty,
.error_remove_page = generic_error_remove_page,
+ .swap_activate = btrfs_swap_activate,
+ .swap_deactivate = btrfs_swap_deactivate,
};
static const struct inode_operations btrfs_file_inode_operations = {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a990a9045139..494f0f10d70e 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -290,6 +290,11 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
} else if (fsflags & FS_COMPR_FL) {
const char *comp;
+ if (IS_SWAPFILE(inode)) {
+ ret = -ETXTBSY;
+ goto out_unlock;
+ }
+
binode->flags |= BTRFS_INODE_COMPRESS;
binode->flags &= ~BTRFS_INODE_NOCOMPRESS;
@@ -754,6 +759,12 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
return -EINVAL;
+ if (atomic_read(&root->nr_swapfiles)) {
+ btrfs_warn(fs_info,
+ "cannot snapshot subvolume with active swapfile");
+ return -ETXTBSY;
+ }
+
pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_KERNEL);
if (!pending_snapshot)
return -ENOMEM;
@@ -777,7 +788,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
wait_event(root->subv_writers->wait,
percpu_counter_sum(&root->subv_writers->counter) == 0);
- ret = btrfs_start_delalloc_inodes(root);
+ ret = btrfs_start_delalloc_snapshot(root);
if (ret)
goto dec_and_free;
@@ -1505,9 +1516,13 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
}
inode_lock(inode);
- if (do_compress)
- BTRFS_I(inode)->defrag_compress = compress_type;
- ret = cluster_pages_for_defrag(inode, pages, i, cluster);
+ if (IS_SWAPFILE(inode)) {
+ ret = -ETXTBSY;
+ } else {
+ if (do_compress)
+ BTRFS_I(inode)->defrag_compress = compress_type;
+ ret = cluster_pages_for_defrag(inode, pages, i, cluster);
+ }
if (ret < 0) {
inode_unlock(inode);
goto out_ra;
@@ -1627,7 +1642,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
btrfs_info(fs_info, "resizing devid %llu", devid);
}
- device = btrfs_find_device(fs_info, devid, NULL, NULL);
+ device = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
if (!device) {
btrfs_info(fs_info, "resizer unable to find device %llu",
devid);
@@ -3135,7 +3150,7 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info,
}
rcu_read_unlock();
- memcpy(&fi_args->fsid, fs_info->fsid, sizeof(fi_args->fsid));
+ memcpy(&fi_args->fsid, fs_devices->fsid, sizeof(fi_args->fsid));
fi_args->nodesize = fs_info->nodesize;
fi_args->sectorsize = fs_info->sectorsize;
fi_args->clone_alignment = fs_info->sectorsize;
@@ -3163,7 +3178,8 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,
s_uuid = di_args->uuid;
rcu_read_lock();
- dev = btrfs_find_device(fs_info, di_args->devid, s_uuid, NULL);
+ dev = btrfs_find_device(fs_info->fs_devices, di_args->devid, s_uuid,
+ NULL, true);
if (!dev) {
ret = -ENODEV;
@@ -3191,92 +3207,6 @@ out:
return ret;
}
-static struct page *extent_same_get_page(struct inode *inode, pgoff_t index)
-{
- struct page *page;
-
- page = grab_cache_page(inode->i_mapping, index);
- if (!page)
- return ERR_PTR(-ENOMEM);
-
- if (!PageUptodate(page)) {
- int ret;
-
- ret = btrfs_readpage(NULL, page);
- if (ret)
- return ERR_PTR(ret);
- lock_page(page);
- if (!PageUptodate(page)) {
- unlock_page(page);
- put_page(page);
- return ERR_PTR(-EIO);
- }
- if (page->mapping != inode->i_mapping) {
- unlock_page(page);
- put_page(page);
- return ERR_PTR(-EAGAIN);
- }
- }
-
- return page;
-}
-
-static int gather_extent_pages(struct inode *inode, struct page **pages,
- int num_pages, u64 off)
-{
- int i;
- pgoff_t index = off >> PAGE_SHIFT;
-
- for (i = 0; i < num_pages; i++) {
-again:
- pages[i] = extent_same_get_page(inode, index + i);
- if (IS_ERR(pages[i])) {
- int err = PTR_ERR(pages[i]);
-
- if (err == -EAGAIN)
- goto again;
- pages[i] = NULL;
- return err;
- }
- }
- return 0;
-}
-
-static int lock_extent_range(struct inode *inode, u64 off, u64 len,
- bool retry_range_locking)
-{
- /*
- * Do any pending delalloc/csum calculations on inode, one way or
- * another, and lock file content.
- * The locking order is:
- *
- * 1) pages
- * 2) range in the inode's io tree
- */
- while (1) {
- struct btrfs_ordered_extent *ordered;
- lock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);
- ordered = btrfs_lookup_first_ordered_extent(inode,
- off + len - 1);
- if ((!ordered ||
- ordered->file_offset + ordered->len <= off ||
- ordered->file_offset >= off + len) &&
- !test_range_bit(&BTRFS_I(inode)->io_tree, off,
- off + len - 1, EXTENT_DELALLOC, 0, NULL)) {
- if (ordered)
- btrfs_put_ordered_extent(ordered);
- break;
- }
- unlock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);
- if (ordered)
- btrfs_put_ordered_extent(ordered);
- if (!retry_range_locking)
- return -EAGAIN;
- btrfs_wait_ordered_range(inode, off, len);
- }
- return 0;
-}
-
static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2)
{
inode_unlock(inode1);
@@ -3293,258 +3223,37 @@ static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2)
}
static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
- struct inode *inode2, u64 loff2, u64 len)
+ struct inode *inode2, u64 loff2, u64 len)
{
unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
}
-static int btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
- struct inode *inode2, u64 loff2, u64 len,
- bool retry_range_locking)
+static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
+ struct inode *inode2, u64 loff2, u64 len)
{
- int ret;
-
if (inode1 < inode2) {
swap(inode1, inode2);
swap(loff1, loff2);
+ } else if (inode1 == inode2 && loff2 < loff1) {
+ swap(loff1, loff2);
}
- ret = lock_extent_range(inode1, loff1, len, retry_range_locking);
- if (ret)
- return ret;
- ret = lock_extent_range(inode2, loff2, len, retry_range_locking);
- if (ret)
- unlock_extent(&BTRFS_I(inode1)->io_tree, loff1,
- loff1 + len - 1);
- return ret;
-}
-
-struct cmp_pages {
- int num_pages;
- struct page **src_pages;
- struct page **dst_pages;
-};
-
-static void btrfs_cmp_data_free(struct cmp_pages *cmp)
-{
- int i;
- struct page *pg;
-
- for (i = 0; i < cmp->num_pages; i++) {
- pg = cmp->src_pages[i];
- if (pg) {
- unlock_page(pg);
- put_page(pg);
- cmp->src_pages[i] = NULL;
- }
- pg = cmp->dst_pages[i];
- if (pg) {
- unlock_page(pg);
- put_page(pg);
- cmp->dst_pages[i] = NULL;
- }
- }
-}
-
-static int btrfs_cmp_data_prepare(struct inode *src, u64 loff,
- struct inode *dst, u64 dst_loff,
- u64 len, struct cmp_pages *cmp)
-{
- int ret;
- int num_pages = PAGE_ALIGN(len) >> PAGE_SHIFT;
-
- cmp->num_pages = num_pages;
-
- ret = gather_extent_pages(src, cmp->src_pages, num_pages, loff);
- if (ret)
- goto out;
-
- ret = gather_extent_pages(dst, cmp->dst_pages, num_pages, dst_loff);
-
-out:
- if (ret)
- btrfs_cmp_data_free(cmp);
- return ret;
-}
-
-static int btrfs_cmp_data(u64 len, struct cmp_pages *cmp)
-{
- int ret = 0;
- int i;
- struct page *src_page, *dst_page;
- unsigned int cmp_len = PAGE_SIZE;
- void *addr, *dst_addr;
-
- i = 0;
- while (len) {
- if (len < PAGE_SIZE)
- cmp_len = len;
-
- BUG_ON(i >= cmp->num_pages);
-
- src_page = cmp->src_pages[i];
- dst_page = cmp->dst_pages[i];
- ASSERT(PageLocked(src_page));
- ASSERT(PageLocked(dst_page));
-
- addr = kmap_atomic(src_page);
- dst_addr = kmap_atomic(dst_page);
-
- flush_dcache_page(src_page);
- flush_dcache_page(dst_page);
-
- if (memcmp(addr, dst_addr, cmp_len))
- ret = -EBADE;
-
- kunmap_atomic(addr);
- kunmap_atomic(dst_addr);
-
- if (ret)
- break;
-
- len -= cmp_len;
- i++;
- }
-
- return ret;
-}
-
-static int extent_same_check_offsets(struct inode *inode, u64 off, u64 *plen,
- u64 olen)
-{
- u64 len = *plen;
- u64 bs = BTRFS_I(inode)->root->fs_info->sb->s_blocksize;
-
- if (off + olen > inode->i_size || off + olen < off)
- return -EINVAL;
-
- /* if we extend to eof, continue to block boundary */
- if (off + len == inode->i_size)
- *plen = len = ALIGN(inode->i_size, bs) - off;
-
- /* Check that we are block aligned - btrfs_clone() requires this */
- if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs))
- return -EINVAL;
-
- return 0;
+ lock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
+ lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
}
-static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 olen,
- struct inode *dst, u64 dst_loff,
- struct cmp_pages *cmp)
+static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
+ struct inode *dst, u64 dst_loff)
{
int ret;
- u64 len = olen;
- bool same_inode = (src == dst);
- u64 same_lock_start = 0;
- u64 same_lock_len = 0;
-
- ret = extent_same_check_offsets(src, loff, &len, olen);
- if (ret)
- return ret;
-
- ret = extent_same_check_offsets(dst, dst_loff, &len, olen);
- if (ret)
- return ret;
-
- if (same_inode) {
- /*
- * Single inode case wants the same checks, except we
- * don't want our length pushed out past i_size as
- * comparing that data range makes no sense.
- *
- * extent_same_check_offsets() will do this for an
- * unaligned length at i_size, so catch it here and
- * reject the request.
- *
- * This effectively means we require aligned extents
- * for the single-inode case, whereas the other cases
- * allow an unaligned length so long as it ends at
- * i_size.
- */
- if (len != olen)
- return -EINVAL;
- /* Check for overlapping ranges */
- if (dst_loff + len > loff && dst_loff < loff + len)
- return -EINVAL;
-
- same_lock_start = min_t(u64, loff, dst_loff);
- same_lock_len = max_t(u64, loff, dst_loff) + len - same_lock_start;
- } else {
- /*
- * If the source and destination inodes are different, the
- * source's range end offset matches the source's i_size, that
- * i_size is not a multiple of the sector size, and the
- * destination range does not go past the destination's i_size,
- * we must round down the length to the nearest sector size
- * multiple. If we don't do this adjustment we end replacing
- * with zeroes the bytes in the range that starts at the
- * deduplication range's end offset and ends at the next sector
- * size multiple.
- */
- if (loff + olen == i_size_read(src) &&
- dst_loff + len < i_size_read(dst)) {
- const u64 sz = BTRFS_I(src)->root->fs_info->sectorsize;
-
- len = round_down(i_size_read(src), sz) - loff;
- olen = len;
- }
- }
-
-again:
- ret = btrfs_cmp_data_prepare(src, loff, dst, dst_loff, olen, cmp);
- if (ret)
- return ret;
-
- if (same_inode)
- ret = lock_extent_range(src, same_lock_start, same_lock_len,
- false);
- else
- ret = btrfs_double_extent_lock(src, loff, dst, dst_loff, len,
- false);
/*
- * If one of the inodes has dirty pages in the respective range or
- * ordered extents, we need to flush dellaloc and wait for all ordered
- * extents in the range. We must unlock the pages and the ranges in the
- * io trees to avoid deadlocks when flushing delalloc (requires locking
- * pages) and when waiting for ordered extents to complete (they require
- * range locking).
+ * Lock destination range to serialize with concurrent readpages() and
+ * source range to serialize with relocation.
*/
- if (ret == -EAGAIN) {
- /*
- * Ranges in the io trees already unlocked. Now unlock all
- * pages before waiting for all IO to complete.
- */
- btrfs_cmp_data_free(cmp);
- if (same_inode) {
- btrfs_wait_ordered_range(src, same_lock_start,
- same_lock_len);
- } else {
- btrfs_wait_ordered_range(src, loff, len);
- btrfs_wait_ordered_range(dst, dst_loff, len);
- }
- goto again;
- }
- ASSERT(ret == 0);
- if (WARN_ON(ret)) {
- /* ranges in the io trees already unlocked */
- btrfs_cmp_data_free(cmp);
- return ret;
- }
-
- /* pass original length for comparison so we stay within i_size */
- ret = btrfs_cmp_data(olen, cmp);
- if (ret == 0)
- ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1);
-
- if (same_inode)
- unlock_extent(&BTRFS_I(src)->io_tree, same_lock_start,
- same_lock_start + same_lock_len - 1);
- else
- btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
-
- btrfs_cmp_data_free(cmp);
+ btrfs_double_extent_lock(src, loff, dst, dst_loff, len);
+ ret = btrfs_clone(src, dst, loff, len, len, dst_loff, 1);
+ btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
return ret;
}
@@ -3555,58 +3264,16 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
struct inode *dst, u64 dst_loff)
{
int ret;
- struct cmp_pages cmp;
- int num_pages = PAGE_ALIGN(BTRFS_MAX_DEDUPE_LEN) >> PAGE_SHIFT;
- bool same_inode = (src == dst);
u64 i, tail_len, chunk_count;
- if (olen == 0)
- return 0;
-
- if (same_inode)
- inode_lock(src);
- else
- btrfs_double_inode_lock(src, dst);
-
- /* don't make the dst file partly checksummed */
- if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
- (BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM)) {
- ret = -EINVAL;
- goto out_unlock;
- }
-
tail_len = olen % BTRFS_MAX_DEDUPE_LEN;
chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN);
- if (chunk_count == 0)
- num_pages = PAGE_ALIGN(tail_len) >> PAGE_SHIFT;
-
- /*
- * If deduping ranges in the same inode, locking rules make it
- * mandatory to always lock pages in ascending order to avoid deadlocks
- * with concurrent tasks (such as starting writeback/delalloc).
- */
- if (same_inode && dst_loff < loff)
- swap(loff, dst_loff);
-
- /*
- * We must gather up all the pages before we initiate our extent
- * locking. We use an array for the page pointers. Size of the array is
- * bounded by len, which is in turn bounded by BTRFS_MAX_DEDUPE_LEN.
- */
- cmp.src_pages = kvmalloc_array(num_pages, sizeof(struct page *),
- GFP_KERNEL | __GFP_ZERO);
- cmp.dst_pages = kvmalloc_array(num_pages, sizeof(struct page *),
- GFP_KERNEL | __GFP_ZERO);
- if (!cmp.src_pages || !cmp.dst_pages) {
- ret = -ENOMEM;
- goto out_free;
- }
for (i = 0; i < chunk_count; i++) {
ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN,
- dst, dst_loff, &cmp);
+ dst, dst_loff);
if (ret)
- goto out_free;
+ return ret;
loff += BTRFS_MAX_DEDUPE_LEN;
dst_loff += BTRFS_MAX_DEDUPE_LEN;
@@ -3614,41 +3281,11 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
if (tail_len > 0)
ret = btrfs_extent_same_range(src, loff, tail_len, dst,
- dst_loff, &cmp);
-
-out_free:
- kvfree(cmp.src_pages);
- kvfree(cmp.dst_pages);
-
-out_unlock:
- if (same_inode)
- inode_unlock(src);
- else
- btrfs_double_inode_unlock(src, dst);
+ dst_loff);
return ret;
}
-int btrfs_dedupe_file_range(struct file *src_file, loff_t src_loff,
- struct file *dst_file, loff_t dst_loff,
- u64 olen)
-{
- struct inode *src = file_inode(src_file);
- struct inode *dst = file_inode(dst_file);
- u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
-
- if (WARN_ON_ONCE(bs < PAGE_SIZE)) {
- /*
- * Btrfs does not support blocksize < page_size. As a
- * result, btrfs_cmp_data() won't correctly handle
- * this situation without an update.
- */
- return -EINVAL;
- }
-
- return btrfs_extent_same(src, src_loff, olen, dst, dst_loff);
-}
-
static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
struct inode *inode,
u64 endoff,
@@ -4231,11 +3868,9 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
struct inode *inode = file_inode(file);
struct inode *src = file_inode(file_src);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
int ret;
u64 len = olen;
u64 bs = fs_info->sb->s_blocksize;
- int same_inode = src == inode;
/*
* TODO:
@@ -4248,93 +3883,43 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
* be either compressed or non-compressed.
*/
- if (btrfs_root_readonly(root))
- return -EROFS;
-
- if (file_src->f_path.mnt != file->f_path.mnt ||
- src->i_sb != inode->i_sb)
- return -EXDEV;
-
- if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
- return -EISDIR;
-
- if (!same_inode) {
- btrfs_double_inode_lock(src, inode);
- } else {
- inode_lock(src);
- }
-
- /* don't make the dst file partly checksummed */
- if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
- (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
- ret = -EINVAL;
- goto out_unlock;
- }
-
- /* determine range to clone */
- ret = -EINVAL;
- if (off + len > src->i_size || off + len < off)
- goto out_unlock;
- if (len == 0)
- olen = len = src->i_size - off;
- /* if we extend to eof, continue to block boundary */
+ /*
+ * VFS's generic_remap_file_range_prep() protects us from cloning the
+ * eof block into the middle of a file, which would result in corruption
+ * if the file size is not blocksize aligned. So we don't need to check
+ * for that case here.
+ */
if (off + len == src->i_size)
len = ALIGN(src->i_size, bs) - off;
- if (len == 0) {
- ret = 0;
- goto out_unlock;
- }
-
- /* verify the end result is block aligned */
- if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) ||
- !IS_ALIGNED(destoff, bs))
- goto out_unlock;
-
- /* verify if ranges are overlapped within the same file */
- if (same_inode) {
- if (destoff + len > off && destoff < off + len)
- goto out_unlock;
- }
-
if (destoff > inode->i_size) {
+ const u64 wb_start = ALIGN_DOWN(inode->i_size, bs);
+
ret = btrfs_cont_expand(inode, inode->i_size, destoff);
if (ret)
- goto out_unlock;
+ return ret;
+ /*
+ * We may have truncated the last block if the inode's size is
+ * not sector size aligned, so we need to wait for writeback to
+ * complete before proceeding further, otherwise we can race
+ * with cloning and attempt to increment a reference to an
+ * extent that no longer exists (writeback completed right after
+ * we found the previous extent covering eof and before we
+ * attempted to increment its reference count).
+ */
+ ret = btrfs_wait_ordered_range(inode, wb_start,
+ destoff - wb_start);
+ if (ret)
+ return ret;
}
/*
- * Lock the target range too. Right after we replace the file extent
- * items in the fs tree (which now point to the cloned data), we might
- * have a worker replace them with extent items relative to a write
- * operation that was issued before this clone operation (i.e. confront
- * with inode.c:btrfs_finish_ordered_io).
+ * Lock destination range to serialize with concurrent readpages() and
+ * source range to serialize with relocation.
*/
- if (same_inode) {
- u64 lock_start = min_t(u64, off, destoff);
- u64 lock_len = max_t(u64, off, destoff) + len - lock_start;
-
- ret = lock_extent_range(src, lock_start, lock_len, true);
- } else {
- ret = btrfs_double_extent_lock(src, off, inode, destoff, len,
- true);
- }
- ASSERT(ret == 0);
- if (WARN_ON(ret)) {
- /* ranges in the io trees already unlocked */
- goto out_unlock;
- }
-
+ btrfs_double_extent_lock(src, off, inode, destoff, len);
ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
-
- if (same_inode) {
- u64 lock_start = min_t(u64, off, destoff);
- u64 lock_end = max_t(u64, off, destoff) + len - 1;
-
- unlock_extent(&BTRFS_I(src)->io_tree, lock_start, lock_end);
- } else {
- btrfs_double_extent_unlock(src, off, inode, destoff, len);
- }
+ btrfs_double_extent_unlock(src, off, inode, destoff, len);
/*
* Truncate page cache pages so that future reads will see the cloned
* data immediately and not the previous data.
@@ -4342,18 +3927,125 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
truncate_inode_pages_range(&inode->i_data,
round_down(destoff, PAGE_SIZE),
round_up(destoff + len, PAGE_SIZE) - 1);
-out_unlock:
+
+ return ret;
+}
+
+static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t *len, unsigned int remap_flags)
+{
+ struct inode *inode_in = file_inode(file_in);
+ struct inode *inode_out = file_inode(file_out);
+ u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize;
+ bool same_inode = inode_out == inode_in;
+ u64 wb_len;
+ int ret;
+
+ if (!(remap_flags & REMAP_FILE_DEDUP)) {
+ struct btrfs_root *root_out = BTRFS_I(inode_out)->root;
+
+ if (btrfs_root_readonly(root_out))
+ return -EROFS;
+
+ if (file_in->f_path.mnt != file_out->f_path.mnt ||
+ inode_in->i_sb != inode_out->i_sb)
+ return -EXDEV;
+ }
+
+ if (same_inode)
+ inode_lock(inode_in);
+ else
+ btrfs_double_inode_lock(inode_in, inode_out);
+
+ /* don't make the dst file partly checksummed */
+ if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) !=
+ (BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ /*
+ * Now that the inodes are locked, we need to start writeback ourselves
+ * and can not rely on the writeback from the VFS's generic helper
+ * generic_remap_file_range_prep() because:
+ *
+ * 1) For compression we must call filemap_fdatawrite_range() range
+ * twice (btrfs_fdatawrite_range() does it for us), and the generic
+ * helper only calls it once;
+ *
+ * 2) filemap_fdatawrite_range(), called by the generic helper only
+ * waits for the writeback to complete, i.e. for IO to be done, and
+ * not for the ordered extents to complete. We need to wait for them
+ * to complete so that new file extent items are in the fs tree.
+ */
+ if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP))
+ wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs);
+ else
+ wb_len = ALIGN(*len, bs);
+
+ /*
+ * Since we don't lock ranges, wait for ongoing lockless dio writes (as
+ * any in progress could create its ordered extents after we wait for
+ * existing ordered extents below).
+ */
+ inode_dio_wait(inode_in);
if (!same_inode)
- btrfs_double_inode_unlock(src, inode);
+ inode_dio_wait(inode_out);
+
+ ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs),
+ wb_len);
+ if (ret < 0)
+ goto out_unlock;
+ ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs),
+ wb_len);
+ if (ret < 0)
+ goto out_unlock;
+
+ ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
+ len, remap_flags);
+ if (ret < 0 || *len == 0)
+ goto out_unlock;
+
+ return 0;
+
+ out_unlock:
+ if (same_inode)
+ inode_unlock(inode_in);
else
- inode_unlock(src);
+ btrfs_double_inode_unlock(inode_in, inode_out);
+
return ret;
}
-int btrfs_clone_file_range(struct file *src_file, loff_t off,
- struct file *dst_file, loff_t destoff, u64 len)
+loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
+ struct file *dst_file, loff_t destoff, loff_t len,
+ unsigned int remap_flags)
{
- return btrfs_clone_files(dst_file, src_file, off, len, destoff);
+ struct inode *src_inode = file_inode(src_file);
+ struct inode *dst_inode = file_inode(dst_file);
+ bool same_inode = dst_inode == src_inode;
+ int ret;
+
+ if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
+ return -EINVAL;
+
+ ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff,
+ &len, remap_flags);
+ if (ret < 0 || len == 0)
+ return ret;
+
+ if (remap_flags & REMAP_FILE_DEDUP)
+ ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff);
+ else
+ ret = btrfs_clone_files(dst_file, src_file, off, len, destoff);
+
+ if (same_inode)
+ inode_unlock(src_inode);
+ else
+ btrfs_double_inode_unlock(src_inode, dst_inode);
+
+ return ret < 0 ? ret : len;
}
static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
@@ -4663,7 +4355,7 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
&sa->progress, sa->flags & BTRFS_SCRUB_READONLY,
0);
- if (copy_to_user(arg, sa, sizeof(*sa)))
+ if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa)))
ret = -EFAULT;
if (!(sa->flags & BTRFS_SCRUB_READONLY))
@@ -4696,7 +4388,7 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_fs_info *fs_info,
ret = btrfs_scrub_progress(fs_info, sa->devid, &sa->progress);
- if (copy_to_user(arg, sa, sizeof(*sa)))
+ if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa)))
ret = -EFAULT;
kfree(sa);
@@ -4720,7 +4412,7 @@ static long btrfs_ioctl_get_dev_stats(struct btrfs_fs_info *fs_info,
ret = btrfs_get_dev_stats(fs_info, sa);
- if (copy_to_user(arg, sa, sizeof(*sa)))
+ if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa)))
ret = -EFAULT;
kfree(sa);
@@ -4766,7 +4458,7 @@ static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info,
break;
}
- if (copy_to_user(arg, p, sizeof(*p)))
+ if ((ret == 0 || ret == -ECANCELED) && copy_to_user(arg, p, sizeof(*p)))
ret = -EFAULT;
out:
kfree(p);
@@ -5072,7 +4764,7 @@ do_balance:
ret = btrfs_balance(fs_info, bctl, bargs);
bctl = NULL;
- if (arg) {
+ if ((ret == 0 || ret == -ECANCELED) && arg) {
if (copy_to_user(arg, bargs, sizeof(*bargs)))
ret = -EFAULT;
}
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 1da768e5ef75..82b84e4daad1 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -14,43 +14,58 @@
static void btrfs_assert_tree_read_locked(struct extent_buffer *eb);
-/*
- * if we currently have a spinning reader or writer lock
- * (indicated by the rw flag) this will bump the count
- * of blocking holders and drop the spinlock.
- */
-void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
+void btrfs_set_lock_blocking_read(struct extent_buffer *eb)
{
/*
- * no lock is required. The lock owner may change if
- * we have a read lock, but it won't change to or away
- * from us. If we have the write lock, we are the owner
- * and it'll never change.
+ * No lock is required. The lock owner may change if we have a read
+ * lock, but it won't change to or away from us. If we have the write
+ * lock, we are the owner and it'll never change.
*/
if (eb->lock_nested && current->pid == eb->lock_owner)
return;
- if (rw == BTRFS_WRITE_LOCK) {
- if (atomic_read(&eb->blocking_writers) == 0) {
- WARN_ON(atomic_read(&eb->spinning_writers) != 1);
- atomic_dec(&eb->spinning_writers);
- btrfs_assert_tree_locked(eb);
- atomic_inc(&eb->blocking_writers);
- write_unlock(&eb->lock);
- }
- } else if (rw == BTRFS_READ_LOCK) {
- btrfs_assert_tree_read_locked(eb);
- atomic_inc(&eb->blocking_readers);
- WARN_ON(atomic_read(&eb->spinning_readers) == 0);
- atomic_dec(&eb->spinning_readers);
- read_unlock(&eb->lock);
+ btrfs_assert_tree_read_locked(eb);
+ atomic_inc(&eb->blocking_readers);
+ WARN_ON(atomic_read(&eb->spinning_readers) == 0);
+ atomic_dec(&eb->spinning_readers);
+ read_unlock(&eb->lock);
+}
+
+void btrfs_set_lock_blocking_write(struct extent_buffer *eb)
+{
+ /*
+ * No lock is required. The lock owner may change if we have a read
+ * lock, but it won't change to or away from us. If we have the write
+ * lock, we are the owner and it'll never change.
+ */
+ if (eb->lock_nested && current->pid == eb->lock_owner)
+ return;
+ if (atomic_read(&eb->blocking_writers) == 0) {
+ WARN_ON(atomic_read(&eb->spinning_writers) != 1);
+ atomic_dec(&eb->spinning_writers);
+ btrfs_assert_tree_locked(eb);
+ atomic_inc(&eb->blocking_writers);
+ write_unlock(&eb->lock);
}
}
-/*
- * if we currently have a blocking lock, take the spinlock
- * and drop our blocking count
- */
-void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
+void btrfs_clear_lock_blocking_read(struct extent_buffer *eb)
+{
+ /*
+ * No lock is required. The lock owner may change if we have a read
+ * lock, but it won't change to or away from us. If we have the write
+ * lock, we are the owner and it'll never change.
+ */
+ if (eb->lock_nested && current->pid == eb->lock_owner)
+ return;
+ BUG_ON(atomic_read(&eb->blocking_readers) == 0);
+ read_lock(&eb->lock);
+ atomic_inc(&eb->spinning_readers);
+ /* atomic_dec_and_test implies a barrier */
+ if (atomic_dec_and_test(&eb->blocking_readers))
+ cond_wake_up_nomb(&eb->read_lock_wq);
+}
+
+void btrfs_clear_lock_blocking_write(struct extent_buffer *eb)
{
/*
* no lock is required. The lock owner may change if
@@ -60,23 +75,13 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
*/
if (eb->lock_nested && current->pid == eb->lock_owner)
return;
-
- if (rw == BTRFS_WRITE_LOCK_BLOCKING) {
- BUG_ON(atomic_read(&eb->blocking_writers) != 1);
- write_lock(&eb->lock);
- WARN_ON(atomic_read(&eb->spinning_writers));
- atomic_inc(&eb->spinning_writers);
- /* atomic_dec_and_test implies a barrier */
- if (atomic_dec_and_test(&eb->blocking_writers))
- cond_wake_up_nomb(&eb->write_lock_wq);
- } else if (rw == BTRFS_READ_LOCK_BLOCKING) {
- BUG_ON(atomic_read(&eb->blocking_readers) == 0);
- read_lock(&eb->lock);
- atomic_inc(&eb->spinning_readers);
- /* atomic_dec_and_test implies a barrier */
- if (atomic_dec_and_test(&eb->blocking_readers))
- cond_wake_up_nomb(&eb->read_lock_wq);
- }
+ BUG_ON(atomic_read(&eb->blocking_writers) != 1);
+ write_lock(&eb->lock);
+ WARN_ON(atomic_read(&eb->spinning_writers));
+ atomic_inc(&eb->spinning_writers);
+ /* atomic_dec_and_test implies a barrier */
+ if (atomic_dec_and_test(&eb->blocking_writers))
+ cond_wake_up_nomb(&eb->write_lock_wq);
}
/*
@@ -232,16 +237,9 @@ again:
wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0);
wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
write_lock(&eb->lock);
- if (atomic_read(&eb->blocking_readers)) {
+ if (atomic_read(&eb->blocking_readers) ||
+ atomic_read(&eb->blocking_writers)) {
write_unlock(&eb->lock);
- wait_event(eb->read_lock_wq,
- atomic_read(&eb->blocking_readers) == 0);
- goto again;
- }
- if (atomic_read(&eb->blocking_writers)) {
- write_unlock(&eb->lock);
- wait_event(eb->write_lock_wq,
- atomic_read(&eb->blocking_writers) == 0);
goto again;
}
WARN_ON(atomic_read(&eb->spinning_writers));
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index 29135def468e..595014f64830 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -17,8 +17,10 @@ void btrfs_tree_unlock(struct extent_buffer *eb);
void btrfs_tree_read_lock(struct extent_buffer *eb);
void btrfs_tree_read_unlock(struct extent_buffer *eb);
void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb);
-void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw);
-void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw);
+void btrfs_set_lock_blocking_read(struct extent_buffer *eb);
+void btrfs_set_lock_blocking_write(struct extent_buffer *eb);
+void btrfs_clear_lock_blocking_read(struct extent_buffer *eb);
+void btrfs_clear_lock_blocking_write(struct extent_buffer *eb);
void btrfs_assert_tree_locked(struct extent_buffer *eb);
int btrfs_try_tree_read_lock(struct extent_buffer *eb);
int btrfs_try_tree_write_lock(struct extent_buffer *eb);
@@ -37,13 +39,4 @@ static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw)
BUG();
}
-static inline void btrfs_set_lock_blocking(struct extent_buffer *eb)
-{
- btrfs_set_lock_blocking_rw(eb, BTRFS_WRITE_LOCK);
-}
-
-static inline void btrfs_clear_lock_blocking(struct extent_buffer *eb)
-{
- btrfs_clear_lock_blocking_rw(eb, BTRFS_WRITE_LOCK_BLOCKING);
-}
#endif
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index b6a4cc178bee..579d53ae256f 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -27,7 +27,7 @@
* Records the total size (including the header) of compressed data.
*
* 2. Segment(s)
- * Variable size. Each segment includes one segment header, followd by data
+ * Variable size. Each segment includes one segment header, followed by data
* payload.
* One regular LZO compressed extent can have one or more segments.
* For inlined LZO compressed extent, only one segment is allowed.
@@ -61,6 +61,28 @@ struct workspace {
struct list_head list;
};
+static struct workspace_manager wsm;
+
+static void lzo_init_workspace_manager(void)
+{
+ btrfs_init_workspace_manager(&wsm, &btrfs_lzo_compress);
+}
+
+static void lzo_cleanup_workspace_manager(void)
+{
+ btrfs_cleanup_workspace_manager(&wsm);
+}
+
+static struct list_head *lzo_get_workspace(unsigned int level)
+{
+ return btrfs_get_workspace(&wsm, level);
+}
+
+static void lzo_put_workspace(struct list_head *ws)
+{
+ btrfs_put_workspace(&wsm, ws);
+}
+
static void lzo_free_workspace(struct list_head *ws)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
@@ -71,7 +93,7 @@ static void lzo_free_workspace(struct list_head *ws)
kfree(workspace);
}
-static struct list_head *lzo_alloc_workspace(void)
+static struct list_head *lzo_alloc_workspace(unsigned int level)
{
struct workspace *workspace;
@@ -485,11 +507,16 @@ out:
return ret;
}
-static void lzo_set_level(struct list_head *ws, unsigned int type)
+static unsigned int lzo_set_level(unsigned int level)
{
+ return 0;
}
const struct btrfs_compress_op btrfs_lzo_compress = {
+ .init_workspace_manager = lzo_init_workspace_manager,
+ .cleanup_workspace_manager = lzo_cleanup_workspace_manager,
+ .get_workspace = lzo_get_workspace,
+ .put_workspace = lzo_put_workspace,
.alloc_workspace = lzo_alloc_workspace,
.free_workspace = lzo_free_workspace,
.compress_pages = lzo_compress_pages,
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 0c4ef208b8b9..6fde2b2741ef 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -460,7 +460,6 @@ void btrfs_remove_ordered_extent(struct inode *inode,
struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
struct btrfs_root *root = btrfs_inode->root;
struct rb_node *node;
- bool dec_pending_ordered = false;
/* This is paired with btrfs_add_ordered_extent. */
spin_lock(&btrfs_inode->lock);
@@ -477,37 +476,8 @@ void btrfs_remove_ordered_extent(struct inode *inode,
if (tree->last == node)
tree->last = NULL;
set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
- if (test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags))
- dec_pending_ordered = true;
spin_unlock_irq(&tree->lock);
- /*
- * The current running transaction is waiting on us, we need to let it
- * know that we're complete and wake it up.
- */
- if (dec_pending_ordered) {
- struct btrfs_transaction *trans;
-
- /*
- * The checks for trans are just a formality, it should be set,
- * but if it isn't we don't want to deref/assert under the spin
- * lock, so be nice and check if trans is set, but ASSERT() so
- * if it isn't set a developer will notice.
- */
- spin_lock(&fs_info->trans_lock);
- trans = fs_info->running_transaction;
- if (trans)
- refcount_inc(&trans->use_count);
- spin_unlock(&fs_info->trans_lock);
-
- ASSERT(trans);
- if (trans) {
- if (atomic_dec_and_test(&trans->pending_ordered))
- wake_up(&trans->pending_wait);
- btrfs_put_transaction(trans);
- }
- }
-
spin_lock(&root->ordered_extent_lock);
list_del_init(&entry->root_extent_list);
root->nr_ordered_extents--;
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 02d813aaa261..fb9a161f0215 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -37,28 +37,31 @@ struct btrfs_ordered_sum {
* rbtree, just before waking any waiters. It is used to indicate the
* IO is done and any metadata is inserted into the tree.
*/
-#define BTRFS_ORDERED_IO_DONE 0 /* set when all the pages are written */
-
-#define BTRFS_ORDERED_COMPLETE 1 /* set when removed from the tree */
-
-#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */
-
-#define BTRFS_ORDERED_COMPRESSED 3 /* writing a zlib compressed extent */
-
-#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to preallocated extent */
-
-#define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */
-
-#define BTRFS_ORDERED_IOERR 6 /* We had an io error when writing this out */
-
-#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates whether this ordered extent
- * has done its due diligence in updating
- * the isize. */
-#define BTRFS_ORDERED_TRUNCATED 8 /* Set when we have to truncate an extent */
-
-#define BTRFS_ORDERED_PENDING 9 /* We are waiting for this ordered extent to
- * complete in the current transaction. */
-#define BTRFS_ORDERED_REGULAR 10 /* Regular IO for COW */
+enum {
+ /* set when all the pages are written */
+ BTRFS_ORDERED_IO_DONE,
+ /* set when removed from the tree */
+ BTRFS_ORDERED_COMPLETE,
+ /* set when we want to write in place */
+ BTRFS_ORDERED_NOCOW,
+ /* writing a zlib compressed extent */
+ BTRFS_ORDERED_COMPRESSED,
+ /* set when writing to preallocated extent */
+ BTRFS_ORDERED_PREALLOC,
+ /* set when we're doing DIO with this extent */
+ BTRFS_ORDERED_DIRECT,
+ /* We had an io error when writing this out */
+ BTRFS_ORDERED_IOERR,
+ /*
+ * indicates whether this ordered extent has done its due diligence in
+ * updating the isize
+ */
+ BTRFS_ORDERED_UPDATED_ISIZE,
+ /* Set when we have to truncate an extent */
+ BTRFS_ORDERED_TRUNCATED,
+ /* Regular IO for COW */
+ BTRFS_ORDERED_REGULAR,
+};
struct btrfs_ordered_extent {
/* logical offset in the file */
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 45868fd76209..c1cd5558a646 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -30,7 +30,7 @@
* - sync
* - copy also limits on subvol creation
* - limit
- * - caches fuer ulists
+ * - caches for ulists
* - performance benchmarks
* - check all ioctl parameters
*/
@@ -522,7 +522,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
__del_qgroup_rb(qgroup);
}
/*
- * we call btrfs_free_qgroup_config() when umounting
+ * We call btrfs_free_qgroup_config() when unmounting
* filesystem and disabling quota, so we set qgroup_ulist
* to be null here to avoid double free.
*/
@@ -1013,16 +1013,22 @@ out_add_root:
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
- spin_lock(&fs_info->qgroup_lock);
- fs_info->quota_root = quota_root;
- set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
- spin_unlock(&fs_info->qgroup_lock);
ret = btrfs_commit_transaction(trans);
trans = NULL;
if (ret)
goto out_free_path;
+ /*
+ * Set quota enabled flag after committing the transaction, to avoid
+ * deadlocks on fs_info->qgroup_ioctl_lock with concurrent snapshot
+ * creation.
+ */
+ spin_lock(&fs_info->qgroup_lock);
+ fs_info->quota_root = quota_root;
+ set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
+ spin_unlock(&fs_info->qgroup_lock);
+
ret = qgroup_rescan_init(fs_info, 0, 1);
if (!ret) {
qgroup_rescan_zero_tracking(fs_info);
@@ -1122,7 +1128,7 @@ static void qgroup_dirty(struct btrfs_fs_info *fs_info,
* The easy accounting, we're updating qgroup relationship whose child qgroup
* only has exclusive extents.
*
- * In this case, all exclsuive extents will also be exlusive for parent, so
+ * In this case, all exclusive extents will also be exclusive for parent, so
* excl/rfer just get added/removed.
*
* So is qgroup reservation space, which should also be added/removed to
@@ -1540,12 +1546,18 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
parent_node = *p;
entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record,
node);
- if (bytenr < entry->bytenr)
+ if (bytenr < entry->bytenr) {
p = &(*p)->rb_left;
- else if (bytenr > entry->bytenr)
+ } else if (bytenr > entry->bytenr) {
p = &(*p)->rb_right;
- else
+ } else {
+ if (record->data_rsv && !entry->data_rsv) {
+ entry->data_rsv = record->data_rsv;
+ entry->data_rsv_refroot =
+ record->data_rsv_refroot;
+ }
return 1;
+ }
}
rb_link_node(&record->node, parent_node, p);
@@ -1591,7 +1603,7 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)
|| bytenr == 0 || num_bytes == 0)
return 0;
- record = kmalloc(sizeof(*record), gfp_flag);
+ record = kzalloc(sizeof(*record), gfp_flag);
if (!record)
return -ENOMEM;
@@ -1749,14 +1761,14 @@ static int adjust_slots_upwards(struct btrfs_path *path, int root_level)
*
* 2) Mark the final tree blocks in @src_path and @dst_path qgroup dirty
* NOTE: In above case, OO(a) and NN(a) won't be marked qgroup dirty.
- * They should be marked during preivous (@dst_level = 1) iteration.
+ * They should be marked during previous (@dst_level = 1) iteration.
*
* 3) Mark file extents in leaves dirty
* We don't have good way to pick out new file extents only.
* So we still follow the old method by scanning all file extents in
* the leave.
*
- * This function can free us from keeping two pathes, thus later we only need
+ * This function can free us from keeping two paths, thus later we only need
* to care about how to iterate all new tree blocks in reloc tree.
*/
static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
@@ -1826,7 +1838,7 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
src_path->nodes[cur_level] = eb;
btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ btrfs_set_lock_blocking_read(eb);
src_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING;
}
@@ -1895,7 +1907,7 @@ out:
*
* We will iterate through tree blocks NN(b), NN(d) and info qgroup to trace
* above tree blocks along with their counter parts in file tree.
- * While during search, old tree blocsk OO(c) will be skiped as tree block swap
+ * While during search, old tree blocks OO(c) will be skipped as tree block swap
* won't affect OO(c).
*/
static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
@@ -1967,7 +1979,7 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
dst_path->slots[cur_level] = 0;
btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ btrfs_set_lock_blocking_read(eb);
dst_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING;
need_cleanup = true;
}
@@ -2011,86 +2023,30 @@ out:
return ret;
}
-/*
- * Inform qgroup to trace subtree swap used in balance.
- *
- * Unlike btrfs_qgroup_trace_subtree(), this function will only trace
- * new tree blocks whose generation is equal to (or larger than) @last_snapshot.
- *
- * Will go down the tree block pointed by @dst_eb (pointed by @dst_parent and
- * @dst_slot), and find any tree blocks whose generation is at @last_snapshot,
- * and then go down @src_eb (pointed by @src_parent and @src_slot) to find
- * the conterpart of the tree block, then mark both tree blocks as qgroup dirty,
- * and skip all tree blocks whose generation is smaller than last_snapshot.
- *
- * This would skip tons of tree blocks of original btrfs_qgroup_trace_subtree(),
- * which could be the cause of very slow balance if the file tree is large.
- *
- * @src_parent, @src_slot: pointer to src (file tree) eb.
- * @dst_parent, @dst_slot: pointer to dst (reloc tree) eb.
- */
-int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *bg_cache,
- struct extent_buffer *src_parent, int src_slot,
- struct extent_buffer *dst_parent, int dst_slot,
- u64 last_snapshot)
+static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
+ struct extent_buffer *src_eb,
+ struct extent_buffer *dst_eb,
+ u64 last_snapshot, bool trace_leaf)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_path *dst_path = NULL;
- struct btrfs_key first_key;
- struct extent_buffer *src_eb = NULL;
- struct extent_buffer *dst_eb = NULL;
- bool trace_leaf = false;
- u64 child_gen;
- u64 child_bytenr;
int level;
int ret;
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
return 0;
- /* Check parameter order */
- if (btrfs_node_ptr_generation(src_parent, src_slot) >
- btrfs_node_ptr_generation(dst_parent, dst_slot)) {
+ /* Wrong parameter order */
+ if (btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb)) {
btrfs_err_rl(fs_info,
"%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__,
- btrfs_node_ptr_generation(src_parent, src_slot),
- btrfs_node_ptr_generation(dst_parent, dst_slot));
+ btrfs_header_generation(src_eb),
+ btrfs_header_generation(dst_eb));
return -EUCLEAN;
}
- /*
- * Only trace leaf if we're relocating data block groups, this could
- * reduce tons of data extents tracing for meta/sys bg relocation.
- */
- if (bg_cache->flags & BTRFS_BLOCK_GROUP_DATA)
- trace_leaf = true;
- /* Read out real @src_eb, pointed by @src_parent and @src_slot */
- child_bytenr = btrfs_node_blockptr(src_parent, src_slot);
- child_gen = btrfs_node_ptr_generation(src_parent, src_slot);
- btrfs_node_key_to_cpu(src_parent, &first_key, src_slot);
-
- src_eb = read_tree_block(fs_info, child_bytenr, child_gen,
- btrfs_header_level(src_parent) - 1, &first_key);
- if (IS_ERR(src_eb)) {
- ret = PTR_ERR(src_eb);
- goto out;
- }
-
- /* Read out real @dst_eb, pointed by @src_parent and @src_slot */
- child_bytenr = btrfs_node_blockptr(dst_parent, dst_slot);
- child_gen = btrfs_node_ptr_generation(dst_parent, dst_slot);
- btrfs_node_key_to_cpu(dst_parent, &first_key, dst_slot);
-
- dst_eb = read_tree_block(fs_info, child_bytenr, child_gen,
- btrfs_header_level(dst_parent) - 1, &first_key);
- if (IS_ERR(dst_eb)) {
- ret = PTR_ERR(dst_eb);
- goto out;
- }
-
if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) {
- ret = -EINVAL;
+ ret = -EIO;
goto out;
}
@@ -2100,14 +2056,13 @@ int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
ret = -ENOMEM;
goto out;
}
-
/* For dst_path */
extent_buffer_get(dst_eb);
dst_path->nodes[level] = dst_eb;
dst_path->slots[level] = 0;
dst_path->locks[level] = 0;
- /* Do the generation-aware breadth-first search */
+ /* Do the generation aware breadth-first search */
ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level,
level, last_snapshot, trace_leaf);
if (ret < 0)
@@ -2115,8 +2070,6 @@ int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
ret = 0;
out:
- free_extent_buffer(src_eb);
- free_extent_buffer(dst_eb);
btrfs_free_path(dst_path);
if (ret < 0)
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
@@ -2201,7 +2154,7 @@ walk_down:
path->slots[level] = 0;
btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ btrfs_set_lock_blocking_read(eb);
path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
ret = btrfs_qgroup_trace_extent(trans, child_bytenr,
@@ -2570,6 +2523,11 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
goto cleanup;
}
+ /* Free the reserved data space */
+ btrfs_qgroup_free_refroot(fs_info,
+ record->data_rsv_refroot,
+ record->data_rsv,
+ BTRFS_QGROUP_RSV_DATA);
/*
* Use SEQ_LAST as time_seq to do special search, which
* doesn't lock tree or delayed_refs and search current
@@ -2659,7 +2617,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
int i;
u64 *i_qgroups;
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_root *quota_root = fs_info->quota_root;
+ struct btrfs_root *quota_root;
struct btrfs_qgroup *srcgroup;
struct btrfs_qgroup *dstgroup;
u32 level_size = 0;
@@ -2669,6 +2627,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
goto out;
+ quota_root = fs_info->quota_root;
if (!quota_root) {
ret = -EINVAL;
goto out;
@@ -2835,16 +2794,15 @@ out:
/*
* Two limits to commit transaction in advance.
*
- * For RATIO, it will be 1/RATIO of the remaining limit
- * (excluding data and prealloc meta) as threshold.
+ * For RATIO, it will be 1/RATIO of the remaining limit as threshold.
* For SIZE, it will be in byte unit as threshold.
*/
-#define QGROUP_PERTRANS_RATIO 32
-#define QGROUP_PERTRANS_SIZE SZ_32M
+#define QGROUP_FREE_RATIO 32
+#define QGROUP_FREE_SIZE SZ_32M
static bool qgroup_check_limits(struct btrfs_fs_info *fs_info,
const struct btrfs_qgroup *qg, u64 num_bytes)
{
- u64 limit;
+ u64 free;
u64 threshold;
if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
@@ -2863,20 +2821,21 @@ static bool qgroup_check_limits(struct btrfs_fs_info *fs_info,
*/
if ((qg->lim_flags & (BTRFS_QGROUP_LIMIT_MAX_RFER |
BTRFS_QGROUP_LIMIT_MAX_EXCL))) {
- if (qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL)
- limit = qg->max_excl;
- else
- limit = qg->max_rfer;
- threshold = (limit - qg->rsv.values[BTRFS_QGROUP_RSV_DATA] -
- qg->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC]) /
- QGROUP_PERTRANS_RATIO;
- threshold = min_t(u64, threshold, QGROUP_PERTRANS_SIZE);
+ if (qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) {
+ free = qg->max_excl - qgroup_rsv_total(qg) - qg->excl;
+ threshold = min_t(u64, qg->max_excl / QGROUP_FREE_RATIO,
+ QGROUP_FREE_SIZE);
+ } else {
+ free = qg->max_rfer - qgroup_rsv_total(qg) - qg->rfer;
+ threshold = min_t(u64, qg->max_rfer / QGROUP_FREE_RATIO,
+ QGROUP_FREE_SIZE);
+ }
/*
* Use transaction_kthread to commit transaction, so we no
* longer need to bother nested transaction nor lock context.
*/
- if (qg->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS] > threshold)
+ if (free < threshold)
btrfs_commit_transaction_locksafe(fs_info);
}
@@ -2952,7 +2911,6 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
qg = unode_aux_to_qgroup(unode);
- trace_qgroup_update_reserve(fs_info, qg, num_bytes, type);
qgroup_rsv_add(fs_info, qg, num_bytes, type);
}
@@ -3019,7 +2977,6 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
qg = unode_aux_to_qgroup(unode);
- trace_qgroup_update_reserve(fs_info, qg, -(s64)num_bytes, type);
qgroup_rsv_release(fs_info, qg, num_bytes, type);
list_for_each_entry(glist, &qg->groups, next_group) {
@@ -3103,9 +3060,6 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
mutex_unlock(&fs_info->qgroup_rescan_lock);
goto out;
}
- extent_buffer_get(scratch_leaf);
- btrfs_tree_read_lock(scratch_leaf);
- btrfs_set_lock_blocking_rw(scratch_leaf, BTRFS_READ_LOCK);
slot = path->slots[0];
btrfs_release_path(path);
mutex_unlock(&fs_info->qgroup_rescan_lock);
@@ -3131,10 +3085,8 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
goto out;
}
out:
- if (scratch_leaf) {
- btrfs_tree_read_unlock_blocking(scratch_leaf);
+ if (scratch_leaf)
free_extent_buffer(scratch_leaf);
- }
if (done && !ret) {
ret = 1;
@@ -3781,3 +3733,241 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode)
}
extent_changeset_release(&changeset);
}
+
+void btrfs_qgroup_init_swapped_blocks(
+ struct btrfs_qgroup_swapped_blocks *swapped_blocks)
+{
+ int i;
+
+ spin_lock_init(&swapped_blocks->lock);
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++)
+ swapped_blocks->blocks[i] = RB_ROOT;
+ swapped_blocks->swapped = false;
+}
+
+/*
+ * Delete all swapped blocks record of @root.
+ * Every record here means we skipped a full subtree scan for qgroup.
+ *
+ * Gets called when committing one transaction.
+ */
+void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root)
+{
+ struct btrfs_qgroup_swapped_blocks *swapped_blocks;
+ int i;
+
+ swapped_blocks = &root->swapped_blocks;
+
+ spin_lock(&swapped_blocks->lock);
+ if (!swapped_blocks->swapped)
+ goto out;
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+ struct rb_root *cur_root = &swapped_blocks->blocks[i];
+ struct btrfs_qgroup_swapped_block *entry;
+ struct btrfs_qgroup_swapped_block *next;
+
+ rbtree_postorder_for_each_entry_safe(entry, next, cur_root,
+ node)
+ kfree(entry);
+ swapped_blocks->blocks[i] = RB_ROOT;
+ }
+ swapped_blocks->swapped = false;
+out:
+ spin_unlock(&swapped_blocks->lock);
+}
+
+/*
+ * Add subtree roots record into @subvol_root.
+ *
+ * @subvol_root: tree root of the subvolume tree get swapped
+ * @bg: block group under balance
+ * @subvol_parent/slot: pointer to the subtree root in subvolume tree
+ * @reloc_parent/slot: pointer to the subtree root in reloc tree
+ * BOTH POINTERS ARE BEFORE TREE SWAP
+ * @last_snapshot: last snapshot generation of the subvolume tree
+ */
+int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
+ struct btrfs_root *subvol_root,
+ struct btrfs_block_group_cache *bg,
+ struct extent_buffer *subvol_parent, int subvol_slot,
+ struct extent_buffer *reloc_parent, int reloc_slot,
+ u64 last_snapshot)
+{
+ struct btrfs_fs_info *fs_info = subvol_root->fs_info;
+ struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks;
+ struct btrfs_qgroup_swapped_block *block;
+ struct rb_node **cur;
+ struct rb_node *parent = NULL;
+ int level = btrfs_header_level(subvol_parent) - 1;
+ int ret = 0;
+
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ return 0;
+
+ if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) >
+ btrfs_node_ptr_generation(reloc_parent, reloc_slot)) {
+ btrfs_err_rl(fs_info,
+ "%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu",
+ __func__,
+ btrfs_node_ptr_generation(subvol_parent, subvol_slot),
+ btrfs_node_ptr_generation(reloc_parent, reloc_slot));
+ return -EUCLEAN;
+ }
+
+ block = kmalloc(sizeof(*block), GFP_NOFS);
+ if (!block) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * @reloc_parent/slot is still before swap, while @block is going to
+ * record the bytenr after swap, so we do the swap here.
+ */
+ block->subvol_bytenr = btrfs_node_blockptr(reloc_parent, reloc_slot);
+ block->subvol_generation = btrfs_node_ptr_generation(reloc_parent,
+ reloc_slot);
+ block->reloc_bytenr = btrfs_node_blockptr(subvol_parent, subvol_slot);
+ block->reloc_generation = btrfs_node_ptr_generation(subvol_parent,
+ subvol_slot);
+ block->last_snapshot = last_snapshot;
+ block->level = level;
+ if (bg->flags & BTRFS_BLOCK_GROUP_DATA)
+ block->trace_leaf = true;
+ else
+ block->trace_leaf = false;
+ btrfs_node_key_to_cpu(reloc_parent, &block->first_key, reloc_slot);
+
+ /* Insert @block into @blocks */
+ spin_lock(&blocks->lock);
+ cur = &blocks->blocks[level].rb_node;
+ while (*cur) {
+ struct btrfs_qgroup_swapped_block *entry;
+
+ parent = *cur;
+ entry = rb_entry(parent, struct btrfs_qgroup_swapped_block,
+ node);
+
+ if (entry->subvol_bytenr < block->subvol_bytenr) {
+ cur = &(*cur)->rb_left;
+ } else if (entry->subvol_bytenr > block->subvol_bytenr) {
+ cur = &(*cur)->rb_right;
+ } else {
+ if (entry->subvol_generation !=
+ block->subvol_generation ||
+ entry->reloc_bytenr != block->reloc_bytenr ||
+ entry->reloc_generation !=
+ block->reloc_generation) {
+ /*
+ * Duplicated but mismatch entry found.
+ * Shouldn't happen.
+ *
+ * Marking qgroup inconsistent should be enough
+ * for end users.
+ */
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ ret = -EEXIST;
+ }
+ kfree(block);
+ goto out_unlock;
+ }
+ }
+ rb_link_node(&block->node, parent, cur);
+ rb_insert_color(&block->node, &blocks->blocks[level]);
+ blocks->swapped = true;
+out_unlock:
+ spin_unlock(&blocks->lock);
+out:
+ if (ret < 0)
+ fs_info->qgroup_flags |=
+ BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ return ret;
+}
+
+/*
+ * Check if the tree block is a subtree root, and if so do the needed
+ * delayed subtree trace for qgroup.
+ *
+ * This is called during btrfs_cow_block().
+ */
+int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *subvol_eb)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks;
+ struct btrfs_qgroup_swapped_block *block;
+ struct extent_buffer *reloc_eb = NULL;
+ struct rb_node *node;
+ bool found = false;
+ bool swapped = false;
+ int level = btrfs_header_level(subvol_eb);
+ int ret = 0;
+ int i;
+
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ return 0;
+ if (!is_fstree(root->root_key.objectid) || !root->reloc_root)
+ return 0;
+
+ spin_lock(&blocks->lock);
+ if (!blocks->swapped) {
+ spin_unlock(&blocks->lock);
+ return 0;
+ }
+ node = blocks->blocks[level].rb_node;
+
+ while (node) {
+ block = rb_entry(node, struct btrfs_qgroup_swapped_block, node);
+ if (block->subvol_bytenr < subvol_eb->start) {
+ node = node->rb_left;
+ } else if (block->subvol_bytenr > subvol_eb->start) {
+ node = node->rb_right;
+ } else {
+ found = true;
+ break;
+ }
+ }
+ if (!found) {
+ spin_unlock(&blocks->lock);
+ goto out;
+ }
+ /* Found one, remove it from @blocks first and update blocks->swapped */
+ rb_erase(&block->node, &blocks->blocks[level]);
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+ if (RB_EMPTY_ROOT(&blocks->blocks[i])) {
+ swapped = true;
+ break;
+ }
+ }
+ blocks->swapped = swapped;
+ spin_unlock(&blocks->lock);
+
+ /* Read out reloc subtree root */
+ reloc_eb = read_tree_block(fs_info, block->reloc_bytenr,
+ block->reloc_generation, block->level,
+ &block->first_key);
+ if (IS_ERR(reloc_eb)) {
+ ret = PTR_ERR(reloc_eb);
+ reloc_eb = NULL;
+ goto free_out;
+ }
+ if (!extent_buffer_uptodate(reloc_eb)) {
+ ret = -EIO;
+ goto free_out;
+ }
+
+ ret = qgroup_trace_subtree_swap(trans, reloc_eb, subvol_eb,
+ block->last_snapshot, block->trace_leaf);
+free_out:
+ kfree(block);
+ free_extent_buffer(reloc_eb);
+out:
+ if (ret < 0) {
+ btrfs_err_rl(fs_info,
+ "failed to account subtree at bytenr %llu: %d",
+ subvol_eb->start, ret);
+ fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ }
+ return ret;
+}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index d8f78f5ab854..46ba7bd2961c 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -6,6 +6,8 @@
#ifndef BTRFS_QGROUP_H
#define BTRFS_QGROUP_H
+#include <linux/spinlock.h>
+#include <linux/rbtree.h>
#include "ulist.h"
#include "delayed-ref.h"
@@ -38,6 +40,66 @@
*/
/*
+ * Special performance optimization for balance.
+ *
+ * For balance, we need to swap subtree of subvolume and reloc trees.
+ * In theory, we need to trace all subtree blocks of both subvolume and reloc
+ * trees, since their owner has changed during such swap.
+ *
+ * However since balance has ensured that both subtrees are containing the
+ * same contents and have the same tree structures, such swap won't cause
+ * qgroup number change.
+ *
+ * But there is a race window between subtree swap and transaction commit,
+ * during that window, if we increase/decrease tree level or merge/split tree
+ * blocks, we still need to trace the original subtrees.
+ *
+ * So for balance, we use a delayed subtree tracing, whose workflow is:
+ *
+ * 1) Record the subtree root block get swapped.
+ *
+ * During subtree swap:
+ * O = Old tree blocks
+ * N = New tree blocks
+ * reloc tree subvolume tree X
+ * Root Root
+ * / \ / \
+ * NA OB OA OB
+ * / | | \ / | | \
+ * NC ND OE OF OC OD OE OF
+ *
+ * In this case, NA and OA are going to be swapped, record (NA, OA) into
+ * subvolume tree X.
+ *
+ * 2) After subtree swap.
+ * reloc tree subvolume tree X
+ * Root Root
+ * / \ / \
+ * OA OB NA OB
+ * / | | \ / | | \
+ * OC OD OE OF NC ND OE OF
+ *
+ * 3a) COW happens for OB
+ * If we are going to COW tree block OB, we check OB's bytenr against
+ * tree X's swapped_blocks structure.
+ * If it doesn't fit any, nothing will happen.
+ *
+ * 3b) COW happens for NA
+ * Check NA's bytenr against tree X's swapped_blocks, and get a hit.
+ * Then we do subtree scan on both subtrees OA and NA.
+ * Resulting 6 tree blocks to be scanned (OA, OC, OD, NA, NC, ND).
+ *
+ * Then no matter what we do to subvolume tree X, qgroup numbers will
+ * still be correct.
+ * Then NA's record gets removed from X's swapped_blocks.
+ *
+ * 4) Transaction commit
+ * Any record in X's swapped_blocks gets removed, since there is no
+ * modification to the swapped subtrees, no need to trigger heavy qgroup
+ * subtree rescan for them.
+ */
+
+/*
* Record a dirty extent, and info qgroup to update quota on it
* TODO: Use kmem cache to alloc it.
*/
@@ -45,9 +107,38 @@ struct btrfs_qgroup_extent_record {
struct rb_node node;
u64 bytenr;
u64 num_bytes;
+
+ /*
+ * For qgroup reserved data space freeing.
+ *
+ * @data_rsv_refroot and @data_rsv will be recorded after
+ * BTRFS_ADD_DELAYED_EXTENT is called.
+ * And will be used to free reserved qgroup space at
+ * transaction commit time.
+ */
+ u32 data_rsv; /* reserved data space needs to be freed */
+ u64 data_rsv_refroot; /* which root the reserved data belongs to */
struct ulist *old_roots;
};
+struct btrfs_qgroup_swapped_block {
+ struct rb_node node;
+
+ int level;
+ bool trace_leaf;
+
+ /* bytenr/generation of the tree block in subvolume tree after swap */
+ u64 subvol_bytenr;
+ u64 subvol_generation;
+
+ /* bytenr/generation of the tree block in reloc tree after swap */
+ u64 reloc_bytenr;
+ u64 reloc_generation;
+
+ u64 last_snapshot;
+ struct btrfs_key first_key;
+};
+
/*
* Qgroup reservation types:
*
@@ -70,7 +161,7 @@ struct btrfs_qgroup_extent_record {
* be converted into META_PERTRANS.
*/
enum btrfs_qgroup_rsv_type {
- BTRFS_QGROUP_RSV_DATA = 0,
+ BTRFS_QGROUP_RSV_DATA,
BTRFS_QGROUP_RSV_META_PERTRANS,
BTRFS_QGROUP_RSV_META_PREALLOC,
BTRFS_QGROUP_RSV_LAST,
@@ -81,10 +172,10 @@ enum btrfs_qgroup_rsv_type {
*
* Each type should have different reservation behavior.
* E.g, data follows its io_tree flag modification, while
- * *currently* meta is just reserve-and-clear during transcation.
+ * *currently* meta is just reserve-and-clear during transaction.
*
* TODO: Add new type for reservation which can survive transaction commit.
- * Currect metadata reservation behavior is not suitable for such case.
+ * Current metadata reservation behavior is not suitable for such case.
*/
struct btrfs_qgroup_rsv {
u64 values[BTRFS_QGROUP_RSV_LAST];
@@ -236,12 +327,6 @@ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
struct extent_buffer *root_eb,
u64 root_gen, int root_level);
-
-int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *bg_cache,
- struct extent_buffer *src_parent, int src_slot,
- struct extent_buffer *dst_parent, int dst_slot,
- u64 last_snapshot);
int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
u64 num_bytes, struct ulist *old_roots,
struct ulist *new_roots);
@@ -252,15 +337,6 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
u64 ref_root, u64 num_bytes,
enum btrfs_qgroup_rsv_type type);
-static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info,
- u64 ref_root, u64 num_bytes)
-{
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
- return;
- trace_btrfs_qgroup_free_delayed_ref(fs_info, ref_root, num_bytes);
- btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes,
- BTRFS_QGROUP_RSV_DATA);
-}
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
@@ -325,4 +401,18 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes);
void btrfs_qgroup_check_reserved_leak(struct inode *inode);
+/* btrfs_qgroup_swapped_blocks related functions */
+void btrfs_qgroup_init_swapped_blocks(
+ struct btrfs_qgroup_swapped_blocks *swapped_blocks);
+
+void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root);
+int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
+ struct btrfs_root *subvol_root,
+ struct btrfs_block_group_cache *bg,
+ struct extent_buffer *subvol_parent, int subvol_slot,
+ struct extent_buffer *reloc_parent, int reloc_slot,
+ u64 last_snapshot);
+int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct extent_buffer *eb);
+
#endif
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index df41d7049936..1869ba8e5981 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1443,10 +1443,11 @@ static void set_bio_pages_uptodate(struct bio *bio)
{
struct bio_vec *bvec;
int i;
+ struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, i)
+ bio_for_each_segment_all(bvec, bio, i, iter_all)
SetPageUptodate(bvec->bv_page);
}
@@ -1980,7 +1981,7 @@ cleanup_io:
* - In case of single failure, where rbio->failb == -1:
*
* Cache this rbio iff the above read reconstruction is
- * excuted without problems.
+ * executed without problems.
*/
if (err == BLK_STS_OK && rbio->failb < 0)
cache_rbio_pages(rbio);
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index dec14b739b10..10d9589001a9 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -376,26 +376,28 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
goto error;
}
+ /* Insert extent in reada tree + all per-device trees, all or nothing */
+ down_read(&fs_info->dev_replace.rwsem);
ret = radix_tree_preload(GFP_KERNEL);
- if (ret)
+ if (ret) {
+ up_read(&fs_info->dev_replace.rwsem);
goto error;
+ }
- /* insert extent in reada_tree + all per-device trees, all or nothing */
- btrfs_dev_replace_read_lock(&fs_info->dev_replace);
spin_lock(&fs_info->reada_lock);
ret = radix_tree_insert(&fs_info->reada_tree, index, re);
if (ret == -EEXIST) {
re_exist = radix_tree_lookup(&fs_info->reada_tree, index);
re_exist->refcnt++;
spin_unlock(&fs_info->reada_lock);
- btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
radix_tree_preload_end();
+ up_read(&fs_info->dev_replace.rwsem);
goto error;
}
if (ret) {
spin_unlock(&fs_info->reada_lock);
- btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
radix_tree_preload_end();
+ up_read(&fs_info->dev_replace.rwsem);
goto error;
}
radix_tree_preload_end();
@@ -437,13 +439,13 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
}
radix_tree_delete(&fs_info->reada_tree, index);
spin_unlock(&fs_info->reada_lock);
- btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
+ up_read(&fs_info->dev_replace.rwsem);
goto error;
}
have_zone = 1;
}
spin_unlock(&fs_info->reada_lock);
- btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
+ up_read(&fs_info->dev_replace.rwsem);
if (!have_zone)
goto error;
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index d69fbfb30aa9..d09b6cdb785a 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -43,7 +43,7 @@ struct ref_entry {
* back to the delayed ref action. We hold the ref we are changing in the
* action so we can account for the history properly, and we record the root we
* were called with since it could be different from ref_root. We also store
- * stack traces because thats how I roll.
+ * stack traces because that's how I roll.
*/
struct ref_action {
int action;
@@ -56,7 +56,7 @@ struct ref_action {
/*
* One of these for every block we reference, it holds the roots and references
- * to it as well as all of the ref actions that have occured to it. We never
+ * to it as well as all of the ref actions that have occurred to it. We never
* free it until we unmount the file system in order to make sure re-allocations
* are happening properly.
*/
@@ -583,7 +583,7 @@ static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path,
return -EIO;
}
btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ btrfs_set_lock_blocking_read(eb);
path->nodes[level-1] = eb;
path->slots[level-1] = 0;
path->locks[level-1] = BTRFS_READ_LOCK_BLOCKING;
@@ -859,7 +859,7 @@ int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
* This shouldn't happen because we will add our re
* above when we lookup the be with !parent, but just in
* case catch this case so we don't panic because I
- * didn't thik of some other corner case.
+ * didn't think of some other corner case.
*/
btrfs_err(fs_info, "failed to find root %llu for %llu",
root->root_key.objectid, be->bytenr);
@@ -987,7 +987,7 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
return -ENOMEM;
eb = btrfs_read_lock_root_node(fs_info->extent_root);
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ btrfs_set_lock_blocking_read(eb);
level = btrfs_header_level(eb);
path->nodes[level] = eb;
path->slots[level] = 0;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 924116f654a1..ddf028509931 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -162,6 +162,8 @@ struct reloc_control {
struct mapping_tree reloc_root_tree;
/* list of reloc trees */
struct list_head reloc_roots;
+ /* list of subvolume trees that get relocated */
+ struct list_head dirty_subvol_roots;
/* size of metadata reservation for merging reloc trees */
u64 merging_rsv_size;
/* size of relocated tree nodes */
@@ -1467,15 +1469,17 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
struct btrfs_root_item *root_item;
int ret;
- if (!root->reloc_root)
+ if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state) ||
+ !root->reloc_root)
goto out;
reloc_root = root->reloc_root;
root_item = &reloc_root->root_item;
+ /* root->reloc_root will stay until current relocation finished */
if (fs_info->reloc_ctl->merge_reloc_tree &&
btrfs_root_refs(root_item) == 0) {
- root->reloc_root = NULL;
+ set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
__del_reloc_root(reloc_root);
}
@@ -1773,7 +1777,7 @@ again:
btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
eb = btrfs_lock_root_node(dest);
- btrfs_set_lock_blocking(eb);
+ btrfs_set_lock_blocking_write(eb);
level = btrfs_header_level(eb);
if (level < lowest_level) {
@@ -1786,7 +1790,7 @@ again:
ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
BUG_ON(ret);
}
- btrfs_set_lock_blocking(eb);
+ btrfs_set_lock_blocking_write(eb);
if (next_key) {
next_key->objectid = (u64)-1;
@@ -1802,6 +1806,8 @@ again:
BUG_ON(level < lowest_level);
ret = btrfs_bin_search(parent, &key, level, &slot);
+ if (ret < 0)
+ break;
if (ret && slot > 0)
slot--;
@@ -1852,7 +1858,7 @@ again:
slot, &eb);
BUG_ON(ret);
}
- btrfs_set_lock_blocking(eb);
+ btrfs_set_lock_blocking_write(eb);
btrfs_tree_unlock(parent);
free_extent_buffer(parent);
@@ -1885,15 +1891,18 @@ again:
* If not traced, we will leak data numbers
* 2) Fs subtree
* If not traced, we will double count old data
- * and tree block numbers, if current trans doesn't free
- * data reloc tree inode.
+ *
+ * We don't scan the subtree right now, but only record
+ * the swapped tree blocks.
+ * The real subtree rescan is delayed until we have new
+ * CoW on the subtree root node before transaction commit.
*/
- ret = btrfs_qgroup_trace_subtree_swap(trans, rc->block_group,
- parent, slot, path->nodes[level],
- path->slots[level], last_snapshot);
+ ret = btrfs_qgroup_add_swapped_blocks(trans, dest,
+ rc->block_group, parent, slot,
+ path->nodes[level], path->slots[level],
+ last_snapshot);
if (ret < 0)
break;
-
/*
* swap blocks in fs tree and reloc tree.
*/
@@ -2121,6 +2130,58 @@ static int find_next_key(struct btrfs_path *path, int level,
}
/*
+ * Insert current subvolume into reloc_control::dirty_subvol_roots
+ */
+static void insert_dirty_subvol(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc,
+ struct btrfs_root *root)
+{
+ struct btrfs_root *reloc_root = root->reloc_root;
+ struct btrfs_root_item *reloc_root_item;
+
+ /* @root must be a subvolume tree root with a valid reloc tree */
+ ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
+ ASSERT(reloc_root);
+
+ reloc_root_item = &reloc_root->root_item;
+ memset(&reloc_root_item->drop_progress, 0,
+ sizeof(reloc_root_item->drop_progress));
+ reloc_root_item->drop_level = 0;
+ btrfs_set_root_refs(reloc_root_item, 0);
+ btrfs_update_reloc_root(trans, root);
+
+ if (list_empty(&root->reloc_dirty_list)) {
+ btrfs_grab_fs_root(root);
+ list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots);
+ }
+}
+
+static int clean_dirty_subvols(struct reloc_control *rc)
+{
+ struct btrfs_root *root;
+ struct btrfs_root *next;
+ int ret = 0;
+
+ list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots,
+ reloc_dirty_list) {
+ struct btrfs_root *reloc_root = root->reloc_root;
+
+ clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
+ list_del_init(&root->reloc_dirty_list);
+ root->reloc_root = NULL;
+ if (reloc_root) {
+ int ret2;
+
+ ret2 = btrfs_drop_snapshot(reloc_root, NULL, 0, 1);
+ if (ret2 < 0 && !ret)
+ ret = ret2;
+ }
+ btrfs_put_fs_root(root);
+ }
+ return ret;
+}
+
+/*
* merge the relocated tree blocks in reloc tree with corresponding
* fs tree.
*/
@@ -2128,7 +2189,6 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
- LIST_HEAD(inode_list);
struct btrfs_key key;
struct btrfs_key next_key;
struct btrfs_trans_handle *trans = NULL;
@@ -2259,13 +2319,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
out:
btrfs_free_path(path);
- if (err == 0) {
- memset(&root_item->drop_progress, 0,
- sizeof(root_item->drop_progress));
- root_item->drop_level = 0;
- btrfs_set_root_refs(root_item, 0);
- btrfs_update_reloc_root(trans, root);
- }
+ if (err == 0)
+ insert_dirty_subvol(trans, rc, root);
if (trans)
btrfs_end_transaction_throttle(trans);
@@ -2410,14 +2465,6 @@ again:
} else {
list_del_init(&reloc_root->root_list);
}
-
- ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1);
- if (ret < 0) {
- if (list_empty(&reloc_root->root_list))
- list_add_tail(&reloc_root->root_list,
- &reloc_roots);
- goto out;
- }
}
if (found) {
@@ -2631,7 +2678,7 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
* only one thread can access block_rsv at this point,
* so we don't need hold lock to protect block_rsv.
* we expand more reservation size here to allow enough
- * space for relocation and we will return eailer in
+ * space for relocation and we will return earlier in
* enospc case.
*/
rc->block_rsv->size = tmp + fs_info->nodesize *
@@ -2685,6 +2732,10 @@ static int do_relocation(struct btrfs_trans_handle *trans,
if (!lowest) {
ret = btrfs_bin_search(upper->eb, key,
upper->level, &slot);
+ if (ret < 0) {
+ err = ret;
+ goto next;
+ }
BUG_ON(ret);
bytenr = btrfs_node_blockptr(upper->eb, slot);
if (node->eb->start == bytenr)
@@ -2720,6 +2771,10 @@ static int do_relocation(struct btrfs_trans_handle *trans,
} else {
ret = btrfs_bin_search(upper->eb, key, upper->level,
&slot);
+ if (ret < 0) {
+ err = ret;
+ goto next;
+ }
BUG_ON(ret);
}
@@ -2752,7 +2807,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
goto next;
}
btrfs_tree_lock(eb);
- btrfs_set_lock_blocking(eb);
+ btrfs_set_lock_blocking_write(eb);
if (!node->eb) {
ret = btrfs_cow_block(trans, root, eb, upper->eb,
@@ -3959,6 +4014,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
restart:
if (update_backref_cache(trans, &rc->backref_cache)) {
btrfs_end_transaction(trans);
+ trans = NULL;
continue;
}
@@ -4078,6 +4134,9 @@ restart:
goto out_free;
}
btrfs_commit_transaction(trans);
+ ret = clean_dirty_subvols(rc);
+ if (ret < 0 && !err)
+ err = ret;
out_free:
btrfs_free_block_rsv(fs_info, rc->block_rsv);
btrfs_free_path(path);
@@ -4172,6 +4231,7 @@ static struct reloc_control *alloc_reloc_control(void)
return NULL;
INIT_LIST_HEAD(&rc->reloc_roots);
+ INIT_LIST_HEAD(&rc->dirty_subvol_roots);
backref_cache_init(&rc->backref_cache);
mapping_tree_init(&rc->reloc_root_tree);
extent_io_tree_init(&rc->processed_blocks, NULL);
@@ -4184,37 +4244,13 @@ static struct reloc_control *alloc_reloc_control(void)
static void describe_relocation(struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group)
{
- char buf[128]; /* prefixed by a '|' that'll be dropped */
- u64 flags = block_group->flags;
-
- /* Shouldn't happen */
- if (!flags) {
- strcpy(buf, "|NONE");
- } else {
- char *bp = buf;
+ char buf[128] = {'\0'};
-#define DESCRIBE_FLAG(f, d) \
- if (flags & BTRFS_BLOCK_GROUP_##f) { \
- bp += snprintf(bp, buf - bp + sizeof(buf), "|%s", d); \
- flags &= ~BTRFS_BLOCK_GROUP_##f; \
- }
- DESCRIBE_FLAG(DATA, "data");
- DESCRIBE_FLAG(SYSTEM, "system");
- DESCRIBE_FLAG(METADATA, "metadata");
- DESCRIBE_FLAG(RAID0, "raid0");
- DESCRIBE_FLAG(RAID1, "raid1");
- DESCRIBE_FLAG(DUP, "dup");
- DESCRIBE_FLAG(RAID10, "raid10");
- DESCRIBE_FLAG(RAID5, "raid5");
- DESCRIBE_FLAG(RAID6, "raid6");
- if (flags)
- snprintf(bp, buf - bp + sizeof(buf), "|0x%llx", flags);
-#undef DESCRIBE_FLAG
- }
+ btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf));
btrfs_info(fs_info,
"relocating block group %llu flags %s",
- block_group->key.objectid, buf + 1);
+ block_group->key.objectid, buf);
}
/*
@@ -4222,6 +4258,7 @@ static void describe_relocation(struct btrfs_fs_info *fs_info,
*/
int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
{
+ struct btrfs_block_group_cache *bg;
struct btrfs_root *extent_root = fs_info->extent_root;
struct reloc_control *rc;
struct inode *inode;
@@ -4230,14 +4267,23 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
int rw = 0;
int err = 0;
+ bg = btrfs_lookup_block_group(fs_info, group_start);
+ if (!bg)
+ return -ENOENT;
+
+ if (btrfs_pinned_by_swapfile(fs_info, bg)) {
+ btrfs_put_block_group(bg);
+ return -ETXTBSY;
+ }
+
rc = alloc_reloc_control();
- if (!rc)
+ if (!rc) {
+ btrfs_put_block_group(bg);
return -ENOMEM;
+ }
rc->extent_root = extent_root;
-
- rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
- BUG_ON(!rc->block_group);
+ rc->block_group = bg;
ret = btrfs_inc_block_group_ro(rc->block_group);
if (ret) {
@@ -4481,6 +4527,10 @@ int btrfs_recover_relocation(struct btrfs_root *root)
goto out_free;
}
err = btrfs_commit_transaction(trans);
+
+ ret = clean_dirty_subvols(rc);
+ if (ret < 0 && !err)
+ err = ret;
out_free:
kfree(rc);
out:
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 65bda0682928..0d2b957ca3a3 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -21,12 +21,12 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
struct btrfs_root_item *item)
{
uuid_le uuid;
- int len;
+ u32 len;
int need_reset = 0;
len = btrfs_item_size_nr(eb, slot);
read_extent_buffer(eb, item, btrfs_item_ptr_offset(eb, slot),
- min_t(int, len, (int)sizeof(*item)));
+ min_t(u32, len, sizeof(*item)));
if (len < sizeof(*item))
need_reset = 1;
if (!need_reset && btrfs_root_generation(item)
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 902819d3cf41..a99588536c79 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -339,7 +339,9 @@ static struct full_stripe_lock *insert_full_stripe_lock(
}
}
- /* Insert new lock */
+ /*
+ * Insert new lock.
+ */
ret = kmalloc(sizeof(*ret), GFP_KERNEL);
if (!ret)
return ERR_PTR(-ENOMEM);
@@ -568,12 +570,11 @@ static void scrub_put_ctx(struct scrub_ctx *sctx)
scrub_free_ctx(sctx);
}
-static noinline_for_stack
-struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
+static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
+ struct btrfs_fs_info *fs_info, int is_dev_replace)
{
struct scrub_ctx *sctx;
int i;
- struct btrfs_fs_info *fs_info = dev->fs_info;
sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
if (!sctx)
@@ -582,7 +583,8 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
sctx->is_dev_replace = is_dev_replace;
sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
sctx->curr = -1;
- sctx->fs_info = dev->fs_info;
+ sctx->fs_info = fs_info;
+ INIT_LIST_HEAD(&sctx->csum_list);
for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
struct scrub_bio *sbio;
@@ -607,7 +609,6 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
atomic_set(&sctx->workers_pending, 0);
atomic_set(&sctx->cancel_req, 0);
sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
- INIT_LIST_HEAD(&sctx->csum_list);
spin_lock_init(&sctx->list_lock);
spin_lock_init(&sctx->stat_lock);
@@ -832,6 +833,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
int page_num;
int success;
bool full_stripe_locked;
+ unsigned int nofs_flag;
static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
@@ -857,6 +859,16 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
dev = sblock_to_check->pagev[0]->dev;
/*
+ * We must use GFP_NOFS because the scrub task might be waiting for a
+ * worker task executing this function and in turn a transaction commit
+ * might be waiting the scrub task to pause (which needs to wait for all
+ * the worker tasks to complete before pausing).
+ * We do allocations in the workers through insert_full_stripe_lock()
+ * and scrub_add_page_to_wr_bio(), which happens down the call chain of
+ * this function.
+ */
+ nofs_flag = memalloc_nofs_save();
+ /*
* For RAID5/6, race can happen for a different device scrub thread.
* For data corruption, Parity and Data threads will both try
* to recovery the data.
@@ -865,6 +877,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
*/
ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
if (ret < 0) {
+ memalloc_nofs_restore(nofs_flag);
spin_lock(&sctx->stat_lock);
if (ret == -ENOMEM)
sctx->stat.malloc_errors++;
@@ -904,7 +917,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
*/
sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
- sizeof(*sblocks_for_recheck), GFP_NOFS);
+ sizeof(*sblocks_for_recheck), GFP_KERNEL);
if (!sblocks_for_recheck) {
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
@@ -1202,6 +1215,7 @@ out:
}
ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
+ memalloc_nofs_restore(nofs_flag);
if (ret < 0)
return ret;
return 0;
@@ -3540,7 +3554,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
if (!ret && sctx->is_dev_replace) {
/*
* If we are doing a device replace wait for any tasks
- * that started dellaloc right before we set the block
+ * that started delalloc right before we set the block
* group to RO mode, as they might have just allocated
* an extent from it or decided they could do a nocow
* write. And if any such tasks did that, wait for their
@@ -3596,11 +3610,12 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
break;
}
- btrfs_dev_replace_write_lock(&fs_info->dev_replace);
+ down_write(&fs_info->dev_replace.rwsem);
dev_replace->cursor_right = found_key.offset + length;
dev_replace->cursor_left = found_key.offset;
dev_replace->item_needs_writeback = 1;
- btrfs_dev_replace_write_unlock(&fs_info->dev_replace);
+ up_write(&dev_replace->rwsem);
+
ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length,
found_key.offset, cache);
@@ -3636,10 +3651,10 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
scrub_pause_off(fs_info);
- btrfs_dev_replace_write_lock(&fs_info->dev_replace);
+ down_write(&fs_info->dev_replace.rwsem);
dev_replace->cursor_left = dev_replace->cursor_right;
dev_replace->item_needs_writeback = 1;
- btrfs_dev_replace_write_unlock(&fs_info->dev_replace);
+ up_write(&fs_info->dev_replace.rwsem);
if (ro_set)
btrfs_dec_block_group_ro(cache);
@@ -3726,25 +3741,33 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
int max_active = fs_info->thread_pool_size;
- if (fs_info->scrub_workers_refcnt == 0) {
+ lockdep_assert_held(&fs_info->scrub_lock);
+
+ if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
+ ASSERT(fs_info->scrub_workers == NULL);
fs_info->scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub",
flags, is_dev_replace ? 1 : max_active, 4);
if (!fs_info->scrub_workers)
goto fail_scrub_workers;
+ ASSERT(fs_info->scrub_wr_completion_workers == NULL);
fs_info->scrub_wr_completion_workers =
btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
max_active, 2);
if (!fs_info->scrub_wr_completion_workers)
goto fail_scrub_wr_completion_workers;
+ ASSERT(fs_info->scrub_parity_workers == NULL);
fs_info->scrub_parity_workers =
btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
max_active, 2);
if (!fs_info->scrub_parity_workers)
goto fail_scrub_parity_workers;
+
+ refcount_set(&fs_info->scrub_workers_refcnt, 1);
+ } else {
+ refcount_inc(&fs_info->scrub_workers_refcnt);
}
- ++fs_info->scrub_workers_refcnt;
return 0;
fail_scrub_parity_workers:
@@ -3755,16 +3778,6 @@ fail_scrub_workers:
return -ENOMEM;
}
-static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
-{
- if (--fs_info->scrub_workers_refcnt == 0) {
- btrfs_destroy_workqueue(fs_info->scrub_workers);
- btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
- btrfs_destroy_workqueue(fs_info->scrub_parity_workers);
- }
- WARN_ON(fs_info->scrub_workers_refcnt < 0);
-}
-
int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
u64 end, struct btrfs_scrub_progress *progress,
int readonly, int is_dev_replace)
@@ -3772,6 +3785,10 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
struct scrub_ctx *sctx;
int ret;
struct btrfs_device *dev;
+ unsigned int nofs_flag;
+ struct btrfs_workqueue *scrub_workers = NULL;
+ struct btrfs_workqueue *scrub_wr_comp = NULL;
+ struct btrfs_workqueue *scrub_parity = NULL;
if (btrfs_fs_closing(fs_info))
return -EINVAL;
@@ -3813,13 +3830,18 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
return -EINVAL;
}
+ /* Allocate outside of device_list_mutex */
+ sctx = scrub_setup_ctx(fs_info, is_dev_replace);
+ if (IS_ERR(sctx))
+ return PTR_ERR(sctx);
mutex_lock(&fs_info->fs_devices->device_list_mutex);
- dev = btrfs_find_device(fs_info, devid, NULL, NULL);
+ dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
!is_dev_replace)) {
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
- return -ENODEV;
+ ret = -ENODEV;
+ goto out_free_ctx;
}
if (!is_dev_replace && !readonly &&
@@ -3827,7 +3849,8 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
btrfs_err_in_rcu(fs_info, "scrub: device %s is not writable",
rcu_str_deref(dev->name));
- return -EROFS;
+ ret = -EROFS;
+ goto out_free_ctx;
}
mutex_lock(&fs_info->scrub_lock);
@@ -3835,34 +3858,29 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {
mutex_unlock(&fs_info->scrub_lock);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
- return -EIO;
+ ret = -EIO;
+ goto out_free_ctx;
}
- btrfs_dev_replace_read_lock(&fs_info->dev_replace);
+ down_read(&fs_info->dev_replace.rwsem);
if (dev->scrub_ctx ||
(!is_dev_replace &&
btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
- btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
+ up_read(&fs_info->dev_replace.rwsem);
mutex_unlock(&fs_info->scrub_lock);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
- return -EINPROGRESS;
+ ret = -EINPROGRESS;
+ goto out_free_ctx;
}
- btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
+ up_read(&fs_info->dev_replace.rwsem);
ret = scrub_workers_get(fs_info, is_dev_replace);
if (ret) {
mutex_unlock(&fs_info->scrub_lock);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
- return ret;
+ goto out_free_ctx;
}
- sctx = scrub_setup_ctx(dev, is_dev_replace);
- if (IS_ERR(sctx)) {
- mutex_unlock(&fs_info->scrub_lock);
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
- scrub_workers_put(fs_info);
- return PTR_ERR(sctx);
- }
sctx->readonly = readonly;
dev->scrub_ctx = sctx;
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
@@ -3875,7 +3893,18 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
atomic_inc(&fs_info->scrubs_running);
mutex_unlock(&fs_info->scrub_lock);
+ /*
+ * In order to avoid deadlock with reclaim when there is a transaction
+ * trying to pause scrub, make sure we use GFP_NOFS for all the
+ * allocations done at btrfs_scrub_pages() and scrub_pages_for_parity()
+ * invoked by our callees. The pausing request is done when the
+ * transaction commit starts, and it blocks the transaction until scrub
+ * is paused (done at specific points at scrub_stripe() or right above
+ * before incrementing fs_info->scrubs_running).
+ */
+ nofs_flag = memalloc_nofs_save();
if (!is_dev_replace) {
+ btrfs_info(fs_info, "scrub: started on devid %llu", devid);
/*
* by holding device list mutex, we can
* kick off writing super in log tree sync.
@@ -3887,6 +3916,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
if (!ret)
ret = scrub_enumerate_chunks(sctx, dev, start, end);
+ memalloc_nofs_restore(nofs_flag);
wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
atomic_dec(&fs_info->scrubs_running);
@@ -3897,14 +3927,34 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
if (progress)
memcpy(progress, &sctx->stat, sizeof(*progress));
+ if (!is_dev_replace)
+ btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d",
+ ret ? "not finished" : "finished", devid, ret);
+
mutex_lock(&fs_info->scrub_lock);
dev->scrub_ctx = NULL;
- scrub_workers_put(fs_info);
+ if (refcount_dec_and_test(&fs_info->scrub_workers_refcnt)) {
+ scrub_workers = fs_info->scrub_workers;
+ scrub_wr_comp = fs_info->scrub_wr_completion_workers;
+ scrub_parity = fs_info->scrub_parity_workers;
+
+ fs_info->scrub_workers = NULL;
+ fs_info->scrub_wr_completion_workers = NULL;
+ fs_info->scrub_parity_workers = NULL;
+ }
mutex_unlock(&fs_info->scrub_lock);
+ btrfs_destroy_workqueue(scrub_workers);
+ btrfs_destroy_workqueue(scrub_wr_comp);
+ btrfs_destroy_workqueue(scrub_parity);
scrub_put_ctx(sctx);
return ret;
+
+out_free_ctx:
+ scrub_free_ctx(sctx);
+
+ return ret;
}
void btrfs_scrub_pause(struct btrfs_fs_info *fs_info)
@@ -3979,7 +4029,7 @@ int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
struct scrub_ctx *sctx = NULL;
mutex_lock(&fs_info->fs_devices->device_list_mutex);
- dev = btrfs_find_device(fs_info, devid, NULL, NULL);
+ dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
if (dev)
sctx = dev->scrub_ctx;
if (sctx)
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 094cc1444a90..7ea2d6b1f170 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -2238,7 +2238,7 @@ out:
* inodes "orphan" name instead of the real name and stop. Same with new inodes
* that were not created yet and overwritten inodes/refs.
*
- * When do we have have orphan inodes:
+ * When do we have orphan inodes:
* 1. When an inode is freshly created and thus no valid refs are available yet
* 2. When a directory lost all it's refs (deleted) but still has dir items
* inside which were not processed yet (pending for move/delete). If anyone
@@ -3340,7 +3340,8 @@ static void free_pending_move(struct send_ctx *sctx, struct pending_dir_move *m)
kfree(m);
}
-static void tail_append_pending_moves(struct pending_dir_move *moves,
+static void tail_append_pending_moves(struct send_ctx *sctx,
+ struct pending_dir_move *moves,
struct list_head *stack)
{
if (list_empty(&moves->list)) {
@@ -3351,6 +3352,10 @@ static void tail_append_pending_moves(struct pending_dir_move *moves,
list_add_tail(&moves->list, stack);
list_splice_tail(&list, stack);
}
+ if (!RB_EMPTY_NODE(&moves->node)) {
+ rb_erase(&moves->node, &sctx->pending_dir_moves);
+ RB_CLEAR_NODE(&moves->node);
+ }
}
static int apply_children_dir_moves(struct send_ctx *sctx)
@@ -3365,7 +3370,7 @@ static int apply_children_dir_moves(struct send_ctx *sctx)
return 0;
INIT_LIST_HEAD(&stack);
- tail_append_pending_moves(pm, &stack);
+ tail_append_pending_moves(sctx, pm, &stack);
while (!list_empty(&stack)) {
pm = list_first_entry(&stack, struct pending_dir_move, list);
@@ -3376,7 +3381,7 @@ static int apply_children_dir_moves(struct send_ctx *sctx)
goto out;
pm = get_pending_dir_moves(sctx, parent_ino);
if (pm)
- tail_append_pending_moves(pm, &stack);
+ tail_append_pending_moves(sctx, pm, &stack);
}
return 0;
@@ -3849,7 +3854,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
/*
* We may have refs where the parent directory does not exist
* yet. This happens if the parent directories inum is higher
- * the the current inum. To handle this case, we create the
+ * than the current inum. To handle this case, we create the
* parent directory out of order. But we need to check if this
* did already happen before due to other refs in the same dir.
*/
@@ -4770,7 +4775,7 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
struct btrfs_key key;
pgoff_t index = offset >> PAGE_SHIFT;
pgoff_t last_index;
- unsigned pg_offset = offset & ~PAGE_MASK;
+ unsigned pg_offset = offset_in_page(offset);
ssize_t ret = 0;
key.objectid = sctx->cur_ino;
@@ -6641,7 +6646,7 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
goto out;
}
- if (!access_ok(VERIFY_READ, arg->clone_sources,
+ if (!access_ok(arg->clone_sources,
sizeof(*arg->clone_sources) *
arg->clone_sources_count)) {
ret = -EFAULT;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index b362b45dd757..120e4340792a 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -93,7 +93,7 @@ const char *btrfs_decode_error(int errno)
/*
* __btrfs_handle_fs_error decodes expected errors from the caller and
- * invokes the approciate error response.
+ * invokes the appropriate error response.
*/
__cold
void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function,
@@ -151,7 +151,7 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function
* although there is no way to update the progress. It would add the
* risk of a deadlock, therefore the canceling is omitted. The only
* penalty is that some I/O remains active until the procedure
- * completes. The next time when the filesystem is mounted writeable
+ * completes. The next time when the filesystem is mounted writable
* again, the device replace operation continues.
*/
}
@@ -529,7 +529,9 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
if (token != Opt_compress &&
token != Opt_compress_force)
info->compress_level =
- btrfs_compress_str2level(args[0].from);
+ btrfs_compress_str2level(
+ BTRFS_COMPRESS_ZLIB,
+ args[0].from + 4);
btrfs_set_opt(info->mount_opt, COMPRESS);
btrfs_clear_opt(info->mount_opt, NODATACOW);
btrfs_clear_opt(info->mount_opt, NODATASUM);
@@ -542,9 +544,13 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
btrfs_clear_opt(info->mount_opt, NODATASUM);
btrfs_set_fs_incompat(info, COMPRESS_LZO);
no_compress = 0;
- } else if (strcmp(args[0].from, "zstd") == 0) {
+ } else if (strncmp(args[0].from, "zstd", 4) == 0) {
compress_type = "zstd";
info->compress_type = BTRFS_COMPRESS_ZSTD;
+ info->compress_level =
+ btrfs_compress_str2level(
+ BTRFS_COMPRESS_ZSTD,
+ args[0].from + 4);
btrfs_set_opt(info->mount_opt, COMPRESS);
btrfs_clear_opt(info->mount_opt, NODATACOW);
btrfs_clear_opt(info->mount_opt, NODATASUM);
@@ -1458,56 +1464,6 @@ out:
return root;
}
-static int parse_security_options(char *orig_opts,
- struct security_mnt_opts *sec_opts)
-{
- char *secdata = NULL;
- int ret = 0;
-
- secdata = alloc_secdata();
- if (!secdata)
- return -ENOMEM;
- ret = security_sb_copy_data(orig_opts, secdata);
- if (ret) {
- free_secdata(secdata);
- return ret;
- }
- ret = security_sb_parse_opts_str(secdata, sec_opts);
- free_secdata(secdata);
- return ret;
-}
-
-static int setup_security_options(struct btrfs_fs_info *fs_info,
- struct super_block *sb,
- struct security_mnt_opts *sec_opts)
-{
- int ret = 0;
-
- /*
- * Call security_sb_set_mnt_opts() to check whether new sec_opts
- * is valid.
- */
- ret = security_sb_set_mnt_opts(sb, sec_opts, 0, NULL);
- if (ret)
- return ret;
-
-#ifdef CONFIG_SECURITY
- if (!fs_info->security_opts.num_mnt_opts) {
- /* first time security setup, copy sec_opts to fs_info */
- memcpy(&fs_info->security_opts, sec_opts, sizeof(*sec_opts));
- } else {
- /*
- * Since SELinux (the only one supporting security_mnt_opts)
- * does NOT support changing context during remount/mount of
- * the same sb, this must be the same or part of the same
- * security options, just free it.
- */
- security_free_mnt_opts(sec_opts);
- }
-#endif
- return ret;
-}
-
/*
* Find a superblock for the given device / mount point.
*
@@ -1522,16 +1478,15 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
struct btrfs_device *device = NULL;
struct btrfs_fs_devices *fs_devices = NULL;
struct btrfs_fs_info *fs_info = NULL;
- struct security_mnt_opts new_sec_opts;
+ void *new_sec_opts = NULL;
fmode_t mode = FMODE_READ;
int error = 0;
if (!(flags & SB_RDONLY))
mode |= FMODE_WRITE;
- security_init_mnt_opts(&new_sec_opts);
if (data) {
- error = parse_security_options(data, &new_sec_opts);
+ error = security_sb_eat_lsm_opts(data, &new_sec_opts);
if (error)
return ERR_PTR(error);
}
@@ -1550,7 +1505,6 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
- security_init_mnt_opts(&fs_info->security_opts);
if (!fs_info->super_copy || !fs_info->super_for_commit) {
error = -ENOMEM;
goto error_fs_info;
@@ -1601,16 +1555,12 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
btrfs_sb(s)->bdev_holder = fs_type;
error = btrfs_fill_super(s, fs_devices, data);
}
+ if (!error)
+ error = security_sb_set_mnt_opts(s, new_sec_opts, 0, NULL);
+ security_free_mnt_opts(&new_sec_opts);
if (error) {
deactivate_locked_super(s);
- goto error_sec_opts;
- }
-
- fs_info = btrfs_sb(s);
- error = setup_security_options(fs_info, s, &new_sec_opts);
- if (error) {
- deactivate_locked_super(s);
- goto error_sec_opts;
+ return ERR_PTR(error);
}
return dget(s->s_root);
@@ -1677,6 +1627,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
flags | SB_RDONLY, device_name, data);
if (IS_ERR(mnt_root)) {
root = ERR_CAST(mnt_root);
+ kfree(subvol_name);
goto out;
}
@@ -1686,12 +1637,14 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
if (error < 0) {
root = ERR_PTR(error);
mntput(mnt_root);
+ kfree(subvol_name);
goto out;
}
}
}
if (IS_ERR(mnt_root)) {
root = ERR_CAST(mnt_root);
+ kfree(subvol_name);
goto out;
}
@@ -1779,18 +1732,14 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
btrfs_remount_prepare(fs_info);
if (data) {
- struct security_mnt_opts new_sec_opts;
+ void *new_sec_opts = NULL;
- security_init_mnt_opts(&new_sec_opts);
- ret = parse_security_options(data, &new_sec_opts);
+ ret = security_sb_eat_lsm_opts(data, &new_sec_opts);
+ if (!ret)
+ ret = security_sb_remount(sb, new_sec_opts);
+ security_free_mnt_opts(&new_sec_opts);
if (ret)
goto restore;
- ret = setup_security_options(fs_info, sb,
- &new_sec_opts);
- if (ret) {
- security_free_mnt_opts(&new_sec_opts);
- goto restore;
- }
}
ret = btrfs_parse_options(fs_info, data, *flags);
@@ -1848,7 +1797,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
if (!btrfs_check_rw_degradable(fs_info, NULL)) {
btrfs_warn(fs_info,
- "too many missing devices, writeable remount is not allowed");
+ "too many missing devices, writable remount is not allowed");
ret = -EACCES;
goto restore;
}
@@ -1916,7 +1865,7 @@ restore:
}
/* Used to sort the devices by max_avail(descending sort) */
-static int btrfs_cmp_device_free_bytes(const void *dev_info1,
+static inline int btrfs_cmp_device_free_bytes(const void *dev_info1,
const void *dev_info2)
{
if (((struct btrfs_device_info *)dev_info1)->max_avail >
@@ -1945,8 +1894,8 @@ static inline void btrfs_descending_sort_devices(
* The helper to calc the free space on the devices that can be used to store
* file data.
*/
-static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
- u64 *free_bytes)
+static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
+ u64 *free_bytes)
{
struct btrfs_device_info *devices_info;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
@@ -2090,7 +2039,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
u64 total_free_data = 0;
u64 total_free_meta = 0;
int bits = dentry->d_sb->s_blocksize_bits;
- __be32 *fsid = (__be32 *)fs_info->fsid;
+ __be32 *fsid = (__be32 *)fs_info->fs_devices->fsid;
unsigned factor = 1;
struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
int ret;
@@ -2237,6 +2186,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
vol = memdup_user((void __user *)arg, sizeof(*vol));
if (IS_ERR(vol))
return PTR_ERR(vol);
+ vol->name[BTRFS_PATH_NAME_MAX] = '\0';
switch (cmd) {
case BTRFS_IOC_SCAN_DEV:
@@ -2246,6 +2196,9 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
ret = PTR_ERR_OR_ZERO(device);
mutex_unlock(&uuid_mutex);
break;
+ case BTRFS_IOC_FORGET_DEV:
+ ret = btrfs_forget_devices(vol->name);
+ break;
case BTRFS_IOC_DEVICES_READY:
mutex_lock(&uuid_mutex);
device = btrfs_scan_one_device(vol->name, FMODE_READ,
@@ -2311,7 +2264,7 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
* device_list_mutex here as we only read the device data and the list
* is protected by RCU. Even if a device is deleted during the list
* traversals, we'll get valid data, the freeing callback will wait at
- * least until until the rcu_read_unlock.
+ * least until the rcu_read_unlock.
*/
rcu_read_lock();
cur_devices = fs_info->fs_devices;
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 3717c864ba23..5a5930e3d32b 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -191,6 +191,7 @@ BTRFS_FEAT_ATTR_INCOMPAT(extended_iref, EXTENDED_IREF);
BTRFS_FEAT_ATTR_INCOMPAT(raid56, RAID56);
BTRFS_FEAT_ATTR_INCOMPAT(skinny_metadata, SKINNY_METADATA);
BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES);
+BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID);
BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE);
static struct attribute *btrfs_supported_feature_attrs[] = {
@@ -204,6 +205,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = {
BTRFS_FEAT_ATTR_PTR(raid56),
BTRFS_FEAT_ATTR_PTR(skinny_metadata),
BTRFS_FEAT_ATTR_PTR(no_holes),
+ BTRFS_FEAT_ATTR_PTR(metadata_uuid),
BTRFS_FEAT_ATTR_PTR(free_space_tree),
NULL
};
@@ -505,12 +507,24 @@ static ssize_t quota_override_store(struct kobject *kobj,
BTRFS_ATTR_RW(, quota_override, quota_override_show, quota_override_store);
+static ssize_t btrfs_metadata_uuid_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+
+ return snprintf(buf, PAGE_SIZE, "%pU\n",
+ fs_info->fs_devices->metadata_uuid);
+}
+
+BTRFS_ATTR(, metadata_uuid, btrfs_metadata_uuid_show);
+
static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(, label),
BTRFS_ATTR_PTR(, nodesize),
BTRFS_ATTR_PTR(, sectorsize),
BTRFS_ATTR_PTR(, clone_alignment),
BTRFS_ATTR_PTR(, quota_override),
+ BTRFS_ATTR_PTR(, metadata_uuid),
NULL,
};
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index c6ee600aff89..40716b357c1d 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -9,7 +9,7 @@
extern u64 btrfs_debugfs_test;
enum btrfs_feature_set {
- FEAT_COMPAT = 0,
+ FEAT_COMPAT,
FEAT_COMPAT_RO,
FEAT_INCOMPAT,
FEAT_MAX
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index db72b3b6209e..8a59597f1883 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -174,8 +174,10 @@ void btrfs_free_dummy_root(struct btrfs_root *root)
/* Will be freed by btrfs_free_fs_roots */
if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state)))
return;
- if (root->node)
+ if (root->node) {
+ /* One for allocate_extent_buffer */
free_extent_buffer(root->node);
+ }
kfree(root);
}
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index 9e0f4a01be14..3c46d7f23456 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -62,10 +62,11 @@ static int test_find_delalloc(u32 sectorsize)
struct page *page;
struct page *locked_page = NULL;
unsigned long index = 0;
- u64 total_dirty = SZ_256M;
- u64 max_bytes = SZ_128M;
+ /* In this test we need at least 2 file extents at its maximum size */
+ u64 max_bytes = BTRFS_MAX_EXTENT_SIZE;
+ u64 total_dirty = 2 * max_bytes;
u64 start, end, test_start;
- u64 found;
+ bool found;
int ret = -EINVAL;
test_msg("running find delalloc tests");
@@ -76,7 +77,7 @@ static int test_find_delalloc(u32 sectorsize)
return -ENOMEM;
}
- extent_io_tree_init(&tmp, inode);
+ extent_io_tree_init(&tmp, NULL);
/*
* First go through and create and mark all of our pages dirty, we pin
@@ -106,8 +107,8 @@ static int test_find_delalloc(u32 sectorsize)
set_extent_delalloc(&tmp, 0, sectorsize - 1, 0, NULL);
start = 0;
end = 0;
- found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start,
- &end, max_bytes);
+ found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+ &end);
if (!found) {
test_err("should have found at least one delalloc");
goto out_bits;
@@ -137,8 +138,8 @@ static int test_find_delalloc(u32 sectorsize)
set_extent_delalloc(&tmp, sectorsize, max_bytes - 1, 0, NULL);
start = test_start;
end = 0;
- found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start,
- &end, max_bytes);
+ found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+ &end);
if (!found) {
test_err("couldn't find delalloc in our range");
goto out_bits;
@@ -171,8 +172,8 @@ static int test_find_delalloc(u32 sectorsize)
}
start = test_start;
end = 0;
- found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start,
- &end, max_bytes);
+ found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+ &end);
if (found) {
test_err("found range when we shouldn't have");
goto out_bits;
@@ -192,8 +193,8 @@ static int test_find_delalloc(u32 sectorsize)
set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, 0, NULL);
start = test_start;
end = 0;
- found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start,
- &end, max_bytes);
+ found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+ &end);
if (!found) {
test_err("didn't find our range");
goto out_bits;
@@ -233,8 +234,8 @@ static int test_find_delalloc(u32 sectorsize)
* this changes at any point in the future we will need to fix this
* tests expected behavior.
*/
- found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start,
- &end, max_bytes);
+ found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+ &end);
if (!found) {
test_err("didn't find our range");
goto out_bits;
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 64043f028820..af0c8e30d9e2 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -254,11 +254,6 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
- /*
- * We will just free a dummy node if it's ref count is 2 so we need an
- * extra ref so our searches don't accidentally release our page.
- */
- extent_buffer_get(root->node);
btrfs_set_header_nritems(root->node, 0);
btrfs_set_header_level(root->node, 0);
ret = -EINVAL;
@@ -860,7 +855,6 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
goto out;
}
- extent_buffer_get(root->node);
btrfs_set_header_nritems(root->node, 0);
btrfs_set_header_level(root->node, 0);
BTRFS_I(inode)->root = root;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 5686290a50e1..acdad6d658f5 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -122,6 +122,7 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans)
if (is_fstree(root->root_key.objectid))
btrfs_unpin_free_ino(root);
clear_btree_io_tree(&root->dirty_log_pages);
+ btrfs_qgroup_clean_swapped_blocks(root);
}
/* We can free old roots now. */
@@ -233,14 +234,12 @@ loop:
extwriter_counter_init(cur_trans, type);
init_waitqueue_head(&cur_trans->writer_wait);
init_waitqueue_head(&cur_trans->commit_wait);
- init_waitqueue_head(&cur_trans->pending_wait);
cur_trans->state = TRANS_STATE_RUNNING;
/*
* One for this trans handle, one so it will live on until we
* commit the transaction.
*/
refcount_set(&cur_trans->use_count, 2);
- atomic_set(&cur_trans->pending_ordered, 0);
cur_trans->flags = 0;
cur_trans->start_time = ktime_get_seconds();
@@ -456,7 +455,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
bool enforce_qgroups)
{
struct btrfs_fs_info *fs_info = root->fs_info;
-
+ struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
struct btrfs_trans_handle *h;
struct btrfs_transaction *cur_trans;
u64 num_bytes = 0;
@@ -485,13 +484,28 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
* the appropriate flushing if need be.
*/
if (num_items && root != fs_info->chunk_root) {
+ struct btrfs_block_rsv *rsv = &fs_info->trans_block_rsv;
+ u64 delayed_refs_bytes = 0;
+
qgroup_reserved = num_items * fs_info->nodesize;
ret = btrfs_qgroup_reserve_meta_pertrans(root, qgroup_reserved,
enforce_qgroups);
if (ret)
return ERR_PTR(ret);
+ /*
+ * We want to reserve all the bytes we may need all at once, so
+ * we only do 1 enospc flushing cycle per transaction start. We
+ * accomplish this by simply assuming we'll do 2 x num_items
+ * worth of delayed refs updates in this trans handle, and
+ * refill that amount for whatever is missing in the reserve.
+ */
num_bytes = btrfs_calc_trans_metadata_size(fs_info, num_items);
+ if (delayed_refs_rsv->full == 0) {
+ delayed_refs_bytes = num_bytes;
+ num_bytes <<= 1;
+ }
+
/*
* Do the reservation for the relocation root creation
*/
@@ -500,8 +514,24 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
reloc_reserved = true;
}
- ret = btrfs_block_rsv_add(root, &fs_info->trans_block_rsv,
- num_bytes, flush);
+ ret = btrfs_block_rsv_add(root, rsv, num_bytes, flush);
+ if (ret)
+ goto reserve_fail;
+ if (delayed_refs_bytes) {
+ btrfs_migrate_to_delayed_refs_rsv(fs_info, rsv,
+ delayed_refs_bytes);
+ num_bytes -= delayed_refs_bytes;
+ }
+ } else if (num_items == 0 && flush == BTRFS_RESERVE_FLUSH_ALL &&
+ !delayed_refs_rsv->full) {
+ /*
+ * Some people call with btrfs_start_transaction(root, 0)
+ * because they can be throttled, but have some other mechanism
+ * for reserving space. We still want these guys to refill the
+ * delayed block_rsv so just add 1 items worth of reservation
+ * here.
+ */
+ ret = btrfs_delayed_refs_rsv_refill(fs_info, flush);
if (ret)
goto reserve_fail;
}
@@ -670,7 +700,7 @@ struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
/*
* btrfs_attach_transaction_barrier() - catch the running transaction
*
- * It is similar to the above function, the differentia is this one
+ * It is similar to the above function, the difference is this one
* will wait for all the inactive transactions until they fully
* complete.
*/
@@ -760,7 +790,7 @@ static int should_end_transaction(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- if (btrfs_check_space_for_delayed_refs(trans))
+ if (btrfs_check_space_for_delayed_refs(fs_info))
return 1;
return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 5);
@@ -769,22 +799,12 @@ static int should_end_transaction(struct btrfs_trans_handle *trans)
int btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
{
struct btrfs_transaction *cur_trans = trans->transaction;
- int updates;
- int err;
smp_mb();
if (cur_trans->state >= TRANS_STATE_BLOCKED ||
cur_trans->delayed_refs.flushing)
return 1;
- updates = trans->delayed_ref_updates;
- trans->delayed_ref_updates = 0;
- if (updates) {
- err = btrfs_run_delayed_refs(trans, updates * 2);
- if (err) /* Error code will also eval true */
- return err;
- }
-
return should_end_transaction(trans);
}
@@ -814,11 +834,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *info = trans->fs_info;
struct btrfs_transaction *cur_trans = trans->transaction;
- u64 transid = trans->transid;
- unsigned long cur = trans->delayed_ref_updates;
int lock = (trans->type != TRANS_JOIN_NOLOCK);
int err = 0;
- int must_run_delayed_refs = 0;
if (refcount_read(&trans->use_count) > 1) {
refcount_dec(&trans->use_count);
@@ -829,40 +846,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
btrfs_trans_release_metadata(trans);
trans->block_rsv = NULL;
- if (!list_empty(&trans->new_bgs))
- btrfs_create_pending_block_groups(trans);
-
- trans->delayed_ref_updates = 0;
- if (!trans->sync) {
- must_run_delayed_refs =
- btrfs_should_throttle_delayed_refs(trans);
- cur = max_t(unsigned long, cur, 32);
-
- /*
- * don't make the caller wait if they are from a NOLOCK
- * or ATTACH transaction, it will deadlock with commit
- */
- if (must_run_delayed_refs == 1 &&
- (trans->type & (__TRANS_JOIN_NOLOCK | __TRANS_ATTACH)))
- must_run_delayed_refs = 2;
- }
-
- btrfs_trans_release_metadata(trans);
- trans->block_rsv = NULL;
-
- if (!list_empty(&trans->new_bgs))
- btrfs_create_pending_block_groups(trans);
+ btrfs_create_pending_block_groups(trans);
btrfs_trans_release_chunk_metadata(trans);
- if (lock && should_end_transaction(trans) &&
- READ_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) {
- spin_lock(&info->trans_lock);
- if (cur_trans->state == TRANS_STATE_RUNNING)
- cur_trans->state = TRANS_STATE_BLOCKED;
- spin_unlock(&info->trans_lock);
- }
-
if (lock && READ_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) {
if (throttle)
return btrfs_commit_transaction(trans);
@@ -894,10 +881,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
}
kmem_cache_free(btrfs_trans_handle_cachep, trans);
- if (must_run_delayed_refs) {
- btrfs_async_run_delayed_refs(info, cur, transid,
- must_run_delayed_refs == 1);
- }
return err;
}
@@ -1338,7 +1321,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
return 0;
/*
- * Ensure dirty @src will be commited. Or, after comming
+ * Ensure dirty @src will be committed. Or, after coming
* commit_fs_roots() and switch_commit_roots(), any dirty but not
* recorded root will never be updated again, causing an outdated root
* item.
@@ -1549,7 +1532,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
goto fail;
}
- btrfs_set_lock_blocking(old);
+ btrfs_set_lock_blocking_write(old);
ret = btrfs_copy_root(trans, root, old, &tmp, objectid);
/* clean up in any case */
@@ -1842,7 +1825,6 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_transaction *cur_trans = trans->transaction;
- DEFINE_WAIT(wait);
WARN_ON(refcount_read(&trans->use_count) > 1);
@@ -1889,6 +1871,21 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
kmem_cache_free(btrfs_trans_handle_cachep, trans);
}
+/*
+ * Release reserved delayed ref space of all pending block groups of the
+ * transaction and remove them from the list
+ */
+static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_block_group_cache *block_group, *tmp;
+
+ list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
+ btrfs_delayed_refs_rsv_release(fs_info, 1);
+ list_del_init(&block_group->bg_list);
+ }
+}
+
static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
{
/*
@@ -1911,13 +1908,6 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
}
-static inline void
-btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans)
-{
- wait_event(cur_trans->pending_wait,
- atomic_read(&cur_trans->pending_ordered) == 0);
-}
-
int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -1953,8 +1943,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
cur_trans->delayed_refs.flushing = 1;
smp_wmb();
- if (!list_empty(&trans->new_bgs))
- btrfs_create_pending_block_groups(trans);
+ btrfs_create_pending_block_groups(trans);
ret = btrfs_run_delayed_refs(trans, 0);
if (ret) {
@@ -2052,8 +2041,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
btrfs_wait_delalloc_flush(fs_info);
- btrfs_wait_pending_ordered(cur_trans);
-
btrfs_scrub_pause(fs_info);
/*
* Ok now we need to make sure to block out any other joins while we
@@ -2283,21 +2270,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
kmem_cache_free(btrfs_trans_handle_cachep, trans);
- /*
- * If fs has been frozen, we can not handle delayed iputs, otherwise
- * it'll result in deadlock about SB_FREEZE_FS.
- */
- if (current != fs_info->transaction_kthread &&
- current != fs_info->cleaner_kthread &&
- !test_bit(BTRFS_FS_FROZEN, &fs_info->flags))
- btrfs_run_delayed_iputs(fs_info);
-
return ret;
scrub_continue:
btrfs_scrub_continue(fs_info);
cleanup_transaction:
btrfs_trans_release_metadata(trans);
+ btrfs_cleanup_pending_block_groups(trans);
btrfs_trans_release_chunk_metadata(trans);
trans->block_rsv = NULL;
btrfs_warn(fs_info, "Skipping commit of aborted transaction.");
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 4cbb1b55387d..f1ba78949d1b 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -12,13 +12,13 @@
#include "ctree.h"
enum btrfs_trans_state {
- TRANS_STATE_RUNNING = 0,
- TRANS_STATE_BLOCKED = 1,
- TRANS_STATE_COMMIT_START = 2,
- TRANS_STATE_COMMIT_DOING = 3,
- TRANS_STATE_UNBLOCKED = 4,
- TRANS_STATE_COMPLETED = 5,
- TRANS_STATE_MAX = 6,
+ TRANS_STATE_RUNNING,
+ TRANS_STATE_BLOCKED,
+ TRANS_STATE_COMMIT_START,
+ TRANS_STATE_COMMIT_DOING,
+ TRANS_STATE_UNBLOCKED,
+ TRANS_STATE_COMPLETED,
+ TRANS_STATE_MAX,
};
#define BTRFS_TRANS_HAVE_FREE_BGS 0
@@ -39,7 +39,6 @@ struct btrfs_transaction {
*/
atomic_t num_writers;
refcount_t use_count;
- atomic_t pending_ordered;
unsigned long flags;
@@ -51,7 +50,6 @@ struct btrfs_transaction {
time64_t start_time;
wait_queue_head_t writer_wait;
wait_queue_head_t commit_wait;
- wait_queue_head_t pending_wait;
struct list_head pending_snapshots;
struct list_head pending_chunks;
struct list_head switch_commits;
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index cab0b1f1f741..a62e1e837a89 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -27,10 +27,10 @@
*
* @type: leaf or node
* @identifier: the necessary info to locate the leaf/node.
- * It's recommened to decode key.objecitd/offset if it's
+ * It's recommended to decode key.objecitd/offset if it's
* meaningful.
* @reason: describe the error
- * @bad_value: optional, it's recommened to output bad value and its
+ * @bad_value: optional, it's recommended to output bad value and its
* expected value (range).
*
* Since comma is used to separate the components, only space is allowed
@@ -130,7 +130,7 @@ static int check_extent_data_item(struct btrfs_fs_info *fs_info,
}
/*
- * Support for new compression/encrption must introduce incompat flag,
+ * Support for new compression/encryption must introduce incompat flag,
* and must be caught in open_ctree().
*/
if (btrfs_file_extent_compression(leaf, fi) > BTRFS_COMPRESS_TYPES) {
@@ -389,13 +389,11 @@ static int check_block_group_item(struct btrfs_fs_info *fs_info,
/*
* Here we don't really care about alignment since extent allocator can
- * handle it. We care more about the size, as if one block group is
- * larger than maximum size, it's must be some obvious corruption.
+ * handle it. We care more about the size.
*/
- if (key->offset > BTRFS_MAX_DATA_CHUNK_SIZE || key->offset == 0) {
+ if (key->offset == 0) {
block_group_err(fs_info, leaf, slot,
- "invalid block group size, have %llu expect (0, %llu]",
- key->offset, BTRFS_MAX_DATA_CHUNK_SIZE);
+ "invalid block group size 0");
return -EUCLEAN;
}
@@ -440,7 +438,7 @@ static int check_block_group_item(struct btrfs_fs_info *fs_info,
type != (BTRFS_BLOCK_GROUP_METADATA |
BTRFS_BLOCK_GROUP_DATA)) {
block_group_err(fs_info, leaf, slot,
-"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llu or 0x%llx",
+"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llx or 0x%llx",
type, hweight64(type),
BTRFS_BLOCK_GROUP_DATA, BTRFS_BLOCK_GROUP_METADATA,
BTRFS_BLOCK_GROUP_SYSTEM,
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 3c0987ab587d..5f9e2dd413af 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -52,7 +52,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
u32 nritems;
root_node = btrfs_lock_root_node(root);
- btrfs_set_lock_blocking(root_node);
+ btrfs_set_lock_blocking_write(root_node);
nritems = btrfs_header_nritems(root_node);
root->defrag_max.objectid = 0;
/* from above we know this is not a leaf */
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 0dba09334a16..f06454a55e00 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -27,6 +27,7 @@
#define LOG_INODE_ALL 0
#define LOG_INODE_EXISTS 1
#define LOG_OTHER_INODE 2
+#define LOG_OTHER_INODE_ALL 3
/*
* directory trouble cases
@@ -1144,7 +1145,7 @@ next:
}
btrfs_release_path(path);
- /* look for a conflicing name */
+ /* look for a conflicting name */
di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
name, namelen, 0);
if (di && !IS_ERR(di)) {
@@ -1330,6 +1331,67 @@ out:
return ret;
}
+static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct inode *dir, struct inode *inode, const char *name,
+ int namelen, u64 ref_index)
+{
+ struct btrfs_dir_item *dir_item;
+ struct btrfs_key key;
+ struct btrfs_path *path;
+ struct inode *other_inode = NULL;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ dir_item = btrfs_lookup_dir_item(NULL, root, path,
+ btrfs_ino(BTRFS_I(dir)),
+ name, namelen, 0);
+ if (!dir_item) {
+ btrfs_release_path(path);
+ goto add_link;
+ } else if (IS_ERR(dir_item)) {
+ ret = PTR_ERR(dir_item);
+ goto out;
+ }
+
+ /*
+ * Our inode's dentry collides with the dentry of another inode which is
+ * in the log but not yet processed since it has a higher inode number.
+ * So delete that other dentry.
+ */
+ btrfs_dir_item_key_to_cpu(path->nodes[0], dir_item, &key);
+ btrfs_release_path(path);
+ other_inode = read_one_inode(root, key.objectid);
+ if (!other_inode) {
+ ret = -ENOENT;
+ goto out;
+ }
+ ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), BTRFS_I(other_inode),
+ name, namelen);
+ if (ret)
+ goto out;
+ /*
+ * If we dropped the link count to 0, bump it so that later the iput()
+ * on the inode will not free it. We will fixup the link count later.
+ */
+ if (other_inode->i_nlink == 0)
+ inc_nlink(other_inode);
+
+ ret = btrfs_run_delayed_items(trans);
+ if (ret)
+ goto out;
+add_link:
+ ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
+ name, namelen, 0, ref_index);
+out:
+ iput(other_inode);
+ btrfs_free_path(path);
+
+ return ret;
+}
+
/*
* replay one inode back reference item found in the log tree.
* eb, slot and key refer to the buffer and key found in the log tree.
@@ -1466,9 +1528,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
goto out;
/* insert our name */
- ret = btrfs_add_link(trans, BTRFS_I(dir),
- BTRFS_I(inode),
- name, namelen, 0, ref_index);
+ ret = add_link(trans, root, dir, inode, name, namelen,
+ ref_index);
if (ret)
goto out;
@@ -2663,7 +2724,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
if (trans) {
btrfs_tree_lock(next);
- btrfs_set_lock_blocking(next);
+ btrfs_set_lock_blocking_write(next);
clean_tree_block(fs_info, next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
@@ -2747,7 +2808,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
if (trans) {
btrfs_tree_lock(next);
- btrfs_set_lock_blocking(next);
+ btrfs_set_lock_blocking_write(next);
clean_tree_block(fs_info, next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
@@ -2829,7 +2890,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
if (trans) {
btrfs_tree_lock(next);
- btrfs_set_lock_blocking(next);
+ btrfs_set_lock_blocking_write(next);
clean_tree_block(fs_info, next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
@@ -3149,7 +3210,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
mutex_unlock(&log_root_tree->log_mutex);
/*
- * nobody else is going to jump in and write the the ctree
+ * Nobody else is going to jump in and write the ctree
* super here because the log_commit atomic below is protecting
* us. We must be called with a transaction handle pinning
* the running transaction open, so a full commit can't hop
@@ -3201,8 +3262,6 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *log)
{
int ret;
- u64 start;
- u64 end;
struct walk_control wc = {
.free = 1,
.process_func = process_one_buffer
@@ -3216,18 +3275,8 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
btrfs_handle_fs_error(log->fs_info, ret, NULL);
}
- while (1) {
- ret = find_first_extent_bit(&log->dirty_log_pages,
- 0, &start, &end,
- EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT,
- NULL);
- if (ret)
- break;
-
- clear_extent_bits(&log->dirty_log_pages, start, end,
- EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
- }
-
+ clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1,
+ EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
free_extent_buffer(log->node);
kfree(log);
}
@@ -3718,6 +3767,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
found_key.type = 0;
ret = btrfs_bin_search(path->nodes[0], &found_key, 0,
&start_slot);
+ if (ret < 0)
+ break;
ret = btrfs_del_items(trans, log, path, start_slot,
path->slots[0] - start_slot + 1);
@@ -4383,20 +4434,33 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
struct extent_map *em, *n;
struct list_head extents;
struct extent_map_tree *tree = &inode->extent_tree;
- u64 logged_start, logged_end;
u64 test_gen;
int ret = 0;
int num = 0;
INIT_LIST_HEAD(&extents);
- down_write(&inode->dio_sem);
write_lock(&tree->lock);
test_gen = root->fs_info->last_trans_committed;
- logged_start = start;
- logged_end = end;
list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
+ /*
+ * Skip extents outside our logging range. It's important to do
+ * it for correctness because if we don't ignore them, we may
+ * log them before their ordered extent completes, and therefore
+ * we could log them without logging their respective checksums
+ * (the checksum items are added to the csum tree at the very
+ * end of btrfs_finish_ordered_io()). Also leave such extents
+ * outside of our range in the list, since we may have another
+ * ranged fsync in the near future that needs them. If an extent
+ * outside our range corresponds to a hole, log it to avoid
+ * leaving gaps between extents (fsck will complain when we are
+ * not using the NO_HOLES feature).
+ */
+ if ((em->start > end || em->start + em->len <= start) &&
+ em->block_start != EXTENT_MAP_HOLE)
+ continue;
+
list_del_init(&em->list);
/*
* Just an arbitrary number, this can be really CPU intensive
@@ -4418,11 +4482,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
em->start >= i_size_read(&inode->vfs_inode))
continue;
- if (em->start < logged_start)
- logged_start = em->start;
- if ((em->start + em->len - 1) > logged_end)
- logged_end = em->start + em->len - 1;
-
/* Need a ref to keep it from getting evicted from cache */
refcount_inc(&em->refs);
set_bit(EXTENT_FLAG_LOGGING, &em->flags);
@@ -4456,7 +4515,6 @@ process:
}
WARN_ON(!list_empty(&extents));
write_unlock(&tree->lock);
- up_write(&inode->dio_sem);
btrfs_release_path(path);
if (!ret)
@@ -4652,7 +4710,8 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
ASSERT(len == i_size ||
(len == fs_info->sectorsize &&
btrfs_file_extent_compression(leaf, extent) !=
- BTRFS_COMPRESS_NONE));
+ BTRFS_COMPRESS_NONE) ||
+ (len < i_size && i_size < fs_info->sectorsize));
return 0;
}
@@ -4721,7 +4780,7 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
const int slot,
const struct btrfs_key *key,
struct btrfs_inode *inode,
- u64 *other_ino)
+ u64 *other_ino, u64 *other_parent)
{
int ret;
struct btrfs_path *search_path;
@@ -4784,8 +4843,13 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
btrfs_dir_item_key_to_cpu(search_path->nodes[0],
di, &di_key);
if (di_key.type == BTRFS_INODE_ITEM_KEY) {
- ret = 1;
- *other_ino = di_key.objectid;
+ if (di_key.objectid != key->objectid) {
+ ret = 1;
+ *other_ino = di_key.objectid;
+ *other_parent = parent;
+ } else {
+ ret = 0;
+ }
} else {
ret = -EAGAIN;
}
@@ -4805,6 +4869,144 @@ out:
return ret;
}
+struct btrfs_ino_list {
+ u64 ino;
+ u64 parent;
+ struct list_head list;
+};
+
+static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_log_ctx *ctx,
+ u64 ino, u64 parent)
+{
+ struct btrfs_ino_list *ino_elem;
+ LIST_HEAD(inode_list);
+ int ret = 0;
+
+ ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
+ if (!ino_elem)
+ return -ENOMEM;
+ ino_elem->ino = ino;
+ ino_elem->parent = parent;
+ list_add_tail(&ino_elem->list, &inode_list);
+
+ while (!list_empty(&inode_list)) {
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_key key;
+ struct inode *inode;
+
+ ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list,
+ list);
+ ino = ino_elem->ino;
+ parent = ino_elem->parent;
+ list_del(&ino_elem->list);
+ kfree(ino_elem);
+ if (ret)
+ continue;
+
+ btrfs_release_path(path);
+
+ key.objectid = ino;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+ inode = btrfs_iget(fs_info->sb, &key, root, NULL);
+ /*
+ * If the other inode that had a conflicting dir entry was
+ * deleted in the current transaction, we need to log its parent
+ * directory.
+ */
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ if (ret == -ENOENT) {
+ key.objectid = parent;
+ inode = btrfs_iget(fs_info->sb, &key, root,
+ NULL);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ } else {
+ ret = btrfs_log_inode(trans, root,
+ BTRFS_I(inode),
+ LOG_OTHER_INODE_ALL,
+ 0, LLONG_MAX, ctx);
+ iput(inode);
+ }
+ }
+ continue;
+ }
+ /*
+ * We are safe logging the other inode without acquiring its
+ * lock as long as we log with the LOG_INODE_EXISTS mode. We
+ * are safe against concurrent renames of the other inode as
+ * well because during a rename we pin the log and update the
+ * log with the new name before we unpin it.
+ */
+ ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
+ LOG_OTHER_INODE, 0, LLONG_MAX, ctx);
+ if (ret) {
+ iput(inode);
+ continue;
+ }
+
+ key.objectid = ino;
+ key.type = BTRFS_INODE_REF_KEY;
+ key.offset = 0;
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0) {
+ iput(inode);
+ continue;
+ }
+
+ while (true) {
+ struct extent_buffer *leaf = path->nodes[0];
+ int slot = path->slots[0];
+ u64 other_ino = 0;
+ u64 other_parent = 0;
+
+ if (slot >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0) {
+ break;
+ } else if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid != ino ||
+ (key.type != BTRFS_INODE_REF_KEY &&
+ key.type != BTRFS_INODE_EXTREF_KEY)) {
+ ret = 0;
+ break;
+ }
+
+ ret = btrfs_check_ref_name_override(leaf, slot, &key,
+ BTRFS_I(inode), &other_ino,
+ &other_parent);
+ if (ret < 0)
+ break;
+ if (ret > 0) {
+ ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
+ if (!ino_elem) {
+ ret = -ENOMEM;
+ break;
+ }
+ ino_elem->ino = other_ino;
+ ino_elem->parent = other_parent;
+ list_add_tail(&ino_elem->list, &inode_list);
+ ret = 0;
+ }
+ path->slots[0]++;
+ }
+ iput(inode);
+ }
+
+ return ret;
+}
+
/* log a single inode in the tree log.
* At least one parent directory for this inode must exist in the tree
* or be logged already.
@@ -4844,6 +5046,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
u64 logged_isize = 0;
bool need_log_inode_item = true;
bool xattrs_logged = false;
+ bool recursive_logging = false;
path = btrfs_alloc_path();
if (!path)
@@ -4889,8 +5092,12 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
return ret;
}
- if (inode_only == LOG_OTHER_INODE) {
- inode_only = LOG_INODE_EXISTS;
+ if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) {
+ recursive_logging = true;
+ if (inode_only == LOG_OTHER_INODE)
+ inode_only = LOG_INODE_EXISTS;
+ else
+ inode_only = LOG_INODE_ALL;
mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING);
} else {
mutex_lock(&inode->log_mutex);
@@ -4985,20 +5192,19 @@ again:
if ((min_key.type == BTRFS_INODE_REF_KEY ||
min_key.type == BTRFS_INODE_EXTREF_KEY) &&
- inode->generation == trans->transid) {
+ inode->generation == trans->transid &&
+ !recursive_logging) {
u64 other_ino = 0;
+ u64 other_parent = 0;
ret = btrfs_check_ref_name_override(path->nodes[0],
path->slots[0], &min_key, inode,
- &other_ino);
+ &other_ino, &other_parent);
if (ret < 0) {
err = ret;
goto out_unlock;
} else if (ret > 0 && ctx &&
other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
- struct btrfs_key inode_key;
- struct inode *other_inode;
-
if (ins_nr > 0) {
ins_nr++;
} else {
@@ -5014,43 +5220,13 @@ again:
goto out_unlock;
}
ins_nr = 0;
- btrfs_release_path(path);
- inode_key.objectid = other_ino;
- inode_key.type = BTRFS_INODE_ITEM_KEY;
- inode_key.offset = 0;
- other_inode = btrfs_iget(fs_info->sb,
- &inode_key, root,
- NULL);
- /*
- * If the other inode that had a conflicting dir
- * entry was deleted in the current transaction,
- * we don't need to do more work nor fallback to
- * a transaction commit.
- */
- if (other_inode == ERR_PTR(-ENOENT)) {
- goto next_key;
- } else if (IS_ERR(other_inode)) {
- err = PTR_ERR(other_inode);
- goto out_unlock;
- }
- /*
- * We are safe logging the other inode without
- * acquiring its i_mutex as long as we log with
- * the LOG_INODE_EXISTS mode. We're safe against
- * concurrent renames of the other inode as well
- * because during a rename we pin the log and
- * update the log with the new name before we
- * unpin it.
- */
- err = btrfs_log_inode(trans, root,
- BTRFS_I(other_inode),
- LOG_OTHER_INODE, 0, LLONG_MAX,
- ctx);
- iput(other_inode);
+
+ err = log_conflicting_inodes(trans, root, path,
+ ctx, other_ino, other_parent);
if (err)
goto out_unlock;
- else
- goto next_key;
+ btrfs_release_path(path);
+ goto next_key;
}
}
@@ -5762,6 +5938,22 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
goto end_trans;
}
+ /*
+ * If a new hard link was added to the inode in the current transaction
+ * and its link count is now greater than 1, we need to fallback to a
+ * transaction commit, otherwise we can end up not logging all its new
+ * parents for all the hard links. Here just from the dentry used to
+ * fsync, we can not visit the ancestor inodes for all the other hard
+ * links to figure out if any is new, so we fallback to a transaction
+ * commit (instead of adding a lot of complexity of scanning a btree,
+ * since this scenario is not a common use case).
+ */
+ if (inode->vfs_inode.i_nlink > 1 &&
+ inode->last_link_trans > last_committed) {
+ ret = -EMLINK;
+ goto end_trans;
+ }
+
while (1) {
if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
break;
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 767765031e59..0fab84a8f670 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -15,7 +15,6 @@
struct btrfs_log_ctx {
int log_ret;
int log_transid;
- int io_err;
bool log_new_dentries;
struct inode *inode;
struct list_head list;
@@ -26,7 +25,6 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx,
{
ctx->log_ret = 0;
ctx->log_transid = 0;
- ctx->io_err = 0;
ctx->log_new_dentries = false;
ctx->inode = inode;
INIT_LIST_HEAD(&ctx->list);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f435d397019e..9024eee889b9 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -37,6 +37,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
.tolerated_failures = 1,
.devs_increment = 2,
.ncopies = 2,
+ .nparity = 0,
.raid_name = "raid10",
.bg_flag = BTRFS_BLOCK_GROUP_RAID10,
.mindev_error = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
@@ -49,6 +50,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
.tolerated_failures = 1,
.devs_increment = 2,
.ncopies = 2,
+ .nparity = 0,
.raid_name = "raid1",
.bg_flag = BTRFS_BLOCK_GROUP_RAID1,
.mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
@@ -61,6 +63,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
.tolerated_failures = 0,
.devs_increment = 1,
.ncopies = 2,
+ .nparity = 0,
.raid_name = "dup",
.bg_flag = BTRFS_BLOCK_GROUP_DUP,
.mindev_error = 0,
@@ -73,6 +76,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
.tolerated_failures = 0,
.devs_increment = 1,
.ncopies = 1,
+ .nparity = 0,
.raid_name = "raid0",
.bg_flag = BTRFS_BLOCK_GROUP_RAID0,
.mindev_error = 0,
@@ -85,6 +89,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
.tolerated_failures = 0,
.devs_increment = 1,
.ncopies = 1,
+ .nparity = 0,
.raid_name = "single",
.bg_flag = 0,
.mindev_error = 0,
@@ -96,7 +101,8 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
.devs_min = 2,
.tolerated_failures = 1,
.devs_increment = 1,
- .ncopies = 2,
+ .ncopies = 1,
+ .nparity = 1,
.raid_name = "raid5",
.bg_flag = BTRFS_BLOCK_GROUP_RAID5,
.mindev_error = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
@@ -108,7 +114,8 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
.devs_min = 3,
.tolerated_failures = 2,
.devs_increment = 1,
- .ncopies = 3,
+ .ncopies = 1,
+ .nparity = 2,
.raid_name = "raid6",
.bg_flag = BTRFS_BLOCK_GROUP_RAID6,
.mindev_error = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
@@ -123,6 +130,60 @@ const char *get_raid_name(enum btrfs_raid_types type)
return btrfs_raid_array[type].raid_name;
}
+/*
+ * Fill @buf with textual description of @bg_flags, no more than @size_buf
+ * bytes including terminating null byte.
+ */
+void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
+{
+ int i;
+ int ret;
+ char *bp = buf;
+ u64 flags = bg_flags;
+ u32 size_bp = size_buf;
+
+ if (!flags) {
+ strcpy(bp, "NONE");
+ return;
+ }
+
+#define DESCRIBE_FLAG(flag, desc) \
+ do { \
+ if (flags & (flag)) { \
+ ret = snprintf(bp, size_bp, "%s|", (desc)); \
+ if (ret < 0 || ret >= size_bp) \
+ goto out_overflow; \
+ size_bp -= ret; \
+ bp += ret; \
+ flags &= ~(flag); \
+ } \
+ } while (0)
+
+ DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data");
+ DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system");
+ DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata");
+
+ DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single");
+ for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
+ DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag,
+ btrfs_raid_array[i].raid_name);
+#undef DESCRIBE_FLAG
+
+ if (flags) {
+ ret = snprintf(bp, size_bp, "0x%llx|", flags);
+ size_bp -= ret;
+ }
+
+ if (size_bp < size_buf)
+ buf[size_buf - size_bp - 1] = '\0'; /* remove last | */
+
+ /*
+ * The text is trimmed, it's up to the caller to provide sufficiently
+ * large buffer
+ */
+out_overflow:;
+}
+
static int init_first_rw_device(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
@@ -151,7 +212,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
* the mutex can be very coarse and can cover long-running operations
*
* protects: updates to fs_devices counters like missing devices, rw devices,
- * seeding, structure cloning, openning/closing devices at mount/umount time
+ * seeding, structure cloning, opening/closing devices at mount/umount time
*
* global::fs_devs - add, remove, updates to the global list
*
@@ -238,13 +299,15 @@ struct list_head *btrfs_get_fs_uuids(void)
/*
* alloc_fs_devices - allocate struct btrfs_fs_devices
- * @fsid: if not NULL, copy the uuid to fs_devices::fsid
+ * @fsid: if not NULL, copy the UUID to fs_devices::fsid
+ * @metadata_fsid: if not NULL, copy the UUID to fs_devices::metadata_fsid
*
* Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
* The returned struct is not linked onto any lists and can be destroyed with
* kfree() right away.
*/
-static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
+static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
+ const u8 *metadata_fsid)
{
struct btrfs_fs_devices *fs_devs;
@@ -261,6 +324,11 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
if (fsid)
memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
+ if (metadata_fsid)
+ memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE);
+ else if (fsid)
+ memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE);
+
return fs_devs;
}
@@ -347,34 +415,57 @@ static struct btrfs_device *__alloc_device(void)
return dev;
}
-/*
- * Find a device specified by @devid or @uuid in the list of @fs_devices, or
- * return NULL.
- *
- * If devid and uuid are both specified, the match must be exact, otherwise
- * only devid is used.
- */
-static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices,
- u64 devid, const u8 *uuid)
+static noinline struct btrfs_fs_devices *find_fsid(
+ const u8 *fsid, const u8 *metadata_fsid)
{
- struct btrfs_device *dev;
+ struct btrfs_fs_devices *fs_devices;
+
+ ASSERT(fsid);
- list_for_each_entry(dev, &fs_devices->devices, dev_list) {
- if (dev->devid == devid &&
- (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
- return dev;
+ if (metadata_fsid) {
+ /*
+ * Handle scanned device having completed its fsid change but
+ * belonging to a fs_devices that was created by first scanning
+ * a device which didn't have its fsid/metadata_uuid changed
+ * at all and the CHANGING_FSID_V2 flag set.
+ */
+ list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
+ if (fs_devices->fsid_change &&
+ memcmp(metadata_fsid, fs_devices->fsid,
+ BTRFS_FSID_SIZE) == 0 &&
+ memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
+ BTRFS_FSID_SIZE) == 0) {
+ return fs_devices;
+ }
+ }
+ /*
+ * Handle scanned device having completed its fsid change but
+ * belonging to a fs_devices that was created by a device that
+ * has an outdated pair of fsid/metadata_uuid and
+ * CHANGING_FSID_V2 flag set.
+ */
+ list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
+ if (fs_devices->fsid_change &&
+ memcmp(fs_devices->metadata_uuid,
+ fs_devices->fsid, BTRFS_FSID_SIZE) != 0 &&
+ memcmp(metadata_fsid, fs_devices->metadata_uuid,
+ BTRFS_FSID_SIZE) == 0) {
+ return fs_devices;
+ }
}
}
- return NULL;
-}
-
-static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
-{
- struct btrfs_fs_devices *fs_devices;
+ /* Handle non-split brain cases */
list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
- if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
- return fs_devices;
+ if (metadata_fsid) {
+ if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0
+ && memcmp(metadata_fsid, fs_devices->metadata_uuid,
+ BTRFS_FSID_SIZE) == 0)
+ return fs_devices;
+ } else {
+ if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
+ return fs_devices;
+ }
}
return NULL;
}
@@ -622,6 +713,17 @@ static void pending_bios_fn(struct btrfs_work *work)
run_scheduled_bios(device);
}
+static bool device_path_matched(const char *path, struct btrfs_device *device)
+{
+ int found;
+
+ rcu_read_lock();
+ found = strcmp(rcu_str_deref(device->name), path);
+ rcu_read_unlock();
+
+ return found == 0;
+}
+
/*
* Search and remove all stale (devices which are not mounted) devices.
* When both inputs are NULL, it will search and release all stale devices.
@@ -629,52 +731,57 @@ static void pending_bios_fn(struct btrfs_work *work)
* matching this path only.
* skip_dev: Optional. Will skip this device when searching for the stale
* devices.
+ * Return: 0 for success or if @path is NULL.
+ * -EBUSY if @path is a mounted device.
+ * -ENOENT if @path does not match any device in the list.
*/
-static void btrfs_free_stale_devices(const char *path,
+static int btrfs_free_stale_devices(const char *path,
struct btrfs_device *skip_device)
{
struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
struct btrfs_device *device, *tmp_device;
+ int ret = 0;
+
+ if (path)
+ ret = -ENOENT;
list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
- mutex_lock(&fs_devices->device_list_mutex);
- if (fs_devices->opened) {
- mutex_unlock(&fs_devices->device_list_mutex);
- continue;
- }
+ mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry_safe(device, tmp_device,
&fs_devices->devices, dev_list) {
- int not_found = 0;
-
if (skip_device && skip_device == device)
continue;
if (path && !device->name)
continue;
-
- rcu_read_lock();
- if (path)
- not_found = strcmp(rcu_str_deref(device->name),
- path);
- rcu_read_unlock();
- if (not_found)
+ if (path && !device_path_matched(path, device))
continue;
+ if (fs_devices->opened) {
+ /* for an already deleted device return 0 */
+ if (path && ret != 0)
+ ret = -EBUSY;
+ break;
+ }
/* delete the stale device */
fs_devices->num_devices--;
list_del(&device->dev_list);
btrfs_free_device(device);
+ ret = 0;
if (fs_devices->num_devices == 0)
break;
}
mutex_unlock(&fs_devices->device_list_mutex);
+
if (fs_devices->num_devices == 0) {
btrfs_sysfs_remove_fsid(fs_devices);
list_del(&fs_devices->fs_list);
free_fs_devices(fs_devices);
}
}
+
+ return ret;
}
static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
@@ -709,6 +816,13 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
device->generation = btrfs_super_generation(disk_super);
if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
+ if (btrfs_super_incompat_flags(disk_super) &
+ BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
+ pr_err(
+ "BTRFS: Invalid seeding and uuid-changed device detected\n");
+ goto error_brelse;
+ }
+
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
fs_devices->seeding = 1;
} else {
@@ -744,6 +858,51 @@ error_brelse:
}
/*
+ * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices
+ * being created with a disk that has already completed its fsid change.
+ */
+static struct btrfs_fs_devices *find_fsid_inprogress(
+ struct btrfs_super_block *disk_super)
+{
+ struct btrfs_fs_devices *fs_devices;
+
+ list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
+ if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
+ BTRFS_FSID_SIZE) != 0 &&
+ memcmp(fs_devices->metadata_uuid, disk_super->fsid,
+ BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) {
+ return fs_devices;
+ }
+ }
+
+ return NULL;
+}
+
+
+static struct btrfs_fs_devices *find_fsid_changed(
+ struct btrfs_super_block *disk_super)
+{
+ struct btrfs_fs_devices *fs_devices;
+
+ /*
+ * Handles the case where scanned device is part of an fs that had
+ * multiple successful changes of FSID but curently device didn't
+ * observe it. Meaning our fsid will be different than theirs.
+ */
+ list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
+ if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
+ BTRFS_FSID_SIZE) != 0 &&
+ memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid,
+ BTRFS_FSID_SIZE) == 0 &&
+ memcmp(fs_devices->fsid, disk_super->fsid,
+ BTRFS_FSID_SIZE) != 0) {
+ return fs_devices;
+ }
+ }
+
+ return NULL;
+}
+/*
* Add new device to list of registered devices
*
* Returns:
@@ -755,25 +914,72 @@ static noinline struct btrfs_device *device_list_add(const char *path,
bool *new_device_added)
{
struct btrfs_device *device;
- struct btrfs_fs_devices *fs_devices;
+ struct btrfs_fs_devices *fs_devices = NULL;
struct rcu_string *name;
u64 found_transid = btrfs_super_generation(disk_super);
u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
+ bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
+ BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
+ bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
+ BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
+
+ if (fsid_change_in_progress) {
+ if (!has_metadata_uuid) {
+ /*
+ * When we have an image which has CHANGING_FSID_V2 set
+ * it might belong to either a filesystem which has
+ * disks with completed fsid change or it might belong
+ * to fs with no UUID changes in effect, handle both.
+ */
+ fs_devices = find_fsid_inprogress(disk_super);
+ if (!fs_devices)
+ fs_devices = find_fsid(disk_super->fsid, NULL);
+ } else {
+ fs_devices = find_fsid_changed(disk_super);
+ }
+ } else if (has_metadata_uuid) {
+ fs_devices = find_fsid(disk_super->fsid,
+ disk_super->metadata_uuid);
+ } else {
+ fs_devices = find_fsid(disk_super->fsid, NULL);
+ }
+
- fs_devices = find_fsid(disk_super->fsid);
if (!fs_devices) {
- fs_devices = alloc_fs_devices(disk_super->fsid);
+ if (has_metadata_uuid)
+ fs_devices = alloc_fs_devices(disk_super->fsid,
+ disk_super->metadata_uuid);
+ else
+ fs_devices = alloc_fs_devices(disk_super->fsid, NULL);
+
if (IS_ERR(fs_devices))
return ERR_CAST(fs_devices);
+ fs_devices->fsid_change = fsid_change_in_progress;
+
mutex_lock(&fs_devices->device_list_mutex);
list_add(&fs_devices->fs_list, &fs_uuids);
device = NULL;
} else {
mutex_lock(&fs_devices->device_list_mutex);
- device = find_device(fs_devices, devid,
- disk_super->dev_item.uuid);
+ device = btrfs_find_device(fs_devices, devid,
+ disk_super->dev_item.uuid, NULL, false);
+
+ /*
+ * If this disk has been pulled into an fs devices created by
+ * a device which had the CHANGING_FSID_V2 flag then replace the
+ * metadata_uuid/fsid values of the fs_devices.
+ */
+ if (has_metadata_uuid && fs_devices->fsid_change &&
+ found_transid > fs_devices->latest_generation) {
+ memcpy(fs_devices->fsid, disk_super->fsid,
+ BTRFS_FSID_SIZE);
+ memcpy(fs_devices->metadata_uuid,
+ disk_super->metadata_uuid, BTRFS_FSID_SIZE);
+
+ fs_devices->fsid_change = false;
+ }
}
if (!device) {
@@ -850,6 +1056,35 @@ static noinline struct btrfs_device *device_list_add(const char *path,
return ERR_PTR(-EEXIST);
}
+ /*
+ * We are going to replace the device path for a given devid,
+ * make sure it's the same device if the device is mounted
+ */
+ if (device->bdev) {
+ struct block_device *path_bdev;
+
+ path_bdev = lookup_bdev(path);
+ if (IS_ERR(path_bdev)) {
+ mutex_unlock(&fs_devices->device_list_mutex);
+ return ERR_CAST(path_bdev);
+ }
+
+ if (device->bdev != path_bdev) {
+ bdput(path_bdev);
+ mutex_unlock(&fs_devices->device_list_mutex);
+ btrfs_warn_in_rcu(device->fs_info,
+ "duplicate device fsid:devid for %pU:%llu old:%s new:%s",
+ disk_super->fsid, devid,
+ rcu_str_deref(device->name), path);
+ return ERR_PTR(-EEXIST);
+ }
+ bdput(path_bdev);
+ btrfs_info_in_rcu(device->fs_info,
+ "device fsid %pU devid %llu moved old:%s new:%s",
+ disk_super->fsid, devid,
+ rcu_str_deref(device->name), path);
+ }
+
name = rcu_string_strdup(path, GFP_NOFS);
if (!name) {
mutex_unlock(&fs_devices->device_list_mutex);
@@ -869,8 +1104,11 @@ static noinline struct btrfs_device *device_list_add(const char *path,
* it back. We need it to pick the disk with largest generation
* (as above).
*/
- if (!fs_devices->opened)
+ if (!fs_devices->opened) {
device->generation = found_transid;
+ fs_devices->latest_generation = max_t(u64, found_transid,
+ fs_devices->latest_generation);
+ }
fs_devices->total_devices = btrfs_super_num_devices(disk_super);
@@ -884,14 +1122,13 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
struct btrfs_device *device;
struct btrfs_device *orig_dev;
- fs_devices = alloc_fs_devices(orig->fsid);
+ fs_devices = alloc_fs_devices(orig->fsid, NULL);
if (IS_ERR(fs_devices))
return fs_devices;
mutex_lock(&orig->device_list_mutex);
fs_devices->total_devices = orig->total_devices;
- /* We have held the volume lock, it is safe to get the devices. */
list_for_each_entry(orig_dev, &orig->devices, dev_list) {
struct rcu_string *name;
@@ -1193,7 +1430,7 @@ static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
p = kmap(*page);
/* align our pointer to the offset of the super block */
- *disk_super = p + (bytenr & ~PAGE_MASK);
+ *disk_super = p + offset_in_page(bytenr);
if (btrfs_super_bytenr(*disk_super) != bytenr ||
btrfs_super_magic(*disk_super) != BTRFS_MAGIC) {
@@ -1208,6 +1445,17 @@ static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
return 0;
}
+int btrfs_forget_devices(const char *path)
+{
+ int ret;
+
+ mutex_lock(&uuid_mutex);
+ ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL);
+ mutex_unlock(&uuid_mutex);
+
+ return ret;
+}
+
/*
* Look for a btrfs signature on a device. This may be called out of the mount path
* and we are not allowed to call set_blocksize during the scan. The superblock
@@ -1709,7 +1957,8 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
ptr = btrfs_device_uuid(dev_item);
write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
ptr = btrfs_device_fsid(dev_item);
- write_extent_buffer(leaf, trans->fs_info->fsid, ptr, BTRFS_FSID_SIZE);
+ write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid,
+ ptr, BTRFS_FSID_SIZE);
btrfs_mark_buffer_dirty(leaf);
ret = 0;
@@ -1862,12 +2111,12 @@ static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
{
u64 num_devices = fs_info->fs_devices->num_devices;
- btrfs_dev_replace_read_lock(&fs_info->dev_replace);
+ down_read(&fs_info->dev_replace.rwsem);
if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
ASSERT(num_devices > 1);
num_devices--;
}
- btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
+ up_read(&fs_info->dev_replace.rwsem);
return num_devices;
}
@@ -1900,6 +2149,14 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
goto out;
}
+ if (btrfs_pinned_by_swapfile(fs_info, device)) {
+ btrfs_warn_in_rcu(fs_info,
+ "cannot remove device %s (devid %llu) due to active swapfile",
+ rcu_str_deref(device->name), device->devid);
+ ret = -ETXTBSY;
+ goto out;
+ }
+
if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
ret = BTRFS_ERROR_DEV_TGT_REPLACE;
goto out;
@@ -2132,7 +2389,13 @@ static struct btrfs_device *btrfs_find_device_by_path(
disk_super = (struct btrfs_super_block *)bh->b_data;
devid = btrfs_stack_device_id(&disk_super->dev_item);
dev_uuid = disk_super->dev_item.uuid;
- device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid);
+ if (btrfs_fs_incompat(fs_info, METADATA_UUID))
+ device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
+ disk_super->metadata_uuid, true);
+ else
+ device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
+ disk_super->fsid, true);
+
brelse(bh);
if (!device)
device = ERR_PTR(-ENOENT);
@@ -2140,50 +2403,38 @@ static struct btrfs_device *btrfs_find_device_by_path(
return device;
}
-static struct btrfs_device *btrfs_find_device_missing_or_by_path(
- struct btrfs_fs_info *fs_info, const char *device_path)
-{
- struct btrfs_device *device = NULL;
- if (strcmp(device_path, "missing") == 0) {
- struct list_head *devices;
- struct btrfs_device *tmp;
-
- devices = &fs_info->fs_devices->devices;
- list_for_each_entry(tmp, devices, dev_list) {
- if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
- &tmp->dev_state) && !tmp->bdev) {
- device = tmp;
- break;
- }
- }
-
- if (!device)
- return ERR_PTR(-ENOENT);
- } else {
- device = btrfs_find_device_by_path(fs_info, device_path);
- }
-
- return device;
-}
-
/*
* Lookup a device given by device id, or the path if the id is 0.
*/
struct btrfs_device *btrfs_find_device_by_devspec(
- struct btrfs_fs_info *fs_info, u64 devid, const char *devpath)
+ struct btrfs_fs_info *fs_info, u64 devid,
+ const char *device_path)
{
struct btrfs_device *device;
if (devid) {
- device = btrfs_find_device(fs_info, devid, NULL, NULL);
+ device = btrfs_find_device(fs_info->fs_devices, devid, NULL,
+ NULL, true);
if (!device)
return ERR_PTR(-ENOENT);
- } else {
- if (!devpath || !devpath[0])
- return ERR_PTR(-EINVAL);
- device = btrfs_find_device_missing_or_by_path(fs_info, devpath);
+ return device;
}
- return device;
+
+ if (!device_path || !device_path[0])
+ return ERR_PTR(-EINVAL);
+
+ if (strcmp(device_path, "missing") == 0) {
+ /* Find first missing device */
+ list_for_each_entry(device, &fs_info->fs_devices->devices,
+ dev_list) {
+ if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
+ &device->dev_state) && !device->bdev)
+ return device;
+ }
+ return ERR_PTR(-ENOENT);
+ }
+
+ return btrfs_find_device_by_path(fs_info, device_path);
}
/*
@@ -2202,7 +2453,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
if (!fs_devices->seeding)
return -EINVAL;
- seed_devices = alloc_fs_devices(NULL);
+ seed_devices = alloc_fs_devices(NULL, NULL);
if (IS_ERR(seed_devices))
return PTR_ERR(seed_devices);
@@ -2238,7 +2489,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
fs_devices->seed = seed_devices;
generate_random_uuid(fs_devices->fsid);
- memcpy(fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
+ memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
mutex_unlock(&fs_devices->device_list_mutex);
@@ -2305,7 +2556,8 @@ next_slot:
BTRFS_UUID_SIZE);
read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
BTRFS_FSID_SIZE);
- device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
+ device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
+ fs_uuid, true);
BUG_ON(!device); /* Logic error */
if (device->fs_devices->seeding) {
@@ -2480,7 +2732,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
* so rename the fsid on the sysfs
*/
snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
- fs_info->fsid);
+ fs_info->fs_devices->fsid);
if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf))
btrfs_warn(fs_info,
"sysfs: failed to create fsid for sprout");
@@ -2718,8 +2970,15 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
return ret;
}
-static struct extent_map *get_chunk_map(struct btrfs_fs_info *fs_info,
- u64 logical, u64 length)
+/*
+ * btrfs_get_chunk_map() - Find the mapping containing the given logical extent.
+ * @logical: Logical block offset in bytes.
+ * @length: Length of extent in bytes.
+ *
+ * Return: Chunk mapping or ERR_PTR.
+ */
+struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length)
{
struct extent_map_tree *em_tree;
struct extent_map *em;
@@ -2756,7 +3015,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
int i, ret = 0;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
- em = get_chunk_map(fs_info, chunk_offset, 1);
+ em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
if (IS_ERR(em)) {
/*
* This is a logic error, but we don't want to just rely on the
@@ -2797,13 +3056,11 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
mutex_unlock(&fs_info->chunk_mutex);
}
- if (map->stripes[i].dev) {
- ret = btrfs_update_device(trans, map->stripes[i].dev);
- if (ret) {
- mutex_unlock(&fs_devices->device_list_mutex);
- btrfs_abort_transaction(trans, ret);
- goto out;
- }
+ ret = btrfs_update_device(trans, device);
+ if (ret) {
+ mutex_unlock(&fs_devices->device_list_mutex);
+ btrfs_abort_transaction(trans, ret);
+ goto out;
}
}
mutex_unlock(&fs_devices->device_list_mutex);
@@ -3437,17 +3694,11 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
{
struct btrfs_balance_control *bctl = fs_info->balance_ctl;
struct btrfs_root *chunk_root = fs_info->chunk_root;
- struct btrfs_root *dev_root = fs_info->dev_root;
- struct list_head *devices;
- struct btrfs_device *device;
- u64 old_size;
- u64 size_to_free;
u64 chunk_type;
struct btrfs_chunk *chunk;
struct btrfs_path *path = NULL;
struct btrfs_key key;
struct btrfs_key found_key;
- struct btrfs_trans_handle *trans;
struct extent_buffer *leaf;
int slot;
int ret;
@@ -3462,53 +3713,6 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
u32 count_sys = 0;
int chunk_reserved = 0;
- /* step one make some room on all the devices */
- devices = &fs_info->fs_devices->devices;
- list_for_each_entry(device, devices, dev_list) {
- old_size = btrfs_device_get_total_bytes(device);
- size_to_free = div_factor(old_size, 1);
- size_to_free = min_t(u64, size_to_free, SZ_1M);
- if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) ||
- btrfs_device_get_total_bytes(device) -
- btrfs_device_get_bytes_used(device) > size_to_free ||
- test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
- continue;
-
- ret = btrfs_shrink_device(device, old_size - size_to_free);
- if (ret == -ENOSPC)
- break;
- if (ret) {
- /* btrfs_shrink_device never returns ret > 0 */
- WARN_ON(ret > 0);
- goto error;
- }
-
- trans = btrfs_start_transaction(dev_root, 0);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- btrfs_info_in_rcu(fs_info,
- "resize: unable to start transaction after shrinking device %s (error %d), old size %llu, new size %llu",
- rcu_str_deref(device->name), ret,
- old_size, old_size - size_to_free);
- goto error;
- }
-
- ret = btrfs_grow_device(trans, device, old_size);
- if (ret) {
- btrfs_end_transaction(trans);
- /* btrfs_grow_device never returns ret > 0 */
- WARN_ON(ret > 0);
- btrfs_info_in_rcu(fs_info,
- "resize: unable to grow device after shrinking device %s (error %d), old size %llu, new size %llu",
- rcu_str_deref(device->name), ret,
- old_size, old_size - size_to_free);
- goto error;
- }
-
- btrfs_end_transaction(trans);
- }
-
- /* step two, relocate all the chunks */
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
@@ -3638,10 +3842,15 @@ again:
ret = btrfs_relocate_chunk(fs_info, found_key.offset);
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
- if (ret && ret != -ENOSPC)
- goto error;
if (ret == -ENOSPC) {
enospc_errors++;
+ } else if (ret == -ETXTBSY) {
+ btrfs_info(fs_info,
+ "skipping relocation of block group %llu due to active swapfile",
+ found_key.offset);
+ ret = 0;
+ } else if (ret) {
+ goto error;
} else {
spin_lock(&fs_info->balance_lock);
bctl->stat.completed++;
@@ -3712,6 +3921,162 @@ static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg,
}
/*
+ * Fill @buf with textual description of balance filter flags @bargs, up to
+ * @size_buf including the terminating null. The output may be trimmed if it
+ * does not fit into the provided buffer.
+ */
+static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf,
+ u32 size_buf)
+{
+ int ret;
+ u32 size_bp = size_buf;
+ char *bp = buf;
+ u64 flags = bargs->flags;
+ char tmp_buf[128] = {'\0'};
+
+ if (!flags)
+ return;
+
+#define CHECK_APPEND_NOARG(a) \
+ do { \
+ ret = snprintf(bp, size_bp, (a)); \
+ if (ret < 0 || ret >= size_bp) \
+ goto out_overflow; \
+ size_bp -= ret; \
+ bp += ret; \
+ } while (0)
+
+#define CHECK_APPEND_1ARG(a, v1) \
+ do { \
+ ret = snprintf(bp, size_bp, (a), (v1)); \
+ if (ret < 0 || ret >= size_bp) \
+ goto out_overflow; \
+ size_bp -= ret; \
+ bp += ret; \
+ } while (0)
+
+#define CHECK_APPEND_2ARG(a, v1, v2) \
+ do { \
+ ret = snprintf(bp, size_bp, (a), (v1), (v2)); \
+ if (ret < 0 || ret >= size_bp) \
+ goto out_overflow; \
+ size_bp -= ret; \
+ bp += ret; \
+ } while (0)
+
+ if (flags & BTRFS_BALANCE_ARGS_CONVERT) {
+ int index = btrfs_bg_flags_to_raid_index(bargs->target);
+
+ CHECK_APPEND_1ARG("convert=%s,", get_raid_name(index));
+ }
+
+ if (flags & BTRFS_BALANCE_ARGS_SOFT)
+ CHECK_APPEND_NOARG("soft,");
+
+ if (flags & BTRFS_BALANCE_ARGS_PROFILES) {
+ btrfs_describe_block_groups(bargs->profiles, tmp_buf,
+ sizeof(tmp_buf));
+ CHECK_APPEND_1ARG("profiles=%s,", tmp_buf);
+ }
+
+ if (flags & BTRFS_BALANCE_ARGS_USAGE)
+ CHECK_APPEND_1ARG("usage=%llu,", bargs->usage);
+
+ if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE)
+ CHECK_APPEND_2ARG("usage=%u..%u,",
+ bargs->usage_min, bargs->usage_max);
+
+ if (flags & BTRFS_BALANCE_ARGS_DEVID)
+ CHECK_APPEND_1ARG("devid=%llu,", bargs->devid);
+
+ if (flags & BTRFS_BALANCE_ARGS_DRANGE)
+ CHECK_APPEND_2ARG("drange=%llu..%llu,",
+ bargs->pstart, bargs->pend);
+
+ if (flags & BTRFS_BALANCE_ARGS_VRANGE)
+ CHECK_APPEND_2ARG("vrange=%llu..%llu,",
+ bargs->vstart, bargs->vend);
+
+ if (flags & BTRFS_BALANCE_ARGS_LIMIT)
+ CHECK_APPEND_1ARG("limit=%llu,", bargs->limit);
+
+ if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)
+ CHECK_APPEND_2ARG("limit=%u..%u,",
+ bargs->limit_min, bargs->limit_max);
+
+ if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE)
+ CHECK_APPEND_2ARG("stripes=%u..%u,",
+ bargs->stripes_min, bargs->stripes_max);
+
+#undef CHECK_APPEND_2ARG
+#undef CHECK_APPEND_1ARG
+#undef CHECK_APPEND_NOARG
+
+out_overflow:
+
+ if (size_bp < size_buf)
+ buf[size_buf - size_bp - 1] = '\0'; /* remove last , */
+ else
+ buf[0] = '\0';
+}
+
+static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info)
+{
+ u32 size_buf = 1024;
+ char tmp_buf[192] = {'\0'};
+ char *buf;
+ char *bp;
+ u32 size_bp = size_buf;
+ int ret;
+ struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+
+ buf = kzalloc(size_buf, GFP_KERNEL);
+ if (!buf)
+ return;
+
+ bp = buf;
+
+#define CHECK_APPEND_1ARG(a, v1) \
+ do { \
+ ret = snprintf(bp, size_bp, (a), (v1)); \
+ if (ret < 0 || ret >= size_bp) \
+ goto out_overflow; \
+ size_bp -= ret; \
+ bp += ret; \
+ } while (0)
+
+ if (bctl->flags & BTRFS_BALANCE_FORCE)
+ CHECK_APPEND_1ARG("%s", "-f ");
+
+ if (bctl->flags & BTRFS_BALANCE_DATA) {
+ describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf));
+ CHECK_APPEND_1ARG("-d%s ", tmp_buf);
+ }
+
+ if (bctl->flags & BTRFS_BALANCE_METADATA) {
+ describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf));
+ CHECK_APPEND_1ARG("-m%s ", tmp_buf);
+ }
+
+ if (bctl->flags & BTRFS_BALANCE_SYSTEM) {
+ describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf));
+ CHECK_APPEND_1ARG("-s%s ", tmp_buf);
+ }
+
+#undef CHECK_APPEND_1ARG
+
+out_overflow:
+
+ if (size_bp < size_buf)
+ buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */
+ btrfs_info(fs_info, "balance: %s %s",
+ (bctl->flags & BTRFS_BALANCE_RESUME) ?
+ "resume" : "start", buf);
+
+ kfree(buf);
+}
+
+/*
* Should be called with balance mutexe held
*/
int btrfs_balance(struct btrfs_fs_info *fs_info,
@@ -3724,6 +4089,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
int ret;
u64 num_devices;
unsigned seq;
+ bool reducing_integrity;
if (btrfs_fs_closing(fs_info) ||
atomic_read(&fs_info->balance_pause_req) ||
@@ -3803,24 +4169,30 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
!(bctl->sys.target & allowed)) ||
((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
(fs_info->avail_metadata_alloc_bits & allowed) &&
- !(bctl->meta.target & allowed))) {
- if (bctl->flags & BTRFS_BALANCE_FORCE) {
- btrfs_info(fs_info,
- "balance: force reducing metadata integrity");
- } else {
- btrfs_err(fs_info,
- "balance: reduces metadata integrity, use --force if you want this");
- ret = -EINVAL;
- goto out;
- }
- }
+ !(bctl->meta.target & allowed)))
+ reducing_integrity = true;
+ else
+ reducing_integrity = false;
+
+ /* if we're not converting, the target field is uninitialized */
+ meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
+ bctl->meta.target : fs_info->avail_metadata_alloc_bits;
+ data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
+ bctl->data.target : fs_info->avail_data_alloc_bits;
} while (read_seqretry(&fs_info->profiles_lock, seq));
- /* if we're not converting, the target field is uninitialized */
- meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
- bctl->meta.target : fs_info->avail_metadata_alloc_bits;
- data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
- bctl->data.target : fs_info->avail_data_alloc_bits;
+ if (reducing_integrity) {
+ if (bctl->flags & BTRFS_BALANCE_FORCE) {
+ btrfs_info(fs_info,
+ "balance: force reducing metadata integrity");
+ } else {
+ btrfs_err(fs_info,
+ "balance: reduces metadata integrity, use --force if you want this");
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
int meta_index = btrfs_bg_flags_to_raid_index(meta_target);
@@ -3850,11 +4222,19 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
+ describe_balance_start_or_resume(fs_info);
mutex_unlock(&fs_info->balance_mutex);
ret = __btrfs_balance(fs_info);
mutex_lock(&fs_info->balance_mutex);
+ if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req))
+ btrfs_info(fs_info, "balance: paused");
+ else if (ret == -ECANCELED && atomic_read(&fs_info->balance_cancel_req))
+ btrfs_info(fs_info, "balance: canceled");
+ else
+ btrfs_info(fs_info, "balance: ended with status: %d", ret);
+
clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
if (bargs) {
@@ -3887,10 +4267,8 @@ static int balance_kthread(void *data)
int ret = 0;
mutex_lock(&fs_info->balance_mutex);
- if (fs_info->balance_ctl) {
- btrfs_info(fs_info, "balance: resuming");
+ if (fs_info->balance_ctl)
ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
- }
mutex_unlock(&fs_info->balance_mutex);
return ret;
@@ -4433,10 +4811,16 @@ again:
ret = btrfs_relocate_chunk(fs_info, chunk_offset);
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
- if (ret && ret != -ENOSPC)
- goto done;
- if (ret == -ENOSPC)
+ if (ret == -ENOSPC) {
failed++;
+ } else if (ret) {
+ if (ret == -ETXTBSY) {
+ btrfs_warn(fs_info,
+ "could not shrink block group %llu due to active swapfile",
+ chunk_offset);
+ }
+ goto done;
+ }
} while (key.offset-- > 0);
if (failed && !retried) {
@@ -4602,11 +4986,13 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
int devs_min; /* min devs needed */
int devs_increment; /* ndevs has to be a multiple of this */
int ncopies; /* how many copies to data has */
+ int nparity; /* number of stripes worth of bytes to
+ store parity information */
int ret;
u64 max_stripe_size;
u64 max_chunk_size;
u64 stripe_size;
- u64 num_bytes;
+ u64 chunk_size;
int ndevs;
int i;
int j;
@@ -4628,6 +5014,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
devs_min = btrfs_raid_array[index].devs_min;
devs_increment = btrfs_raid_array[index].devs_increment;
ncopies = btrfs_raid_array[index].ncopies;
+ nparity = btrfs_raid_array[index].nparity;
if (type & BTRFS_BLOCK_GROUP_DATA) {
max_stripe_size = SZ_1G;
@@ -4654,7 +5041,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
BUG_ON(1);
}
- /* we don't want a chunk larger than 10% of writeable space */
+ /* We don't want a chunk larger than 10% of writable space */
max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
max_chunk_size);
@@ -4757,30 +5144,22 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
* this will have to be fixed for RAID1 and RAID10 over
* more drives
*/
- data_stripes = num_stripes / ncopies;
-
- if (type & BTRFS_BLOCK_GROUP_RAID5)
- data_stripes = num_stripes - 1;
-
- if (type & BTRFS_BLOCK_GROUP_RAID6)
- data_stripes = num_stripes - 2;
+ data_stripes = (num_stripes - nparity) / ncopies;
/*
* Use the number of data stripes to figure out how big this chunk
* is really going to be in terms of logical address space,
- * and compare that answer with the max chunk size
+ * and compare that answer with the max chunk size. If it's higher,
+ * we try to reduce stripe_size.
*/
if (stripe_size * data_stripes > max_chunk_size) {
- stripe_size = div_u64(max_chunk_size, data_stripes);
-
- /* bump the answer up to a 16MB boundary */
- stripe_size = round_up(stripe_size, SZ_16M);
-
/*
- * But don't go higher than the limits we found while searching
- * for free extents
+ * Reduce stripe_size, round it up to a 16MB boundary again and
+ * then use it, unless it ends up being even bigger than the
+ * previous value we had already.
*/
- stripe_size = min(devices_info[ndevs - 1].max_avail,
+ stripe_size = min(round_up(div_u64(max_chunk_size,
+ data_stripes), SZ_16M),
stripe_size);
}
@@ -4808,9 +5187,9 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
map->type = type;
map->sub_stripes = sub_stripes;
- num_bytes = stripe_size * data_stripes;
+ chunk_size = stripe_size * data_stripes;
- trace_btrfs_chunk_alloc(info, map, start, num_bytes);
+ trace_btrfs_chunk_alloc(info, map, start, chunk_size);
em = alloc_extent_map();
if (!em) {
@@ -4821,7 +5200,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
em->map_lookup = map;
em->start = start;
- em->len = num_bytes;
+ em->len = chunk_size;
em->block_start = 0;
em->block_len = em->len;
em->orig_block_len = stripe_size;
@@ -4839,14 +5218,13 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
refcount_inc(&em->refs);
write_unlock(&em_tree->lock);
- ret = btrfs_make_block_group(trans, 0, type, start, num_bytes);
+ ret = btrfs_make_block_group(trans, 0, type, start, chunk_size);
if (ret)
goto error_del_extent;
- for (i = 0; i < map->num_stripes; i++) {
- num_bytes = map->stripes[i].dev->bytes_used + stripe_size;
- btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes);
- }
+ for (i = 0; i < map->num_stripes; i++)
+ btrfs_device_set_bytes_used(map->stripes[i].dev,
+ map->stripes[i].dev->bytes_used + stripe_size);
atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space);
@@ -4890,7 +5268,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
int i = 0;
int ret = 0;
- em = get_chunk_map(fs_info, chunk_offset, chunk_size);
+ em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
if (IS_ERR(em))
return PTR_ERR(em);
@@ -4971,10 +5349,10 @@ out:
}
/*
- * Chunk allocation falls into two parts. The first part does works
- * that make the new allocated chunk useable, but not do any operation
- * that modifies the chunk tree. The second part does the works that
- * require modifying the chunk tree. This division is important for the
+ * Chunk allocation falls into two parts. The first part does work
+ * that makes the new allocated chunk usable, but does not do any operation
+ * that modifies the chunk tree. The second part does the work that
+ * requires modifying the chunk tree. This division is important for the
* bootstrap process of adding storage to a seed btrfs.
*/
int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type)
@@ -5032,7 +5410,7 @@ int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
int miss_ndevs = 0;
int i;
- em = get_chunk_map(fs_info, chunk_offset, 1);
+ em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
if (IS_ERR(em))
return 1;
@@ -5092,7 +5470,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
struct map_lookup *map;
int ret;
- em = get_chunk_map(fs_info, logical, len);
+ em = btrfs_get_chunk_map(fs_info, logical, len);
if (IS_ERR(em))
/*
* We could return errors for these cases, but that could get
@@ -5122,11 +5500,11 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
ret = 1;
free_extent_map(em);
- btrfs_dev_replace_read_lock(&fs_info->dev_replace);
+ down_read(&fs_info->dev_replace.rwsem);
if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
fs_info->dev_replace.tgtdev)
ret++;
- btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
+ up_read(&fs_info->dev_replace.rwsem);
return ret;
}
@@ -5138,7 +5516,7 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
struct map_lookup *map;
unsigned long len = fs_info->sectorsize;
- em = get_chunk_map(fs_info, logical, len);
+ em = btrfs_get_chunk_map(fs_info, logical, len);
if (!WARN_ON(IS_ERR(em))) {
map = em->map_lookup;
@@ -5155,7 +5533,7 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
struct map_lookup *map;
int ret = 0;
- em = get_chunk_map(fs_info, logical, len);
+ em = btrfs_get_chunk_map(fs_info, logical, len);
if(!WARN_ON(IS_ERR(em))) {
map = em->map_lookup;
@@ -5314,7 +5692,7 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
/* discard always return a bbio */
ASSERT(bbio_ret);
- em = get_chunk_map(fs_info, logical, length);
+ em = btrfs_get_chunk_map(fs_info, logical, length);
if (IS_ERR(em))
return PTR_ERR(em);
@@ -5640,7 +6018,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
return __btrfs_map_block_for_discard(fs_info, logical,
*length, bbio_ret);
- em = get_chunk_map(fs_info, logical, *length);
+ em = btrfs_get_chunk_map(fs_info, logical, *length);
if (IS_ERR(em))
return PTR_ERR(em);
@@ -5699,17 +6077,21 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
*length = em->len - offset;
}
- /* This is for when we're called from btrfs_merge_bio_hook() and all
- it cares about is the length */
+ /*
+ * This is for when we're called from btrfs_bio_fits_in_stripe and all
+ * it cares about is the length
+ */
if (!bbio_ret)
goto out;
- btrfs_dev_replace_read_lock(dev_replace);
+ down_read(&dev_replace->rwsem);
dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
+ /*
+ * Hold the semaphore for read during the whole operation, write is
+ * requested at commit time but must wait.
+ */
if (!dev_replace_is_ongoing)
- btrfs_dev_replace_read_unlock(dev_replace);
- else
- btrfs_dev_replace_set_lock_blocking(dev_replace);
+ up_read(&dev_replace->rwsem);
if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
!need_full_stripe(op) && dev_replace->tgtdev != NULL) {
@@ -5904,12 +6286,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
}
out:
if (dev_replace_is_ongoing) {
- ASSERT(atomic_read(&dev_replace->blocking_readers) > 0);
- btrfs_dev_replace_read_lock(dev_replace);
- /* Barrier implied by atomic_dec_and_test */
- if (atomic_dec_and_test(&dev_replace->blocking_readers))
- cond_wake_up_nomb(&dev_replace->read_lock_wq);
- btrfs_dev_replace_read_unlock(dev_replace);
+ lockdep_assert_held(&dev_replace->rwsem);
+ /* Unlock and let waiting writers proceed */
+ up_read(&dev_replace->rwsem);
}
free_extent_map(em);
return ret;
@@ -5943,7 +6322,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
u64 rmap_len;
int i, j, nr = 0;
- em = get_chunk_map(fs_info, chunk_start, 1);
+ em = btrfs_get_chunk_map(fs_info, chunk_start, 1);
if (IS_ERR(em))
return -EIO;
@@ -6083,12 +6462,6 @@ static noinline void btrfs_schedule_bio(struct btrfs_device *device,
int should_queue = 1;
struct btrfs_pending_bios *pending_bios;
- if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state) ||
- !device->bdev) {
- bio_io_error(bio);
- return;
- }
-
/* don't bother with additional async steps for reads, right now */
if (bio_op(bio) == REQ_OP_READ) {
btrfsic_submit_bio(bio);
@@ -6217,7 +6590,8 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
dev = bbio->stripes[dev_nr].dev;
- if (!dev || !dev->bdev ||
+ if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING,
+ &dev->dev_state) ||
(bio_op(first_bio) == REQ_OP_WRITE &&
!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
bbio_error(bbio, first_bio, logical);
@@ -6236,21 +6610,36 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
return BLK_STS_OK;
}
-struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
- u8 *uuid, u8 *fsid)
+/*
+ * Find a device specified by @devid or @uuid in the list of @fs_devices, or
+ * return NULL.
+ *
+ * If devid and uuid are both specified, the match must be exact, otherwise
+ * only devid is used.
+ *
+ * If @seed is true, traverse through the seed devices.
+ */
+struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
+ u64 devid, u8 *uuid, u8 *fsid,
+ bool seed)
{
struct btrfs_device *device;
- struct btrfs_fs_devices *cur_devices;
- cur_devices = fs_info->fs_devices;
- while (cur_devices) {
+ while (fs_devices) {
if (!fsid ||
- !memcmp(cur_devices->fsid, fsid, BTRFS_FSID_SIZE)) {
- device = find_device(cur_devices, devid, uuid);
- if (device)
- return device;
+ !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
+ list_for_each_entry(device, &fs_devices->devices,
+ dev_list) {
+ if (device->devid == devid &&
+ (!uuid || memcmp(device->uuid, uuid,
+ BTRFS_UUID_SIZE) == 0))
+ return device;
+ }
}
- cur_devices = cur_devices->seed;
+ if (seed)
+ fs_devices = fs_devices->seed;
+ else
+ return NULL;
}
return NULL;
}
@@ -6402,10 +6791,10 @@ static int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info,
}
if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) ||
- (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes < 1) ||
+ (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes != 2) ||
(type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) ||
(type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) ||
- (type & BTRFS_BLOCK_GROUP_DUP && num_stripes > 2) ||
+ (type & BTRFS_BLOCK_GROUP_DUP && num_stripes != 2) ||
((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 &&
num_stripes != 1)) {
btrfs_err(fs_info,
@@ -6495,8 +6884,8 @@ static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
read_extent_buffer(leaf, uuid, (unsigned long)
btrfs_stripe_dev_uuid_nr(chunk, i),
BTRFS_UUID_SIZE);
- map->stripes[i].dev = btrfs_find_device(fs_info, devid,
- uuid, NULL);
+ map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices,
+ devid, uuid, NULL, true);
if (!map->stripes[i].dev &&
!btrfs_test_opt(fs_info, DEGRADED)) {
free_extent_map(em);
@@ -6574,12 +6963,12 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
fs_devices = fs_devices->seed;
}
- fs_devices = find_fsid(fsid);
+ fs_devices = find_fsid(fsid, NULL);
if (!fs_devices) {
if (!btrfs_test_opt(fs_info, DEGRADED))
return ERR_PTR(-ENOENT);
- fs_devices = alloc_fs_devices(fsid);
+ fs_devices = alloc_fs_devices(fsid, NULL);
if (IS_ERR(fs_devices))
return fs_devices;
@@ -6629,13 +7018,14 @@ static int read_one_dev(struct btrfs_fs_info *fs_info,
read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
BTRFS_FSID_SIZE);
- if (memcmp(fs_uuid, fs_info->fsid, BTRFS_FSID_SIZE)) {
+ if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) {
fs_devices = open_seed_devices(fs_info, fs_uuid);
if (IS_ERR(fs_devices))
return PTR_ERR(fs_devices);
}
- device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
+ device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
+ fs_uuid, true);
if (!device) {
if (!btrfs_test_opt(fs_info, DEGRADED)) {
btrfs_report_missing_device(fs_info, devid,
@@ -6876,7 +7266,7 @@ bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
if (missing > max_tolerated) {
if (!failing_dev)
btrfs_warn(fs_info,
- "chunk %llu missing %d devices, max tolerance is %d for writeable mount",
+ "chunk %llu missing %d devices, max tolerance is %d for writable mount",
em->start, missing, max_tolerated);
free_extent_map(em);
ret = false;
@@ -7225,7 +7615,8 @@ int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
int i;
mutex_lock(&fs_devices->device_list_mutex);
- dev = btrfs_find_device(fs_info, stats->devid, NULL, NULL);
+ dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL,
+ true);
mutex_unlock(&fs_devices->device_list_mutex);
if (!dev) {
@@ -7387,6 +7778,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
struct extent_map *em;
struct map_lookup *map;
+ struct btrfs_device *dev;
u64 stripe_len;
bool found = false;
int ret = 0;
@@ -7436,6 +7828,35 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
physical_offset, devid);
ret = -EUCLEAN;
}
+
+ /* Make sure no dev extent is beyond device bondary */
+ dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
+ if (!dev) {
+ btrfs_err(fs_info, "failed to find devid %llu", devid);
+ ret = -EUCLEAN;
+ goto out;
+ }
+
+ /* It's possible this device is a dummy for seed device */
+ if (dev->disk_total_bytes == 0) {
+ dev = btrfs_find_device(fs_info->fs_devices->seed, devid, NULL,
+ NULL, false);
+ if (!dev) {
+ btrfs_err(fs_info, "failed to find seed devid %llu",
+ devid);
+ ret = -EUCLEAN;
+ goto out;
+ }
+ }
+
+ if (physical_offset + physical_len > dev->disk_total_bytes) {
+ btrfs_err(fs_info,
+"dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
+ devid, physical_offset, physical_len,
+ dev->disk_total_bytes);
+ ret = -EUCLEAN;
+ goto out;
+ }
out:
free_extent_map(em);
return ret;
@@ -7478,6 +7899,8 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
struct btrfs_path *path;
struct btrfs_root *root = fs_info->dev_root;
struct btrfs_key key;
+ u64 prev_devid = 0;
+ u64 prev_dev_ext_end = 0;
int ret = 0;
key.objectid = 1;
@@ -7522,10 +7945,22 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext);
physical_len = btrfs_dev_extent_length(leaf, dext);
+ /* Check if this dev extent overlaps with the previous one */
+ if (devid == prev_devid && physical_offset < prev_dev_ext_end) {
+ btrfs_err(fs_info,
+"dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu",
+ devid, physical_offset, prev_dev_ext_end);
+ ret = -EUCLEAN;
+ goto out;
+ }
+
ret = verify_one_dev_extent(fs_info, chunk_offset, devid,
physical_offset, physical_len);
if (ret < 0)
goto out;
+ prev_devid = devid;
+ prev_dev_ext_end = physical_offset + physical_len;
+
ret = btrfs_next_item(root, path);
if (ret < 0)
goto out;
@@ -7541,3 +7976,27 @@ out:
btrfs_free_path(path);
return ret;
}
+
+/*
+ * Check whether the given block group or device is pinned by any inode being
+ * used as a swapfile.
+ */
+bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr)
+{
+ struct btrfs_swapfile_pin *sp;
+ struct rb_node *node;
+
+ spin_lock(&fs_info->swapfile_pins_lock);
+ node = fs_info->swapfile_pins.rb_node;
+ while (node) {
+ sp = rb_entry(node, struct btrfs_swapfile_pin, node);
+ if (ptr < sp->ptr)
+ node = node->rb_left;
+ else if (ptr > sp->ptr)
+ node = node->rb_right;
+ else
+ break;
+ }
+ spin_unlock(&fs_info->swapfile_pins_lock);
+ return node != NULL;
+}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index aefce895e994..3ad9d58d1b66 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -210,6 +210,8 @@ BTRFS_DEVICE_GETSET_FUNCS(bytes_used);
struct btrfs_fs_devices {
u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
+ u8 metadata_uuid[BTRFS_FSID_SIZE];
+ bool fsid_change;
struct list_head fs_list;
u64 num_devices;
@@ -218,6 +220,10 @@ struct btrfs_fs_devices {
u64 missing_devices;
u64 total_rw_bytes;
u64 total_devices;
+
+ /* Highest generation number of seen devices */
+ u64 latest_generation;
+
struct block_device *latest_bdev;
/* all of the devices in the FS, protected by a mutex
@@ -261,15 +267,12 @@ struct btrfs_fs_devices {
* we allocate are actually btrfs_io_bios. We'll cram as much of
* struct btrfs_bio as we can into this over time.
*/
-typedef void (btrfs_io_bio_end_io_t) (struct btrfs_io_bio *bio, int err);
struct btrfs_io_bio {
unsigned int mirror_num;
unsigned int stripe_index;
u64 logical;
u8 *csum;
u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
- u8 *csum_allocated;
- btrfs_io_bio_end_io_t *end_io;
struct bvec_iter iter;
/*
* This member must come last, bio_alloc_bioset will allocate enough
@@ -283,15 +286,20 @@ static inline struct btrfs_io_bio *btrfs_io_bio(struct bio *bio)
return container_of(bio, struct btrfs_io_bio, bio);
}
+static inline void btrfs_io_bio_free_csum(struct btrfs_io_bio *io_bio)
+{
+ if (io_bio->csum != io_bio->csum_inline) {
+ kfree(io_bio->csum);
+ io_bio->csum = NULL;
+ }
+}
+
struct btrfs_bio_stripe {
struct btrfs_device *dev;
u64 physical;
u64 length; /* only used for discard mappings */
};
-struct btrfs_bio;
-typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
-
struct btrfs_bio {
refcount_t refs;
atomic_t stripes_pending;
@@ -331,6 +339,8 @@ struct btrfs_raid_attr {
int tolerated_failures; /* max tolerated fail devs */
int devs_increment; /* ndevs has to be a multiple of this */
int ncopies; /* how many copies to data has */
+ int nparity; /* number of stripes worth of bytes to store
+ * parity information */
int mindev_error; /* error code if min devs requisite is unmet */
const char raid_name[8]; /* name of the raid */
u64 bg_flag; /* block group flag of the raid */
@@ -406,6 +416,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
fmode_t flags, void *holder);
struct btrfs_device *btrfs_scan_one_device(const char *path,
fmode_t flags, void *holder);
+int btrfs_forget_devices(const char *path);
int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step);
void btrfs_assign_next_active_device(struct btrfs_device *device,
@@ -423,13 +434,14 @@ void __exit btrfs_cleanup_fs_uuids(void);
int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
int btrfs_grow_device(struct btrfs_trans_handle *trans,
struct btrfs_device *device, u64 new_size);
-struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
- u8 *uuid, u8 *fsid);
+struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
+ u64 devid, u8 *uuid, u8 *fsid, bool seed);
int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *path);
int btrfs_balance(struct btrfs_fs_info *fs_info,
struct btrfs_balance_control *bctl,
struct btrfs_ioctl_balance_args *bargs);
+void btrfs_describe_block_groups(u64 flags, char *buf, u32 size_buf);
int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info);
int btrfs_recover_balance(struct btrfs_fs_info *fs_info);
int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
@@ -462,6 +474,8 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
u64 chunk_offset, u64 chunk_size);
int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset);
+struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length);
static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
int index)
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index ea78c3d6dcfc..f141b45ce349 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -11,6 +11,7 @@
#include <linux/security.h>
#include <linux/posix_acl_xattr.h>
#include <linux/iversion.h>
+#include <linux/sched/mm.h>
#include "ctree.h"
#include "btrfs_inode.h"
#include "transaction.h"
@@ -422,9 +423,15 @@ static int btrfs_initxattrs(struct inode *inode,
{
const struct xattr *xattr;
struct btrfs_trans_handle *trans = fs_info;
+ unsigned int nofs_flag;
char *name;
int err = 0;
+ /*
+ * We're holding a transaction handle, so use a NOFS memory allocation
+ * context to avoid deadlock if reclaim happens.
+ */
+ nofs_flag = memalloc_nofs_save();
for (xattr = xattr_array; xattr->name != NULL; xattr++) {
name = kmalloc(XATTR_SECURITY_PREFIX_LEN +
strlen(xattr->name) + 1, GFP_KERNEL);
@@ -440,6 +447,7 @@ static int btrfs_initxattrs(struct inode *inode,
if (err < 0)
break;
}
+ memalloc_nofs_restore(nofs_flag);
return err;
}
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 970ff3e35bb3..b86b7ad6b900 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -27,6 +27,33 @@ struct workspace {
int level;
};
+static struct workspace_manager wsm;
+
+static void zlib_init_workspace_manager(void)
+{
+ btrfs_init_workspace_manager(&wsm, &btrfs_zlib_compress);
+}
+
+static void zlib_cleanup_workspace_manager(void)
+{
+ btrfs_cleanup_workspace_manager(&wsm);
+}
+
+static struct list_head *zlib_get_workspace(unsigned int level)
+{
+ struct list_head *ws = btrfs_get_workspace(&wsm, level);
+ struct workspace *workspace = list_entry(ws, struct workspace, list);
+
+ workspace->level = level;
+
+ return ws;
+}
+
+static void zlib_put_workspace(struct list_head *ws)
+{
+ btrfs_put_workspace(&wsm, ws);
+}
+
static void zlib_free_workspace(struct list_head *ws)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
@@ -36,7 +63,7 @@ static void zlib_free_workspace(struct list_head *ws)
kfree(workspace);
}
-static struct list_head *zlib_alloc_workspace(void)
+static struct list_head *zlib_alloc_workspace(unsigned int level)
{
struct workspace *workspace;
int workspacesize;
@@ -48,6 +75,7 @@ static struct list_head *zlib_alloc_workspace(void)
workspacesize = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
zlib_inflate_workspacesize());
workspace->strm.workspace = kvmalloc(workspacesize, GFP_KERNEL);
+ workspace->level = level;
workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (!workspace->strm.workspace || !workspace->buf)
goto fail;
@@ -390,18 +418,19 @@ next:
return ret;
}
-static void zlib_set_level(struct list_head *ws, unsigned int type)
+static unsigned int zlib_set_level(unsigned int level)
{
- struct workspace *workspace = list_entry(ws, struct workspace, list);
- unsigned level = (type & 0xF0) >> 4;
-
- if (level > 9)
- level = 9;
+ if (!level)
+ return BTRFS_ZLIB_DEFAULT_LEVEL;
- workspace->level = level > 0 ? level : 3;
+ return min_t(unsigned int, level, 9);
}
const struct btrfs_compress_op btrfs_zlib_compress = {
+ .init_workspace_manager = zlib_init_workspace_manager,
+ .cleanup_workspace_manager = zlib_cleanup_workspace_manager,
+ .get_workspace = zlib_get_workspace,
+ .put_workspace = zlib_put_workspace,
.alloc_workspace = zlib_alloc_workspace,
.free_workspace = zlib_free_workspace,
.compress_pages = zlib_compress_pages,
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index af6ec59972f5..3e418a3aeb11 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -6,25 +6,31 @@
*/
#include <linux/bio.h>
+#include <linux/bitmap.h>
#include <linux/err.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/mm.h>
+#include <linux/sched/mm.h>
#include <linux/pagemap.h>
#include <linux/refcount.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/zstd.h>
#include "compression.h"
+#include "ctree.h"
#define ZSTD_BTRFS_MAX_WINDOWLOG 17
#define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG)
#define ZSTD_BTRFS_DEFAULT_LEVEL 3
+#define ZSTD_BTRFS_MAX_LEVEL 15
+/* 307s to avoid pathologically clashing with transaction commit */
+#define ZSTD_BTRFS_RECLAIM_JIFFIES (307 * HZ)
-static ZSTD_parameters zstd_get_btrfs_parameters(size_t src_len)
+static ZSTD_parameters zstd_get_btrfs_parameters(unsigned int level,
+ size_t src_len)
{
- ZSTD_parameters params = ZSTD_getParams(ZSTD_BTRFS_DEFAULT_LEVEL,
- src_len, 0);
+ ZSTD_parameters params = ZSTD_getParams(level, src_len, 0);
if (params.cParams.windowLog > ZSTD_BTRFS_MAX_WINDOWLOG)
params.cParams.windowLog = ZSTD_BTRFS_MAX_WINDOWLOG;
@@ -36,11 +42,290 @@ struct workspace {
void *mem;
size_t size;
char *buf;
+ unsigned int level;
+ unsigned int req_level;
+ unsigned long last_used; /* jiffies */
struct list_head list;
+ struct list_head lru_list;
ZSTD_inBuffer in_buf;
ZSTD_outBuffer out_buf;
};
+/*
+ * Zstd Workspace Management
+ *
+ * Zstd workspaces have different memory requirements depending on the level.
+ * The zstd workspaces are managed by having individual lists for each level
+ * and a global lru. Forward progress is maintained by protecting a max level
+ * workspace.
+ *
+ * Getting a workspace is done by using the bitmap to identify the levels that
+ * have available workspaces and scans up. This lets us recycle higher level
+ * workspaces because of the monotonic memory guarantee. A workspace's
+ * last_used is only updated if it is being used by the corresponding memory
+ * level. Putting a workspace involves adding it back to the appropriate places
+ * and adding it back to the lru if necessary.
+ *
+ * A timer is used to reclaim workspaces if they have not been used for
+ * ZSTD_BTRFS_RECLAIM_JIFFIES. This helps keep only active workspaces around.
+ * The upper bound is provided by the workqueue limit which is 2 (percpu limit).
+ */
+
+struct zstd_workspace_manager {
+ const struct btrfs_compress_op *ops;
+ spinlock_t lock;
+ struct list_head lru_list;
+ struct list_head idle_ws[ZSTD_BTRFS_MAX_LEVEL];
+ unsigned long active_map;
+ wait_queue_head_t wait;
+ struct timer_list timer;
+};
+
+static struct zstd_workspace_manager wsm;
+
+static size_t zstd_ws_mem_sizes[ZSTD_BTRFS_MAX_LEVEL];
+
+static inline struct workspace *list_to_workspace(struct list_head *list)
+{
+ return container_of(list, struct workspace, list);
+}
+
+/*
+ * zstd_reclaim_timer_fn - reclaim timer
+ * @t: timer
+ *
+ * This scans the lru_list and attempts to reclaim any workspace that hasn't
+ * been used for ZSTD_BTRFS_RECLAIM_JIFFIES.
+ */
+static void zstd_reclaim_timer_fn(struct timer_list *timer)
+{
+ unsigned long reclaim_threshold = jiffies - ZSTD_BTRFS_RECLAIM_JIFFIES;
+ struct list_head *pos, *next;
+
+ spin_lock(&wsm.lock);
+
+ if (list_empty(&wsm.lru_list)) {
+ spin_unlock(&wsm.lock);
+ return;
+ }
+
+ list_for_each_prev_safe(pos, next, &wsm.lru_list) {
+ struct workspace *victim = container_of(pos, struct workspace,
+ lru_list);
+ unsigned int level;
+
+ if (time_after(victim->last_used, reclaim_threshold))
+ break;
+
+ /* workspace is in use */
+ if (victim->req_level)
+ continue;
+
+ level = victim->level;
+ list_del(&victim->lru_list);
+ list_del(&victim->list);
+ wsm.ops->free_workspace(&victim->list);
+
+ if (list_empty(&wsm.idle_ws[level - 1]))
+ clear_bit(level - 1, &wsm.active_map);
+
+ }
+
+ if (!list_empty(&wsm.lru_list))
+ mod_timer(&wsm.timer, jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES);
+
+ spin_unlock(&wsm.lock);
+}
+
+/*
+ * zstd_calc_ws_mem_sizes - calculate monotonic memory bounds
+ *
+ * It is possible based on the level configurations that a higher level
+ * workspace uses less memory than a lower level workspace. In order to reuse
+ * workspaces, this must be made a monotonic relationship. This precomputes
+ * the required memory for each level and enforces the monotonicity between
+ * level and memory required.
+ */
+static void zstd_calc_ws_mem_sizes(void)
+{
+ size_t max_size = 0;
+ unsigned int level;
+
+ for (level = 1; level <= ZSTD_BTRFS_MAX_LEVEL; level++) {
+ ZSTD_parameters params =
+ zstd_get_btrfs_parameters(level, ZSTD_BTRFS_MAX_INPUT);
+ size_t level_size =
+ max_t(size_t,
+ ZSTD_CStreamWorkspaceBound(params.cParams),
+ ZSTD_DStreamWorkspaceBound(ZSTD_BTRFS_MAX_INPUT));
+
+ max_size = max_t(size_t, max_size, level_size);
+ zstd_ws_mem_sizes[level - 1] = max_size;
+ }
+}
+
+static void zstd_init_workspace_manager(void)
+{
+ struct list_head *ws;
+ int i;
+
+ zstd_calc_ws_mem_sizes();
+
+ wsm.ops = &btrfs_zstd_compress;
+ spin_lock_init(&wsm.lock);
+ init_waitqueue_head(&wsm.wait);
+ timer_setup(&wsm.timer, zstd_reclaim_timer_fn, 0);
+
+ INIT_LIST_HEAD(&wsm.lru_list);
+ for (i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++)
+ INIT_LIST_HEAD(&wsm.idle_ws[i]);
+
+ ws = wsm.ops->alloc_workspace(ZSTD_BTRFS_MAX_LEVEL);
+ if (IS_ERR(ws)) {
+ pr_warn(
+ "BTRFS: cannot preallocate zstd compression workspace\n");
+ } else {
+ set_bit(ZSTD_BTRFS_MAX_LEVEL - 1, &wsm.active_map);
+ list_add(ws, &wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1]);
+ }
+}
+
+static void zstd_cleanup_workspace_manager(void)
+{
+ struct workspace *workspace;
+ int i;
+
+ del_timer(&wsm.timer);
+
+ for (i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++) {
+ while (!list_empty(&wsm.idle_ws[i])) {
+ workspace = container_of(wsm.idle_ws[i].next,
+ struct workspace, list);
+ list_del(&workspace->list);
+ list_del(&workspace->lru_list);
+ wsm.ops->free_workspace(&workspace->list);
+ }
+ }
+}
+
+/*
+ * zstd_find_workspace - find workspace
+ * @level: compression level
+ *
+ * This iterates over the set bits in the active_map beginning at the requested
+ * compression level. This lets us utilize already allocated workspaces before
+ * allocating a new one. If the workspace is of a larger size, it is used, but
+ * the place in the lru_list and last_used times are not updated. This is to
+ * offer the opportunity to reclaim the workspace in favor of allocating an
+ * appropriately sized one in the future.
+ */
+static struct list_head *zstd_find_workspace(unsigned int level)
+{
+ struct list_head *ws;
+ struct workspace *workspace;
+ int i = level - 1;
+
+ spin_lock(&wsm.lock);
+ for_each_set_bit_from(i, &wsm.active_map, ZSTD_BTRFS_MAX_LEVEL) {
+ if (!list_empty(&wsm.idle_ws[i])) {
+ ws = wsm.idle_ws[i].next;
+ workspace = list_to_workspace(ws);
+ list_del_init(ws);
+ /* keep its place if it's a lower level using this */
+ workspace->req_level = level;
+ if (level == workspace->level)
+ list_del(&workspace->lru_list);
+ if (list_empty(&wsm.idle_ws[i]))
+ clear_bit(i, &wsm.active_map);
+ spin_unlock(&wsm.lock);
+ return ws;
+ }
+ }
+ spin_unlock(&wsm.lock);
+
+ return NULL;
+}
+
+/*
+ * zstd_get_workspace - zstd's get_workspace
+ * @level: compression level
+ *
+ * If @level is 0, then any compression level can be used. Therefore, we begin
+ * scanning from 1. We first scan through possible workspaces and then after
+ * attempt to allocate a new workspace. If we fail to allocate one due to
+ * memory pressure, go to sleep waiting for the max level workspace to free up.
+ */
+static struct list_head *zstd_get_workspace(unsigned int level)
+{
+ struct list_head *ws;
+ unsigned int nofs_flag;
+
+ /* level == 0 means we can use any workspace */
+ if (!level)
+ level = 1;
+
+again:
+ ws = zstd_find_workspace(level);
+ if (ws)
+ return ws;
+
+ nofs_flag = memalloc_nofs_save();
+ ws = wsm.ops->alloc_workspace(level);
+ memalloc_nofs_restore(nofs_flag);
+
+ if (IS_ERR(ws)) {
+ DEFINE_WAIT(wait);
+
+ prepare_to_wait(&wsm.wait, &wait, TASK_UNINTERRUPTIBLE);
+ schedule();
+ finish_wait(&wsm.wait, &wait);
+
+ goto again;
+ }
+
+ return ws;
+}
+
+/*
+ * zstd_put_workspace - zstd put_workspace
+ * @ws: list_head for the workspace
+ *
+ * When putting back a workspace, we only need to update the LRU if we are of
+ * the requested compression level. Here is where we continue to protect the
+ * max level workspace or update last_used accordingly. If the reclaim timer
+ * isn't set, it is also set here. Only the max level workspace tries and wakes
+ * up waiting workspaces.
+ */
+static void zstd_put_workspace(struct list_head *ws)
+{
+ struct workspace *workspace = list_to_workspace(ws);
+
+ spin_lock(&wsm.lock);
+
+ /* A node is only taken off the lru if we are the corresponding level */
+ if (workspace->req_level == workspace->level) {
+ /* Hide a max level workspace from reclaim */
+ if (list_empty(&wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1])) {
+ INIT_LIST_HEAD(&workspace->lru_list);
+ } else {
+ workspace->last_used = jiffies;
+ list_add(&workspace->lru_list, &wsm.lru_list);
+ if (!timer_pending(&wsm.timer))
+ mod_timer(&wsm.timer,
+ jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES);
+ }
+ }
+
+ set_bit(workspace->level - 1, &wsm.active_map);
+ list_add(&workspace->list, &wsm.idle_ws[workspace->level - 1]);
+ workspace->req_level = 0;
+
+ spin_unlock(&wsm.lock);
+
+ if (workspace->level == ZSTD_BTRFS_MAX_LEVEL)
+ cond_wake_up(&wsm.wait);
+}
+
static void zstd_free_workspace(struct list_head *ws)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
@@ -50,25 +335,25 @@ static void zstd_free_workspace(struct list_head *ws)
kfree(workspace);
}
-static struct list_head *zstd_alloc_workspace(void)
+static struct list_head *zstd_alloc_workspace(unsigned int level)
{
- ZSTD_parameters params =
- zstd_get_btrfs_parameters(ZSTD_BTRFS_MAX_INPUT);
struct workspace *workspace;
workspace = kzalloc(sizeof(*workspace), GFP_KERNEL);
if (!workspace)
return ERR_PTR(-ENOMEM);
- workspace->size = max_t(size_t,
- ZSTD_CStreamWorkspaceBound(params.cParams),
- ZSTD_DStreamWorkspaceBound(ZSTD_BTRFS_MAX_INPUT));
+ workspace->size = zstd_ws_mem_sizes[level - 1];
+ workspace->level = level;
+ workspace->req_level = level;
+ workspace->last_used = jiffies;
workspace->mem = kvmalloc(workspace->size, GFP_KERNEL);
workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (!workspace->mem || !workspace->buf)
goto fail;
INIT_LIST_HEAD(&workspace->list);
+ INIT_LIST_HEAD(&workspace->lru_list);
return &workspace->list;
fail:
@@ -95,7 +380,8 @@ static int zstd_compress_pages(struct list_head *ws,
unsigned long len = *total_out;
const unsigned long nr_dest_pages = *out_pages;
unsigned long max_out = nr_dest_pages * PAGE_SIZE;
- ZSTD_parameters params = zstd_get_btrfs_parameters(len);
+ ZSTD_parameters params = zstd_get_btrfs_parameters(workspace->req_level,
+ len);
*out_pages = 0;
*total_out = 0;
@@ -419,11 +705,19 @@ finish:
return ret;
}
-static void zstd_set_level(struct list_head *ws, unsigned int type)
+static unsigned int zstd_set_level(unsigned int level)
{
+ if (!level)
+ return ZSTD_BTRFS_DEFAULT_LEVEL;
+
+ return min_t(unsigned int, level, ZSTD_BTRFS_MAX_LEVEL);
}
const struct btrfs_compress_op btrfs_zstd_compress = {
+ .init_workspace_manager = zstd_init_workspace_manager,
+ .cleanup_workspace_manager = zstd_cleanup_workspace_manager,
+ .get_workspace = zstd_get_workspace,
+ .put_workspace = zstd_put_workspace,
.alloc_workspace = zstd_alloc_workspace,
.free_workspace = zstd_free_workspace,
.compress_pages = zstd_compress_pages,
diff --git a/fs/buffer.c b/fs/buffer.c
index 109f55196866..ce357602f471 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -200,6 +200,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
struct buffer_head *head;
struct page *page;
int all_mapped = 1;
+ static DEFINE_RATELIMIT_STATE(last_warned, HZ, 1);
index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
page = find_get_page_flags(bd_mapping, index, FGP_ACCESSED);
@@ -227,15 +228,15 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
* file io on the block device and getblk. It gets dealt with
* elsewhere, don't buffer_error if we had some unmapped buffers
*/
- if (all_mapped) {
- printk("__find_get_block_slow() failed. "
- "block=%llu, b_blocknr=%llu\n",
- (unsigned long long)block,
- (unsigned long long)bh->b_blocknr);
- printk("b_state=0x%08lx, b_size=%zu\n",
- bh->b_state, bh->b_size);
- printk("device %pg blocksize: %d\n", bdev,
- 1 << bd_inode->i_blkbits);
+ ratelimit_set_flags(&last_warned, RATELIMIT_MSG_ON_RELEASE);
+ if (all_mapped && __ratelimit(&last_warned)) {
+ printk("__find_get_block_slow() failed. block=%llu, "
+ "b_blocknr=%llu, b_state=0x%08lx, b_size=%zu, "
+ "device %pg blocksize: %d\n",
+ (unsigned long long)block,
+ (unsigned long long)bh->b_blocknr,
+ bh->b_state, bh->b_size, bdev,
+ 1 << bd_inode->i_blkbits);
}
out_unlock:
spin_unlock(&bd_mapping->private_lock);
@@ -562,7 +563,7 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
EXPORT_SYMBOL(mark_buffer_dirty_inode);
/*
- * Mark the page dirty, and set it dirty in the radix tree, and mark the inode
+ * Mark the page dirty, and set it dirty in the page cache, and mark the inode
* dirty.
*
* If warn is true, then emit a warning if the page is not uptodate and has
@@ -579,8 +580,8 @@ void __set_page_dirty(struct page *page, struct address_space *mapping,
if (page->mapping) { /* Race with truncate? */
WARN_ON_ONCE(warn && !PageUptodate(page));
account_page_dirtied(page, mapping);
- radix_tree_tag_set(&mapping->i_pages,
- page_index(page), PAGECACHE_TAG_DIRTY);
+ __xa_set_mark(&mapping->i_pages, page_index(page),
+ PAGECACHE_TAG_DIRTY);
}
xa_unlock_irqrestore(&mapping->i_pages, flags);
}
@@ -1050,7 +1051,7 @@ __getblk_slow(struct block_device *bdev, sector_t block,
* The relationship between dirty buffers and dirty pages:
*
* Whenever a page has any dirty buffers, the page's dirty bit is set, and
- * the page is tagged dirty in its radix tree.
+ * the page is tagged dirty in the page cache.
*
* At all times, the dirtiness of the buffers represents the dirtiness of
* subsections of the page. If the page has buffers, the page dirty bit is
@@ -1073,9 +1074,9 @@ __getblk_slow(struct block_device *bdev, sector_t block,
* mark_buffer_dirty - mark a buffer_head as needing writeout
* @bh: the buffer_head to mark dirty
*
- * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
- * backing page dirty, then tag the page as dirty in its address_space's radix
- * tree and then attach the address_space's inode to its superblock's dirty
+ * mark_buffer_dirty() will set the dirty bit against the buffer, then set
+ * its backing page dirty, then tag the page as dirty in the page cache
+ * and then attach the address_space's inode to its superblock's dirty
* inode list.
*
* mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
@@ -2366,7 +2367,7 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping,
balance_dirty_pages_ratelimited(mapping);
- if (unlikely(fatal_signal_pending(current))) {
+ if (fatal_signal_pending(current)) {
err = -EINTR;
goto out;
}
@@ -3026,13 +3027,23 @@ void guard_bio_eod(int op, struct bio *bio)
/* Uhhuh. We've got a bio that straddles the device size! */
truncated_bytes = bio->bi_iter.bi_size - (maxsector << 9);
+ /*
+ * The bio contains more than one segment which spans EOD, just return
+ * and let IO layer turn it into an EIO
+ */
+ if (truncated_bytes > bvec->bv_len)
+ return;
+
/* Truncate the bio.. */
bio->bi_iter.bi_size -= truncated_bytes;
bvec->bv_len -= truncated_bytes;
/* ..and clear the end of the buffer for reads */
if (op == REQ_OP_READ) {
- zero_user(bvec->bv_page, bvec->bv_offset + bvec->bv_len,
+ struct bio_vec bv;
+
+ mp_bvec_last_segment(bvec, &bv);
+ zero_user(bv.bv_page, bv.bv_offset + bv.bv_len,
truncated_bytes);
}
}
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 95983c744164..1645fcfd9691 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -244,11 +244,13 @@ wait_for_old_object:
ASSERT(!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags));
- cache->cache.ops->put_object(&xobject->fscache, cachefiles_obj_put_wait_retry);
+ cache->cache.ops->put_object(&xobject->fscache,
+ (enum fscache_obj_ref_trace)cachefiles_obj_put_wait_retry);
goto try_again;
requeue:
- cache->cache.ops->put_object(&xobject->fscache, cachefiles_obj_put_wait_timeo);
+ cache->cache.ops->put_object(&xobject->fscache,
+ (enum fscache_obj_ref_trace)cachefiles_obj_put_wait_timeo);
_leave(" = -ETIMEDOUT");
return -ETIMEDOUT;
}
@@ -336,7 +338,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache,
try_again:
/* first step is to make up a grave dentry in the graveyard */
sprintf(nbuffer, "%08x%08x",
- (uint32_t) get_seconds(),
+ (uint32_t) ktime_get_real_seconds(),
(uint32_t) atomic_inc_return(&cache->gravecounter));
/* do the multiway lock magic */
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 40f7595aad10..8a577409d030 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -535,7 +535,10 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
netpage->index, cachefiles_gfp);
if (ret < 0) {
if (ret == -EEXIST) {
+ put_page(backpage);
+ backpage = NULL;
put_page(netpage);
+ netpage = NULL;
fscache_retrieval_complete(op, 1);
continue;
}
@@ -608,7 +611,10 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
netpage->index, cachefiles_gfp);
if (ret < 0) {
if (ret == -EEXIST) {
+ put_page(backpage);
+ backpage = NULL;
put_page(netpage);
+ netpage = NULL;
fscache_retrieval_complete(op, 1);
continue;
}
@@ -962,11 +968,8 @@ void cachefiles_uncache_page(struct fscache_object *_object, struct page *page)
__releases(&object->fscache.cookie->lock)
{
struct cachefiles_object *object;
- struct cachefiles_cache *cache;
object = container_of(_object, struct cachefiles_object, fscache);
- cache = container_of(object->fscache.cache,
- struct cachefiles_cache, cache);
_enter("%p,{%lu}", object, page->index);
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index 0a29a00aed2e..511e6c68156a 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -135,7 +135,8 @@ int cachefiles_update_object_xattr(struct cachefiles_object *object,
struct dentry *dentry = object->dentry;
int ret;
- ASSERT(dentry);
+ if (!dentry)
+ return -ESTALE;
_enter("%p,#%d", object, auxdata->len);
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index 027408d55aee..5f0103f40079 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -104,6 +104,11 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type)
struct timespec64 old_ctime = inode->i_ctime;
umode_t new_mode = inode->i_mode, old_mode = inode->i_mode;
+ if (ceph_snap(inode) != CEPH_NOSNAP) {
+ ret = -EROFS;
+ goto out;
+ }
+
switch (type) {
case ACL_TYPE_ACCESS:
name = XATTR_NAME_POSIX_ACL_ACCESS;
@@ -138,11 +143,6 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type)
goto out_free;
}
- if (ceph_snap(inode) != CEPH_NOSNAP) {
- ret = -EROFS;
- goto out_free;
- }
-
if (new_mode != old_mode) {
newattrs.ia_ctime = current_time(inode);
newattrs.ia_mode = new_mode;
@@ -206,10 +206,9 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
tmp_buf = kmalloc(max(val_size1, val_size2), GFP_KERNEL);
if (!tmp_buf)
goto out_err;
- pagelist = kmalloc(sizeof(struct ceph_pagelist), GFP_KERNEL);
+ pagelist = ceph_pagelist_alloc(GFP_KERNEL);
if (!pagelist)
goto out_err;
- ceph_pagelist_init(pagelist);
err = ceph_pagelist_reserve(pagelist, PAGE_SIZE);
if (err)
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 9c332a6f6667..a47c541f8006 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -306,7 +306,7 @@ static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx,
struct ceph_osd_client *osdc =
&ceph_inode_to_client(inode)->client->osdc;
struct ceph_inode_info *ci = ceph_inode(inode);
- struct page *page = list_entry(page_list->prev, struct page, lru);
+ struct page *page = lru_to_page(page_list);
struct ceph_vino vino;
struct ceph_osd_request *req;
u64 off;
@@ -322,7 +322,7 @@ static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx,
/* caller of readpages does not hold buffer and read caps
* (fadvise, madvise and readahead cases) */
int want = CEPH_CAP_FILE_CACHE;
- ret = ceph_try_get_caps(ci, CEPH_CAP_FILE_RD, want, &got);
+ ret = ceph_try_get_caps(ci, CEPH_CAP_FILE_RD, want, true, &got);
if (ret < 0) {
dout("start_read %p, error getting cap\n", inode);
} else if (!(got & want)) {
@@ -333,8 +333,7 @@ static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx,
if (got)
ceph_put_cap_refs(ci, got);
while (!list_empty(page_list)) {
- page = list_entry(page_list->prev,
- struct page, lru);
+ page = lru_to_page(page_list);
list_del(&page->lru);
put_page(page);
}
@@ -1495,10 +1494,7 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
if (err < 0 || off >= i_size_read(inode)) {
unlock_page(page);
put_page(page);
- if (err == -ENOMEM)
- ret = VM_FAULT_OOM;
- else
- ret = VM_FAULT_SIGBUS;
+ ret = vmf_error(err);
goto out_inline;
}
if (err < PAGE_SIZE)
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index dd7dfdd2ba13..bba28a5034ba 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -519,9 +519,9 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
* -> we take mdsc->cap_delay_lock
*/
static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
- struct ceph_inode_info *ci)
+ struct ceph_inode_info *ci,
+ bool set_timeout)
{
- __cap_set_timeouts(mdsc, ci);
dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
ci->i_ceph_flags, ci->i_hold_caps_max);
if (!mdsc->stopping) {
@@ -531,6 +531,8 @@ static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
goto no_change;
list_del_init(&ci->i_cap_delay_list);
}
+ if (set_timeout)
+ __cap_set_timeouts(mdsc, ci);
list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
no_change:
spin_unlock(&mdsc->cap_delay_lock);
@@ -655,6 +657,9 @@ void ceph_add_cap(struct inode *inode,
session->s_nr_caps++;
spin_unlock(&session->s_cap_lock);
} else {
+ if (cap->cap_gen < session->s_cap_gen)
+ cap->issued = cap->implemented = CEPH_CAP_PIN;
+
/*
* auth mds of the inode changed. we received the cap export
* message, but still haven't received the cap import message.
@@ -720,7 +725,7 @@ void ceph_add_cap(struct inode *inode,
dout(" issued %s, mds wanted %s, actual %s, queueing\n",
ceph_cap_string(issued), ceph_cap_string(wanted),
ceph_cap_string(actual_wanted));
- __cap_delay_requeue(mdsc, ci);
+ __cap_delay_requeue(mdsc, ci, true);
}
if (flags & CEPH_CAP_FLAG_AUTH) {
@@ -1030,6 +1035,8 @@ static void drop_inode_snap_realm(struct ceph_inode_info *ci)
list_del_init(&ci->i_snap_realm_item);
ci->i_snap_realm_counter++;
ci->i_snap_realm = NULL;
+ if (realm->ino == ci->i_vino.ino)
+ realm->inode = NULL;
spin_unlock(&realm->inodes_with_caps_lock);
ceph_put_snap_realm(ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc,
realm);
@@ -1647,7 +1654,7 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
(mask & CEPH_CAP_FILE_BUFFER))
dirty |= I_DIRTY_DATASYNC;
- __cap_delay_requeue(mdsc, ci);
+ __cap_delay_requeue(mdsc, ci, true);
return dirty;
}
@@ -1853,14 +1860,17 @@ retry_locked:
retain |= CEPH_CAP_ANY; /* be greedy */
} else if (S_ISDIR(inode->i_mode) &&
(issued & CEPH_CAP_FILE_SHARED) &&
- __ceph_dir_is_complete(ci)) {
+ __ceph_dir_is_complete(ci)) {
/*
* If a directory is complete, we want to keep
* the exclusive cap. So that MDS does not end up
* revoking the shared cap on every create/unlink
* operation.
*/
- want = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
+ if (IS_RDONLY(inode))
+ want = CEPH_CAP_ANY_SHARED;
+ else
+ want = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
retain |= want;
} else {
@@ -1968,8 +1978,7 @@ retry_locked:
goto ack;
/* things we might delay */
- if ((cap->issued & ~retain) == 0 &&
- cap->mds_wanted == want)
+ if ((cap->issued & ~retain) == 0)
continue; /* nope, all good */
if (no_delay)
@@ -2065,7 +2074,7 @@ ack:
/* Reschedule delayed caps release if we delayed anything */
if (delayed)
- __cap_delay_requeue(mdsc, ci);
+ __cap_delay_requeue(mdsc, ci, false);
spin_unlock(&ci->i_ceph_lock);
@@ -2125,7 +2134,7 @@ retry:
if (delayed) {
spin_lock(&ci->i_ceph_lock);
- __cap_delay_requeue(mdsc, ci);
+ __cap_delay_requeue(mdsc, ci, true);
spin_unlock(&ci->i_ceph_lock);
}
} else {
@@ -2671,17 +2680,18 @@ static void check_max_size(struct inode *inode, loff_t endoff)
ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
}
-int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want, int *got)
+int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want,
+ bool nonblock, int *got)
{
int ret, err = 0;
BUG_ON(need & ~CEPH_CAP_FILE_RD);
- BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO));
+ BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO|CEPH_CAP_FILE_SHARED));
ret = ceph_pool_perm_check(ci, need);
if (ret < 0)
return ret;
- ret = try_get_cap_refs(ci, need, want, 0, true, got, &err);
+ ret = try_get_cap_refs(ci, need, want, 0, nonblock, got, &err);
if (ret) {
if (err == -EAGAIN) {
ret = 0;
@@ -3045,7 +3055,8 @@ static void handle_cap_grant(struct inode *inode,
int used, wanted, dirty;
u64 size = le64_to_cpu(grant->size);
u64 max_size = le64_to_cpu(grant->max_size);
- int check_caps = 0;
+ unsigned char check_caps = 0;
+ bool was_stale = cap->cap_gen < session->s_cap_gen;
bool wake = false;
bool writeback = false;
bool queue_trunc = false;
@@ -3060,21 +3071,6 @@ static void handle_cap_grant(struct inode *inode,
/*
- * auth mds of the inode changed. we received the cap export message,
- * but still haven't received the cap import message. handle_cap_export
- * updated the new auth MDS' cap.
- *
- * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing a message
- * that was sent before the cap import message. So don't remove caps.
- */
- if (ceph_seq_cmp(seq, cap->seq) <= 0) {
- WARN_ON(cap != ci->i_auth_cap);
- WARN_ON(cap->cap_id != le64_to_cpu(grant->cap_id));
- seq = cap->seq;
- newcaps |= cap->issued;
- }
-
- /*
* If CACHE is being revoked, and we have no dirty buffers,
* try to invalidate (once). (If there are dirty buffers, we
* will invalidate _after_ writeback.)
@@ -3093,6 +3089,24 @@ static void handle_cap_grant(struct inode *inode,
}
}
+ if (was_stale)
+ cap->issued = cap->implemented = CEPH_CAP_PIN;
+
+ /*
+ * auth mds of the inode changed. we received the cap export message,
+ * but still haven't received the cap import message. handle_cap_export
+ * updated the new auth MDS' cap.
+ *
+ * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing a message
+ * that was sent before the cap import message. So don't remove caps.
+ */
+ if (ceph_seq_cmp(seq, cap->seq) <= 0) {
+ WARN_ON(cap != ci->i_auth_cap);
+ WARN_ON(cap->cap_id != le64_to_cpu(grant->cap_id));
+ seq = cap->seq;
+ newcaps |= cap->issued;
+ }
+
/* side effects now are allowed */
cap->cap_gen = session->s_cap_gen;
cap->seq = seq;
@@ -3197,13 +3211,20 @@ static void handle_cap_grant(struct inode *inode,
ceph_cap_string(wanted),
ceph_cap_string(used),
ceph_cap_string(dirty));
- if (wanted != le32_to_cpu(grant->wanted)) {
- dout("mds wanted %s -> %s\n",
- ceph_cap_string(le32_to_cpu(grant->wanted)),
- ceph_cap_string(wanted));
- /* imported cap may not have correct mds_wanted */
- if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT)
- check_caps = 1;
+
+ if ((was_stale || le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) &&
+ (wanted & ~(cap->mds_wanted | newcaps))) {
+ /*
+ * If mds is importing cap, prior cap messages that update
+ * 'wanted' may get dropped by mds (migrate seq mismatch).
+ *
+ * We don't send cap message to update 'wanted' if what we
+ * want are already issued. If mds revokes caps, cap message
+ * that releases caps also tells mds what we want. But if
+ * caps got revoked by mds forcedly (session stale). We may
+ * haven't told mds what we want.
+ */
+ check_caps = 1;
}
/* revocation, grant, or no-op? */
@@ -3536,9 +3557,9 @@ retry:
goto out_unlock;
if (target < 0) {
- __ceph_remove_cap(cap, false);
- if (!ci->i_auth_cap)
+ if (cap->mds_wanted | cap->issued)
ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
+ __ceph_remove_cap(cap, false);
goto out_unlock;
}
@@ -3566,7 +3587,6 @@ retry:
tcap->cap_id = t_cap_id;
tcap->seq = t_seq - 1;
tcap->issue_seq = t_seq - 1;
- tcap->mseq = t_mseq;
tcap->issued |= issued;
tcap->implemented |= issued;
if (cap == ci->i_auth_cap)
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 92ab20433682..189df668b6a0 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/ceph/ceph_debug.h>
+#include <linux/ceph/striper.h>
#include <linux/module.h>
#include <linux/sched.h>
@@ -557,90 +558,26 @@ enum {
};
/*
- * Read a range of bytes striped over one or more objects. Iterate over
- * objects we stripe over. (That's not atomic, but good enough for now.)
- *
- * If we get a short result from the OSD, check against i_size; we need to
- * only return a short read to the caller if we hit EOF.
- */
-static int striped_read(struct inode *inode,
- u64 pos, u64 len,
- struct page **pages, int num_pages,
- int page_align, int *checkeof)
-{
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
- struct ceph_inode_info *ci = ceph_inode(inode);
- u64 this_len;
- loff_t i_size;
- int page_idx;
- int ret, read = 0;
- bool hit_stripe, was_short;
-
- /*
- * we may need to do multiple reads. not atomic, unfortunately.
- */
-more:
- this_len = len;
- page_idx = (page_align + read) >> PAGE_SHIFT;
- ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
- &ci->i_layout, pos, &this_len,
- ci->i_truncate_seq, ci->i_truncate_size,
- pages + page_idx, num_pages - page_idx,
- ((page_align + read) & ~PAGE_MASK));
- if (ret == -ENOENT)
- ret = 0;
- hit_stripe = this_len < len;
- was_short = ret >= 0 && ret < this_len;
- dout("striped_read %llu~%llu (read %u) got %d%s%s\n", pos, len, read,
- ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");
-
- i_size = i_size_read(inode);
- if (ret >= 0) {
- if (was_short && (pos + ret < i_size)) {
- int zlen = min(this_len - ret, i_size - pos - ret);
- int zoff = page_align + read + ret;
- dout(" zero gap %llu to %llu\n",
- pos + ret, pos + ret + zlen);
- ceph_zero_page_vector_range(zoff, zlen, pages);
- ret += zlen;
- }
-
- read += ret;
- pos += ret;
- len -= ret;
-
- /* hit stripe and need continue*/
- if (len && hit_stripe && pos < i_size)
- goto more;
- }
-
- if (read > 0) {
- ret = read;
- /* did we bounce off eof? */
- if (pos + len > i_size)
- *checkeof = CHECK_EOF;
- }
-
- dout("striped_read returns %d\n", ret);
- return ret;
-}
-
-/*
* Completely synchronous read and write methods. Direct from __user
* buffer to osd, or directly to user pages (if O_DIRECT).
*
- * If the read spans object boundary, just do multiple reads.
+ * If the read spans object boundary, just do multiple reads. (That's not
+ * atomic, but good enough for now.)
+ *
+ * If we get a short result from the OSD, check against i_size; we need to
+ * only return a short read to the caller if we hit EOF.
*/
static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
- int *checkeof)
+ int *retry_op)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
- struct page **pages;
- u64 off = iocb->ki_pos;
- int num_pages;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_osd_client *osdc = &fsc->client->osdc;
ssize_t ret;
- size_t len = iov_iter_count(to);
+ u64 off = iocb->ki_pos;
+ u64 len = iov_iter_count(to);
dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len,
(file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
@@ -653,61 +590,118 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
* but it will at least behave sensibly when they are
* in sequence.
*/
- ret = filemap_write_and_wait_range(inode->i_mapping, off,
- off + len);
+ ret = filemap_write_and_wait_range(inode->i_mapping, off, off + len);
if (ret < 0)
return ret;
- if (unlikely(to->type & ITER_PIPE)) {
+ ret = 0;
+ while ((len = iov_iter_count(to)) > 0) {
+ struct ceph_osd_request *req;
+ struct page **pages;
+ int num_pages;
size_t page_off;
- ret = iov_iter_get_pages_alloc(to, &pages, len,
- &page_off);
- if (ret <= 0)
- return -ENOMEM;
- num_pages = DIV_ROUND_UP(ret + page_off, PAGE_SIZE);
+ u64 i_size;
+ bool more;
+
+ req = ceph_osdc_new_request(osdc, &ci->i_layout,
+ ci->i_vino, off, &len, 0, 1,
+ CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
+ NULL, ci->i_truncate_seq,
+ ci->i_truncate_size, false);
+ if (IS_ERR(req)) {
+ ret = PTR_ERR(req);
+ break;
+ }
- ret = striped_read(inode, off, ret, pages, num_pages,
- page_off, checkeof);
- if (ret > 0) {
- iov_iter_advance(to, ret);
- off += ret;
+ more = len < iov_iter_count(to);
+
+ if (unlikely(iov_iter_is_pipe(to))) {
+ ret = iov_iter_get_pages_alloc(to, &pages, len,
+ &page_off);
+ if (ret <= 0) {
+ ceph_osdc_put_request(req);
+ ret = -ENOMEM;
+ break;
+ }
+ num_pages = DIV_ROUND_UP(ret + page_off, PAGE_SIZE);
+ if (ret < len) {
+ len = ret;
+ osd_req_op_extent_update(req, 0, len);
+ more = false;
+ }
} else {
- iov_iter_advance(to, 0);
+ num_pages = calc_pages_for(off, len);
+ page_off = off & ~PAGE_MASK;
+ pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
+ if (IS_ERR(pages)) {
+ ceph_osdc_put_request(req);
+ ret = PTR_ERR(pages);
+ break;
+ }
}
- ceph_put_page_vector(pages, num_pages, false);
- } else {
- num_pages = calc_pages_for(off, len);
- pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
- if (IS_ERR(pages))
- return PTR_ERR(pages);
-
- ret = striped_read(inode, off, len, pages, num_pages,
- (off & ~PAGE_MASK), checkeof);
- if (ret > 0) {
- int l, k = 0;
- size_t left = ret;
-
- while (left) {
- size_t page_off = off & ~PAGE_MASK;
- size_t copy = min_t(size_t, left,
- PAGE_SIZE - page_off);
- l = copy_page_to_iter(pages[k++], page_off,
- copy, to);
- off += l;
- left -= l;
- if (l < copy)
+
+ osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off,
+ false, false);
+ ret = ceph_osdc_start_request(osdc, req, false);
+ if (!ret)
+ ret = ceph_osdc_wait_request(osdc, req);
+ ceph_osdc_put_request(req);
+
+ i_size = i_size_read(inode);
+ dout("sync_read %llu~%llu got %zd i_size %llu%s\n",
+ off, len, ret, i_size, (more ? " MORE" : ""));
+
+ if (ret == -ENOENT)
+ ret = 0;
+ if (ret >= 0 && ret < len && (off + ret < i_size)) {
+ int zlen = min(len - ret, i_size - off - ret);
+ int zoff = page_off + ret;
+ dout("sync_read zero gap %llu~%llu\n",
+ off + ret, off + ret + zlen);
+ ceph_zero_page_vector_range(zoff, zlen, pages);
+ ret += zlen;
+ }
+
+ if (unlikely(iov_iter_is_pipe(to))) {
+ if (ret > 0) {
+ iov_iter_advance(to, ret);
+ off += ret;
+ } else {
+ iov_iter_advance(to, 0);
+ }
+ ceph_put_page_vector(pages, num_pages, false);
+ } else {
+ int idx = 0;
+ size_t left = ret > 0 ? ret : 0;
+ while (left > 0) {
+ size_t len, copied;
+ page_off = off & ~PAGE_MASK;
+ len = min_t(size_t, left, PAGE_SIZE - page_off);
+ copied = copy_page_to_iter(pages[idx++],
+ page_off, len, to);
+ off += copied;
+ left -= copied;
+ if (copied < len) {
+ ret = -EFAULT;
break;
+ }
}
+ ceph_release_page_vector(pages, num_pages);
}
- ceph_release_page_vector(pages, num_pages);
+
+ if (ret <= 0 || off >= i_size || !more)
+ break;
}
if (off > iocb->ki_pos) {
+ if (ret >= 0 &&
+ iov_iter_count(to) > 0 && off >= i_size_read(inode))
+ *retry_op = CHECK_EOF;
ret = off - iocb->ki_pos;
iocb->ki_pos = off;
}
- dout("sync_read result %zd\n", ret);
+ dout("sync_read result %zd retry_op %d\n", ret, *retry_op);
return ret;
}
@@ -821,7 +815,7 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
aio_req->total_len = rc + zlen;
}
- iov_iter_bvec(&i, ITER_BVEC, osd_data->bvec_pos.bvecs,
+ iov_iter_bvec(&i, READ, osd_data->bvec_pos.bvecs,
osd_data->num_bvecs,
osd_data->bvec_pos.iter.bi_size);
iov_iter_advance(&i, rc);
@@ -865,7 +859,7 @@ static void ceph_aio_retry_work(struct work_struct *work)
}
spin_unlock(&ci->i_ceph_lock);
- req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 2,
+ req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 1,
false, GFP_NOFS);
if (!req) {
ret = -ENOMEM;
@@ -877,6 +871,11 @@ static void ceph_aio_retry_work(struct work_struct *work)
ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc);
ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid);
+ req->r_ops[0] = orig_req->r_ops[0];
+
+ req->r_mtime = aio_req->mtime;
+ req->r_data_offset = req->r_ops[0].extent.offset;
+
ret = ceph_osdc_alloc_messages(req, GFP_NOFS);
if (ret) {
ceph_osdc_put_request(req);
@@ -884,11 +883,6 @@ static void ceph_aio_retry_work(struct work_struct *work)
goto out;
}
- req->r_ops[0] = orig_req->r_ops[0];
-
- req->r_mtime = aio_req->mtime;
- req->r_data_offset = req->r_ops[0].extent.offset;
-
ceph_osdc_put_request(orig_req);
req->r_callback = ceph_aio_complete_req;
@@ -1044,8 +1038,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
int zlen = min_t(size_t, len - ret,
size - pos - ret);
- iov_iter_bvec(&i, ITER_BVEC, bvecs, num_pages,
- len);
+ iov_iter_bvec(&i, READ, bvecs, num_pages, len);
iov_iter_advance(&i, ret);
iov_iter_zero(zlen, &i);
ret += zlen;
@@ -1735,7 +1728,6 @@ static long ceph_fallocate(struct file *file, int mode,
struct ceph_file_info *fi = file->private_data;
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_cap_flush *prealloc_cf;
int want, got = 0;
int dirty;
@@ -1743,10 +1735,7 @@ static long ceph_fallocate(struct file *file, int mode,
loff_t endoff = 0;
loff_t size;
- if ((offset + length) > max(i_size_read(inode), fsc->max_file_size))
- return -EFBIG;
-
- if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+ if (mode != (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
return -EOPNOTSUPP;
if (!S_ISREG(inode->i_mode))
@@ -1763,18 +1752,6 @@ static long ceph_fallocate(struct file *file, int mode,
goto unlock;
}
- if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
- ceph_quota_is_max_bytes_exceeded(inode, offset + length)) {
- ret = -EDQUOT;
- goto unlock;
- }
-
- if (ceph_osdmap_flag(&fsc->client->osdc, CEPH_OSDMAP_FULL) &&
- !(mode & FALLOC_FL_PUNCH_HOLE)) {
- ret = -ENOSPC;
- goto unlock;
- }
-
if (ci->i_inline_version != CEPH_INLINE_NONE) {
ret = ceph_uninline_data(file, NULL);
if (ret < 0)
@@ -1782,12 +1759,12 @@ static long ceph_fallocate(struct file *file, int mode,
}
size = i_size_read(inode);
- if (!(mode & FALLOC_FL_KEEP_SIZE)) {
- endoff = offset + length;
- ret = inode_newsize_ok(inode, endoff);
- if (ret)
- goto unlock;
- }
+
+ /* Are we punching a hole beyond EOF? */
+ if (offset >= size)
+ goto unlock;
+ if ((offset + length) > size)
+ length = size - offset;
if (fi->fmode & CEPH_FILE_MODE_LAZY)
want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
@@ -1798,16 +1775,8 @@ static long ceph_fallocate(struct file *file, int mode,
if (ret < 0)
goto unlock;
- if (mode & FALLOC_FL_PUNCH_HOLE) {
- if (offset < size)
- ceph_zero_pagecache_range(inode, offset, length);
- ret = ceph_zero_objects(inode, offset, length);
- } else if (endoff > size) {
- truncate_pagecache_range(inode, size, -1);
- if (ceph_inode_set_size(inode, endoff))
- ceph_check_caps(ceph_inode(inode),
- CHECK_CAPS_AUTHONLY, NULL);
- }
+ ceph_zero_pagecache_range(inode, offset, length);
+ ret = ceph_zero_objects(inode, offset, length);
if (!ret) {
spin_lock(&ci->i_ceph_lock);
@@ -1817,9 +1786,6 @@ static long ceph_fallocate(struct file *file, int mode,
spin_unlock(&ci->i_ceph_lock);
if (dirty)
__mark_inode_dirty(inode, dirty);
- if ((endoff > size) &&
- ceph_quota_is_max_bytes_approaching(inode, endoff))
- ceph_check_caps(ci, CHECK_CAPS_NODELAY, NULL);
}
ceph_put_cap_refs(ci, got);
@@ -1829,6 +1795,307 @@ unlock:
return ret;
}
+/*
+ * This function tries to get FILE_WR capabilities for dst_ci and FILE_RD for
+ * src_ci. Two attempts are made to obtain both caps, and an error is return if
+ * this fails; zero is returned on success.
+ */
+static int get_rd_wr_caps(struct ceph_inode_info *src_ci,
+ loff_t src_endoff, int *src_got,
+ struct ceph_inode_info *dst_ci,
+ loff_t dst_endoff, int *dst_got)
+{
+ int ret = 0;
+ bool retrying = false;
+
+retry_caps:
+ ret = ceph_get_caps(dst_ci, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER,
+ dst_endoff, dst_got, NULL);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * Since we're already holding the FILE_WR capability for the dst file,
+ * we would risk a deadlock by using ceph_get_caps. Thus, we'll do some
+ * retry dance instead to try to get both capabilities.
+ */
+ ret = ceph_try_get_caps(src_ci, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_SHARED,
+ false, src_got);
+ if (ret <= 0) {
+ /* Start by dropping dst_ci caps and getting src_ci caps */
+ ceph_put_cap_refs(dst_ci, *dst_got);
+ if (retrying) {
+ if (!ret)
+ /* ceph_try_get_caps masks EAGAIN */
+ ret = -EAGAIN;
+ return ret;
+ }
+ ret = ceph_get_caps(src_ci, CEPH_CAP_FILE_RD,
+ CEPH_CAP_FILE_SHARED, src_endoff,
+ src_got, NULL);
+ if (ret < 0)
+ return ret;
+ /*... drop src_ci caps too, and retry */
+ ceph_put_cap_refs(src_ci, *src_got);
+ retrying = true;
+ goto retry_caps;
+ }
+ return ret;
+}
+
+static void put_rd_wr_caps(struct ceph_inode_info *src_ci, int src_got,
+ struct ceph_inode_info *dst_ci, int dst_got)
+{
+ ceph_put_cap_refs(src_ci, src_got);
+ ceph_put_cap_refs(dst_ci, dst_got);
+}
+
+/*
+ * This function does several size-related checks, returning an error if:
+ * - source file is smaller than off+len
+ * - destination file size is not OK (inode_newsize_ok())
+ * - max bytes quotas is exceeded
+ */
+static int is_file_size_ok(struct inode *src_inode, struct inode *dst_inode,
+ loff_t src_off, loff_t dst_off, size_t len)
+{
+ loff_t size, endoff;
+
+ size = i_size_read(src_inode);
+ /*
+ * Don't copy beyond source file EOF. Instead of simply setting length
+ * to (size - src_off), just drop to VFS default implementation, as the
+ * local i_size may be stale due to other clients writing to the source
+ * inode.
+ */
+ if (src_off + len > size) {
+ dout("Copy beyond EOF (%llu + %zu > %llu)\n",
+ src_off, len, size);
+ return -EOPNOTSUPP;
+ }
+ size = i_size_read(dst_inode);
+
+ endoff = dst_off + len;
+ if (inode_newsize_ok(dst_inode, endoff))
+ return -EOPNOTSUPP;
+
+ if (ceph_quota_is_max_bytes_exceeded(dst_inode, endoff))
+ return -EDQUOT;
+
+ return 0;
+}
+
+static ssize_t ceph_copy_file_range(struct file *src_file, loff_t src_off,
+ struct file *dst_file, loff_t dst_off,
+ size_t len, unsigned int flags)
+{
+ struct inode *src_inode = file_inode(src_file);
+ struct inode *dst_inode = file_inode(dst_file);
+ struct ceph_inode_info *src_ci = ceph_inode(src_inode);
+ struct ceph_inode_info *dst_ci = ceph_inode(dst_inode);
+ struct ceph_cap_flush *prealloc_cf;
+ struct ceph_object_locator src_oloc, dst_oloc;
+ struct ceph_object_id src_oid, dst_oid;
+ loff_t endoff = 0, size;
+ ssize_t ret = -EIO;
+ u64 src_objnum, dst_objnum, src_objoff, dst_objoff;
+ u32 src_objlen, dst_objlen, object_size;
+ int src_got = 0, dst_got = 0, err, dirty;
+ bool do_final_copy = false;
+
+ if (src_inode == dst_inode)
+ return -EINVAL;
+ if (ceph_snap(dst_inode) != CEPH_NOSNAP)
+ return -EROFS;
+
+ /*
+ * Some of the checks below will return -EOPNOTSUPP, which will force a
+ * fallback to the default VFS copy_file_range implementation. This is
+ * desirable in several cases (for ex, the 'len' is smaller than the
+ * size of the objects, or in cases where that would be more
+ * efficient).
+ */
+
+ if (ceph_test_mount_opt(ceph_inode_to_client(src_inode), NOCOPYFROM))
+ return -EOPNOTSUPP;
+
+ if ((src_ci->i_layout.stripe_unit != dst_ci->i_layout.stripe_unit) ||
+ (src_ci->i_layout.stripe_count != dst_ci->i_layout.stripe_count) ||
+ (src_ci->i_layout.object_size != dst_ci->i_layout.object_size))
+ return -EOPNOTSUPP;
+
+ if (len < src_ci->i_layout.object_size)
+ return -EOPNOTSUPP; /* no remote copy will be done */
+
+ prealloc_cf = ceph_alloc_cap_flush();
+ if (!prealloc_cf)
+ return -ENOMEM;
+
+ /* Start by sync'ing the source and destination files */
+ ret = file_write_and_wait_range(src_file, src_off, (src_off + len));
+ if (ret < 0) {
+ dout("failed to write src file (%zd)\n", ret);
+ goto out;
+ }
+ ret = file_write_and_wait_range(dst_file, dst_off, (dst_off + len));
+ if (ret < 0) {
+ dout("failed to write dst file (%zd)\n", ret);
+ goto out;
+ }
+
+ /*
+ * We need FILE_WR caps for dst_ci and FILE_RD for src_ci as other
+ * clients may have dirty data in their caches. And OSDs know nothing
+ * about caps, so they can't safely do the remote object copies.
+ */
+ err = get_rd_wr_caps(src_ci, (src_off + len), &src_got,
+ dst_ci, (dst_off + len), &dst_got);
+ if (err < 0) {
+ dout("get_rd_wr_caps returned %d\n", err);
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ ret = is_file_size_ok(src_inode, dst_inode, src_off, dst_off, len);
+ if (ret < 0)
+ goto out_caps;
+
+ size = i_size_read(dst_inode);
+ endoff = dst_off + len;
+
+ /* Drop dst file cached pages */
+ ret = invalidate_inode_pages2_range(dst_inode->i_mapping,
+ dst_off >> PAGE_SHIFT,
+ endoff >> PAGE_SHIFT);
+ if (ret < 0) {
+ dout("Failed to invalidate inode pages (%zd)\n", ret);
+ ret = 0; /* XXX */
+ }
+ src_oloc.pool = src_ci->i_layout.pool_id;
+ src_oloc.pool_ns = ceph_try_get_string(src_ci->i_layout.pool_ns);
+ dst_oloc.pool = dst_ci->i_layout.pool_id;
+ dst_oloc.pool_ns = ceph_try_get_string(dst_ci->i_layout.pool_ns);
+
+ ceph_calc_file_object_mapping(&src_ci->i_layout, src_off,
+ src_ci->i_layout.object_size,
+ &src_objnum, &src_objoff, &src_objlen);
+ ceph_calc_file_object_mapping(&dst_ci->i_layout, dst_off,
+ dst_ci->i_layout.object_size,
+ &dst_objnum, &dst_objoff, &dst_objlen);
+ /* object-level offsets need to the same */
+ if (src_objoff != dst_objoff) {
+ ret = -EOPNOTSUPP;
+ goto out_caps;
+ }
+
+ /*
+ * Do a manual copy if the object offset isn't object aligned.
+ * 'src_objlen' contains the bytes left until the end of the object,
+ * starting at the src_off
+ */
+ if (src_objoff) {
+ /*
+ * we need to temporarily drop all caps as we'll be calling
+ * {read,write}_iter, which will get caps again.
+ */
+ put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got);
+ ret = do_splice_direct(src_file, &src_off, dst_file,
+ &dst_off, src_objlen, flags);
+ if (ret < 0) {
+ dout("do_splice_direct returned %d\n", err);
+ goto out;
+ }
+ len -= ret;
+ err = get_rd_wr_caps(src_ci, (src_off + len),
+ &src_got, dst_ci,
+ (dst_off + len), &dst_got);
+ if (err < 0)
+ goto out;
+ err = is_file_size_ok(src_inode, dst_inode,
+ src_off, dst_off, len);
+ if (err < 0)
+ goto out_caps;
+ }
+ object_size = src_ci->i_layout.object_size;
+ while (len >= object_size) {
+ ceph_calc_file_object_mapping(&src_ci->i_layout, src_off,
+ object_size, &src_objnum,
+ &src_objoff, &src_objlen);
+ ceph_calc_file_object_mapping(&dst_ci->i_layout, dst_off,
+ object_size, &dst_objnum,
+ &dst_objoff, &dst_objlen);
+ ceph_oid_init(&src_oid);
+ ceph_oid_printf(&src_oid, "%llx.%08llx",
+ src_ci->i_vino.ino, src_objnum);
+ ceph_oid_init(&dst_oid);
+ ceph_oid_printf(&dst_oid, "%llx.%08llx",
+ dst_ci->i_vino.ino, dst_objnum);
+ /* Do an object remote copy */
+ err = ceph_osdc_copy_from(
+ &ceph_inode_to_client(src_inode)->client->osdc,
+ src_ci->i_vino.snap, 0,
+ &src_oid, &src_oloc,
+ CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
+ CEPH_OSD_OP_FLAG_FADVISE_NOCACHE,
+ &dst_oid, &dst_oloc,
+ CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
+ CEPH_OSD_OP_FLAG_FADVISE_DONTNEED, 0);
+ if (err) {
+ dout("ceph_osdc_copy_from returned %d\n", err);
+ if (!ret)
+ ret = err;
+ goto out_caps;
+ }
+ len -= object_size;
+ src_off += object_size;
+ dst_off += object_size;
+ ret += object_size;
+ }
+
+ if (len)
+ /* We still need one final local copy */
+ do_final_copy = true;
+
+ file_update_time(dst_file);
+ if (endoff > size) {
+ int caps_flags = 0;
+
+ /* Let the MDS know about dst file size change */
+ if (ceph_quota_is_max_bytes_approaching(dst_inode, endoff))
+ caps_flags |= CHECK_CAPS_NODELAY;
+ if (ceph_inode_set_size(dst_inode, endoff))
+ caps_flags |= CHECK_CAPS_AUTHONLY;
+ if (caps_flags)
+ ceph_check_caps(dst_ci, caps_flags, NULL);
+ }
+ /* Mark Fw dirty */
+ spin_lock(&dst_ci->i_ceph_lock);
+ dst_ci->i_inline_version = CEPH_INLINE_NONE;
+ dirty = __ceph_mark_dirty_caps(dst_ci, CEPH_CAP_FILE_WR, &prealloc_cf);
+ spin_unlock(&dst_ci->i_ceph_lock);
+ if (dirty)
+ __mark_inode_dirty(dst_inode, dirty);
+
+out_caps:
+ put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got);
+
+ if (do_final_copy) {
+ err = do_splice_direct(src_file, &src_off, dst_file,
+ &dst_off, len, flags);
+ if (err < 0) {
+ dout("do_splice_direct returned %d\n", err);
+ goto out;
+ }
+ len -= err;
+ ret += err;
+ }
+
+out:
+ ceph_free_cap_flush(prealloc_cf);
+
+ return ret;
+}
+
const struct file_operations ceph_file_fops = {
.open = ceph_open,
.release = ceph_release,
@@ -1844,5 +2111,5 @@ const struct file_operations ceph_file_fops = {
.unlocked_ioctl = ceph_ioctl,
.compat_ioctl = ceph_ioctl,
.fallocate = ceph_fallocate,
+ .copy_file_range = ceph_copy_file_range,
};
-
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index ebc7bdaed2d0..9d1f34d46627 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1098,8 +1098,9 @@ out_unlock:
* splice a dentry to an inode.
* caller must hold directory i_mutex for this to be safe.
*/
-static struct dentry *splice_dentry(struct dentry *dn, struct inode *in)
+static int splice_dentry(struct dentry **pdn, struct inode *in)
{
+ struct dentry *dn = *pdn;
struct dentry *realdn;
BUG_ON(d_inode(dn));
@@ -1132,24 +1133,23 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in)
if (IS_ERR(realdn)) {
pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n",
PTR_ERR(realdn), dn, in, ceph_vinop(in));
- dput(dn);
- dn = realdn; /* note realdn contains the error */
- goto out;
- } else if (realdn) {
+ return PTR_ERR(realdn);
+ }
+
+ if (realdn) {
dout("dn %p (%d) spliced with %p (%d) "
"inode %p ino %llx.%llx\n",
dn, d_count(dn),
realdn, d_count(realdn),
d_inode(realdn), ceph_vinop(d_inode(realdn)));
dput(dn);
- dn = realdn;
+ *pdn = realdn;
} else {
BUG_ON(!ceph_dentry(dn));
dout("dn %p attached to %p ino %llx.%llx\n",
dn, d_inode(dn), ceph_vinop(d_inode(dn)));
}
-out:
- return dn;
+ return 0;
}
/*
@@ -1196,7 +1196,9 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
WARN_ON_ONCE(1);
}
- if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME) {
+ if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME &&
+ test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
+ !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
struct qstr dname;
struct dentry *dn, *parent;
@@ -1334,7 +1336,12 @@ retry_lookup:
dout("dn %p gets new offset %lld\n", req->r_old_dentry,
ceph_dentry(req->r_old_dentry)->offset);
- dn = req->r_old_dentry; /* use old_dentry */
+ /* swap r_dentry and r_old_dentry in case that
+ * splice_dentry() gets called later. This is safe
+ * because no other place will use them */
+ req->r_dentry = req->r_old_dentry;
+ req->r_old_dentry = dn;
+ dn = req->r_dentry;
}
/* null dentry? */
@@ -1359,12 +1366,10 @@ retry_lookup:
if (d_really_is_negative(dn)) {
ceph_dir_clear_ordered(dir);
ihold(in);
- dn = splice_dentry(dn, in);
- if (IS_ERR(dn)) {
- err = PTR_ERR(dn);
+ err = splice_dentry(&req->r_dentry, in);
+ if (err < 0)
goto done;
- }
- req->r_dentry = dn; /* may have spliced */
+ dn = req->r_dentry; /* may have spliced */
} else if (d_really_is_positive(dn) && d_inode(dn) != in) {
dout(" %p links to %p %llx.%llx, not %llx.%llx\n",
dn, d_inode(dn), ceph_vinop(d_inode(dn)),
@@ -1384,22 +1389,18 @@ retry_lookup:
} else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
req->r_op == CEPH_MDS_OP_MKSNAP) &&
!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
- struct dentry *dn = req->r_dentry;
struct inode *dir = req->r_parent;
/* fill out a snapdir LOOKUPSNAP dentry */
- BUG_ON(!dn);
BUG_ON(!dir);
BUG_ON(ceph_snap(dir) != CEPH_SNAPDIR);
- dout(" linking snapped dir %p to dn %p\n", in, dn);
+ BUG_ON(!req->r_dentry);
+ dout(" linking snapped dir %p to dn %p\n", in, req->r_dentry);
ceph_dir_clear_ordered(dir);
ihold(in);
- dn = splice_dentry(dn, in);
- if (IS_ERR(dn)) {
- err = PTR_ERR(dn);
+ err = splice_dentry(&req->r_dentry, in);
+ if (err < 0)
goto done;
- }
- req->r_dentry = dn; /* may have spliced */
} else if (rinfo->head->is_dentry) {
struct ceph_vino *ptvino = NULL;
@@ -1663,8 +1664,6 @@ retry_lookup:
}
if (d_really_is_negative(dn)) {
- struct dentry *realdn;
-
if (ceph_security_xattr_deadlock(in)) {
dout(" skip splicing dn %p to inode %p"
" (security xattr deadlock)\n", dn, in);
@@ -1673,14 +1672,9 @@ retry_lookup:
goto next_item;
}
- realdn = splice_dentry(dn, in);
- if (IS_ERR(realdn)) {
- err = PTR_ERR(realdn);
- d_drop(dn);
- dn = NULL;
+ err = splice_dentry(&dn, in);
+ if (err < 0)
goto next_item;
- }
- dn = realdn;
}
ceph_dentry(dn)->offset = rde->offset;
@@ -1696,8 +1690,7 @@ retry_lookup:
err = ret;
}
next_item:
- if (dn)
- dput(dn);
+ dput(dn);
}
out:
if (err == 0 && skipped == 0) {
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index bc43c822426a..163fc74bf221 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -80,12 +80,8 @@ static int parse_reply_info_in(void **p, void *end,
info->symlink = *p;
*p += info->symlink_len;
- if (features & CEPH_FEATURE_DIRLAYOUTHASH)
- ceph_decode_copy_safe(p, end, &info->dir_layout,
- sizeof(info->dir_layout), bad);
- else
- memset(&info->dir_layout, 0, sizeof(info->dir_layout));
-
+ ceph_decode_copy_safe(p, end, &info->dir_layout,
+ sizeof(info->dir_layout), bad);
ceph_decode_32_safe(p, end, info->xattr_len, bad);
ceph_decode_need(p, end, info->xattr_len, bad);
info->xattr_data = *p;
@@ -1236,13 +1232,13 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
dout("removing cap %p, ci is %p, inode is %p\n",
cap, ci, &ci->vfs_inode);
spin_lock(&ci->i_ceph_lock);
+ if (cap->mds_wanted | cap->issued)
+ ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
__ceph_remove_cap(cap, false);
if (!ci->i_auth_cap) {
struct ceph_cap_flush *cf;
struct ceph_mds_client *mdsc = fsc->mdsc;
- ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
-
if (ci->i_wrbuffer_ref > 0 &&
READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
invalidate = true;
@@ -1359,6 +1355,12 @@ static void remove_session_caps(struct ceph_mds_session *session)
dispose_cap_releases(session->s_mdsc, &dispose);
}
+enum {
+ RECONNECT,
+ RENEWCAPS,
+ FORCE_RO,
+};
+
/*
* wake up any threads waiting on this session's caps. if the cap is
* old (didn't get renewed on the client reconnect), remove it now.
@@ -1369,23 +1371,34 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
void *arg)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ unsigned long ev = (unsigned long)arg;
- if (arg) {
+ if (ev == RECONNECT) {
spin_lock(&ci->i_ceph_lock);
ci->i_wanted_max_size = 0;
ci->i_requested_max_size = 0;
spin_unlock(&ci->i_ceph_lock);
+ } else if (ev == RENEWCAPS) {
+ if (cap->cap_gen < cap->session->s_cap_gen) {
+ /* mds did not re-issue stale cap */
+ spin_lock(&ci->i_ceph_lock);
+ cap->issued = cap->implemented = CEPH_CAP_PIN;
+ /* make sure mds knows what we want */
+ if (__ceph_caps_file_wanted(ci) & ~cap->mds_wanted)
+ ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
+ spin_unlock(&ci->i_ceph_lock);
+ }
+ } else if (ev == FORCE_RO) {
}
wake_up_all(&ci->i_cap_wq);
return 0;
}
-static void wake_up_session_caps(struct ceph_mds_session *session,
- int reconnect)
+static void wake_up_session_caps(struct ceph_mds_session *session, int ev)
{
dout("wake_up_session_caps %p mds%d\n", session, session->s_mds);
iterate_session_caps(session, wake_up_session_cb,
- (void *)(unsigned long)reconnect);
+ (void *)(unsigned long)ev);
}
/*
@@ -1470,7 +1483,7 @@ static void renewed_caps(struct ceph_mds_client *mdsc,
spin_unlock(&session->s_cap_lock);
if (wake)
- wake_up_session_caps(session, 0);
+ wake_up_session_caps(session, RENEWCAPS);
}
/*
@@ -2071,7 +2084,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
if (req->r_old_dentry_drop)
len += req->r_old_dentry->d_name.len;
- msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS, false);
+ msg = ceph_msg_new2(CEPH_MSG_CLIENT_REQUEST, len, 1, GFP_NOFS, false);
if (!msg) {
msg = ERR_PTR(-ENOMEM);
goto out_free2;
@@ -2136,7 +2149,6 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
if (req->r_pagelist) {
struct ceph_pagelist *pagelist = req->r_pagelist;
- refcount_inc(&pagelist->refcnt);
ceph_msg_data_add_pagelist(msg, pagelist);
msg->hdr.data_len = cpu_to_le32(pagelist->length);
} else {
@@ -2852,7 +2864,7 @@ static void handle_session(struct ceph_mds_session *session,
spin_lock(&session->s_cap_lock);
session->s_readonly = true;
spin_unlock(&session->s_cap_lock);
- wake_up_session_caps(session, 0);
+ wake_up_session_caps(session, FORCE_RO);
break;
case CEPH_SESSION_REJECT:
@@ -2948,11 +2960,8 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
struct ceph_inode_info *ci = cap->ci;
struct ceph_reconnect_state *recon_state = arg;
struct ceph_pagelist *pagelist = recon_state->pagelist;
- char *path;
- int pathlen, err;
- u64 pathbase;
+ int err;
u64 snap_follows;
- struct dentry *dentry;
dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
inode, ceph_vinop(inode), cap, cap->cap_id,
@@ -2961,19 +2970,6 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
if (err)
return err;
- dentry = d_find_alias(inode);
- if (dentry) {
- path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0);
- if (IS_ERR(path)) {
- err = PTR_ERR(path);
- goto out_dput;
- }
- } else {
- path = NULL;
- pathlen = 0;
- pathbase = 0;
- }
-
spin_lock(&ci->i_ceph_lock);
cap->seq = 0; /* reset cap seq */
cap->issue_seq = 0; /* and issue_seq */
@@ -2985,7 +2981,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
rec.v2.issued = cpu_to_le32(cap->issued);
rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
- rec.v2.pathbase = cpu_to_le64(pathbase);
+ rec.v2.pathbase = 0;
rec.v2.flock_len = (__force __le32)
((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1);
} else {
@@ -2996,7 +2992,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
ceph_encode_timespec64(&rec.v1.mtime, &inode->i_mtime);
ceph_encode_timespec64(&rec.v1.atime, &inode->i_atime);
rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
- rec.v1.pathbase = cpu_to_le64(pathbase);
+ rec.v1.pathbase = 0;
}
if (list_empty(&ci->i_cap_snaps)) {
@@ -3028,7 +3024,7 @@ encode_again:
GFP_NOFS);
if (!flocks) {
err = -ENOMEM;
- goto out_free;
+ goto out_err;
}
err = ceph_encode_locks_to_buffer(inode, flocks,
num_fcntl_locks,
@@ -3038,7 +3034,7 @@ encode_again:
flocks = NULL;
if (err == -ENOSPC)
goto encode_again;
- goto out_free;
+ goto out_err;
}
} else {
kfree(flocks);
@@ -3058,44 +3054,64 @@ encode_again:
sizeof(struct ceph_filelock);
rec.v2.flock_len = cpu_to_le32(struct_len);
- struct_len += sizeof(rec.v2);
- struct_len += sizeof(u32) + pathlen;
+ struct_len += sizeof(u32) + sizeof(rec.v2);
if (struct_v >= 2)
struct_len += sizeof(u64); /* snap_follows */
total_len += struct_len;
err = ceph_pagelist_reserve(pagelist, total_len);
+ if (err) {
+ kfree(flocks);
+ goto out_err;
+ }
- if (!err) {
- if (recon_state->msg_version >= 3) {
- ceph_pagelist_encode_8(pagelist, struct_v);
- ceph_pagelist_encode_8(pagelist, 1);
- ceph_pagelist_encode_32(pagelist, struct_len);
- }
- ceph_pagelist_encode_string(pagelist, path, pathlen);
- ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2));
- ceph_locks_to_pagelist(flocks, pagelist,
- num_fcntl_locks,
- num_flock_locks);
- if (struct_v >= 2)
- ceph_pagelist_encode_64(pagelist, snap_follows);
+ if (recon_state->msg_version >= 3) {
+ ceph_pagelist_encode_8(pagelist, struct_v);
+ ceph_pagelist_encode_8(pagelist, 1);
+ ceph_pagelist_encode_32(pagelist, struct_len);
}
+ ceph_pagelist_encode_string(pagelist, NULL, 0);
+ ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2));
+ ceph_locks_to_pagelist(flocks, pagelist,
+ num_fcntl_locks, num_flock_locks);
+ if (struct_v >= 2)
+ ceph_pagelist_encode_64(pagelist, snap_follows);
+
kfree(flocks);
} else {
- size_t size = sizeof(u32) + pathlen + sizeof(rec.v1);
- err = ceph_pagelist_reserve(pagelist, size);
- if (!err) {
- ceph_pagelist_encode_string(pagelist, path, pathlen);
- ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1));
+ u64 pathbase = 0;
+ int pathlen = 0;
+ char *path = NULL;
+ struct dentry *dentry;
+
+ dentry = d_find_alias(inode);
+ if (dentry) {
+ path = ceph_mdsc_build_path(dentry,
+ &pathlen, &pathbase, 0);
+ dput(dentry);
+ if (IS_ERR(path)) {
+ err = PTR_ERR(path);
+ goto out_err;
+ }
+ rec.v1.pathbase = cpu_to_le64(pathbase);
}
+
+ err = ceph_pagelist_reserve(pagelist,
+ pathlen + sizeof(u32) + sizeof(rec.v1));
+ if (err) {
+ kfree(path);
+ goto out_err;
+ }
+
+ ceph_pagelist_encode_string(pagelist, path, pathlen);
+ ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1));
+
+ kfree(path);
}
recon_state->nr_caps++;
-out_free:
- kfree(path);
-out_dput:
- dput(dentry);
+out_err:
return err;
}
@@ -3126,12 +3142,11 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
pr_info("mds%d reconnect start\n", mds);
- pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
+ pagelist = ceph_pagelist_alloc(GFP_NOFS);
if (!pagelist)
goto fail_nopagelist;
- ceph_pagelist_init(pagelist);
- reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS, false);
+ reply = ceph_msg_new2(CEPH_MSG_CLIENT_RECONNECT, 0, 1, GFP_NOFS, false);
if (!reply)
goto fail_nomsg;
@@ -3184,10 +3199,8 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
recon_state.pagelist = pagelist;
if (session->s_con.peer_features & CEPH_FEATURE_MDSENC)
recon_state.msg_version = 3;
- else if (session->s_con.peer_features & CEPH_FEATURE_FLOCK)
- recon_state.msg_version = 2;
else
- recon_state.msg_version = 1;
+ recon_state.msg_version = 2;
err = iterate_session_caps(session, encode_caps_cb, &recon_state);
if (err < 0)
goto fail;
@@ -3241,6 +3254,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
mutex_unlock(&mdsc->mutex);
up_read(&mdsc->snap_rwsem);
+ ceph_pagelist_release(pagelist);
return;
fail:
@@ -3346,7 +3360,7 @@ static void check_new_map(struct ceph_mds_client *mdsc,
pr_info("mds%d recovery completed\n", s->s_mds);
kick_requests(mdsc, i);
ceph_kick_flushing_caps(mdsc, s);
- wake_up_session_caps(s, 1);
+ wake_up_session_caps(s, RECONNECT);
}
}
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 32fcce0d4d3c..729da155ebf0 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -17,14 +17,16 @@
#include <linux/ceph/auth.h>
/* The first 8 bits are reserved for old ceph releases */
-#define CEPHFS_FEATURE_MIMIC 8
-
-#define CEPHFS_FEATURES_ALL { \
- 0, 1, 2, 3, 4, 5, 6, 7, \
- CEPHFS_FEATURE_MIMIC, \
+#define CEPHFS_FEATURE_MIMIC 8
+#define CEPHFS_FEATURE_REPLY_ENCODING 9
+#define CEPHFS_FEATURE_RECLAIM_CLIENT 10
+#define CEPHFS_FEATURE_LAZY_CAP_WANTED 11
+
+#define CEPHFS_FEATURES_CLIENT_SUPPORTED { \
+ 0, 1, 2, 3, 4, 5, 6, 7, \
+ CEPHFS_FEATURE_MIMIC, \
+ CEPHFS_FEATURE_LAZY_CAP_WANTED, \
}
-
-#define CEPHFS_FEATURES_CLIENT_SUPPORTED CEPHFS_FEATURES_ALL
#define CEPHFS_FEATURES_CLIENT_REQUIRED {}
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 44e53abeb32a..1a2c5d390f7f 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -35,7 +35,6 @@ int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
/* pick */
n = prandom_u32() % n;
- i = 0;
for (i = 0; n > 0; i++, n--)
while (m->m_info[i].state <= 0)
i++;
diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
index 32d4f13784ba..9455d3aef0c3 100644
--- a/fs/ceph/quota.c
+++ b/fs/ceph/quota.c
@@ -3,19 +3,6 @@
* quota.c - CephFS quota
*
* Copyright (C) 2017-2018 SUSE
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <linux/statfs.h>
@@ -237,7 +224,8 @@ static bool check_quota_exceeded(struct inode *inode, enum quota_check_op op,
ceph_put_snap_realm(mdsc, realm);
realm = next;
}
- ceph_put_snap_realm(mdsc, realm);
+ if (realm)
+ ceph_put_snap_realm(mdsc, realm);
up_read(&mdsc->snap_rwsem);
return exceeded;
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 041c27ea8de1..f74193da0e09 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -616,7 +616,8 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
capsnap->size);
spin_lock(&mdsc->snap_flush_lock);
- list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list);
+ if (list_empty(&ci->i_snap_flush_item))
+ list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list);
spin_unlock(&mdsc->snap_flush_lock);
return 1; /* caller may want to ceph_flush_snaps */
}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index eab1359d0553..da2cd8e89062 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -165,6 +165,8 @@ enum {
Opt_noacl,
Opt_quotadf,
Opt_noquotadf,
+ Opt_copyfrom,
+ Opt_nocopyfrom,
};
static match_table_t fsopt_tokens = {
@@ -203,6 +205,8 @@ static match_table_t fsopt_tokens = {
{Opt_noacl, "noacl"},
{Opt_quotadf, "quotadf"},
{Opt_noquotadf, "noquotadf"},
+ {Opt_copyfrom, "copyfrom"},
+ {Opt_nocopyfrom, "nocopyfrom"},
{-1, NULL}
};
@@ -355,6 +359,12 @@ static int parse_fsopt_token(char *c, void *private)
case Opt_noquotadf:
fsopt->flags |= CEPH_MOUNT_OPT_NOQUOTADF;
break;
+ case Opt_copyfrom:
+ fsopt->flags &= ~CEPH_MOUNT_OPT_NOCOPYFROM;
+ break;
+ case Opt_nocopyfrom:
+ fsopt->flags |= CEPH_MOUNT_OPT_NOCOPYFROM;
+ break;
#ifdef CONFIG_CEPH_FS_POSIX_ACL
case Opt_acl:
fsopt->sb_flags |= SB_POSIXACL;
@@ -520,7 +530,7 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
seq_putc(m, ',');
pos = m->count;
- ret = ceph_print_client_options(m, fsc->client);
+ ret = ceph_print_client_options(m, fsc->client, false);
if (ret)
return ret;
@@ -553,6 +563,9 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
seq_puts(m, ",noacl");
#endif
+ if ((fsopt->flags & CEPH_MOUNT_OPT_NOCOPYFROM) == 0)
+ seq_puts(m, ",copyfrom");
+
if (fsopt->mds_namespace)
seq_show_option(m, "mds_namespace", fsopt->mds_namespace);
if (fsopt->wsize != CEPH_MAX_WRITE_SIZE)
@@ -627,7 +640,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
opt = NULL; /* fsc->client now owns this */
fsc->client->extra_mon_dispatch = extra_mon_dispatch;
- fsc->client->osdc.abort_on_full = true;
+ ceph_set_opt(fsc->client, ABORT_ON_FULL);
if (!fsopt->mds_namespace) {
ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP,
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 582e28fd1b7b..dfb64a5211b6 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -40,8 +40,11 @@
#define CEPH_MOUNT_OPT_NOPOOLPERM (1<<11) /* no pool permission check */
#define CEPH_MOUNT_OPT_MOUNTWAIT (1<<12) /* mount waits if no mds is up */
#define CEPH_MOUNT_OPT_NOQUOTADF (1<<13) /* no root dir quota in statfs */
+#define CEPH_MOUNT_OPT_NOCOPYFROM (1<<14) /* don't use RADOS 'copy-from' op */
-#define CEPH_MOUNT_OPT_DEFAULT CEPH_MOUNT_OPT_DCACHE
+#define CEPH_MOUNT_OPT_DEFAULT \
+ (CEPH_MOUNT_OPT_DCACHE | \
+ CEPH_MOUNT_OPT_NOCOPYFROM)
#define ceph_set_mount_opt(fsc, opt) \
(fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt;
@@ -807,7 +810,7 @@ static inline int default_congestion_kb(void)
* This allows larger machines to have larger/more transfers.
* Limit the default to 256M
*/
- congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
+ congestion_kb = (16*int_sqrt(totalram_pages())) << (PAGE_SHIFT-10);
if (congestion_kb > 256*1024)
congestion_kb = 256*1024;
@@ -1008,7 +1011,7 @@ extern int ceph_encode_dentry_release(void **p, struct dentry *dn,
extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
loff_t endoff, int *got, struct page **pinned_page);
extern int ceph_try_get_caps(struct ceph_inode_info *ci,
- int need, int want, int *got);
+ int need, int want, bool nonblock, int *got);
/* for counting open files by mode */
extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 5cc8b94f8206..316f6ad10644 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -951,11 +951,10 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name,
if (size > 0) {
/* copy value into pagelist */
- pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
+ pagelist = ceph_pagelist_alloc(GFP_NOFS);
if (!pagelist)
return -ENOMEM;
- ceph_pagelist_init(pagelist);
err = ceph_pagelist_append(pagelist, value, size);
if (err)
goto out;
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index abcd78e332fe..f1ddc9d03c10 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -133,7 +133,7 @@ config CIFS_XATTR
config CIFS_POSIX
bool "CIFS POSIX Extensions"
- depends on CIFS_XATTR
+ depends on CIFS && CIFS_ALLOW_INSECURE_LEGACY && CIFS_XATTR
help
Enabling this option will cause the cifs client to attempt to
negotiate a newer dialect with servers, such as Samba 3.0.5
@@ -190,8 +190,9 @@ config CIFS_DFS_UPCALL
moves to a different server. This feature also enables
an upcall mechanism for CIFS which contacts userspace helper
utilities to provide server name resolution (host names to
- IP addresses) which is needed for implicit mounts of DFS junction
- points. If unsure, say Y.
+ IP addresses) which is needed in order to reconnect to
+ servers if their addresses change or for implicit mounts of
+ DFS junction points. If unsure, say Y.
config CIFS_NFSD_EXPORT
bool "Allow nfsd to export CIFS file system"
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index 85817991ee68..51af69a1a328 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -17,7 +17,7 @@ cifs-$(CONFIG_CIFS_ACL) += cifsacl.o
cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o
-cifs-$(CONFIG_CIFS_DFS_UPCALL) += dns_resolve.o cifs_dfs_ref.o
+cifs-$(CONFIG_CIFS_DFS_UPCALL) += dns_resolve.o cifs_dfs_ref.o dfs_cache.o
cifs-$(CONFIG_CIFS_FSCACHE) += fscache.o cache.o
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index f1fbea947fef..e92a2fee3c57 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -30,6 +30,9 @@
#include "cifsproto.h"
#include "cifs_debug.h"
#include "cifsfs.h"
+#ifdef CONFIG_CIFS_DFS_UPCALL
+#include "dfs_cache.h"
+#endif
#ifdef CONFIG_CIFS_SMB_DIRECT
#include "smbdirect.h"
#endif
@@ -132,7 +135,7 @@ cifs_dump_iface(struct seq_file *m, struct cifs_server_iface *iface)
struct sockaddr_in *ipv4 = (struct sockaddr_in *)&iface->sockaddr;
struct sockaddr_in6 *ipv6 = (struct sockaddr_in6 *)&iface->sockaddr;
- seq_printf(m, "\t\tSpeed: %zu bps\n", iface->speed);
+ seq_printf(m, "\tSpeed: %zu bps\n", iface->speed);
seq_puts(m, "\t\tCapabilities: ");
if (iface->rdma_capable)
seq_puts(m, "rdma ");
@@ -145,6 +148,58 @@ cifs_dump_iface(struct seq_file *m, struct cifs_server_iface *iface)
seq_printf(m, "\t\tIPv6: %pI6\n", &ipv6->sin6_addr);
}
+static int cifs_debug_files_proc_show(struct seq_file *m, void *v)
+{
+ struct list_head *stmp, *tmp, *tmp1, *tmp2;
+ struct TCP_Server_Info *server;
+ struct cifs_ses *ses;
+ struct cifs_tcon *tcon;
+ struct cifsFileInfo *cfile;
+
+ seq_puts(m, "# Version:1\n");
+ seq_puts(m, "# Format:\n");
+ seq_puts(m, "# <tree id> <persistent fid> <flags> <count> <pid> <uid>");
+#ifdef CONFIG_CIFS_DEBUG2
+ seq_printf(m, " <filename> <mid>\n");
+#else
+ seq_printf(m, " <filename>\n");
+#endif /* CIFS_DEBUG2 */
+ spin_lock(&cifs_tcp_ses_lock);
+ list_for_each(stmp, &cifs_tcp_ses_list) {
+ server = list_entry(stmp, struct TCP_Server_Info,
+ tcp_ses_list);
+ list_for_each(tmp, &server->smb_ses_list) {
+ ses = list_entry(tmp, struct cifs_ses, smb_ses_list);
+ list_for_each(tmp1, &ses->tcon_list) {
+ tcon = list_entry(tmp1, struct cifs_tcon, tcon_list);
+ spin_lock(&tcon->open_file_lock);
+ list_for_each(tmp2, &tcon->openFileList) {
+ cfile = list_entry(tmp2, struct cifsFileInfo,
+ tlist);
+ seq_printf(m,
+ "0x%x 0x%llx 0x%x %d %d %d %s",
+ tcon->tid,
+ cfile->fid.persistent_fid,
+ cfile->f_flags,
+ cfile->count,
+ cfile->pid,
+ from_kuid(&init_user_ns, cfile->uid),
+ cfile->dentry->d_name.name);
+#ifdef CONFIG_CIFS_DEBUG2
+ seq_printf(m, " 0x%llx\n", cfile->fid.mid);
+#else
+ seq_printf(m, "\n");
+#endif /* CIFS_DEBUG2 */
+ }
+ spin_unlock(&tcon->open_file_lock);
+ }
+ }
+ }
+ spin_unlock(&cifs_tcp_ses_lock);
+ seq_putc(m, '\n');
+ return 0;
+}
+
static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
{
struct list_head *tmp1, *tmp2, *tmp3;
@@ -197,6 +252,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
seq_printf(m, ",ACL");
#endif
seq_putc(m, '\n');
+ seq_printf(m, "CIFSMaxBufSize: %d\n", CIFSMaxBufSize);
seq_printf(m, "Active VFS Requests: %d\n", GlobalTotalActiveXid);
seq_printf(m, "Servers:");
@@ -285,7 +341,7 @@ skip_rdma:
if ((ses->serverDomain == NULL) ||
(ses->serverOS == NULL) ||
(ses->serverNOS == NULL)) {
- seq_printf(m, "\n%d) Name: %s Uses: %d Capability: 0x%x\tSession Status: %d\t",
+ seq_printf(m, "\n%d) Name: %s Uses: %d Capability: 0x%x\tSession Status: %d ",
i, ses->serverName, ses->ses_count,
ses->capabilities, ses->status);
if (ses->session_flags & SMB2_SESSION_FLAG_IS_GUEST)
@@ -296,16 +352,18 @@ skip_rdma:
seq_printf(m,
"\n%d) Name: %s Domain: %s Uses: %d OS:"
" %s\n\tNOS: %s\tCapability: 0x%x\n\tSMB"
- " session status: %d\t",
+ " session status: %d ",
i, ses->serverName, ses->serverDomain,
ses->ses_count, ses->serverOS, ses->serverNOS,
ses->capabilities, ses->status);
}
if (server->rdma)
seq_printf(m, "RDMA\n\t");
- seq_printf(m, "TCP status: %d\n\tLocal Users To "
+ seq_printf(m, "TCP status: %d Instance: %d\n\tLocal Users To "
"Server: %d SecMode: 0x%x Req On Wire: %d",
- server->tcpStatus, server->srv_count,
+ server->tcpStatus,
+ server->reconnect_instance,
+ server->srv_count,
server->sec_mode, in_flight(server));
#ifdef CONFIG_CIFS_STATS2
@@ -352,7 +410,7 @@ skip_rdma:
seq_printf(m, "\n\tServer interfaces: %zu\n",
ses->iface_count);
for (j = 0; j < ses->iface_count; j++) {
- seq_printf(m, "\t%d)\n", j);
+ seq_printf(m, "\t%d)", j);
cifs_dump_iface(m, &ses->iface_list[j]);
}
spin_unlock(&ses->iface_lock);
@@ -383,6 +441,9 @@ static ssize_t cifs_stats_proc_write(struct file *file,
atomic_set(&totBufAllocCount, 0);
atomic_set(&totSmBufAllocCount, 0);
#endif /* CONFIG_CIFS_STATS2 */
+ atomic_set(&tcpSesReconnectCount, 0);
+ atomic_set(&tconInfoReconnectCount, 0);
+
spin_lock(&GlobalMid_Lock);
GlobalMaxActiveXid = 0;
GlobalCurrentXid = 0;
@@ -560,6 +621,9 @@ cifs_proc_init(void)
proc_create_single("DebugData", 0, proc_fs_cifs,
cifs_debug_data_proc_show);
+ proc_create_single("open_files", 0400, proc_fs_cifs,
+ cifs_debug_files_proc_show);
+
proc_create("Stats", 0644, proc_fs_cifs, &cifs_stats_proc_fops);
proc_create("cifsFYI", 0644, proc_fs_cifs, &cifsFYI_proc_fops);
proc_create("traceSMB", 0644, proc_fs_cifs, &traceSMB_proc_fops);
@@ -569,6 +633,11 @@ cifs_proc_init(void)
&cifs_security_flags_proc_fops);
proc_create("LookupCacheEnabled", 0644, proc_fs_cifs,
&cifs_lookup_cache_proc_fops);
+
+#ifdef CONFIG_CIFS_DFS_UPCALL
+ proc_create("dfscache", 0644, proc_fs_cifs, &dfscache_proc_fops);
+#endif
+
#ifdef CONFIG_CIFS_SMB_DIRECT
proc_create("rdma_readwrite_threshold", 0644, proc_fs_cifs,
&cifs_rdma_readwrite_threshold_proc_fops);
@@ -596,12 +665,17 @@ cifs_proc_clean(void)
return;
remove_proc_entry("DebugData", proc_fs_cifs);
+ remove_proc_entry("open_files", proc_fs_cifs);
remove_proc_entry("cifsFYI", proc_fs_cifs);
remove_proc_entry("traceSMB", proc_fs_cifs);
remove_proc_entry("Stats", proc_fs_cifs);
remove_proc_entry("SecurityFlags", proc_fs_cifs);
remove_proc_entry("LinuxExtensionsEnabled", proc_fs_cifs);
remove_proc_entry("LookupCacheEnabled", proc_fs_cifs);
+
+#ifdef CONFIG_CIFS_DFS_UPCALL
+ remove_proc_entry("dfscache", proc_fs_cifs);
+#endif
#ifdef CONFIG_CIFS_SMB_DIRECT
remove_proc_entry("rdma_readwrite_threshold", proc_fs_cifs);
remove_proc_entry("smbd_max_frmr_depth", proc_fs_cifs);
diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h
index f4f3f0853c6e..631dc1bb21c1 100644
--- a/fs/cifs/cifs_debug.h
+++ b/fs/cifs/cifs_debug.h
@@ -47,6 +47,29 @@ extern int cifsFYI;
*/
#ifdef CONFIG_CIFS_DEBUG
+
+/*
+ * When adding tracepoints and debug messages we have various choices.
+ * Some considerations:
+ *
+ * Use cifs_dbg(VFS, ...) for things we always want logged, and the user to see
+ * cifs_info(...) slightly less important, admin can filter via loglevel > 6
+ * cifs_dbg(FYI, ...) minor debugging messages, off by default
+ * trace_smb3_* ftrace functions are preferred for complex debug messages
+ * intended for developers or experienced admins, off by default
+ */
+
+/* Information level messages, minor events */
+#define cifs_info_func(ratefunc, fmt, ...) \
+do { \
+ pr_info_ ## ratefunc("CIFS: " fmt, ##__VA_ARGS__); \
+} while (0)
+
+#define cifs_info(fmt, ...) \
+do { \
+ cifs_info_func(ratelimited, fmt, ##__VA_ARGS__); \
+} while (0)
+
/* information message: e.g., configuration, major event */
#define cifs_dbg_func(ratefunc, type, fmt, ...) \
do { \
@@ -81,6 +104,11 @@ do { \
if (0) \
pr_debug(fmt, ##__VA_ARGS__); \
} while (0)
+
+#define cifs_info(fmt, ...) \
+do { \
+ pr_info("CIFS: "fmt, ##__VA_ARGS__); \
+} while (0)
#endif
#endif /* _H_CIFS_DEBUG */
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 6b61df117fd4..d9b99abe1243 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -25,6 +25,7 @@
#include "dns_resolve.h"
#include "cifs_debug.h"
#include "cifs_unicode.h"
+#include "dfs_cache.h"
static LIST_HEAD(cifs_dfs_automount_list);
@@ -126,7 +127,7 @@ cifs_build_devname(char *nodename, const char *prepath)
* @sb_mountdata: parent/root DFS mount options (template)
* @fullpath: full path in UNC format
* @ref: server's referral
- * @devname: pointer for saving device name
+ * @devname: optional pointer for saving device name
*
* creates mount options for submount based on template options sb_mountdata
* and replacing unc,ip,prefixpath options with ones we've got form ref_unc.
@@ -140,6 +141,7 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
char **devname)
{
int rc;
+ char *name;
char *mountdata = NULL;
const char *prepath = NULL;
int md_len;
@@ -158,17 +160,17 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
prepath++;
}
- *devname = cifs_build_devname(ref->node_name, prepath);
- if (IS_ERR(*devname)) {
- rc = PTR_ERR(*devname);
- *devname = NULL;
+ name = cifs_build_devname(ref->node_name, prepath);
+ if (IS_ERR(name)) {
+ rc = PTR_ERR(name);
+ name = NULL;
goto compose_mount_options_err;
}
- rc = dns_resolve_server_name_to_ip(*devname, &srvIP);
+ rc = dns_resolve_server_name_to_ip(name, &srvIP);
if (rc < 0) {
cifs_dbg(FYI, "%s: Failed to resolve server part of %s to IP: %d\n",
- __func__, *devname, rc);
+ __func__, name, rc);
goto compose_mount_options_err;
}
@@ -224,6 +226,9 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
strcat(mountdata, "ip=");
strcat(mountdata, srvIP);
+ if (devname)
+ *devname = name;
+
/*cifs_dbg(FYI, "%s: parent mountdata: %s\n", __func__, sb_mountdata);*/
/*cifs_dbg(FYI, "%s: submount mountdata: %s\n", __func__, mountdata );*/
@@ -234,8 +239,7 @@ compose_mount_options_out:
compose_mount_options_err:
kfree(mountdata);
mountdata = ERR_PTR(rc);
- kfree(*devname);
- *devname = NULL;
+ kfree(name);
goto compose_mount_options_out;
}
@@ -251,20 +255,30 @@ static struct vfsmount *cifs_dfs_do_refmount(struct dentry *mntpt,
{
struct vfsmount *mnt;
char *mountdata;
- char *devname = NULL;
+ char *devname;
+
+ /*
+ * Always pass down the DFS full path to smb3_do_mount() so we
+ * can use it later for failover.
+ */
+ devname = kstrndup(fullpath, strlen(fullpath), GFP_KERNEL);
+ if (!devname)
+ return ERR_PTR(-ENOMEM);
+
+ convert_delimiter(devname, '/');
/* strip first '\' from fullpath */
mountdata = cifs_compose_mount_options(cifs_sb->mountdata,
- fullpath + 1, ref, &devname);
-
- if (IS_ERR(mountdata))
+ fullpath + 1, ref, NULL);
+ if (IS_ERR(mountdata)) {
+ kfree(devname);
return (struct vfsmount *)mountdata;
+ }
mnt = vfs_submount(mntpt, &cifs_fs_type, devname, mountdata);
kfree(mountdata);
kfree(devname);
return mnt;
-
}
static void dump_referral(const struct dfs_info3_param *ref)
@@ -282,16 +296,15 @@ static void dump_referral(const struct dfs_info3_param *ref)
*/
static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
{
- struct dfs_info3_param *referrals = NULL;
- unsigned int num_referrals = 0;
+ struct dfs_info3_param referral = {0};
struct cifs_sb_info *cifs_sb;
struct cifs_ses *ses;
- char *full_path;
+ struct cifs_tcon *tcon;
+ char *full_path, *root_path;
unsigned int xid;
- int i;
+ int len;
int rc;
struct vfsmount *mnt;
- struct tcon_link *tlink;
cifs_dbg(FYI, "in %s\n", __func__);
BUG_ON(IS_ROOT(mntpt));
@@ -304,54 +317,80 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
*/
mnt = ERR_PTR(-ENOMEM);
+ cifs_sb = CIFS_SB(mntpt->d_sb);
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS) {
+ mnt = ERR_PTR(-EREMOTE);
+ goto cdda_exit;
+ }
+
/* always use tree name prefix */
full_path = build_path_from_dentry_optional_prefix(mntpt, true);
if (full_path == NULL)
goto cdda_exit;
- cifs_sb = CIFS_SB(mntpt->d_sb);
- tlink = cifs_sb_tlink(cifs_sb);
- if (IS_ERR(tlink)) {
- mnt = ERR_CAST(tlink);
+ cifs_dbg(FYI, "%s: full_path: %s\n", __func__, full_path);
+
+ if (!cifs_sb_master_tlink(cifs_sb)) {
+ cifs_dbg(FYI, "%s: master tlink is NULL\n", __func__);
+ goto free_full_path;
+ }
+
+ tcon = cifs_sb_master_tcon(cifs_sb);
+ if (!tcon) {
+ cifs_dbg(FYI, "%s: master tcon is NULL\n", __func__);
+ goto free_full_path;
+ }
+
+ root_path = kstrdup(tcon->treeName, GFP_KERNEL);
+ if (!root_path) {
+ mnt = ERR_PTR(-ENOMEM);
goto free_full_path;
}
- ses = tlink_tcon(tlink)->ses;
+ cifs_dbg(FYI, "%s: root path: %s\n", __func__, root_path);
+ ses = tcon->ses;
xid = get_xid();
- rc = get_dfs_path(xid, ses, full_path + 1, cifs_sb->local_nls,
- &num_referrals, &referrals,
- cifs_remap(cifs_sb));
- free_xid(xid);
- cifs_put_tlink(tlink);
-
- mnt = ERR_PTR(-ENOENT);
- for (i = 0; i < num_referrals; i++) {
- int len;
- dump_referral(referrals + i);
- /* connect to a node */
- len = strlen(referrals[i].node_name);
- if (len < 2) {
- cifs_dbg(VFS, "%s: Net Address path too short: %s\n",
- __func__, referrals[i].node_name);
- mnt = ERR_PTR(-EINVAL);
- break;
- }
- mnt = cifs_dfs_do_refmount(mntpt, cifs_sb,
- full_path, referrals + i);
- cifs_dbg(FYI, "%s: cifs_dfs_do_refmount:%s , mnt:%p\n",
- __func__, referrals[i].node_name, mnt);
- if (!IS_ERR(mnt))
- goto success;
+ /*
+ * If DFS root has been expired, then unconditionally fetch it again to
+ * refresh DFS referral cache.
+ */
+ rc = dfs_cache_find(xid, ses, cifs_sb->local_nls, cifs_remap(cifs_sb),
+ root_path + 1, NULL, NULL);
+ if (!rc) {
+ rc = dfs_cache_find(xid, ses, cifs_sb->local_nls,
+ cifs_remap(cifs_sb), full_path + 1,
+ &referral, NULL);
}
- /* no valid submounts were found; return error from get_dfs_path() by
- * preference */
- if (rc != 0)
+ free_xid(xid);
+
+ if (rc) {
mnt = ERR_PTR(rc);
+ goto free_root_path;
+ }
-success:
- free_dfs_info_array(referrals, num_referrals);
+ dump_referral(&referral);
+
+ len = strlen(referral.node_name);
+ if (len < 2) {
+ cifs_dbg(VFS, "%s: Net Address path too short: %s\n",
+ __func__, referral.node_name);
+ mnt = ERR_PTR(-EINVAL);
+ goto free_dfs_ref;
+ }
+ /*
+ * cifs_mount() will retry every available node server in case
+ * of failures.
+ */
+ mnt = cifs_dfs_do_refmount(mntpt, cifs_sb, full_path, &referral);
+ cifs_dbg(FYI, "%s: cifs_dfs_do_refmount:%s , mnt:%p\n", __func__,
+ referral.node_name, mnt);
+
+free_dfs_ref:
+ free_dfs_info_param(&referral);
+free_root_path:
+ kfree(root_path);
free_full_path:
kfree(full_path);
cdda_exit:
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 9731d0d891e7..42f0d67f1054 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -51,6 +51,7 @@
*/
#define CIFS_MOUNT_UID_FROM_ACL 0x2000000 /* try to get UID via special SID */
#define CIFS_MOUNT_NO_HANDLE_CACHE 0x4000000 /* disable caching dir handles */
+#define CIFS_MOUNT_NO_DFS 0x8000000 /* disable DFS resolving */
struct cifs_sb_info {
struct rb_root tlink_tree;
@@ -71,6 +72,15 @@ struct cifs_sb_info {
char *mountdata; /* options received at mount time or via DFS refs */
struct delayed_work prune_tlinks;
struct rcu_head rcu;
+
+ /* only used when CIFS_MOUNT_USE_PREFIX_PATH is set */
char *prepath;
+
+ /*
+ * Path initially provided by the mount call. We might connect
+ * to something different via DFS but we want to keep it to do
+ * failover properly.
+ */
+ char *origin_fullpath; /* \\HOST\SHARE\[OPTIONAL PATH] */
};
#endif /* _CIFS_FS_SB_H */
diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h
index 57ff0756e30c..d8bce2f862de 100644
--- a/fs/cifs/cifs_ioctl.h
+++ b/fs/cifs/cifs_ioctl.h
@@ -43,8 +43,19 @@ struct smb_snapshot_array {
/* snapshots[]; */
} __packed;
+struct smb_query_info {
+ __u32 info_type;
+ __u32 file_info_class;
+ __u32 additional_information;
+ __u32 flags;
+ __u32 input_buffer_length;
+ __u32 output_buffer_length;
+ /* char buffer[]; */
+} __packed;
+
#define CIFS_IOCTL_MAGIC 0xCF
#define CIFS_IOC_COPYCHUNK_FILE _IOW(CIFS_IOCTL_MAGIC, 3, int)
#define CIFS_IOC_SET_INTEGRITY _IO(CIFS_IOCTL_MAGIC, 4)
#define CIFS_IOC_GET_MNT_INFO _IOR(CIFS_IOCTL_MAGIC, 5, struct smb_mnt_fs_info)
#define CIFS_ENUMERATE_SNAPSHOTS _IOR(CIFS_IOCTL_MAGIC, 6, struct smb_snapshot_array)
+#define CIFS_QUERY_INFO _IOWR(CIFS_IOCTL_MAGIC, 7, struct smb_query_info)
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index b611fc2e8984..7f01c6e60791 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -147,8 +147,10 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo)
sprintf(dp, ";sec=krb5");
else if (server->sec_mskerberos)
sprintf(dp, ";sec=mskrb5");
- else
- goto out;
+ else {
+ cifs_dbg(VFS, "unknown or missing server auth type, use krb5\n");
+ sprintf(dp, ";sec=krb5");
+ }
dp = description + strlen(description);
sprintf(dp, ";uid=0x%x",
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 85b31cfa2f3c..d2a05e46d6f5 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -224,7 +224,7 @@ int cifs_verify_signature(struct smb_rqst *rqst,
if (cifs_pdu->Command == SMB_COM_LOCKING_ANDX) {
struct smb_com_lock_req *pSMB =
(struct smb_com_lock_req *)cifs_pdu;
- if (pSMB->LockType & LOCKING_ANDX_OPLOCK_RELEASE)
+ if (pSMB->LockType & LOCKING_ANDX_OPLOCK_RELEASE)
return 0;
}
@@ -304,12 +304,17 @@ int setup_ntlm_response(struct cifs_ses *ses, const struct nls_table *nls_cp)
int calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
char *lnm_session_key)
{
- int i;
+ int i, len;
int rc;
char password_with_pad[CIFS_ENCPWD_SIZE] = {0};
- if (password)
- strncpy(password_with_pad, password, CIFS_ENCPWD_SIZE);
+ if (password) {
+ for (len = 0; len < CIFS_ENCPWD_SIZE; len++)
+ if (!password[len])
+ break;
+
+ memcpy(password_with_pad, password, len);
+ }
if (!encrypt && global_secflags & CIFSSEC_MAY_PLNTXT) {
memcpy(lnm_session_key, password_with_pad,
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 7065426b3280..62d48d486d8f 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -52,6 +52,9 @@
#include "cifs_spnego.h"
#include "fscache.h"
#include "smb2pdu.h"
+#ifdef CONFIG_CIFS_DFS_UPCALL
+#include "dfs_cache.h"
+#endif
int cifsFYI = 0;
bool traceSMB;
@@ -81,6 +84,14 @@ module_param(cifs_max_pending, uint, 0444);
MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server for "
"CIFS/SMB1 dialect (N/A for SMB3) "
"Default: 32767 Range: 2 to 32767.");
+#ifdef CONFIG_CIFS_STATS2
+unsigned int slow_rsp_threshold = 1;
+module_param(slow_rsp_threshold, uint, 0644);
+MODULE_PARM_DESC(slow_rsp_threshold, "Amount of time (in seconds) to wait "
+ "before logging that a response is delayed. "
+ "Default: 1 (if set to 0 disables msg).");
+#endif /* STATS2 */
+
module_param(enable_oplocks, bool, 0644);
MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks. Default: y/Y/1");
@@ -492,6 +503,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
seq_puts(s, ",unix");
else
seq_puts(s, ",nounix");
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS)
+ seq_puts(s, ",nodfs");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)
seq_puts(s, ",posixpaths");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)
@@ -707,7 +720,14 @@ cifs_smb3_do_mount(struct file_system_type *fs_type,
struct cifs_mnt_data mnt_data;
struct dentry *root;
- cifs_dbg(FYI, "Devname: %s flags: %d\n", dev_name, flags);
+ /*
+ * Prints in Kernel / CIFS log the attempted mount operation
+ * If CIFS_DEBUG && cifs_FYI
+ */
+ if (cifsFYI)
+ cifs_dbg(FYI, "Devname: %s flags: %d\n", dev_name, flags);
+ else
+ cifs_info("Attempting to mount %s\n", dev_name);
volume_info = cifs_get_volume_info((char *)data, dev_name, is_smb3);
if (IS_ERR(volume_info))
@@ -975,17 +995,21 @@ const struct inode_operations cifs_symlink_inode_ops = {
.listxattr = cifs_listxattr,
};
-static int cifs_clone_file_range(struct file *src_file, loff_t off,
- struct file *dst_file, loff_t destoff, u64 len)
+static loff_t cifs_remap_file_range(struct file *src_file, loff_t off,
+ struct file *dst_file, loff_t destoff, loff_t len,
+ unsigned int remap_flags)
{
struct inode *src_inode = file_inode(src_file);
struct inode *target_inode = file_inode(dst_file);
struct cifsFileInfo *smb_file_src = src_file->private_data;
- struct cifsFileInfo *smb_file_target = dst_file->private_data;
- struct cifs_tcon *target_tcon = tlink_tcon(smb_file_target->tlink);
+ struct cifsFileInfo *smb_file_target;
+ struct cifs_tcon *target_tcon;
unsigned int xid;
int rc;
+ if (remap_flags & ~REMAP_FILE_ADVISORY)
+ return -EINVAL;
+
cifs_dbg(FYI, "clone range\n");
xid = get_xid();
@@ -996,6 +1020,9 @@ static int cifs_clone_file_range(struct file *src_file, loff_t off,
goto out;
}
+ smb_file_target = dst_file->private_data;
+ target_tcon = tlink_tcon(smb_file_target->tlink);
+
/*
* Note: cifs case is easier than btrfs since server responsible for
* checks for proper open modes and file type and if it wants
@@ -1025,7 +1052,7 @@ static int cifs_clone_file_range(struct file *src_file, loff_t off,
unlock_two_nondirectories(src_inode, target_inode);
out:
free_xid(xid);
- return rc;
+ return rc < 0 ? rc : len;
}
ssize_t cifs_file_copychunk_range(unsigned int xid,
@@ -1134,7 +1161,7 @@ const struct file_operations cifs_file_ops = {
.llseek = cifs_llseek,
.unlocked_ioctl = cifs_ioctl,
.copy_file_range = cifs_copy_file_range,
- .clone_file_range = cifs_clone_file_range,
+ .remap_file_range = cifs_remap_file_range,
.setlease = cifs_setlease,
.fallocate = cifs_fallocate,
};
@@ -1153,15 +1180,14 @@ const struct file_operations cifs_file_strict_ops = {
.llseek = cifs_llseek,
.unlocked_ioctl = cifs_ioctl,
.copy_file_range = cifs_copy_file_range,
- .clone_file_range = cifs_clone_file_range,
+ .remap_file_range = cifs_remap_file_range,
.setlease = cifs_setlease,
.fallocate = cifs_fallocate,
};
const struct file_operations cifs_file_direct_ops = {
- /* BB reevaluate whether they can be done with directio, no cache */
- .read_iter = cifs_user_readv,
- .write_iter = cifs_user_writev,
+ .read_iter = cifs_direct_readv,
+ .write_iter = cifs_direct_writev,
.open = cifs_open,
.release = cifs_close,
.lock = cifs_lock,
@@ -1172,7 +1198,7 @@ const struct file_operations cifs_file_direct_ops = {
.splice_write = iter_file_splice_write,
.unlocked_ioctl = cifs_ioctl,
.copy_file_range = cifs_copy_file_range,
- .clone_file_range = cifs_clone_file_range,
+ .remap_file_range = cifs_remap_file_range,
.llseek = cifs_llseek,
.setlease = cifs_setlease,
.fallocate = cifs_fallocate,
@@ -1191,7 +1217,7 @@ const struct file_operations cifs_file_nobrl_ops = {
.llseek = cifs_llseek,
.unlocked_ioctl = cifs_ioctl,
.copy_file_range = cifs_copy_file_range,
- .clone_file_range = cifs_clone_file_range,
+ .remap_file_range = cifs_remap_file_range,
.setlease = cifs_setlease,
.fallocate = cifs_fallocate,
};
@@ -1209,15 +1235,14 @@ const struct file_operations cifs_file_strict_nobrl_ops = {
.llseek = cifs_llseek,
.unlocked_ioctl = cifs_ioctl,
.copy_file_range = cifs_copy_file_range,
- .clone_file_range = cifs_clone_file_range,
+ .remap_file_range = cifs_remap_file_range,
.setlease = cifs_setlease,
.fallocate = cifs_fallocate,
};
const struct file_operations cifs_file_direct_nobrl_ops = {
- /* BB reevaluate whether they can be done with directio, no cache */
- .read_iter = cifs_user_readv,
- .write_iter = cifs_user_writev,
+ .read_iter = cifs_direct_readv,
+ .write_iter = cifs_direct_writev,
.open = cifs_open,
.release = cifs_close,
.fsync = cifs_fsync,
@@ -1227,7 +1252,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
.splice_write = iter_file_splice_write,
.unlocked_ioctl = cifs_ioctl,
.copy_file_range = cifs_copy_file_range,
- .clone_file_range = cifs_clone_file_range,
+ .remap_file_range = cifs_remap_file_range,
.llseek = cifs_llseek,
.setlease = cifs_setlease,
.fallocate = cifs_fallocate,
@@ -1239,7 +1264,7 @@ const struct file_operations cifs_dir_ops = {
.read = generic_read_dir,
.unlocked_ioctl = cifs_ioctl,
.copy_file_range = cifs_copy_file_range,
- .clone_file_range = cifs_clone_file_range,
+ .remap_file_range = cifs_remap_file_range,
.llseek = generic_file_llseek,
.fsync = cifs_dir_fsync,
};
@@ -1418,6 +1443,11 @@ init_cifs(void)
#ifdef CONFIG_CIFS_STATS2
atomic_set(&totBufAllocCount, 0);
atomic_set(&totSmBufAllocCount, 0);
+ if (slow_rsp_threshold < 1)
+ cifs_dbg(FYI, "slow_response_threshold msgs disabled\n");
+ else if (slow_rsp_threshold > 32767)
+ cifs_dbg(VFS,
+ "slow response threshold set higher than recommended (0 to 32767)\n");
#endif /* CONFIG_CIFS_STATS2 */
atomic_set(&midCount, 0);
@@ -1467,10 +1497,15 @@ init_cifs(void)
if (rc)
goto out_destroy_mids;
+#ifdef CONFIG_CIFS_DFS_UPCALL
+ rc = dfs_cache_init();
+ if (rc)
+ goto out_destroy_request_bufs;
+#endif /* CONFIG_CIFS_DFS_UPCALL */
#ifdef CONFIG_CIFS_UPCALL
rc = init_cifs_spnego();
if (rc)
- goto out_destroy_request_bufs;
+ goto out_destroy_dfs_cache;
#endif /* CONFIG_CIFS_UPCALL */
#ifdef CONFIG_CIFS_ACL
@@ -1498,6 +1533,10 @@ out_register_key_type:
#endif
#ifdef CONFIG_CIFS_UPCALL
exit_cifs_spnego();
+out_destroy_dfs_cache:
+#endif
+#ifdef CONFIG_CIFS_DFS_UPCALL
+ dfs_cache_destroy();
out_destroy_request_bufs:
#endif
cifs_destroy_request_bufs();
@@ -1529,6 +1568,9 @@ exit_cifs(void)
#ifdef CONFIG_CIFS_UPCALL
exit_cifs_spnego();
#endif
+#ifdef CONFIG_CIFS_DFS_UPCALL
+ dfs_cache_destroy();
+#endif
cifs_destroy_request_bufs();
cifs_destroy_mids();
cifs_destroy_inodecache();
@@ -1538,11 +1580,11 @@ exit_cifs(void)
cifs_proc_clean();
}
-MODULE_AUTHOR("Steve French <sfrench@us.ibm.com>");
+MODULE_AUTHOR("Steve French");
MODULE_LICENSE("GPL"); /* combination of LGPL + GPL source behaves as GPL */
MODULE_DESCRIPTION
- ("VFS to access servers complying with the SNIA CIFS Specification "
- "e.g. Samba and Windows");
+ ("VFS to access SMB3 servers e.g. Samba, Macs, Azure and Windows (and "
+ "also older servers complying with the SNIA CIFS Specification)");
MODULE_VERSION(CIFS_VERSION);
MODULE_SOFTDEP("pre: arc4");
MODULE_SOFTDEP("pre: des");
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index f047e87871a1..7652551a1fc4 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -101,8 +101,10 @@ extern int cifs_open(struct inode *inode, struct file *file);
extern int cifs_close(struct inode *inode, struct file *file);
extern int cifs_closedir(struct inode *inode, struct file *file);
extern ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to);
+extern ssize_t cifs_direct_readv(struct kiocb *iocb, struct iov_iter *to);
extern ssize_t cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to);
extern ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from);
+extern ssize_t cifs_direct_writev(struct kiocb *iocb, struct iov_iter *from);
extern ssize_t cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from);
extern int cifs_lock(struct file *, int, struct file_lock *);
extern int cifs_fsync(struct file *, loff_t, loff_t, int);
@@ -148,5 +150,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
-#define CIFS_VERSION "2.13"
+#define CIFS_VERSION "2.17"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 9dcaed031843..94dbdbe5be34 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -33,6 +33,7 @@
#define CIFS_MAGIC_NUMBER 0xFF534D42 /* the first four bytes of SMB PDUs */
+#define SMB_PATH_MAX 260
#define CIFS_PORT 445
#define RFC1001_PORT 139
@@ -465,6 +466,11 @@ struct smb_version_operations {
enum securityEnum (*select_sectype)(struct TCP_Server_Info *,
enum securityEnum);
int (*next_header)(char *);
+ /* ioctl passthrough for query_info */
+ int (*ioctl_query_info)(const unsigned int xid,
+ struct cifs_tcon *tcon,
+ __le16 *path, int is_dir,
+ unsigned long p);
};
struct smb_version_values {
@@ -654,6 +660,7 @@ struct TCP_Server_Info {
/* 16th byte of RFC1001 workstation name is always null */
char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
__u32 sequence_number; /* for signing, protected by srv_mutex */
+ __u32 reconnect_instance; /* incremented on each reconnect */
struct session_key session_key;
unsigned long lstrp; /* when we got last response from this server */
struct cifs_secmech secmech; /* crypto sec mech functs, descriptors */
@@ -694,6 +701,13 @@ struct TCP_Server_Info {
struct delayed_work reconnect; /* reconnect workqueue job */
struct mutex reconnect_mutex; /* prevent simultaneous reconnects */
unsigned long echo_interval;
+
+ /*
+ * Number of targets available for reconnect. The more targets
+ * the more tasks have to wait to let the demultiplex thread
+ * reconnect.
+ */
+ int nr_targets;
};
static inline unsigned int
@@ -798,6 +812,7 @@ compare_mid(__u16 mid, const struct smb_hdr *smb)
* a single wsize request with a single call.
*/
#define CIFS_DEFAULT_IOSIZE (1024 * 1024)
+#define SMB3_DEFAULT_IOSIZE (4 * 1024 * 1024)
/*
* Windows only supports a max of 60kb reads and 65535 byte writes. Default to
@@ -924,6 +939,8 @@ struct cifs_tcon {
struct list_head tcon_list;
int tc_count;
struct list_head rlist; /* reconnect list */
+ atomic_t num_local_opens; /* num of all opens including disconnected */
+ atomic_t num_remote_opens; /* num of all network opens on server */
struct list_head openFileList;
spinlock_t open_file_lock; /* protects list above */
struct cifs_ses *ses; /* pointer to session associated with */
@@ -1004,6 +1021,11 @@ struct cifs_tcon {
struct list_head pending_opens; /* list of incomplete opens */
struct cached_fid crfid; /* Cached root fid */
/* BB add field for back pointer to sb struct(s)? */
+#ifdef CONFIG_CIFS_DFS_UPCALL
+ char *dfs_path;
+ int remap:2;
+ struct list_head ulist; /* cache update list */
+#endif
};
/*
@@ -1072,7 +1094,8 @@ struct cifsLockInfo {
__u64 offset;
__u64 length;
__u32 pid;
- __u32 type;
+ __u16 type;
+ __u16 flags;
};
/*
@@ -1114,6 +1137,9 @@ struct cifs_fid {
__u8 create_guid[16];
struct cifs_pending_open *pending_open;
unsigned int epoch;
+#ifdef CONFIG_CIFS_DEBUG2
+ __u64 mid;
+#endif /* CIFS_DEBUG2 */
bool purge_cache;
};
@@ -1172,6 +1198,11 @@ struct cifs_aio_ctx {
unsigned int len;
unsigned int total_len;
bool should_dirty;
+ /*
+ * Indicates if this aio_ctx is for direct_io,
+ * If yes, iter is a copy of the user passed iov_iter
+ */
+ bool direct_io;
};
struct cifs_readdata;
@@ -1407,6 +1438,7 @@ struct mid_q_entry {
int mid_state; /* wish this were enum but can not pass to wait_event */
unsigned int mid_flags;
__le16 command; /* smb command code */
+ unsigned int optype; /* operation type */
bool large_buf:1; /* if valid response, is pointer to large buf */
bool multiRsp:1; /* multiple trans2 responses for one request */
bool multiEnd:1; /* both received */
@@ -1489,6 +1521,7 @@ struct dfs_info3_param {
int ref_flag;
char *path_name;
char *node_name;
+ int ttl;
};
/*
@@ -1526,7 +1559,6 @@ static inline void free_dfs_info_param(struct dfs_info3_param *param)
if (param) {
kfree(param->path_name);
kfree(param->node_name);
- kfree(param);
}
}
@@ -1543,6 +1575,25 @@ static inline void free_dfs_info_array(struct dfs_info3_param *param,
kfree(param);
}
+static inline bool is_interrupt_error(int error)
+{
+ switch (error) {
+ case -EINTR:
+ case -ERESTARTSYS:
+ case -ERESTARTNOHAND:
+ case -ERESTARTNOINTR:
+ return true;
+ }
+ return false;
+}
+
+static inline bool is_retryable_error(int error)
+{
+ if (is_interrupt_error(error) || error == -EAGAIN)
+ return true;
+ return false;
+}
+
#define MID_FREE 0
#define MID_REQUEST_ALLOCATED 1
#define MID_REQUEST_SUBMITTED 2
@@ -1715,6 +1766,7 @@ GLOBAL_EXTERN atomic_t bufAllocCount; /* current number allocated */
#ifdef CONFIG_CIFS_STATS2
GLOBAL_EXTERN atomic_t totBufAllocCount; /* total allocated over all time */
GLOBAL_EXTERN atomic_t totSmBufAllocCount;
+extern unsigned int slow_rsp_threshold; /* number of secs before logging */
#endif
GLOBAL_EXTERN atomic_t smBufAllocCount;
GLOBAL_EXTERN atomic_t midCount;
@@ -1770,6 +1822,7 @@ extern struct smb_version_values smb3any_values;
extern struct smb_version_operations smb30_operations;
extern struct smb_version_values smb30_values;
#define SMB302_VERSION_STRING "3.02"
+#define ALT_SMB302_VERSION_STRING "3.0.2"
/*extern struct smb_version_operations smb302_operations;*/ /* not needed yet */
extern struct smb_version_values smb302_values;
#define SMB311_VERSION_STRING "3.1.1"
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 1ce733f3582f..79d842e7240c 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -1539,6 +1539,9 @@ struct reparse_symlink_data {
char PathBuffer[0];
} __attribute__((packed));
+/* Flag above */
+#define SYMLINK_FLAG_RELATIVE 0x00000001
+
/* For IO_REPARSE_TAG_NFS */
#define NFS_SPECFILE_LNK 0x00000000014B4E4C
#define NFS_SPECFILE_CHR 0x0000000000524843
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 20adda4de83b..336c116995d7 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -22,6 +22,9 @@
#define _CIFSPROTO_H
#include <linux/nls.h>
#include "trace.h"
+#ifdef CONFIG_CIFS_DFS_UPCALL
+#include "dfs_cache.h"
+#endif
struct statfs;
struct smb_vol;
@@ -213,13 +216,13 @@ extern int cifs_match_super(struct super_block *, void *);
extern void cifs_cleanup_volume_info(struct smb_vol *pvolume_info);
extern struct smb_vol *cifs_get_volume_info(char *mount_data,
const char *devname, bool is_smb3);
-extern int cifs_mount(struct cifs_sb_info *, struct smb_vol *);
+extern int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *vol);
extern void cifs_umount(struct cifs_sb_info *);
extern void cifs_mark_open_files_invalid(struct cifs_tcon *tcon);
extern void cifs_reopen_persistent_handles(struct cifs_tcon *tcon);
extern bool cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset,
- __u64 length, __u8 type,
+ __u64 length, __u8 type, __u16 flags,
struct cifsLockInfo **conf_lock,
int rw_check);
extern void cifs_add_pending_open(struct cifs_fid *fid,
@@ -294,11 +297,6 @@ extern int CIFSGetDFSRefer(const unsigned int xid, struct cifs_ses *ses,
unsigned int *num_of_nodes,
const struct nls_table *nls_codepage, int remap);
-extern int get_dfs_path(const unsigned int xid, struct cifs_ses *ses,
- const char *old_path,
- const struct nls_table *nls_codepage,
- unsigned int *num_referrals,
- struct dfs_info3_param **referrals, int remap);
extern int parse_dfs_referrals(struct get_dfs_referral_rsp *rsp, u32 rsp_size,
unsigned int *num_of_nodes,
struct dfs_info3_param **target_nodes,
@@ -524,6 +522,11 @@ extern int E_md4hash(const unsigned char *passwd, unsigned char *p16,
const struct nls_table *codepage);
extern int SMBencrypt(unsigned char *passwd, const unsigned char *c8,
unsigned char *p24);
+extern void
+cifs_cleanup_volume_info_contents(struct smb_vol *volume_info);
+
+extern struct TCP_Server_Info *
+cifs_find_tcp_session(struct smb_vol *vol);
void cifs_readdata_release(struct kref *refcount);
int cifs_async_readv(struct cifs_readdata *rdata);
@@ -562,4 +565,17 @@ void cifs_free_hash(struct crypto_shash **shash, struct sdesc **sdesc);
extern void rqst_page_get_length(struct smb_rqst *rqst, unsigned int page,
unsigned int *len, unsigned int *offset);
+void extract_unc_hostname(const char *unc, const char **h, size_t *len);
+
+#ifdef CONFIG_CIFS_DFS_UPCALL
+static inline int get_dfs_path(const unsigned int xid, struct cifs_ses *ses,
+ const char *old_path,
+ const struct nls_table *nls_codepage,
+ struct dfs_info3_param *referral, int remap)
+{
+ return dfs_cache_find(xid, ses, nls_codepage, remap, old_path,
+ referral, NULL);
+}
+#endif
+
#endif /* _CIFSPROTO_H */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 5657b79dbc99..bb54ccf8481c 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -44,6 +44,9 @@
#include "cifs_debug.h"
#include "fscache.h"
#include "smbdirect.h"
+#ifdef CONFIG_CIFS_DFS_UPCALL
+#include "dfs_cache.h"
+#endif
#ifdef CONFIG_CIFS_POSIX
static struct {
@@ -118,6 +121,86 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon)
*/
}
+#ifdef CONFIG_CIFS_DFS_UPCALL
+static int __cifs_reconnect_tcon(const struct nls_table *nlsc,
+ struct cifs_tcon *tcon)
+{
+ int rc;
+ struct dfs_cache_tgt_list tl;
+ struct dfs_cache_tgt_iterator *it = NULL;
+ char *tree;
+ const char *tcp_host;
+ size_t tcp_host_len;
+ const char *dfs_host;
+ size_t dfs_host_len;
+
+ tree = kzalloc(MAX_TREE_SIZE, GFP_KERNEL);
+ if (!tree)
+ return -ENOMEM;
+
+ if (tcon->ipc) {
+ snprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$",
+ tcon->ses->server->hostname);
+ rc = CIFSTCon(0, tcon->ses, tree, tcon, nlsc);
+ goto out;
+ }
+
+ if (!tcon->dfs_path) {
+ rc = CIFSTCon(0, tcon->ses, tcon->treeName, tcon, nlsc);
+ goto out;
+ }
+
+ rc = dfs_cache_noreq_find(tcon->dfs_path + 1, NULL, &tl);
+ if (rc)
+ goto out;
+
+ extract_unc_hostname(tcon->ses->server->hostname, &tcp_host,
+ &tcp_host_len);
+
+ for (it = dfs_cache_get_tgt_iterator(&tl); it;
+ it = dfs_cache_get_next_tgt(&tl, it)) {
+ const char *tgt = dfs_cache_get_tgt_name(it);
+
+ extract_unc_hostname(tgt, &dfs_host, &dfs_host_len);
+
+ if (dfs_host_len != tcp_host_len
+ || strncasecmp(dfs_host, tcp_host, dfs_host_len) != 0) {
+ cifs_dbg(FYI, "%s: skipping %.*s, doesn't match %.*s",
+ __func__,
+ (int)dfs_host_len, dfs_host,
+ (int)tcp_host_len, tcp_host);
+ continue;
+ }
+
+ snprintf(tree, MAX_TREE_SIZE, "\\%s", tgt);
+
+ rc = CIFSTCon(0, tcon->ses, tree, tcon, nlsc);
+ if (!rc)
+ break;
+ if (rc == -EREMOTE)
+ break;
+ }
+
+ if (!rc) {
+ if (it)
+ rc = dfs_cache_noreq_update_tgthint(tcon->dfs_path + 1,
+ it);
+ else
+ rc = -ENOENT;
+ }
+ dfs_cache_free_tgts(&tl);
+out:
+ kfree(tree);
+ return rc;
+}
+#else
+static inline int __cifs_reconnect_tcon(const struct nls_table *nlsc,
+ struct cifs_tcon *tcon)
+{
+ return CIFSTCon(0, tcon->ses, tcon->treeName, tcon, nlsc);
+}
+#endif
+
/* reconnect the socket, tcon, and smb session if needed */
static int
cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
@@ -126,6 +209,7 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
struct cifs_ses *ses;
struct TCP_Server_Info *server;
struct nls_table *nls_codepage;
+ int retries;
/*
* SMBs NegProt, SessSetup, uLogoff do not have tcon yet so check for
@@ -152,9 +236,12 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
}
}
+ retries = server->nr_targets;
+
/*
- * Give demultiplex thread up to 10 seconds to reconnect, should be
- * greater than cifs socket timeout which is 7 seconds
+ * Give demultiplex thread up to 10 seconds to each target available for
+ * reconnect -- should be greater than cifs socket timeout which is 7
+ * seconds.
*/
while (server->tcpStatus == CifsNeedReconnect) {
rc = wait_event_interruptible_timeout(server->response_q,
@@ -170,6 +257,9 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
if (server->tcpStatus != CifsNeedReconnect)
break;
+ if (--retries)
+ continue;
+
/*
* on "soft" mounts we wait once. Hard mounts keep
* retrying until process is killed or server comes
@@ -179,6 +269,7 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
cifs_dbg(FYI, "gave up waiting on reconnect in smb_init\n");
return -EHOSTDOWN;
}
+ retries = server->nr_targets;
}
if (!ses->need_reconnect && !tcon->need_reconnect)
@@ -214,7 +305,7 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
}
cifs_mark_open_files_invalid(tcon);
- rc = CIFSTCon(0, ses, tcon->treeName, tcon, nls_codepage);
+ rc = __cifs_reconnect_tcon(nls_codepage, tcon);
mutex_unlock(&ses->session_mutex);
cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc);
@@ -1458,18 +1549,26 @@ cifs_discard_remaining_data(struct TCP_Server_Info *server)
}
static int
-cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
+__cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid,
+ bool malformed)
{
int length;
- struct cifs_readdata *rdata = mid->callback_data;
length = cifs_discard_remaining_data(server);
- dequeue_mid(mid, rdata->result);
+ dequeue_mid(mid, malformed);
mid->resp_buf = server->smallbuf;
server->smallbuf = NULL;
return length;
}
+static int
+cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
+{
+ struct cifs_readdata *rdata = mid->callback_data;
+
+ return __cifs_readv_discard(server, mid, rdata->result);
+}
+
int
cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
{
@@ -1511,12 +1610,23 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
return -1;
}
+ /* set up first two iov for signature check and to get credits */
+ rdata->iov[0].iov_base = buf;
+ rdata->iov[0].iov_len = 4;
+ rdata->iov[1].iov_base = buf + 4;
+ rdata->iov[1].iov_len = server->total_read - 4;
+ cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n",
+ rdata->iov[0].iov_base, rdata->iov[0].iov_len);
+ cifs_dbg(FYI, "1: iov_base=%p iov_len=%zu\n",
+ rdata->iov[1].iov_base, rdata->iov[1].iov_len);
+
/* Was the SMB read successful? */
rdata->result = server->ops->map_error(buf, false);
if (rdata->result != 0) {
cifs_dbg(FYI, "%s: server returned error %d\n",
__func__, rdata->result);
- return cifs_readv_discard(server, mid);
+ /* normal error on read response */
+ return __cifs_readv_discard(server, mid, false);
}
/* Is there enough to get to the rest of the READ_RSP header? */
@@ -1560,14 +1670,6 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
server->total_read += length;
}
- /* set up first iov for signature check */
- rdata->iov[0].iov_base = buf;
- rdata->iov[0].iov_len = 4;
- rdata->iov[1].iov_base = buf + 4;
- rdata->iov[1].iov_len = server->total_read - 4;
- cifs_dbg(FYI, "0: iov_base=%p iov_len=%u\n",
- rdata->iov[0].iov_base, server->total_read);
-
/* how much data is in the response? */
#ifdef CONFIG_CIFS_SMB_DIRECT
use_rdma_mr = rdata->mr;
@@ -1607,6 +1709,7 @@ cifs_readv_callback(struct mid_q_entry *mid)
struct smb_rqst rqst = { .rq_iov = rdata->iov,
.rq_nvec = 2,
.rq_pages = rdata->pages,
+ .rq_offset = rdata->page_offset,
.rq_npages = rdata->nr_pages,
.rq_pagesz = rdata->pagesz,
.rq_tailsz = rdata->tailsz };
@@ -2031,7 +2134,7 @@ cifs_writev_requeue(struct cifs_writedata *wdata)
for (j = 0; j < nr_pages; j++) {
unlock_page(wdata2->pages[j]);
- if (rc != 0 && rc != -EAGAIN) {
+ if (rc != 0 && !is_retryable_error(rc)) {
SetPageError(wdata2->pages[j]);
end_page_writeback(wdata2->pages[j]);
put_page(wdata2->pages[j]);
@@ -2040,7 +2143,7 @@ cifs_writev_requeue(struct cifs_writedata *wdata)
if (rc) {
kref_put(&wdata2->refcount, cifs_writedata_release);
- if (rc == -EAGAIN)
+ if (is_retryable_error(rc))
continue;
break;
}
@@ -2049,7 +2152,8 @@ cifs_writev_requeue(struct cifs_writedata *wdata)
i += nr_pages;
} while (i < wdata->nr_pages);
- mapping_set_error(inode->i_mapping, rc);
+ if (rc != 0 && !is_retryable_error(rc))
+ mapping_set_error(inode->i_mapping, rc);
kref_put(&wdata->refcount, cifs_writedata_release);
}
@@ -2210,6 +2314,7 @@ cifs_async_writev(struct cifs_writedata *wdata,
rqst.rq_iov = iov;
rqst.rq_nvec = 2;
rqst.rq_pages = wdata->pages;
+ rqst.rq_offset = wdata->page_offset;
rqst.rq_npages = wdata->nr_pages;
rqst.rq_pagesz = wdata->pagesz;
rqst.rq_tailsz = wdata->tailsz;
@@ -5027,6 +5132,13 @@ oldQFSInfoRetry:
le16_to_cpu(response_data->BytesPerSector) *
le32_to_cpu(response_data->
SectorsPerAllocationUnit);
+ /*
+ * much prefer larger but if server doesn't report
+ * a valid size than 4K is a reasonable minimum
+ */
+ if (FSData->f_bsize < 512)
+ FSData->f_bsize = 4096;
+
FSData->f_blocks =
le32_to_cpu(response_data->TotalAllocationUnits);
FSData->f_bfree = FSData->f_bavail =
@@ -5107,6 +5219,13 @@ QFSInfoRetry:
le32_to_cpu(response_data->BytesPerSector) *
le32_to_cpu(response_data->
SectorsPerAllocationUnit);
+ /*
+ * much prefer larger but if server doesn't report
+ * a valid size than 4K is a reasonable minimum
+ */
+ if (FSData->f_bsize < 512)
+ FSData->f_bsize = 4096;
+
FSData->f_blocks =
le64_to_cpu(response_data->TotalAllocationUnits);
FSData->f_bfree = FSData->f_bavail =
@@ -5470,6 +5589,13 @@ QFSPosixRetry:
data_offset);
FSData->f_bsize =
le32_to_cpu(response_data->BlockSize);
+ /*
+ * much prefer larger but if server doesn't report
+ * a valid size than 4K is a reasonable minimum
+ */
+ if (FSData->f_bsize < 512)
+ FSData->f_bsize = 4096;
+
FSData->f_blocks =
le64_to_cpu(response_data->TotalBlocks);
FSData->f_bfree =
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 52d71b64c0c6..8463c940e0e5 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -56,6 +56,11 @@
#include "fscache.h"
#include "smb2proto.h"
#include "smbdirect.h"
+#include "dns_resolve.h"
+#include "cifsfs.h"
+#ifdef CONFIG_CIFS_DFS_UPCALL
+#include "dfs_cache.h"
+#endif
extern mempool_t *cifs_req_poolp;
extern bool disable_legacy_dialects;
@@ -250,6 +255,7 @@ static const match_table_t cifs_mount_option_tokens = {
{ Opt_ignore, "dev" },
{ Opt_ignore, "mand" },
{ Opt_ignore, "nomand" },
+ { Opt_ignore, "relatime" },
{ Opt_ignore, "_netdev" },
{ Opt_err, NULL }
@@ -303,6 +309,7 @@ static const match_table_t cifs_smb_version_tokens = {
{ Smb_21, SMB21_VERSION_STRING },
{ Smb_30, SMB30_VERSION_STRING },
{ Smb_302, SMB302_VERSION_STRING },
+ { Smb_302, ALT_SMB302_VERSION_STRING },
{ Smb_311, SMB311_VERSION_STRING },
{ Smb_311, ALT_SMB311_VERSION_STRING },
{ Smb_3any, SMB3ANY_VERSION_STRING },
@@ -316,6 +323,132 @@ static void tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink);
static void cifs_prune_tlinks(struct work_struct *work);
static int cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data,
const char *devname, bool is_smb3);
+static char *extract_hostname(const char *unc);
+
+/*
+ * Resolve hostname and set ip addr in tcp ses. Useful for hostnames that may
+ * get their ip addresses changed at some point.
+ *
+ * This should be called with server->srv_mutex held.
+ */
+#ifdef CONFIG_CIFS_DFS_UPCALL
+static int reconn_set_ipaddr(struct TCP_Server_Info *server)
+{
+ int rc;
+ int len;
+ char *unc, *ipaddr = NULL;
+
+ if (!server->hostname)
+ return -EINVAL;
+
+ len = strlen(server->hostname) + 3;
+
+ unc = kmalloc(len, GFP_KERNEL);
+ if (!unc) {
+ cifs_dbg(FYI, "%s: failed to create UNC path\n", __func__);
+ return -ENOMEM;
+ }
+ snprintf(unc, len, "\\\\%s", server->hostname);
+
+ rc = dns_resolve_server_name_to_ip(unc, &ipaddr);
+ kfree(unc);
+
+ if (rc < 0) {
+ cifs_dbg(FYI, "%s: failed to resolve server part of %s to IP: %d\n",
+ __func__, server->hostname, rc);
+ return rc;
+ }
+
+ rc = cifs_convert_address((struct sockaddr *)&server->dstaddr, ipaddr,
+ strlen(ipaddr));
+ kfree(ipaddr);
+
+ return !rc ? -1 : 0;
+}
+#else
+static inline int reconn_set_ipaddr(struct TCP_Server_Info *server)
+{
+ return 0;
+}
+#endif
+
+#ifdef CONFIG_CIFS_DFS_UPCALL
+struct super_cb_data {
+ struct TCP_Server_Info *server;
+ struct cifs_sb_info *cifs_sb;
+};
+
+/* These functions must be called with server->srv_mutex held */
+
+static void super_cb(struct super_block *sb, void *arg)
+{
+ struct super_cb_data *d = arg;
+ struct cifs_sb_info *cifs_sb;
+ struct cifs_tcon *tcon;
+
+ if (d->cifs_sb)
+ return;
+
+ cifs_sb = CIFS_SB(sb);
+ tcon = cifs_sb_master_tcon(cifs_sb);
+ if (tcon->ses->server == d->server)
+ d->cifs_sb = cifs_sb;
+}
+
+static inline struct cifs_sb_info *
+find_super_by_tcp(struct TCP_Server_Info *server)
+{
+ struct super_cb_data d = {
+ .server = server,
+ .cifs_sb = NULL,
+ };
+
+ iterate_supers_type(&cifs_fs_type, super_cb, &d);
+ return d.cifs_sb ? d.cifs_sb : ERR_PTR(-ENOENT);
+}
+
+static void reconn_inval_dfs_target(struct TCP_Server_Info *server,
+ struct cifs_sb_info *cifs_sb,
+ struct dfs_cache_tgt_list *tgt_list,
+ struct dfs_cache_tgt_iterator **tgt_it)
+{
+ const char *name;
+
+ if (!cifs_sb || !cifs_sb->origin_fullpath || !tgt_list ||
+ !server->nr_targets)
+ return;
+
+ if (!*tgt_it) {
+ *tgt_it = dfs_cache_get_tgt_iterator(tgt_list);
+ } else {
+ *tgt_it = dfs_cache_get_next_tgt(tgt_list, *tgt_it);
+ if (!*tgt_it)
+ *tgt_it = dfs_cache_get_tgt_iterator(tgt_list);
+ }
+
+ cifs_dbg(FYI, "%s: UNC: %s\n", __func__, cifs_sb->origin_fullpath);
+
+ name = dfs_cache_get_tgt_name(*tgt_it);
+
+ kfree(server->hostname);
+
+ server->hostname = extract_hostname(name);
+ if (IS_ERR(server->hostname)) {
+ cifs_dbg(FYI,
+ "%s: failed to extract hostname from target: %ld\n",
+ __func__, PTR_ERR(server->hostname));
+ }
+}
+
+static inline int reconn_setup_dfs_targets(struct cifs_sb_info *cifs_sb,
+ struct dfs_cache_tgt_list *tl,
+ struct dfs_cache_tgt_iterator **it)
+{
+ if (!cifs_sb->origin_fullpath)
+ return -EOPNOTSUPP;
+ return dfs_cache_noreq_find(cifs_sb->origin_fullpath + 1, NULL, tl);
+}
+#endif
/*
* cifs tcp session reconnection
@@ -334,8 +467,33 @@ cifs_reconnect(struct TCP_Server_Info *server)
struct cifs_tcon *tcon;
struct mid_q_entry *mid_entry;
struct list_head retry_list;
+#ifdef CONFIG_CIFS_DFS_UPCALL
+ struct cifs_sb_info *cifs_sb = NULL;
+ struct dfs_cache_tgt_list tgt_list = {0};
+ struct dfs_cache_tgt_iterator *tgt_it = NULL;
+#endif
spin_lock(&GlobalMid_Lock);
+ server->nr_targets = 1;
+#ifdef CONFIG_CIFS_DFS_UPCALL
+ cifs_sb = find_super_by_tcp(server);
+ if (IS_ERR(cifs_sb)) {
+ rc = PTR_ERR(cifs_sb);
+ cifs_dbg(FYI, "%s: will not do DFS failover: rc = %d\n",
+ __func__, rc);
+ cifs_sb = NULL;
+ } else {
+ rc = reconn_setup_dfs_targets(cifs_sb, &tgt_list, &tgt_it);
+ if (rc && (rc != -EOPNOTSUPP)) {
+ cifs_dbg(VFS, "%s: no target servers for DFS failover\n",
+ __func__);
+ } else {
+ server->nr_targets = dfs_cache_get_nr_tgts(&tgt_list);
+ }
+ }
+ cifs_dbg(FYI, "%s: will retry %d target(s)\n", __func__,
+ server->nr_targets);
+#endif
if (server->tcpStatus == CifsExiting) {
/* the demux thread will exit normally
next time through the loop */
@@ -347,7 +505,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
server->maxBuf = 0;
server->max_read = 0;
- cifs_dbg(FYI, "Reconnecting tcp session\n");
+ cifs_dbg(FYI, "Mark tcp session as need reconnect\n");
trace_smb3_reconnect(server->CurrentMid, server->hostname);
/* before reconnecting the tcp session, mark the smb session (uid)
@@ -409,14 +567,27 @@ cifs_reconnect(struct TCP_Server_Info *server)
do {
try_to_freeze();
- /* we should try only the port we connected to before */
mutex_lock(&server->srv_mutex);
+ /*
+ * Set up next DFS target server (if any) for reconnect. If DFS
+ * feature is disabled, then we will retry last server we
+ * connected to before.
+ */
if (cifs_rdma_enabled(server))
rc = smbd_reconnect(server);
else
rc = generic_ip_connect(server);
if (rc) {
cifs_dbg(FYI, "reconnect error %d\n", rc);
+#ifdef CONFIG_CIFS_DFS_UPCALL
+ reconn_inval_dfs_target(server, cifs_sb, &tgt_list,
+ &tgt_it);
+#endif
+ rc = reconn_set_ipaddr(server);
+ if (rc) {
+ cifs_dbg(FYI, "%s: failed to resolve hostname: %d\n",
+ __func__, rc);
+ }
mutex_unlock(&server->srv_mutex);
msleep(3000);
} else {
@@ -429,6 +600,22 @@ cifs_reconnect(struct TCP_Server_Info *server)
}
} while (server->tcpStatus == CifsNeedReconnect);
+#ifdef CONFIG_CIFS_DFS_UPCALL
+ if (tgt_it) {
+ rc = dfs_cache_noreq_update_tgthint(cifs_sb->origin_fullpath + 1,
+ tgt_it);
+ if (rc) {
+ cifs_dbg(VFS, "%s: failed to update DFS target hint: rc = %d\n",
+ __func__, rc);
+ }
+ rc = dfs_cache_update_vol(cifs_sb->origin_fullpath, server);
+ if (rc) {
+ cifs_dbg(VFS, "%s: failed to update vol info in DFS cache: rc = %d\n",
+ __func__, rc);
+ }
+ dfs_cache_free_tgts(&tgt_list);
+ }
+#endif
if (server->tcpStatus == CifsNeedNegotiate)
mod_delayed_work(cifsiod_wq, &server->echo, 0);
@@ -533,6 +720,21 @@ server_unresponsive(struct TCP_Server_Info *server)
return false;
}
+static inline bool
+zero_credits(struct TCP_Server_Info *server)
+{
+ int val;
+
+ spin_lock(&server->req_lock);
+ val = server->credits + server->echo_credits + server->oplock_credits;
+ if (server->in_flight == 0 && val == 0) {
+ spin_unlock(&server->req_lock);
+ return true;
+ }
+ spin_unlock(&server->req_lock);
+ return false;
+}
+
static int
cifs_readv_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg)
{
@@ -545,6 +747,12 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg)
for (total_read = 0; msg_data_left(smb_msg); total_read += length) {
try_to_freeze();
+ /* reconnect if no credits and no requests in flight */
+ if (zero_credits(server)) {
+ cifs_reconnect(server);
+ return -ECONNABORTED;
+ }
+
if (server_unresponsive(server))
return -ECONNABORTED;
if (cifs_rdma_enabled(server) && server->smbd_conn)
@@ -588,7 +796,7 @@ cifs_read_from_socket(struct TCP_Server_Info *server, char *buf,
{
struct msghdr smb_msg;
struct kvec iov = {.iov_base = buf, .iov_len = to_read};
- iov_iter_kvec(&smb_msg.msg_iter, READ | ITER_KVEC, &iov, 1, to_read);
+ iov_iter_kvec(&smb_msg.msg_iter, READ, &iov, 1, to_read);
return cifs_readv_from_socket(server, &smb_msg);
}
@@ -600,7 +808,7 @@ cifs_read_page_from_socket(struct TCP_Server_Info *server, struct page *page,
struct msghdr smb_msg;
struct bio_vec bv = {
.bv_page = page, .bv_len = to_read, .bv_offset = page_offset};
- iov_iter_bvec(&smb_msg.msg_iter, READ | ITER_BVEC, &bv, 1, to_read);
+ iov_iter_bvec(&smb_msg.msg_iter, READ, &bv, 1, to_read);
return cifs_readv_from_socket(server, &smb_msg);
}
@@ -1042,7 +1250,12 @@ extract_hostname(const char *unc)
/* skip double chars at beginning of string */
/* BB: check validity of these bytes? */
- src = unc + 2;
+ if (strlen(unc) < 3)
+ return ERR_PTR(-EINVAL);
+ for (src = unc; *src && *src == '\\'; src++)
+ ;
+ if (!*src)
+ return ERR_PTR(-EINVAL);
/* delimiter between hostname and sharename is always '\\' now */
delim = strchr(src, '\\');
@@ -1826,7 +2039,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
vol->password = NULL;
break;
}
- /* Yes it is. Drop down to Opt_pass below.*/
+ /* Fallthrough - to Opt_pass below.*/
case Opt_pass:
/* Obtain the value string */
value = strchr(data, '=');
@@ -2288,7 +2501,7 @@ static int match_server(struct TCP_Server_Info *server, struct smb_vol *vol)
return 1;
}
-static struct TCP_Server_Info *
+struct TCP_Server_Info *
cifs_find_tcp_session(struct smb_vol *vol)
{
struct TCP_Server_Info *server;
@@ -2396,6 +2609,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
volume_info->target_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL);
tcp_ses->session_estab = false;
tcp_ses->sequence_number = 0;
+ tcp_ses->reconnect_instance = 0;
tcp_ses->lstrp = jiffies;
spin_lock_init(&tcp_ses->req_lock);
INIT_LIST_HEAD(&tcp_ses->tcp_ses_list);
@@ -2459,6 +2673,8 @@ smbd_connected:
}
tcp_ses->tcpStatus = CifsNeedNegotiate;
+ tcp_ses->nr_targets = 1;
+
/* thread spawned, put it on the list */
spin_lock(&cifs_tcp_ses_lock);
list_add(&tcp_ses->tcp_ses_list, &cifs_tcp_ses_list);
@@ -3085,10 +3301,6 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
if (rc)
goto out_fail;
- if (volume_info->nodfs) {
- tcon->Flags &= ~SMB_SHARE_IS_IN_DFS;
- cifs_dbg(FYI, "DFS disabled (%d)\n", tcon->Flags);
- }
tcon->use_persistent = false;
/* check if SMB2 or later, CIFS does not support persistent handles */
if (volume_info->persistent) {
@@ -3258,25 +3470,6 @@ out:
return rc;
}
-int
-get_dfs_path(const unsigned int xid, struct cifs_ses *ses, const char *old_path,
- const struct nls_table *nls_codepage, unsigned int *num_referrals,
- struct dfs_info3_param **referrals, int remap)
-{
- int rc = 0;
-
- if (!ses->server->ops->get_dfs_refer)
- return -ENOSYS;
-
- *num_referrals = 0;
- *referrals = NULL;
-
- rc = ses->server->ops->get_dfs_refer(xid, ses, old_path,
- referrals, num_referrals,
- nls_codepage, remap);
- return rc;
-}
-
#ifdef CONFIG_DEBUG_LOCK_ALLOC
static struct lock_class_key cifs_key[2];
static struct lock_class_key cifs_slock_key[2];
@@ -3663,6 +3856,8 @@ int cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
cifs_sb->actimeo = pvolume_info->actimeo;
cifs_sb->local_nls = pvolume_info->local_nls;
+ if (pvolume_info->nodfs)
+ cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_DFS;
if (pvolume_info->noperm)
cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM;
if (pvolume_info->setuids)
@@ -3746,8 +3941,8 @@ int cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
return 0;
}
-static void
-cleanup_volume_info_contents(struct smb_vol *volume_info)
+void
+cifs_cleanup_volume_info_contents(struct smb_vol *volume_info)
{
kfree(volume_info->username);
kzfree(volume_info->password);
@@ -3762,10 +3957,136 @@ cifs_cleanup_volume_info(struct smb_vol *volume_info)
{
if (!volume_info)
return;
- cleanup_volume_info_contents(volume_info);
+ cifs_cleanup_volume_info_contents(volume_info);
kfree(volume_info);
}
+/* Release all succeed connections */
+static inline void mount_put_conns(struct cifs_sb_info *cifs_sb,
+ unsigned int xid,
+ struct TCP_Server_Info *server,
+ struct cifs_ses *ses, struct cifs_tcon *tcon)
+{
+ int rc = 0;
+
+ if (tcon)
+ cifs_put_tcon(tcon);
+ else if (ses)
+ cifs_put_smb_ses(ses);
+ else if (server)
+ cifs_put_tcp_session(server, 0);
+ cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_POSIX_PATHS;
+ free_xid(xid);
+}
+
+/* Get connections for tcp, ses and tcon */
+static int mount_get_conns(struct smb_vol *vol, struct cifs_sb_info *cifs_sb,
+ unsigned int *xid,
+ struct TCP_Server_Info **nserver,
+ struct cifs_ses **nses, struct cifs_tcon **ntcon)
+{
+ int rc = 0;
+ struct TCP_Server_Info *server;
+ struct cifs_ses *ses;
+ struct cifs_tcon *tcon;
+
+ *nserver = NULL;
+ *nses = NULL;
+ *ntcon = NULL;
+
+ *xid = get_xid();
+
+ /* get a reference to a tcp session */
+ server = cifs_get_tcp_session(vol);
+ if (IS_ERR(server)) {
+ rc = PTR_ERR(server);
+ return rc;
+ }
+
+ *nserver = server;
+
+ if ((vol->max_credits < 20) || (vol->max_credits > 60000))
+ server->max_credits = SMB2_MAX_CREDITS_AVAILABLE;
+ else
+ server->max_credits = vol->max_credits;
+
+ /* get a reference to a SMB session */
+ ses = cifs_get_smb_ses(server, vol);
+ if (IS_ERR(ses)) {
+ rc = PTR_ERR(ses);
+ return rc;
+ }
+
+ *nses = ses;
+
+ if ((vol->persistent == true) && (!(ses->server->capabilities &
+ SMB2_GLOBAL_CAP_PERSISTENT_HANDLES))) {
+ cifs_dbg(VFS, "persistent handles not supported by server\n");
+ return -EOPNOTSUPP;
+ }
+
+ /* search for existing tcon to this server share */
+ tcon = cifs_get_tcon(ses, vol);
+ if (IS_ERR(tcon)) {
+ rc = PTR_ERR(tcon);
+ return rc;
+ }
+
+ *ntcon = tcon;
+
+ /* if new SMB3.11 POSIX extensions are supported do not remap / and \ */
+ if (tcon->posix_extensions)
+ cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_POSIX_PATHS;
+
+ /* tell server which Unix caps we support */
+ if (cap_unix(tcon->ses)) {
+ /*
+ * reset of caps checks mount to see if unix extensions disabled
+ * for just this mount.
+ */
+ reset_cifs_unix_caps(*xid, tcon, cifs_sb, vol);
+ if ((tcon->ses->server->tcpStatus == CifsNeedReconnect) &&
+ (le64_to_cpu(tcon->fsUnixInfo.Capability) &
+ CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP))
+ return -EACCES;
+ } else
+ tcon->unix_ext = 0; /* server does not support them */
+
+ /* do not care if a following call succeed - informational */
+ if (!tcon->pipe && server->ops->qfs_tcon)
+ server->ops->qfs_tcon(*xid, tcon);
+
+ cifs_sb->wsize = server->ops->negotiate_wsize(tcon, vol);
+ cifs_sb->rsize = server->ops->negotiate_rsize(tcon, vol);
+
+ return 0;
+}
+
+static int mount_setup_tlink(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses,
+ struct cifs_tcon *tcon)
+{
+ struct tcon_link *tlink;
+
+ /* hang the tcon off of the superblock */
+ tlink = kzalloc(sizeof(*tlink), GFP_KERNEL);
+ if (tlink == NULL)
+ return -ENOMEM;
+
+ tlink->tl_uid = ses->linux_uid;
+ tlink->tl_tcon = tcon;
+ tlink->tl_time = jiffies;
+ set_bit(TCON_LINK_MASTER, &tlink->tl_flags);
+ set_bit(TCON_LINK_IN_TREE, &tlink->tl_flags);
+
+ cifs_sb->master_tlink = tlink;
+ spin_lock(&cifs_sb->tlink_tree_lock);
+ tlink_rb_insert(&cifs_sb->tlink_tree, tlink);
+ spin_unlock(&cifs_sb->tlink_tree_lock);
+
+ queue_delayed_work(cifsiod_wq, &cifs_sb->prune_tlinks,
+ TLINK_IDLE_EXPIRE);
+ return 0;
+}
#ifdef CONFIG_CIFS_DFS_UPCALL
/*
@@ -3774,10 +4095,11 @@ cifs_cleanup_volume_info(struct smb_vol *volume_info)
*/
static char *
build_unc_path_to_root(const struct smb_vol *vol,
- const struct cifs_sb_info *cifs_sb)
+ const struct cifs_sb_info *cifs_sb, bool useppath)
{
char *full_path, *pos;
- unsigned int pplen = vol->prepath ? strlen(vol->prepath) + 1 : 0;
+ unsigned int pplen = useppath && vol->prepath ?
+ strlen(vol->prepath) + 1 : 0;
unsigned int unc_len = strnlen(vol->UNC, MAX_TREE_SIZE + 1);
full_path = kmalloc(unc_len + pplen + 1, GFP_KERNEL);
@@ -3799,8 +4121,9 @@ build_unc_path_to_root(const struct smb_vol *vol,
return full_path;
}
-/*
- * Perform a dfs referral query for a share and (optionally) prefix
+/**
+ * expand_dfs_referral - Perform a dfs referral query and update the cifs_sb
+ *
*
* If a referral is found, cifs_sb->mountdata will be (re-)allocated
* to a string containing updated options for the submount. Otherwise it
@@ -3815,36 +4138,36 @@ expand_dfs_referral(const unsigned int xid, struct cifs_ses *ses,
int check_prefix)
{
int rc;
- unsigned int num_referrals = 0;
- struct dfs_info3_param *referrals = NULL;
+ struct dfs_info3_param referral = {0};
char *full_path = NULL, *ref_path = NULL, *mdata = NULL;
- full_path = build_unc_path_to_root(volume_info, cifs_sb);
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS)
+ return -EREMOTE;
+
+ full_path = build_unc_path_to_root(volume_info, cifs_sb, true);
if (IS_ERR(full_path))
return PTR_ERR(full_path);
/* For DFS paths, skip the first '\' of the UNC */
ref_path = check_prefix ? full_path + 1 : volume_info->UNC + 1;
- rc = get_dfs_path(xid, ses, ref_path, cifs_sb->local_nls,
- &num_referrals, &referrals, cifs_remap(cifs_sb));
-
- if (!rc && num_referrals > 0) {
+ rc = dfs_cache_find(xid, ses, cifs_sb->local_nls, cifs_remap(cifs_sb),
+ ref_path, &referral, NULL);
+ if (!rc) {
char *fake_devname = NULL;
mdata = cifs_compose_mount_options(cifs_sb->mountdata,
- full_path + 1, referrals,
+ full_path + 1, &referral,
&fake_devname);
-
- free_dfs_info_array(referrals, num_referrals);
+ free_dfs_info_param(&referral);
if (IS_ERR(mdata)) {
rc = PTR_ERR(mdata);
mdata = NULL;
} else {
- cleanup_volume_info_contents(volume_info);
+ cifs_cleanup_volume_info_contents(volume_info);
rc = cifs_setup_volume_info(volume_info, mdata,
- fake_devname, false);
+ fake_devname, false);
}
kfree(fake_devname);
kfree(cifs_sb->mountdata);
@@ -3853,6 +4176,143 @@ expand_dfs_referral(const unsigned int xid, struct cifs_ses *ses,
kfree(full_path);
return rc;
}
+
+static inline int get_next_dfs_tgt(const char *path,
+ struct dfs_cache_tgt_list *tgt_list,
+ struct dfs_cache_tgt_iterator **tgt_it)
+{
+ if (!*tgt_it)
+ *tgt_it = dfs_cache_get_tgt_iterator(tgt_list);
+ else
+ *tgt_it = dfs_cache_get_next_tgt(tgt_list, *tgt_it);
+ return !*tgt_it ? -EHOSTDOWN : 0;
+}
+
+static int update_vol_info(const struct dfs_cache_tgt_iterator *tgt_it,
+ struct smb_vol *fake_vol, struct smb_vol *vol)
+{
+ const char *tgt = dfs_cache_get_tgt_name(tgt_it);
+ int len = strlen(tgt) + 2;
+ char *new_unc;
+
+ new_unc = kmalloc(len, GFP_KERNEL);
+ if (!new_unc)
+ return -ENOMEM;
+ snprintf(new_unc, len, "\\%s", tgt);
+
+ kfree(vol->UNC);
+ vol->UNC = new_unc;
+
+ if (fake_vol->prepath) {
+ kfree(vol->prepath);
+ vol->prepath = fake_vol->prepath;
+ fake_vol->prepath = NULL;
+ }
+ memcpy(&vol->dstaddr, &fake_vol->dstaddr, sizeof(vol->dstaddr));
+
+ return 0;
+}
+
+static int setup_dfs_tgt_conn(const char *path,
+ const struct dfs_cache_tgt_iterator *tgt_it,
+ struct cifs_sb_info *cifs_sb,
+ struct smb_vol *vol,
+ unsigned int *xid,
+ struct TCP_Server_Info **server,
+ struct cifs_ses **ses,
+ struct cifs_tcon **tcon)
+{
+ int rc;
+ struct dfs_info3_param ref = {0};
+ char *mdata = NULL, *fake_devname = NULL;
+ struct smb_vol fake_vol = {0};
+
+ cifs_dbg(FYI, "%s: dfs path: %s\n", __func__, path);
+
+ rc = dfs_cache_get_tgt_referral(path, tgt_it, &ref);
+ if (rc)
+ return rc;
+
+ mdata = cifs_compose_mount_options(cifs_sb->mountdata, path, &ref,
+ &fake_devname);
+ free_dfs_info_param(&ref);
+
+ if (IS_ERR(mdata)) {
+ rc = PTR_ERR(mdata);
+ mdata = NULL;
+ } else {
+ cifs_dbg(FYI, "%s: fake_devname: %s\n", __func__, fake_devname);
+ rc = cifs_setup_volume_info(&fake_vol, mdata, fake_devname,
+ false);
+ }
+ kfree(mdata);
+ kfree(fake_devname);
+
+ if (!rc) {
+ /*
+ * We use a 'fake_vol' here because we need pass it down to the
+ * mount_{get,put} functions to test connection against new DFS
+ * targets.
+ */
+ mount_put_conns(cifs_sb, *xid, *server, *ses, *tcon);
+ rc = mount_get_conns(&fake_vol, cifs_sb, xid, server, ses,
+ tcon);
+ if (!rc) {
+ /*
+ * We were able to connect to new target server.
+ * Update current volume info with new target server.
+ */
+ rc = update_vol_info(tgt_it, &fake_vol, vol);
+ }
+ }
+ cifs_cleanup_volume_info_contents(&fake_vol);
+ return rc;
+}
+
+static int mount_do_dfs_failover(const char *path,
+ struct cifs_sb_info *cifs_sb,
+ struct smb_vol *vol,
+ struct cifs_ses *root_ses,
+ unsigned int *xid,
+ struct TCP_Server_Info **server,
+ struct cifs_ses **ses,
+ struct cifs_tcon **tcon)
+{
+ int rc;
+ struct dfs_cache_tgt_list tgt_list;
+ struct dfs_cache_tgt_iterator *tgt_it = NULL;
+
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS)
+ return -EOPNOTSUPP;
+
+ rc = dfs_cache_noreq_find(path, NULL, &tgt_list);
+ if (rc)
+ return rc;
+
+ for (;;) {
+ /* Get next DFS target server - if any */
+ rc = get_next_dfs_tgt(path, &tgt_list, &tgt_it);
+ if (rc)
+ break;
+ /* Connect to next DFS target */
+ rc = setup_dfs_tgt_conn(path, tgt_it, cifs_sb, vol, xid, server,
+ ses, tcon);
+ if (!rc || rc == -EACCES || rc == -EOPNOTSUPP)
+ break;
+ }
+ if (!rc) {
+ /*
+ * Update DFS target hint in DFS referral cache with the target
+ * server we successfully reconnected to.
+ */
+ rc = dfs_cache_update_tgthint(*xid, root_ses ? root_ses : *ses,
+ cifs_sb->local_nls,
+ cifs_remap(cifs_sb), path,
+ tgt_it);
+ }
+ dfs_cache_free_tgts(&tgt_list);
+ return rc;
+}
#endif
static int
@@ -3951,107 +4411,108 @@ cifs_are_all_path_components_accessible(struct TCP_Server_Info *server,
return rc;
}
-int
-cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info)
+/*
+ * Check if path is remote (e.g. a DFS share). Return -EREMOTE if it is,
+ * otherwise 0.
+ */
+static int is_path_remote(struct cifs_sb_info *cifs_sb, struct smb_vol *vol,
+ const unsigned int xid,
+ struct TCP_Server_Info *server,
+ struct cifs_tcon *tcon)
{
int rc;
- unsigned int xid;
- struct cifs_ses *ses;
- struct cifs_tcon *tcon;
- struct TCP_Server_Info *server;
- char *full_path;
- struct tcon_link *tlink;
-#ifdef CONFIG_CIFS_DFS_UPCALL
- int referral_walks_count = 0;
-#endif
-
-#ifdef CONFIG_CIFS_DFS_UPCALL
-try_mount_again:
- /* cleanup activities if we're chasing a referral */
- if (referral_walks_count) {
- if (tcon)
- cifs_put_tcon(tcon);
- else if (ses)
- cifs_put_smb_ses(ses);
+ char *full_path;
- cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_POSIX_PATHS;
+ if (!server->ops->is_path_accessible)
+ return -EOPNOTSUPP;
- free_xid(xid);
- }
-#endif
- rc = 0;
- tcon = NULL;
- ses = NULL;
- server = NULL;
- full_path = NULL;
- tlink = NULL;
+ /*
+ * cifs_build_path_to_root works only when we have a valid tcon
+ */
+ full_path = cifs_build_path_to_root(vol, cifs_sb, tcon,
+ tcon->Flags & SMB_SHARE_IS_IN_DFS);
+ if (full_path == NULL)
+ return -ENOMEM;
- xid = get_xid();
+ cifs_dbg(FYI, "%s: full_path: %s\n", __func__, full_path);
- /* get a reference to a tcp session */
- server = cifs_get_tcp_session(volume_info);
- if (IS_ERR(server)) {
- rc = PTR_ERR(server);
- goto out;
- }
- if ((volume_info->max_credits < 20) ||
- (volume_info->max_credits > 60000))
- server->max_credits = SMB2_MAX_CREDITS_AVAILABLE;
- else
- server->max_credits = volume_info->max_credits;
- /* get a reference to a SMB session */
- ses = cifs_get_smb_ses(server, volume_info);
- if (IS_ERR(ses)) {
- rc = PTR_ERR(ses);
- ses = NULL;
- goto mount_fail_check;
+ rc = server->ops->is_path_accessible(xid, tcon, cifs_sb,
+ full_path);
+ if (rc != 0 && rc != -EREMOTE) {
+ kfree(full_path);
+ return rc;
}
- if ((volume_info->persistent == true) && ((ses->server->capabilities &
- SMB2_GLOBAL_CAP_PERSISTENT_HANDLES) == 0)) {
- cifs_dbg(VFS, "persistent handles not supported by server\n");
- rc = -EOPNOTSUPP;
- goto mount_fail_check;
+ if (rc != -EREMOTE) {
+ rc = cifs_are_all_path_components_accessible(server, xid, tcon,
+ cifs_sb,
+ full_path);
+ if (rc != 0) {
+ cifs_dbg(VFS, "cannot query dirs between root and final path, "
+ "enabling CIFS_MOUNT_USE_PREFIX_PATH\n");
+ cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH;
+ rc = 0;
+ }
}
- /* search for existing tcon to this server share */
- tcon = cifs_get_tcon(ses, volume_info);
- if (IS_ERR(tcon)) {
- rc = PTR_ERR(tcon);
- tcon = NULL;
- if (rc == -EACCES)
- goto mount_fail_check;
-
- goto remote_path_check;
- }
+ kfree(full_path);
+ return rc;
+}
- /* if new SMB3.11 POSIX extensions are supported do not remap / and \ */
- if (tcon->posix_extensions)
- cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_POSIX_PATHS;
+#ifdef CONFIG_CIFS_DFS_UPCALL
+int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *vol)
+{
+ int rc = 0;
+ unsigned int xid;
+ struct cifs_ses *ses;
+ struct cifs_tcon *root_tcon = NULL;
+ struct cifs_tcon *tcon = NULL;
+ struct TCP_Server_Info *server;
+ char *root_path = NULL, *full_path = NULL;
+ char *old_mountdata;
+ int count;
- /* tell server which Unix caps we support */
- if (cap_unix(tcon->ses)) {
- /* reset of caps checks mount to see if unix extensions
- disabled for just this mount */
- reset_cifs_unix_caps(xid, tcon, cifs_sb, volume_info);
- if ((tcon->ses->server->tcpStatus == CifsNeedReconnect) &&
- (le64_to_cpu(tcon->fsUnixInfo.Capability) &
- CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP)) {
- rc = -EACCES;
- goto mount_fail_check;
+ rc = mount_get_conns(vol, cifs_sb, &xid, &server, &ses, &tcon);
+ if (!rc && tcon) {
+ /* If not a standalone DFS root, then check if path is remote */
+ rc = dfs_cache_find(xid, ses, cifs_sb->local_nls,
+ cifs_remap(cifs_sb), vol->UNC + 1, NULL,
+ NULL);
+ if (rc) {
+ rc = is_path_remote(cifs_sb, vol, xid, server, tcon);
+ if (!rc)
+ goto out;
+ if (rc != -EREMOTE)
+ goto error;
}
- } else
- tcon->unix_ext = 0; /* server does not support them */
-
- /* do not care if a following call succeed - informational */
- if (!tcon->pipe && server->ops->qfs_tcon)
- server->ops->qfs_tcon(xid, tcon);
+ }
+ /*
+ * If first DFS target server went offline and we failed to connect it,
+ * server and ses pointers are NULL at this point, though we still have
+ * chance to get a cached DFS referral in expand_dfs_referral() and
+ * retry next target available in it.
+ *
+ * If a NULL ses ptr is passed to dfs_cache_find(), a lookup will be
+ * performed against DFS path and *no* requests will be sent to server
+ * for any new DFS referrals. Hence it's safe to skip checking whether
+ * server or ses ptr is NULL.
+ */
+ if (rc == -EACCES || rc == -EOPNOTSUPP)
+ goto error;
- cifs_sb->wsize = server->ops->negotiate_wsize(tcon, volume_info);
- cifs_sb->rsize = server->ops->negotiate_rsize(tcon, volume_info);
+ root_path = build_unc_path_to_root(vol, cifs_sb, false);
+ if (IS_ERR(root_path)) {
+ rc = PTR_ERR(root_path);
+ root_path = NULL;
+ goto error;
+ }
-remote_path_check:
-#ifdef CONFIG_CIFS_DFS_UPCALL
+ full_path = build_unc_path_to_root(vol, cifs_sb, true);
+ if (IS_ERR(full_path)) {
+ rc = PTR_ERR(full_path);
+ full_path = NULL;
+ goto error;
+ }
/*
* Perform an unconditional check for whether there are DFS
* referrals for this path without prefix, to provide support
@@ -4059,119 +4520,173 @@ remote_path_check:
* with PATH_NOT_COVERED to requests that include the prefix.
* Chase the referral if found, otherwise continue normally.
*/
- if (referral_walks_count == 0) {
- int refrc = expand_dfs_referral(xid, ses, volume_info, cifs_sb,
- false);
- if (!refrc) {
- referral_walks_count++;
- goto try_mount_again;
- }
+ old_mountdata = cifs_sb->mountdata;
+ (void)expand_dfs_referral(xid, ses, vol, cifs_sb, false);
+
+ if (cifs_sb->mountdata == NULL) {
+ rc = -ENOENT;
+ goto error;
}
-#endif
- /* check if a whole path is not remote */
- if (!rc && tcon) {
- if (!server->ops->is_path_accessible) {
- rc = -ENOSYS;
- goto mount_fail_check;
+ if (cifs_sb->mountdata != old_mountdata) {
+ /* If we were redirected, reconnect to new target server */
+ mount_put_conns(cifs_sb, xid, server, ses, tcon);
+ rc = mount_get_conns(vol, cifs_sb, &xid, &server, &ses, &tcon);
+ }
+ if (rc) {
+ if (rc == -EACCES || rc == -EOPNOTSUPP)
+ goto error;
+ /* Perform DFS failover to any other DFS targets */
+ rc = mount_do_dfs_failover(root_path + 1, cifs_sb, vol, NULL,
+ &xid, &server, &ses, &tcon);
+ if (rc)
+ goto error;
+ }
+
+ kfree(root_path);
+ root_path = build_unc_path_to_root(vol, cifs_sb, false);
+ if (IS_ERR(root_path)) {
+ rc = PTR_ERR(root_path);
+ root_path = NULL;
+ goto error;
+ }
+ /* Cache out resolved root server */
+ (void)dfs_cache_find(xid, ses, cifs_sb->local_nls, cifs_remap(cifs_sb),
+ root_path + 1, NULL, NULL);
+ /*
+ * Save root tcon for additional DFS requests to update or create a new
+ * DFS cache entry, or even perform DFS failover.
+ */
+ spin_lock(&cifs_tcp_ses_lock);
+ tcon->tc_count++;
+ tcon->dfs_path = root_path;
+ root_path = NULL;
+ tcon->remap = cifs_remap(cifs_sb);
+ spin_unlock(&cifs_tcp_ses_lock);
+
+ root_tcon = tcon;
+
+ for (count = 1; ;) {
+ if (!rc && tcon) {
+ rc = is_path_remote(cifs_sb, vol, xid, server, tcon);
+ if (!rc || rc != -EREMOTE)
+ break;
}
/*
- * cifs_build_path_to_root works only when we have a valid tcon
+ * BB: when we implement proper loop detection,
+ * we will remove this check. But now we need it
+ * to prevent an indefinite loop if 'DFS tree' is
+ * misconfigured (i.e. has loops).
*/
- full_path = cifs_build_path_to_root(volume_info, cifs_sb, tcon,
- tcon->Flags & SMB_SHARE_IS_IN_DFS);
- if (full_path == NULL) {
- rc = -ENOMEM;
- goto mount_fail_check;
- }
- rc = server->ops->is_path_accessible(xid, tcon, cifs_sb,
- full_path);
- if (rc != 0 && rc != -EREMOTE) {
- kfree(full_path);
- goto mount_fail_check;
+ if (count++ > MAX_NESTED_LINKS) {
+ rc = -ELOOP;
+ break;
}
- if (rc != -EREMOTE) {
- rc = cifs_are_all_path_components_accessible(server,
- xid, tcon, cifs_sb,
- full_path);
- if (rc != 0) {
- cifs_dbg(VFS, "cannot query dirs between root and final path, "
- "enabling CIFS_MOUNT_USE_PREFIX_PATH\n");
- cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH;
- rc = 0;
- }
- }
kfree(full_path);
- }
-
- /* get referral if needed */
- if (rc == -EREMOTE) {
-#ifdef CONFIG_CIFS_DFS_UPCALL
- if (referral_walks_count > MAX_NESTED_LINKS) {
- /*
- * BB: when we implement proper loop detection,
- * we will remove this check. But now we need it
- * to prevent an indefinite loop if 'DFS tree' is
- * misconfigured (i.e. has loops).
- */
- rc = -ELOOP;
- goto mount_fail_check;
+ full_path = build_unc_path_to_root(vol, cifs_sb, true);
+ if (IS_ERR(full_path)) {
+ rc = PTR_ERR(full_path);
+ full_path = NULL;
+ break;
}
- rc = expand_dfs_referral(xid, ses, volume_info, cifs_sb, true);
+ old_mountdata = cifs_sb->mountdata;
+ rc = expand_dfs_referral(xid, root_tcon->ses, vol, cifs_sb,
+ true);
+ if (rc)
+ break;
- if (!rc) {
- referral_walks_count++;
- goto try_mount_again;
+ if (cifs_sb->mountdata != old_mountdata) {
+ mount_put_conns(cifs_sb, xid, server, ses, tcon);
+ rc = mount_get_conns(vol, cifs_sb, &xid, &server, &ses,
+ &tcon);
+ }
+ if (rc) {
+ if (rc == -EACCES || rc == -EOPNOTSUPP)
+ break;
+ /* Perform DFS failover to any other DFS targets */
+ rc = mount_do_dfs_failover(full_path + 1, cifs_sb, vol,
+ root_tcon->ses, &xid,
+ &server, &ses, &tcon);
+ if (rc == -EACCES || rc == -EOPNOTSUPP || !server ||
+ !ses)
+ goto error;
}
- goto mount_fail_check;
-#else /* No DFS support, return error on mount */
- rc = -EOPNOTSUPP;
-#endif
}
+ cifs_put_tcon(root_tcon);
if (rc)
- goto mount_fail_check;
+ goto error;
- /* now, hang the tcon off of the superblock */
- tlink = kzalloc(sizeof *tlink, GFP_KERNEL);
- if (tlink == NULL) {
+ spin_lock(&cifs_tcp_ses_lock);
+ if (!tcon->dfs_path) {
+ /* Save full path in new tcon to do failover when reconnecting tcons */
+ tcon->dfs_path = full_path;
+ full_path = NULL;
+ tcon->remap = cifs_remap(cifs_sb);
+ }
+ cifs_sb->origin_fullpath = kstrndup(tcon->dfs_path,
+ strlen(tcon->dfs_path),
+ GFP_ATOMIC);
+ if (!cifs_sb->origin_fullpath) {
+ spin_unlock(&cifs_tcp_ses_lock);
rc = -ENOMEM;
- goto mount_fail_check;
+ goto error;
}
+ spin_unlock(&cifs_tcp_ses_lock);
- tlink->tl_uid = ses->linux_uid;
- tlink->tl_tcon = tcon;
- tlink->tl_time = jiffies;
- set_bit(TCON_LINK_MASTER, &tlink->tl_flags);
- set_bit(TCON_LINK_IN_TREE, &tlink->tl_flags);
+ rc = dfs_cache_add_vol(vol, cifs_sb->origin_fullpath);
+ if (rc) {
+ kfree(cifs_sb->origin_fullpath);
+ goto error;
+ }
+ /*
+ * After reconnecting to a different server, unique ids won't
+ * match anymore, so we disable serverino. This prevents
+ * dentry revalidation to think the dentry are stale (ESTALE).
+ */
+ cifs_autodisable_serverino(cifs_sb);
+out:
+ free_xid(xid);
+ return mount_setup_tlink(cifs_sb, ses, tcon);
- cifs_sb->master_tlink = tlink;
- spin_lock(&cifs_sb->tlink_tree_lock);
- tlink_rb_insert(&cifs_sb->tlink_tree, tlink);
- spin_unlock(&cifs_sb->tlink_tree_lock);
+error:
+ kfree(full_path);
+ kfree(root_path);
+ mount_put_conns(cifs_sb, xid, server, ses, tcon);
+ return rc;
+}
+#else
+int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *vol)
+{
+ int rc = 0;
+ unsigned int xid;
+ struct cifs_ses *ses;
+ struct cifs_tcon *tcon;
+ struct TCP_Server_Info *server;
- queue_delayed_work(cifsiod_wq, &cifs_sb->prune_tlinks,
- TLINK_IDLE_EXPIRE);
+ rc = mount_get_conns(vol, cifs_sb, &xid, &server, &ses, &tcon);
+ if (rc)
+ goto error;
-mount_fail_check:
- /* on error free sesinfo and tcon struct if needed */
- if (rc) {
- /* If find_unc succeeded then rc == 0 so we can not end */
- /* up accidentally freeing someone elses tcon struct */
- if (tcon)
- cifs_put_tcon(tcon);
- else if (ses)
- cifs_put_smb_ses(ses);
- else
- cifs_put_tcp_session(server, 0);
+ if (tcon) {
+ rc = is_path_remote(cifs_sb, vol, xid, server, tcon);
+ if (rc == -EREMOTE)
+ rc = -EOPNOTSUPP;
+ if (rc)
+ goto error;
}
-out:
free_xid(xid);
+
+ return mount_setup_tlink(cifs_sb, ses, tcon);
+
+error:
+ mount_put_conns(cifs_sb, xid, server, ses, tcon);
return rc;
}
+#endif
/*
* Issue a TREE_CONNECT request.
@@ -4367,6 +4882,10 @@ cifs_umount(struct cifs_sb_info *cifs_sb)
kfree(cifs_sb->mountdata);
kfree(cifs_sb->prepath);
+#ifdef CONFIG_CIFS_DFS_UPCALL
+ dfs_cache_del_vol(cifs_sb->origin_fullpath);
+ kfree(cifs_sb->origin_fullpath);
+#endif
call_rcu(&cifs_sb->rcu, delayed_free);
}
diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c
new file mode 100644
index 000000000000..09b7d0d4f6e4
--- /dev/null
+++ b/fs/cifs/dfs_cache.c
@@ -0,0 +1,1368 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * DFS referral cache routines
+ *
+ * Copyright (c) 2018 Paulo Alcantara <palcantara@suse.de>
+ */
+
+#include <linux/rcupdate.h>
+#include <linux/rculist.h>
+#include <linux/jhash.h>
+#include <linux/ktime.h>
+#include <linux/slab.h>
+#include <linux/nls.h>
+#include <linux/workqueue.h>
+#include "cifsglob.h"
+#include "smb2pdu.h"
+#include "smb2proto.h"
+#include "cifsproto.h"
+#include "cifs_debug.h"
+#include "cifs_unicode.h"
+#include "smb2glob.h"
+
+#include "dfs_cache.h"
+
+#define DFS_CACHE_HTABLE_SIZE 32
+#define DFS_CACHE_MAX_ENTRIES 64
+
+#define IS_INTERLINK_SET(v) ((v) & (DFSREF_REFERRAL_SERVER | \
+ DFSREF_STORAGE_SERVER))
+
+struct dfs_cache_tgt {
+ char *t_name;
+ struct list_head t_list;
+};
+
+struct dfs_cache_entry {
+ struct hlist_node ce_hlist;
+ const char *ce_path;
+ int ce_ttl;
+ int ce_srvtype;
+ int ce_flags;
+ struct timespec64 ce_etime;
+ int ce_path_consumed;
+ int ce_numtgts;
+ struct list_head ce_tlist;
+ struct dfs_cache_tgt *ce_tgthint;
+ struct rcu_head ce_rcu;
+};
+
+static struct kmem_cache *dfs_cache_slab __read_mostly;
+
+struct dfs_cache_vol_info {
+ char *vi_fullpath;
+ struct smb_vol vi_vol;
+ struct list_head vi_list;
+};
+
+struct dfs_cache {
+ struct mutex dc_lock;
+ struct nls_table *dc_nlsc;
+ struct list_head dc_vol_list;
+ int dc_ttl;
+ struct delayed_work dc_refresh;
+};
+
+static struct dfs_cache dfs_cache;
+
+/*
+ * Number of entries in the cache
+ */
+static size_t dfs_cache_count;
+
+static DEFINE_MUTEX(dfs_cache_list_lock);
+static struct hlist_head dfs_cache_htable[DFS_CACHE_HTABLE_SIZE];
+
+static void refresh_cache_worker(struct work_struct *work);
+
+static inline bool is_path_valid(const char *path)
+{
+ return path && (strchr(path + 1, '\\') || strchr(path + 1, '/'));
+}
+
+static inline int get_normalized_path(const char *path, char **npath)
+{
+ if (*path == '\\') {
+ *npath = (char *)path;
+ } else {
+ *npath = kstrndup(path, strlen(path), GFP_KERNEL);
+ if (!*npath)
+ return -ENOMEM;
+ convert_delimiter(*npath, '\\');
+ }
+ return 0;
+}
+
+static inline void free_normalized_path(const char *path, char *npath)
+{
+ if (path != npath)
+ kfree(npath);
+}
+
+static inline bool cache_entry_expired(const struct dfs_cache_entry *ce)
+{
+ struct timespec64 ts;
+
+ ktime_get_coarse_real_ts64(&ts);
+ return timespec64_compare(&ts, &ce->ce_etime) >= 0;
+}
+
+static inline void free_tgts(struct dfs_cache_entry *ce)
+{
+ struct dfs_cache_tgt *t, *n;
+
+ list_for_each_entry_safe(t, n, &ce->ce_tlist, t_list) {
+ list_del(&t->t_list);
+ kfree(t->t_name);
+ kfree(t);
+ }
+}
+
+static void free_cache_entry(struct rcu_head *rcu)
+{
+ struct dfs_cache_entry *ce = container_of(rcu, struct dfs_cache_entry,
+ ce_rcu);
+ kmem_cache_free(dfs_cache_slab, ce);
+}
+
+static inline void flush_cache_ent(struct dfs_cache_entry *ce)
+{
+ if (hlist_unhashed(&ce->ce_hlist))
+ return;
+
+ hlist_del_init_rcu(&ce->ce_hlist);
+ kfree(ce->ce_path);
+ free_tgts(ce);
+ dfs_cache_count--;
+ call_rcu(&ce->ce_rcu, free_cache_entry);
+}
+
+static void flush_cache_ents(void)
+{
+ int i;
+
+ rcu_read_lock();
+ for (i = 0; i < DFS_CACHE_HTABLE_SIZE; i++) {
+ struct hlist_head *l = &dfs_cache_htable[i];
+ struct dfs_cache_entry *ce;
+
+ hlist_for_each_entry_rcu(ce, l, ce_hlist)
+ flush_cache_ent(ce);
+ }
+ rcu_read_unlock();
+}
+
+/*
+ * dfs cache /proc file
+ */
+static int dfscache_proc_show(struct seq_file *m, void *v)
+{
+ int bucket;
+ struct dfs_cache_entry *ce;
+ struct dfs_cache_tgt *t;
+
+ seq_puts(m, "DFS cache\n---------\n");
+
+ mutex_lock(&dfs_cache_list_lock);
+
+ rcu_read_lock();
+ hash_for_each_rcu(dfs_cache_htable, bucket, ce, ce_hlist) {
+ seq_printf(m,
+ "cache entry: path=%s,type=%s,ttl=%d,etime=%ld,"
+ "interlink=%s,path_consumed=%d,expired=%s\n",
+ ce->ce_path,
+ ce->ce_srvtype == DFS_TYPE_ROOT ? "root" : "link",
+ ce->ce_ttl, ce->ce_etime.tv_nsec,
+ IS_INTERLINK_SET(ce->ce_flags) ? "yes" : "no",
+ ce->ce_path_consumed,
+ cache_entry_expired(ce) ? "yes" : "no");
+
+ list_for_each_entry(t, &ce->ce_tlist, t_list) {
+ seq_printf(m, " %s%s\n",
+ t->t_name,
+ ce->ce_tgthint == t ? " (target hint)" : "");
+ }
+
+ }
+ rcu_read_unlock();
+
+ mutex_unlock(&dfs_cache_list_lock);
+ return 0;
+}
+
+static ssize_t dfscache_proc_write(struct file *file, const char __user *buffer,
+ size_t count, loff_t *ppos)
+{
+ char c;
+ int rc;
+
+ rc = get_user(c, buffer);
+ if (rc)
+ return rc;
+
+ if (c != '0')
+ return -EINVAL;
+
+ cifs_dbg(FYI, "clearing dfs cache");
+ mutex_lock(&dfs_cache_list_lock);
+ flush_cache_ents();
+ mutex_unlock(&dfs_cache_list_lock);
+
+ return count;
+}
+
+static int dfscache_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, dfscache_proc_show, NULL);
+}
+
+const struct file_operations dfscache_proc_fops = {
+ .open = dfscache_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+ .write = dfscache_proc_write,
+};
+
+#ifdef CONFIG_CIFS_DEBUG2
+static inline void dump_tgts(const struct dfs_cache_entry *ce)
+{
+ struct dfs_cache_tgt *t;
+
+ cifs_dbg(FYI, "target list:\n");
+ list_for_each_entry(t, &ce->ce_tlist, t_list) {
+ cifs_dbg(FYI, " %s%s\n", t->t_name,
+ ce->ce_tgthint == t ? " (target hint)" : "");
+ }
+}
+
+static inline void dump_ce(const struct dfs_cache_entry *ce)
+{
+ cifs_dbg(FYI, "cache entry: path=%s,type=%s,ttl=%d,etime=%ld,"
+ "interlink=%s,path_consumed=%d,expired=%s\n", ce->ce_path,
+ ce->ce_srvtype == DFS_TYPE_ROOT ? "root" : "link", ce->ce_ttl,
+ ce->ce_etime.tv_nsec,
+ IS_INTERLINK_SET(ce->ce_flags) ? "yes" : "no",
+ ce->ce_path_consumed,
+ cache_entry_expired(ce) ? "yes" : "no");
+ dump_tgts(ce);
+}
+
+static inline void dump_refs(const struct dfs_info3_param *refs, int numrefs)
+{
+ int i;
+
+ cifs_dbg(FYI, "DFS referrals returned by the server:\n");
+ for (i = 0; i < numrefs; i++) {
+ const struct dfs_info3_param *ref = &refs[i];
+
+ cifs_dbg(FYI,
+ "\n"
+ "flags: 0x%x\n"
+ "path_consumed: %d\n"
+ "server_type: 0x%x\n"
+ "ref_flag: 0x%x\n"
+ "path_name: %s\n"
+ "node_name: %s\n"
+ "ttl: %d (%dm)\n",
+ ref->flags, ref->path_consumed, ref->server_type,
+ ref->ref_flag, ref->path_name, ref->node_name,
+ ref->ttl, ref->ttl / 60);
+ }
+}
+#else
+#define dump_tgts(e)
+#define dump_ce(e)
+#define dump_refs(r, n)
+#endif
+
+/**
+ * dfs_cache_init - Initialize DFS referral cache.
+ *
+ * Return zero if initialized successfully, otherwise non-zero.
+ */
+int dfs_cache_init(void)
+{
+ int i;
+
+ dfs_cache_slab = kmem_cache_create("cifs_dfs_cache",
+ sizeof(struct dfs_cache_entry), 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ if (!dfs_cache_slab)
+ return -ENOMEM;
+
+ for (i = 0; i < DFS_CACHE_HTABLE_SIZE; i++)
+ INIT_HLIST_HEAD(&dfs_cache_htable[i]);
+
+ INIT_LIST_HEAD(&dfs_cache.dc_vol_list);
+ mutex_init(&dfs_cache.dc_lock);
+ INIT_DELAYED_WORK(&dfs_cache.dc_refresh, refresh_cache_worker);
+ dfs_cache.dc_ttl = -1;
+ dfs_cache.dc_nlsc = load_nls_default();
+
+ cifs_dbg(FYI, "%s: initialized DFS referral cache\n", __func__);
+ return 0;
+}
+
+static inline unsigned int cache_entry_hash(const void *data, int size)
+{
+ unsigned int h;
+
+ h = jhash(data, size, 0);
+ return h & (DFS_CACHE_HTABLE_SIZE - 1);
+}
+
+/* Check whether second path component of @path is SYSVOL or NETLOGON */
+static inline bool is_sysvol_or_netlogon(const char *path)
+{
+ const char *s;
+ char sep = path[0];
+
+ s = strchr(path + 1, sep) + 1;
+ return !strncasecmp(s, "sysvol", strlen("sysvol")) ||
+ !strncasecmp(s, "netlogon", strlen("netlogon"));
+}
+
+/* Return target hint of a DFS cache entry */
+static inline char *get_tgt_name(const struct dfs_cache_entry *ce)
+{
+ struct dfs_cache_tgt *t = ce->ce_tgthint;
+
+ return t ? t->t_name : ERR_PTR(-ENOENT);
+}
+
+/* Return expire time out of a new entry's TTL */
+static inline struct timespec64 get_expire_time(int ttl)
+{
+ struct timespec64 ts = {
+ .tv_sec = ttl,
+ .tv_nsec = 0,
+ };
+ struct timespec64 now;
+
+ ktime_get_coarse_real_ts64(&now);
+ return timespec64_add(now, ts);
+}
+
+/* Allocate a new DFS target */
+static inline struct dfs_cache_tgt *alloc_tgt(const char *name)
+{
+ struct dfs_cache_tgt *t;
+
+ t = kmalloc(sizeof(*t), GFP_KERNEL);
+ if (!t)
+ return ERR_PTR(-ENOMEM);
+ t->t_name = kstrndup(name, strlen(name), GFP_KERNEL);
+ if (!t->t_name) {
+ kfree(t);
+ return ERR_PTR(-ENOMEM);
+ }
+ INIT_LIST_HEAD(&t->t_list);
+ return t;
+}
+
+/*
+ * Copy DFS referral information to a cache entry and conditionally update
+ * target hint.
+ */
+static int copy_ref_data(const struct dfs_info3_param *refs, int numrefs,
+ struct dfs_cache_entry *ce, const char *tgthint)
+{
+ int i;
+
+ ce->ce_ttl = refs[0].ttl;
+ ce->ce_etime = get_expire_time(ce->ce_ttl);
+ ce->ce_srvtype = refs[0].server_type;
+ ce->ce_flags = refs[0].ref_flag;
+ ce->ce_path_consumed = refs[0].path_consumed;
+
+ for (i = 0; i < numrefs; i++) {
+ struct dfs_cache_tgt *t;
+
+ t = alloc_tgt(refs[i].node_name);
+ if (IS_ERR(t)) {
+ free_tgts(ce);
+ return PTR_ERR(t);
+ }
+ if (tgthint && !strcasecmp(t->t_name, tgthint)) {
+ list_add(&t->t_list, &ce->ce_tlist);
+ tgthint = NULL;
+ } else {
+ list_add_tail(&t->t_list, &ce->ce_tlist);
+ }
+ ce->ce_numtgts++;
+ }
+
+ ce->ce_tgthint = list_first_entry_or_null(&ce->ce_tlist,
+ struct dfs_cache_tgt, t_list);
+
+ return 0;
+}
+
+/* Allocate a new cache entry */
+static struct dfs_cache_entry *
+alloc_cache_entry(const char *path, const struct dfs_info3_param *refs,
+ int numrefs)
+{
+ struct dfs_cache_entry *ce;
+ int rc;
+
+ ce = kmem_cache_zalloc(dfs_cache_slab, GFP_KERNEL);
+ if (!ce)
+ return ERR_PTR(-ENOMEM);
+
+ ce->ce_path = kstrdup_const(path, GFP_KERNEL);
+ if (!ce->ce_path) {
+ kmem_cache_free(dfs_cache_slab, ce);
+ return ERR_PTR(-ENOMEM);
+ }
+ INIT_HLIST_NODE(&ce->ce_hlist);
+ INIT_LIST_HEAD(&ce->ce_tlist);
+
+ rc = copy_ref_data(refs, numrefs, ce, NULL);
+ if (rc) {
+ kfree(ce->ce_path);
+ kmem_cache_free(dfs_cache_slab, ce);
+ ce = ERR_PTR(rc);
+ }
+ return ce;
+}
+
+static void remove_oldest_entry(void)
+{
+ int bucket;
+ struct dfs_cache_entry *ce;
+ struct dfs_cache_entry *to_del = NULL;
+
+ rcu_read_lock();
+ hash_for_each_rcu(dfs_cache_htable, bucket, ce, ce_hlist) {
+ if (!to_del || timespec64_compare(&ce->ce_etime,
+ &to_del->ce_etime) < 0)
+ to_del = ce;
+ }
+ if (!to_del) {
+ cifs_dbg(FYI, "%s: no entry to remove", __func__);
+ goto out;
+ }
+ cifs_dbg(FYI, "%s: removing entry", __func__);
+ dump_ce(to_del);
+ flush_cache_ent(to_del);
+out:
+ rcu_read_unlock();
+}
+
+/* Add a new DFS cache entry */
+static inline struct dfs_cache_entry *
+add_cache_entry(unsigned int hash, const char *path,
+ const struct dfs_info3_param *refs, int numrefs)
+{
+ struct dfs_cache_entry *ce;
+
+ ce = alloc_cache_entry(path, refs, numrefs);
+ if (IS_ERR(ce))
+ return ce;
+
+ hlist_add_head_rcu(&ce->ce_hlist, &dfs_cache_htable[hash]);
+
+ mutex_lock(&dfs_cache.dc_lock);
+ if (dfs_cache.dc_ttl < 0) {
+ dfs_cache.dc_ttl = ce->ce_ttl;
+ queue_delayed_work(cifsiod_wq, &dfs_cache.dc_refresh,
+ dfs_cache.dc_ttl * HZ);
+ } else {
+ dfs_cache.dc_ttl = min_t(int, dfs_cache.dc_ttl, ce->ce_ttl);
+ mod_delayed_work(cifsiod_wq, &dfs_cache.dc_refresh,
+ dfs_cache.dc_ttl * HZ);
+ }
+ mutex_unlock(&dfs_cache.dc_lock);
+
+ return ce;
+}
+
+static struct dfs_cache_entry *__find_cache_entry(unsigned int hash,
+ const char *path)
+{
+ struct dfs_cache_entry *ce;
+ bool found = false;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(ce, &dfs_cache_htable[hash], ce_hlist) {
+ if (!strcasecmp(path, ce->ce_path)) {
+#ifdef CONFIG_CIFS_DEBUG2
+ char *name = get_tgt_name(ce);
+
+ if (unlikely(IS_ERR(name))) {
+ rcu_read_unlock();
+ return ERR_CAST(name);
+ }
+ cifs_dbg(FYI, "%s: cache hit\n", __func__);
+ cifs_dbg(FYI, "%s: target hint: %s\n", __func__, name);
+#endif
+ found = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ return found ? ce : ERR_PTR(-ENOENT);
+}
+
+/*
+ * Find a DFS cache entry in hash table and optionally check prefix path against
+ * @path.
+ * Use whole path components in the match.
+ * Return ERR_PTR(-ENOENT) if the entry is not found.
+ */
+static inline struct dfs_cache_entry *find_cache_entry(const char *path,
+ unsigned int *hash)
+{
+ *hash = cache_entry_hash(path, strlen(path));
+ return __find_cache_entry(*hash, path);
+}
+
+static inline void destroy_slab_cache(void)
+{
+ rcu_barrier();
+ kmem_cache_destroy(dfs_cache_slab);
+}
+
+static inline void free_vol(struct dfs_cache_vol_info *vi)
+{
+ list_del(&vi->vi_list);
+ kfree(vi->vi_fullpath);
+ cifs_cleanup_volume_info_contents(&vi->vi_vol);
+ kfree(vi);
+}
+
+static inline void free_vol_list(void)
+{
+ struct dfs_cache_vol_info *vi, *nvi;
+
+ list_for_each_entry_safe(vi, nvi, &dfs_cache.dc_vol_list, vi_list)
+ free_vol(vi);
+}
+
+/**
+ * dfs_cache_destroy - destroy DFS referral cache
+ */
+void dfs_cache_destroy(void)
+{
+ cancel_delayed_work_sync(&dfs_cache.dc_refresh);
+ unload_nls(dfs_cache.dc_nlsc);
+ free_vol_list();
+ mutex_destroy(&dfs_cache.dc_lock);
+
+ flush_cache_ents();
+ destroy_slab_cache();
+ mutex_destroy(&dfs_cache_list_lock);
+
+ cifs_dbg(FYI, "%s: destroyed DFS referral cache\n", __func__);
+}
+
+static inline struct dfs_cache_entry *
+__update_cache_entry(const char *path, const struct dfs_info3_param *refs,
+ int numrefs)
+{
+ int rc;
+ unsigned int h;
+ struct dfs_cache_entry *ce;
+ char *s, *th = NULL;
+
+ ce = find_cache_entry(path, &h);
+ if (IS_ERR(ce))
+ return ce;
+
+ if (ce->ce_tgthint) {
+ s = ce->ce_tgthint->t_name;
+ th = kstrndup(s, strlen(s), GFP_KERNEL);
+ if (!th)
+ return ERR_PTR(-ENOMEM);
+ }
+
+ free_tgts(ce);
+ ce->ce_numtgts = 0;
+
+ rc = copy_ref_data(refs, numrefs, ce, th);
+ kfree(th);
+
+ if (rc)
+ ce = ERR_PTR(rc);
+
+ return ce;
+}
+
+/* Update an expired cache entry by getting a new DFS referral from server */
+static struct dfs_cache_entry *
+update_cache_entry(const unsigned int xid, struct cifs_ses *ses,
+ const struct nls_table *nls_codepage, int remap,
+ const char *path, struct dfs_cache_entry *ce)
+{
+ int rc;
+ struct dfs_info3_param *refs = NULL;
+ int numrefs = 0;
+
+ cifs_dbg(FYI, "%s: update expired cache entry\n", __func__);
+ /*
+ * Check if caller provided enough parameters to update an expired
+ * entry.
+ */
+ if (!ses || !ses->server || !ses->server->ops->get_dfs_refer)
+ return ERR_PTR(-ETIME);
+ if (unlikely(!nls_codepage))
+ return ERR_PTR(-ETIME);
+
+ cifs_dbg(FYI, "%s: DFS referral request for %s\n", __func__, path);
+
+ rc = ses->server->ops->get_dfs_refer(xid, ses, path, &refs, &numrefs,
+ nls_codepage, remap);
+ if (rc)
+ ce = ERR_PTR(rc);
+ else
+ ce = __update_cache_entry(path, refs, numrefs);
+
+ dump_refs(refs, numrefs);
+ free_dfs_info_array(refs, numrefs);
+
+ return ce;
+}
+
+/*
+ * Find, create or update a DFS cache entry.
+ *
+ * If the entry wasn't found, it will create a new one. Or if it was found but
+ * expired, then it will update the entry accordingly.
+ *
+ * For interlinks, __cifs_dfs_mount() and expand_dfs_referral() are supposed to
+ * handle them properly.
+ */
+static struct dfs_cache_entry *
+do_dfs_cache_find(const unsigned int xid, struct cifs_ses *ses,
+ const struct nls_table *nls_codepage, int remap,
+ const char *path, bool noreq)
+{
+ int rc;
+ unsigned int h;
+ struct dfs_cache_entry *ce;
+ struct dfs_info3_param *nrefs;
+ int numnrefs;
+
+ cifs_dbg(FYI, "%s: search path: %s\n", __func__, path);
+
+ ce = find_cache_entry(path, &h);
+ if (IS_ERR(ce)) {
+ cifs_dbg(FYI, "%s: cache miss\n", __func__);
+ /*
+ * If @noreq is set, no requests will be sent to the server for
+ * either updating or getting a new DFS referral.
+ */
+ if (noreq)
+ return ce;
+ /*
+ * No cache entry was found, so check for valid parameters that
+ * will be required to get a new DFS referral and then create a
+ * new cache entry.
+ */
+ if (!ses || !ses->server || !ses->server->ops->get_dfs_refer) {
+ ce = ERR_PTR(-EOPNOTSUPP);
+ return ce;
+ }
+ if (unlikely(!nls_codepage)) {
+ ce = ERR_PTR(-EINVAL);
+ return ce;
+ }
+
+ nrefs = NULL;
+ numnrefs = 0;
+
+ cifs_dbg(FYI, "%s: DFS referral request for %s\n", __func__,
+ path);
+
+ rc = ses->server->ops->get_dfs_refer(xid, ses, path, &nrefs,
+ &numnrefs, nls_codepage,
+ remap);
+ if (rc) {
+ ce = ERR_PTR(rc);
+ return ce;
+ }
+
+ dump_refs(nrefs, numnrefs);
+
+ cifs_dbg(FYI, "%s: new cache entry\n", __func__);
+
+ if (dfs_cache_count >= DFS_CACHE_MAX_ENTRIES) {
+ cifs_dbg(FYI, "%s: reached max cache size (%d)",
+ __func__, DFS_CACHE_MAX_ENTRIES);
+ remove_oldest_entry();
+ }
+ ce = add_cache_entry(h, path, nrefs, numnrefs);
+ free_dfs_info_array(nrefs, numnrefs);
+
+ if (IS_ERR(ce))
+ return ce;
+
+ dfs_cache_count++;
+ }
+
+ dump_ce(ce);
+
+ /* Just return the found cache entry in case @noreq is set */
+ if (noreq)
+ return ce;
+
+ if (cache_entry_expired(ce)) {
+ cifs_dbg(FYI, "%s: expired cache entry\n", __func__);
+ ce = update_cache_entry(xid, ses, nls_codepage, remap, path,
+ ce);
+ if (IS_ERR(ce)) {
+ cifs_dbg(FYI, "%s: failed to update expired entry\n",
+ __func__);
+ }
+ }
+ return ce;
+}
+
+/* Set up a new DFS referral from a given cache entry */
+static int setup_ref(const char *path, const struct dfs_cache_entry *ce,
+ struct dfs_info3_param *ref, const char *tgt)
+{
+ int rc;
+
+ cifs_dbg(FYI, "%s: set up new ref\n", __func__);
+
+ memset(ref, 0, sizeof(*ref));
+
+ ref->path_name = kstrndup(path, strlen(path), GFP_KERNEL);
+ if (!ref->path_name)
+ return -ENOMEM;
+
+ ref->path_consumed = ce->ce_path_consumed;
+
+ ref->node_name = kstrndup(tgt, strlen(tgt), GFP_KERNEL);
+ if (!ref->node_name) {
+ rc = -ENOMEM;
+ goto err_free_path;
+ }
+
+ ref->ttl = ce->ce_ttl;
+ ref->server_type = ce->ce_srvtype;
+ ref->ref_flag = ce->ce_flags;
+
+ return 0;
+
+err_free_path:
+ kfree(ref->path_name);
+ ref->path_name = NULL;
+ return rc;
+}
+
+/* Return target list of a DFS cache entry */
+static int get_tgt_list(const struct dfs_cache_entry *ce,
+ struct dfs_cache_tgt_list *tl)
+{
+ int rc;
+ struct list_head *head = &tl->tl_list;
+ struct dfs_cache_tgt *t;
+ struct dfs_cache_tgt_iterator *it, *nit;
+
+ memset(tl, 0, sizeof(*tl));
+ INIT_LIST_HEAD(head);
+
+ list_for_each_entry(t, &ce->ce_tlist, t_list) {
+ it = kzalloc(sizeof(*it), GFP_KERNEL);
+ if (!it) {
+ rc = -ENOMEM;
+ goto err_free_it;
+ }
+
+ it->it_name = kstrndup(t->t_name, strlen(t->t_name),
+ GFP_KERNEL);
+ if (!it->it_name) {
+ kfree(it);
+ rc = -ENOMEM;
+ goto err_free_it;
+ }
+
+ if (ce->ce_tgthint == t)
+ list_add(&it->it_list, head);
+ else
+ list_add_tail(&it->it_list, head);
+ }
+ tl->tl_numtgts = ce->ce_numtgts;
+
+ return 0;
+
+err_free_it:
+ list_for_each_entry_safe(it, nit, head, it_list) {
+ kfree(it->it_name);
+ kfree(it);
+ }
+ return rc;
+}
+
+/**
+ * dfs_cache_find - find a DFS cache entry
+ *
+ * If it doesn't find the cache entry, then it will get a DFS referral
+ * for @path and create a new entry.
+ *
+ * In case the cache entry exists but expired, it will get a DFS referral
+ * for @path and then update the respective cache entry.
+ *
+ * These parameters are passed down to the get_dfs_refer() call if it
+ * needs to be issued:
+ * @xid: syscall xid
+ * @ses: smb session to issue the request on
+ * @nls_codepage: charset conversion
+ * @remap: path character remapping type
+ * @path: path to lookup in DFS referral cache.
+ *
+ * @ref: when non-NULL, store single DFS referral result in it.
+ * @tgt_list: when non-NULL, store complete DFS target list in it.
+ *
+ * Return zero if the target was found, otherwise non-zero.
+ */
+int dfs_cache_find(const unsigned int xid, struct cifs_ses *ses,
+ const struct nls_table *nls_codepage, int remap,
+ const char *path, struct dfs_info3_param *ref,
+ struct dfs_cache_tgt_list *tgt_list)
+{
+ int rc;
+ char *npath;
+ struct dfs_cache_entry *ce;
+
+ if (unlikely(!is_path_valid(path)))
+ return -EINVAL;
+
+ rc = get_normalized_path(path, &npath);
+ if (rc)
+ return rc;
+
+ mutex_lock(&dfs_cache_list_lock);
+ ce = do_dfs_cache_find(xid, ses, nls_codepage, remap, npath, false);
+ if (!IS_ERR(ce)) {
+ if (ref)
+ rc = setup_ref(path, ce, ref, get_tgt_name(ce));
+ else
+ rc = 0;
+ if (!rc && tgt_list)
+ rc = get_tgt_list(ce, tgt_list);
+ } else {
+ rc = PTR_ERR(ce);
+ }
+ mutex_unlock(&dfs_cache_list_lock);
+ free_normalized_path(path, npath);
+ return rc;
+}
+
+/**
+ * dfs_cache_noreq_find - find a DFS cache entry without sending any requests to
+ * the currently connected server.
+ *
+ * NOTE: This function will neither update a cache entry in case it was
+ * expired, nor create a new cache entry if @path hasn't been found. It heavily
+ * relies on an existing cache entry.
+ *
+ * @path: path to lookup in the DFS referral cache.
+ * @ref: when non-NULL, store single DFS referral result in it.
+ * @tgt_list: when non-NULL, store complete DFS target list in it.
+ *
+ * Return 0 if successful.
+ * Return -ENOENT if the entry was not found.
+ * Return non-zero for other errors.
+ */
+int dfs_cache_noreq_find(const char *path, struct dfs_info3_param *ref,
+ struct dfs_cache_tgt_list *tgt_list)
+{
+ int rc;
+ char *npath;
+ struct dfs_cache_entry *ce;
+
+ if (unlikely(!is_path_valid(path)))
+ return -EINVAL;
+
+ rc = get_normalized_path(path, &npath);
+ if (rc)
+ return rc;
+
+ mutex_lock(&dfs_cache_list_lock);
+ ce = do_dfs_cache_find(0, NULL, NULL, 0, npath, true);
+ if (IS_ERR(ce)) {
+ rc = PTR_ERR(ce);
+ goto out;
+ }
+
+ if (ref)
+ rc = setup_ref(path, ce, ref, get_tgt_name(ce));
+ else
+ rc = 0;
+ if (!rc && tgt_list)
+ rc = get_tgt_list(ce, tgt_list);
+out:
+ mutex_unlock(&dfs_cache_list_lock);
+ free_normalized_path(path, npath);
+ return rc;
+}
+
+/**
+ * dfs_cache_update_tgthint - update target hint of a DFS cache entry
+ *
+ * If it doesn't find the cache entry, then it will get a DFS referral for @path
+ * and create a new entry.
+ *
+ * In case the cache entry exists but expired, it will get a DFS referral
+ * for @path and then update the respective cache entry.
+ *
+ * @xid: syscall id
+ * @ses: smb session
+ * @nls_codepage: charset conversion
+ * @remap: type of character remapping for paths
+ * @path: path to lookup in DFS referral cache.
+ * @it: DFS target iterator
+ *
+ * Return zero if the target hint was updated successfully, otherwise non-zero.
+ */
+int dfs_cache_update_tgthint(const unsigned int xid, struct cifs_ses *ses,
+ const struct nls_table *nls_codepage, int remap,
+ const char *path,
+ const struct dfs_cache_tgt_iterator *it)
+{
+ int rc;
+ char *npath;
+ struct dfs_cache_entry *ce;
+ struct dfs_cache_tgt *t;
+
+ if (unlikely(!is_path_valid(path)))
+ return -EINVAL;
+
+ rc = get_normalized_path(path, &npath);
+ if (rc)
+ return rc;
+
+ cifs_dbg(FYI, "%s: path: %s\n", __func__, npath);
+
+ mutex_lock(&dfs_cache_list_lock);
+ ce = do_dfs_cache_find(xid, ses, nls_codepage, remap, npath, false);
+ if (IS_ERR(ce)) {
+ rc = PTR_ERR(ce);
+ goto out;
+ }
+
+ rc = 0;
+
+ t = ce->ce_tgthint;
+
+ if (likely(!strcasecmp(it->it_name, t->t_name)))
+ goto out;
+
+ list_for_each_entry(t, &ce->ce_tlist, t_list) {
+ if (!strcasecmp(t->t_name, it->it_name)) {
+ ce->ce_tgthint = t;
+ cifs_dbg(FYI, "%s: new target hint: %s\n", __func__,
+ it->it_name);
+ break;
+ }
+ }
+
+out:
+ mutex_unlock(&dfs_cache_list_lock);
+ free_normalized_path(path, npath);
+ return rc;
+}
+
+/**
+ * dfs_cache_noreq_update_tgthint - update target hint of a DFS cache entry
+ * without sending any requests to the currently connected server.
+ *
+ * NOTE: This function will neither update a cache entry in case it was
+ * expired, nor create a new cache entry if @path hasn't been found. It heavily
+ * relies on an existing cache entry.
+ *
+ * @path: path to lookup in DFS referral cache.
+ * @it: target iterator which contains the target hint to update the cache
+ * entry with.
+ *
+ * Return zero if the target hint was updated successfully, otherwise non-zero.
+ */
+int dfs_cache_noreq_update_tgthint(const char *path,
+ const struct dfs_cache_tgt_iterator *it)
+{
+ int rc;
+ char *npath;
+ struct dfs_cache_entry *ce;
+ struct dfs_cache_tgt *t;
+
+ if (unlikely(!is_path_valid(path)) || !it)
+ return -EINVAL;
+
+ rc = get_normalized_path(path, &npath);
+ if (rc)
+ return rc;
+
+ cifs_dbg(FYI, "%s: path: %s\n", __func__, npath);
+
+ mutex_lock(&dfs_cache_list_lock);
+
+ ce = do_dfs_cache_find(0, NULL, NULL, 0, npath, true);
+ if (IS_ERR(ce)) {
+ rc = PTR_ERR(ce);
+ goto out;
+ }
+
+ rc = 0;
+
+ t = ce->ce_tgthint;
+
+ if (unlikely(!strcasecmp(it->it_name, t->t_name)))
+ goto out;
+
+ list_for_each_entry(t, &ce->ce_tlist, t_list) {
+ if (!strcasecmp(t->t_name, it->it_name)) {
+ ce->ce_tgthint = t;
+ cifs_dbg(FYI, "%s: new target hint: %s\n", __func__,
+ it->it_name);
+ break;
+ }
+ }
+
+out:
+ mutex_unlock(&dfs_cache_list_lock);
+ free_normalized_path(path, npath);
+ return rc;
+}
+
+/**
+ * dfs_cache_get_tgt_referral - returns a DFS referral (@ref) from a given
+ * target iterator (@it).
+ *
+ * @path: path to lookup in DFS referral cache.
+ * @it: DFS target iterator.
+ * @ref: DFS referral pointer to set up the gathered information.
+ *
+ * Return zero if the DFS referral was set up correctly, otherwise non-zero.
+ */
+int dfs_cache_get_tgt_referral(const char *path,
+ const struct dfs_cache_tgt_iterator *it,
+ struct dfs_info3_param *ref)
+{
+ int rc;
+ char *npath;
+ struct dfs_cache_entry *ce;
+ unsigned int h;
+
+ if (!it || !ref)
+ return -EINVAL;
+ if (unlikely(!is_path_valid(path)))
+ return -EINVAL;
+
+ rc = get_normalized_path(path, &npath);
+ if (rc)
+ return rc;
+
+ cifs_dbg(FYI, "%s: path: %s\n", __func__, npath);
+
+ mutex_lock(&dfs_cache_list_lock);
+
+ ce = find_cache_entry(npath, &h);
+ if (IS_ERR(ce)) {
+ rc = PTR_ERR(ce);
+ goto out;
+ }
+
+ cifs_dbg(FYI, "%s: target name: %s\n", __func__, it->it_name);
+
+ rc = setup_ref(path, ce, ref, it->it_name);
+
+out:
+ mutex_unlock(&dfs_cache_list_lock);
+ free_normalized_path(path, npath);
+ return rc;
+}
+
+static int dup_vol(struct smb_vol *vol, struct smb_vol *new)
+{
+ memcpy(new, vol, sizeof(*new));
+
+ if (vol->username) {
+ new->username = kstrndup(vol->username, strlen(vol->username),
+ GFP_KERNEL);
+ if (!new->username)
+ return -ENOMEM;
+ }
+ if (vol->password) {
+ new->password = kstrndup(vol->password, strlen(vol->password),
+ GFP_KERNEL);
+ if (!new->password)
+ goto err_free_username;
+ }
+ if (vol->UNC) {
+ cifs_dbg(FYI, "%s: vol->UNC: %s\n", __func__, vol->UNC);
+ new->UNC = kstrndup(vol->UNC, strlen(vol->UNC), GFP_KERNEL);
+ if (!new->UNC)
+ goto err_free_password;
+ }
+ if (vol->domainname) {
+ new->domainname = kstrndup(vol->domainname,
+ strlen(vol->domainname), GFP_KERNEL);
+ if (!new->domainname)
+ goto err_free_unc;
+ }
+ if (vol->iocharset) {
+ new->iocharset = kstrndup(vol->iocharset,
+ strlen(vol->iocharset), GFP_KERNEL);
+ if (!new->iocharset)
+ goto err_free_domainname;
+ }
+ if (vol->prepath) {
+ cifs_dbg(FYI, "%s: vol->prepath: %s\n", __func__, vol->prepath);
+ new->prepath = kstrndup(vol->prepath, strlen(vol->prepath),
+ GFP_KERNEL);
+ if (!new->prepath)
+ goto err_free_iocharset;
+ }
+
+ return 0;
+
+err_free_iocharset:
+ kfree(new->iocharset);
+err_free_domainname:
+ kfree(new->domainname);
+err_free_unc:
+ kfree(new->UNC);
+err_free_password:
+ kzfree(new->password);
+err_free_username:
+ kfree(new->username);
+ kfree(new);
+ return -ENOMEM;
+}
+
+/**
+ * dfs_cache_add_vol - add a cifs volume during mount() that will be handled by
+ * DFS cache refresh worker.
+ *
+ * @vol: cifs volume.
+ * @fullpath: origin full path.
+ *
+ * Return zero if volume was set up correctly, otherwise non-zero.
+ */
+int dfs_cache_add_vol(struct smb_vol *vol, const char *fullpath)
+{
+ int rc;
+ struct dfs_cache_vol_info *vi;
+
+ if (!vol || !fullpath)
+ return -EINVAL;
+
+ cifs_dbg(FYI, "%s: fullpath: %s\n", __func__, fullpath);
+
+ vi = kzalloc(sizeof(*vi), GFP_KERNEL);
+ if (!vi)
+ return -ENOMEM;
+
+ vi->vi_fullpath = kstrndup(fullpath, strlen(fullpath), GFP_KERNEL);
+ if (!vi->vi_fullpath) {
+ rc = -ENOMEM;
+ goto err_free_vi;
+ }
+
+ rc = dup_vol(vol, &vi->vi_vol);
+ if (rc)
+ goto err_free_fullpath;
+
+ mutex_lock(&dfs_cache.dc_lock);
+ list_add_tail(&vi->vi_list, &dfs_cache.dc_vol_list);
+ mutex_unlock(&dfs_cache.dc_lock);
+ return 0;
+
+err_free_fullpath:
+ kfree(vi->vi_fullpath);
+err_free_vi:
+ kfree(vi);
+ return rc;
+}
+
+static inline struct dfs_cache_vol_info *find_vol(const char *fullpath)
+{
+ struct dfs_cache_vol_info *vi;
+
+ list_for_each_entry(vi, &dfs_cache.dc_vol_list, vi_list) {
+ cifs_dbg(FYI, "%s: vi->vi_fullpath: %s\n", __func__,
+ vi->vi_fullpath);
+ if (!strcasecmp(vi->vi_fullpath, fullpath))
+ return vi;
+ }
+ return ERR_PTR(-ENOENT);
+}
+
+/**
+ * dfs_cache_update_vol - update vol info in DFS cache after failover
+ *
+ * @fullpath: fullpath to look up in volume list.
+ * @server: TCP ses pointer.
+ *
+ * Return zero if volume was updated, otherwise non-zero.
+ */
+int dfs_cache_update_vol(const char *fullpath, struct TCP_Server_Info *server)
+{
+ int rc;
+ struct dfs_cache_vol_info *vi;
+
+ if (!fullpath || !server)
+ return -EINVAL;
+
+ cifs_dbg(FYI, "%s: fullpath: %s\n", __func__, fullpath);
+
+ mutex_lock(&dfs_cache.dc_lock);
+
+ vi = find_vol(fullpath);
+ if (IS_ERR(vi)) {
+ rc = PTR_ERR(vi);
+ goto out;
+ }
+
+ cifs_dbg(FYI, "%s: updating volume info\n", __func__);
+ memcpy(&vi->vi_vol.dstaddr, &server->dstaddr,
+ sizeof(vi->vi_vol.dstaddr));
+ rc = 0;
+
+out:
+ mutex_unlock(&dfs_cache.dc_lock);
+ return rc;
+}
+
+/**
+ * dfs_cache_del_vol - remove volume info in DFS cache during umount()
+ *
+ * @fullpath: fullpath to look up in volume list.
+ */
+void dfs_cache_del_vol(const char *fullpath)
+{
+ struct dfs_cache_vol_info *vi;
+
+ if (!fullpath || !*fullpath)
+ return;
+
+ cifs_dbg(FYI, "%s: fullpath: %s\n", __func__, fullpath);
+
+ mutex_lock(&dfs_cache.dc_lock);
+ vi = find_vol(fullpath);
+ if (!IS_ERR(vi))
+ free_vol(vi);
+ mutex_unlock(&dfs_cache.dc_lock);
+}
+
+/* Get all tcons that are within a DFS namespace and can be refreshed */
+static void get_tcons(struct TCP_Server_Info *server, struct list_head *head)
+{
+ struct cifs_ses *ses;
+ struct cifs_tcon *tcon;
+
+ INIT_LIST_HEAD(head);
+
+ spin_lock(&cifs_tcp_ses_lock);
+ list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
+ list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
+ if (!tcon->need_reconnect && !tcon->need_reopen_files &&
+ tcon->dfs_path) {
+ tcon->tc_count++;
+ list_add_tail(&tcon->ulist, head);
+ }
+ }
+ if (ses->tcon_ipc && !ses->tcon_ipc->need_reconnect &&
+ ses->tcon_ipc->dfs_path) {
+ list_add_tail(&ses->tcon_ipc->ulist, head);
+ }
+ }
+ spin_unlock(&cifs_tcp_ses_lock);
+}
+
+/* Refresh DFS cache entry from a given tcon */
+static void do_refresh_tcon(struct dfs_cache *dc, struct cifs_tcon *tcon)
+{
+ int rc = 0;
+ unsigned int xid;
+ char *path, *npath;
+ unsigned int h;
+ struct dfs_cache_entry *ce;
+ struct dfs_info3_param *refs = NULL;
+ int numrefs = 0;
+
+ xid = get_xid();
+
+ path = tcon->dfs_path + 1;
+
+ rc = get_normalized_path(path, &npath);
+ if (rc)
+ goto out;
+
+ mutex_lock(&dfs_cache_list_lock);
+ ce = find_cache_entry(npath, &h);
+ mutex_unlock(&dfs_cache_list_lock);
+
+ if (IS_ERR(ce)) {
+ rc = PTR_ERR(ce);
+ goto out;
+ }
+
+ if (!cache_entry_expired(ce))
+ goto out;
+
+ if (unlikely(!tcon->ses->server->ops->get_dfs_refer)) {
+ rc = -EOPNOTSUPP;
+ } else {
+ rc = tcon->ses->server->ops->get_dfs_refer(xid, tcon->ses, path,
+ &refs, &numrefs,
+ dc->dc_nlsc,
+ tcon->remap);
+ if (!rc) {
+ mutex_lock(&dfs_cache_list_lock);
+ ce = __update_cache_entry(npath, refs, numrefs);
+ mutex_unlock(&dfs_cache_list_lock);
+ dump_refs(refs, numrefs);
+ free_dfs_info_array(refs, numrefs);
+ if (IS_ERR(ce))
+ rc = PTR_ERR(ce);
+ }
+ }
+ if (rc)
+ cifs_dbg(FYI, "%s: failed to update expired entry\n", __func__);
+out:
+ free_xid(xid);
+ free_normalized_path(path, npath);
+}
+
+/*
+ * Worker that will refresh DFS cache based on lowest TTL value from a DFS
+ * referral.
+ *
+ * FIXME: ensure that all requests are sent to DFS root for refreshing the
+ * cache.
+ */
+static void refresh_cache_worker(struct work_struct *work)
+{
+ struct dfs_cache *dc = container_of(work, struct dfs_cache,
+ dc_refresh.work);
+ struct dfs_cache_vol_info *vi;
+ struct TCP_Server_Info *server;
+ LIST_HEAD(list);
+ struct cifs_tcon *tcon, *ntcon;
+
+ mutex_lock(&dc->dc_lock);
+
+ list_for_each_entry(vi, &dc->dc_vol_list, vi_list) {
+ server = cifs_find_tcp_session(&vi->vi_vol);
+ if (IS_ERR_OR_NULL(server))
+ continue;
+ if (server->tcpStatus != CifsGood)
+ goto next;
+ get_tcons(server, &list);
+ list_for_each_entry_safe(tcon, ntcon, &list, ulist) {
+ do_refresh_tcon(dc, tcon);
+ list_del_init(&tcon->ulist);
+ cifs_put_tcon(tcon);
+ }
+next:
+ cifs_put_tcp_session(server, 0);
+ }
+ queue_delayed_work(cifsiod_wq, &dc->dc_refresh, dc->dc_ttl * HZ);
+ mutex_unlock(&dc->dc_lock);
+}
diff --git a/fs/cifs/dfs_cache.h b/fs/cifs/dfs_cache.h
new file mode 100644
index 000000000000..22f366514f3a
--- /dev/null
+++ b/fs/cifs/dfs_cache.h
@@ -0,0 +1,97 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * DFS referral cache routines
+ *
+ * Copyright (c) 2018 Paulo Alcantara <palcantara@suse.de>
+ */
+
+#ifndef _CIFS_DFS_CACHE_H
+#define _CIFS_DFS_CACHE_H
+
+#include <linux/nls.h>
+#include <linux/list.h>
+#include "cifsglob.h"
+
+struct dfs_cache_tgt_list {
+ int tl_numtgts;
+ struct list_head tl_list;
+};
+
+struct dfs_cache_tgt_iterator {
+ char *it_name;
+ struct list_head it_list;
+};
+
+extern int dfs_cache_init(void);
+extern void dfs_cache_destroy(void);
+extern const struct file_operations dfscache_proc_fops;
+
+extern int dfs_cache_find(const unsigned int xid, struct cifs_ses *ses,
+ const struct nls_table *nls_codepage, int remap,
+ const char *path, struct dfs_info3_param *ref,
+ struct dfs_cache_tgt_list *tgt_list);
+extern int dfs_cache_noreq_find(const char *path, struct dfs_info3_param *ref,
+ struct dfs_cache_tgt_list *tgt_list);
+extern int dfs_cache_update_tgthint(const unsigned int xid,
+ struct cifs_ses *ses,
+ const struct nls_table *nls_codepage,
+ int remap, const char *path,
+ const struct dfs_cache_tgt_iterator *it);
+extern int
+dfs_cache_noreq_update_tgthint(const char *path,
+ const struct dfs_cache_tgt_iterator *it);
+extern int dfs_cache_get_tgt_referral(const char *path,
+ const struct dfs_cache_tgt_iterator *it,
+ struct dfs_info3_param *ref);
+extern int dfs_cache_add_vol(struct smb_vol *vol, const char *fullpath);
+extern int dfs_cache_update_vol(const char *fullpath,
+ struct TCP_Server_Info *server);
+extern void dfs_cache_del_vol(const char *fullpath);
+
+static inline struct dfs_cache_tgt_iterator *
+dfs_cache_get_next_tgt(struct dfs_cache_tgt_list *tl,
+ struct dfs_cache_tgt_iterator *it)
+{
+ if (!tl || list_empty(&tl->tl_list) || !it ||
+ list_is_last(&it->it_list, &tl->tl_list))
+ return NULL;
+ return list_next_entry(it, it_list);
+}
+
+static inline struct dfs_cache_tgt_iterator *
+dfs_cache_get_tgt_iterator(struct dfs_cache_tgt_list *tl)
+{
+ if (!tl)
+ return NULL;
+ return list_first_entry_or_null(&tl->tl_list,
+ struct dfs_cache_tgt_iterator,
+ it_list);
+}
+
+static inline void dfs_cache_free_tgts(struct dfs_cache_tgt_list *tl)
+{
+ struct dfs_cache_tgt_iterator *it, *nit;
+
+ if (!tl || list_empty(&tl->tl_list))
+ return;
+ list_for_each_entry_safe(it, nit, &tl->tl_list, it_list) {
+ list_del(&it->it_list);
+ kfree(it->it_name);
+ kfree(it);
+ }
+ tl->tl_numtgts = 0;
+}
+
+static inline const char *
+dfs_cache_get_tgt_name(const struct dfs_cache_tgt_iterator *it)
+{
+ return it ? it->it_name : NULL;
+}
+
+static inline int
+dfs_cache_get_nr_tgts(const struct dfs_cache_tgt_list *tl)
+{
+ return tl ? tl->tl_numtgts : 0;
+}
+
+#endif /* _CIFS_DFS_CACHE_H */
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 3713d22b95a7..907e85d65bb4 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -174,7 +174,7 @@ cifs_bp_rename_retry:
cifs_dbg(FYI, "using cifs_sb prepath <%s>\n", cifs_sb->prepath);
memcpy(full_path+dfsplen+1, cifs_sb->prepath, pplen-1);
- full_path[dfsplen] = '\\';
+ full_path[dfsplen] = dirsep;
for (i = 0; i < pplen-1; i++)
if (full_path[dfsplen+1+i] == '/')
full_path[dfsplen+1+i] = CIFS_DIR_SEP(cifs_sb);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 8d41ca7bfcf1..659ce1b92c44 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -33,6 +33,7 @@
#include <linux/mount.h>
#include <linux/slab.h>
#include <linux/swap.h>
+#include <linux/mm.h>
#include <asm/div64.h>
#include "cifsfs.h"
#include "cifspdu.h"
@@ -334,6 +335,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
server->ops->set_fid(cfile, fid, oplock);
list_add(&cfile->tlist, &tcon->openFileList);
+ atomic_inc(&tcon->num_local_opens);
/* if readable file instance put first in list*/
if (file->f_mode & FMODE_READ)
@@ -395,6 +397,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
/* remove it from the lists */
list_del(&cifs_file->flist);
list_del(&cifs_file->tlist);
+ atomic_dec(&tcon->num_local_opens);
if (list_empty(&cifsi->openFileList)) {
cifs_dbg(FYI, "closing last open instance for inode %p\n",
@@ -730,7 +733,8 @@ reopen_success:
if (can_flush) {
rc = filemap_write_and_wait(inode->i_mapping);
- mapping_set_error(inode->i_mapping, rc);
+ if (!is_interrupt_error(rc))
+ mapping_set_error(inode->i_mapping, rc);
if (tcon->unix_ext)
rc = cifs_get_inode_info_unix(&inode, full_path,
@@ -864,7 +868,7 @@ int cifs_closedir(struct inode *inode, struct file *file)
}
static struct cifsLockInfo *
-cifs_lock_init(__u64 offset, __u64 length, __u8 type)
+cifs_lock_init(__u64 offset, __u64 length, __u8 type, __u16 flags)
{
struct cifsLockInfo *lock =
kmalloc(sizeof(struct cifsLockInfo), GFP_KERNEL);
@@ -874,6 +878,7 @@ cifs_lock_init(__u64 offset, __u64 length, __u8 type)
lock->length = length;
lock->type = type;
lock->pid = current->tgid;
+ lock->flags = flags;
INIT_LIST_HEAD(&lock->blist);
init_waitqueue_head(&lock->block_q);
return lock;
@@ -896,7 +901,8 @@ cifs_del_lock_waiters(struct cifsLockInfo *lock)
/* @rw_check : 0 - no op, 1 - read, 2 - write */
static bool
cifs_find_fid_lock_conflict(struct cifs_fid_locks *fdlocks, __u64 offset,
- __u64 length, __u8 type, struct cifsFileInfo *cfile,
+ __u64 length, __u8 type, __u16 flags,
+ struct cifsFileInfo *cfile,
struct cifsLockInfo **conf_lock, int rw_check)
{
struct cifsLockInfo *li;
@@ -918,6 +924,10 @@ cifs_find_fid_lock_conflict(struct cifs_fid_locks *fdlocks, __u64 offset,
((server->ops->compare_fids(cfile, cur_cfile) &&
current->tgid == li->pid) || type == li->type))
continue;
+ if (rw_check == CIFS_LOCK_OP &&
+ (flags & FL_OFDLCK) && (li->flags & FL_OFDLCK) &&
+ server->ops->compare_fids(cfile, cur_cfile))
+ continue;
if (conf_lock)
*conf_lock = li;
return true;
@@ -927,8 +937,8 @@ cifs_find_fid_lock_conflict(struct cifs_fid_locks *fdlocks, __u64 offset,
bool
cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset, __u64 length,
- __u8 type, struct cifsLockInfo **conf_lock,
- int rw_check)
+ __u8 type, __u16 flags,
+ struct cifsLockInfo **conf_lock, int rw_check)
{
bool rc = false;
struct cifs_fid_locks *cur;
@@ -936,7 +946,8 @@ cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset, __u64 length,
list_for_each_entry(cur, &cinode->llist, llist) {
rc = cifs_find_fid_lock_conflict(cur, offset, length, type,
- cfile, conf_lock, rw_check);
+ flags, cfile, conf_lock,
+ rw_check);
if (rc)
break;
}
@@ -964,7 +975,8 @@ cifs_lock_test(struct cifsFileInfo *cfile, __u64 offset, __u64 length,
down_read(&cinode->lock_sem);
exist = cifs_find_lock_conflict(cfile, offset, length, type,
- &conf_lock, CIFS_LOCK_OP);
+ flock->fl_flags, &conf_lock,
+ CIFS_LOCK_OP);
if (exist) {
flock->fl_start = conf_lock->offset;
flock->fl_end = conf_lock->offset + conf_lock->length - 1;
@@ -995,7 +1007,7 @@ cifs_lock_add(struct cifsFileInfo *cfile, struct cifsLockInfo *lock)
* Set the byte-range lock (mandatory style). Returns:
* 1) 0, if we set the lock and don't need to request to the server;
* 2) 1, if no locks prevent us but we need to request to the server;
- * 3) -EACCESS, if there is a lock that prevents us and wait is false.
+ * 3) -EACCES, if there is a lock that prevents us and wait is false.
*/
static int
cifs_lock_add_if(struct cifsFileInfo *cfile, struct cifsLockInfo *lock,
@@ -1011,7 +1023,8 @@ try_again:
down_write(&cinode->lock_sem);
exist = cifs_find_lock_conflict(cfile, lock->offset, lock->length,
- lock->type, &conf_lock, CIFS_LOCK_OP);
+ lock->type, lock->flags, &conf_lock,
+ CIFS_LOCK_OP);
if (!exist && cinode->can_cache_brlcks) {
list_add_tail(&lock->llist, &cfile->llist->locks);
up_write(&cinode->lock_sem);
@@ -1092,10 +1105,10 @@ try_again:
rc = posix_lock_file(file, flock, NULL);
up_write(&cinode->lock_sem);
if (rc == FILE_LOCK_DEFERRED) {
- rc = wait_event_interruptible(flock->fl_wait, !flock->fl_next);
+ rc = wait_event_interruptible(flock->fl_wait, !flock->fl_blocker);
if (!rc)
goto try_again;
- posix_unblock_lock(flock);
+ locks_delete_block(flock);
}
return rc;
}
@@ -1120,14 +1133,18 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
/*
* Accessing maxBuf is racy with cifs_reconnect - need to store value
- * and check it for zero before using.
+ * and check it before using.
*/
max_buf = tcon->ses->server->maxBuf;
- if (!max_buf) {
+ if (max_buf < (sizeof(struct smb_hdr) + sizeof(LOCKING_ANDX_RANGE))) {
free_xid(xid);
return -EINVAL;
}
+ BUILD_BUG_ON(sizeof(struct smb_hdr) + sizeof(LOCKING_ANDX_RANGE) >
+ PAGE_SIZE);
+ max_buf = min_t(unsigned int, max_buf - sizeof(struct smb_hdr),
+ PAGE_SIZE);
max_num = (max_buf - sizeof(struct smb_hdr)) /
sizeof(LOCKING_ANDX_RANGE);
buf = kcalloc(max_num, sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
@@ -1321,7 +1338,7 @@ cifs_read_flock(struct file_lock *flock, __u32 *type, int *lock, int *unlock,
cifs_dbg(FYI, "Lease on file - not implemented yet\n");
if (flock->fl_flags &
(~(FL_POSIX | FL_FLOCK | FL_SLEEP |
- FL_ACCESS | FL_LEASE | FL_CLOSE)))
+ FL_ACCESS | FL_LEASE | FL_CLOSE | FL_OFDLCK)))
cifs_dbg(FYI, "Unknown lock flags 0x%x\n", flock->fl_flags);
*type = server->vals->large_lock_type;
@@ -1460,12 +1477,16 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock,
/*
* Accessing maxBuf is racy with cifs_reconnect - need to store value
- * and check it for zero before using.
+ * and check it before using.
*/
max_buf = tcon->ses->server->maxBuf;
- if (!max_buf)
+ if (max_buf < (sizeof(struct smb_hdr) + sizeof(LOCKING_ANDX_RANGE)))
return -EINVAL;
+ BUILD_BUG_ON(sizeof(struct smb_hdr) + sizeof(LOCKING_ANDX_RANGE) >
+ PAGE_SIZE);
+ max_buf = min_t(unsigned int, max_buf - sizeof(struct smb_hdr),
+ PAGE_SIZE);
max_num = (max_buf - sizeof(struct smb_hdr)) /
sizeof(LOCKING_ANDX_RANGE);
buf = kcalloc(max_num, sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
@@ -1584,7 +1605,8 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
if (lock) {
struct cifsLockInfo *lock;
- lock = cifs_lock_init(flock->fl_start, length, type);
+ lock = cifs_lock_init(flock->fl_start, length, type,
+ flock->fl_flags);
if (!lock)
return -ENOMEM;
@@ -1653,7 +1675,6 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *flock)
cifs_read_flock(flock, &type, &lock, &unlock, &wait_flag,
tcon->ses->server);
-
cifs_sb = CIFS_FILE_SB(file);
netfid = cfile->fid.netfid;
cinode = CIFS_I(file_inode(file));
@@ -2098,6 +2119,8 @@ static int cifs_writepages(struct address_space *mapping,
pgoff_t end, index;
struct cifs_writedata *wdata;
int rc = 0;
+ int saved_rc = 0;
+ unsigned int xid;
/*
* If wsize is smaller than the page cache size, default to writing
@@ -2106,6 +2129,7 @@ static int cifs_writepages(struct address_space *mapping,
if (cifs_sb->wsize < PAGE_SIZE)
return generic_writepages(mapping, wbc);
+ xid = get_xid();
if (wbc->range_cyclic) {
index = mapping->writeback_index; /* Start from prev offset */
end = -1;
@@ -2124,8 +2148,10 @@ retry:
rc = server->ops->wait_mtu_credits(server, cifs_sb->wsize,
&wsize, &credits);
- if (rc)
+ if (rc != 0) {
+ done = true;
break;
+ }
tofind = min((wsize / PAGE_SIZE) - 1, end - index) + 1;
@@ -2133,6 +2159,7 @@ retry:
&found_pages);
if (!wdata) {
rc = -ENOMEM;
+ done = true;
add_credits_and_wake_if(server, credits, 0);
break;
}
@@ -2161,7 +2188,7 @@ retry:
if (rc != 0) {
add_credits_and_wake_if(server, wdata->credits, 0);
for (i = 0; i < nr_pages; ++i) {
- if (rc == -EAGAIN)
+ if (is_retryable_error(rc))
redirty_page_for_writepage(wbc,
wdata->pages[i]);
else
@@ -2169,7 +2196,7 @@ retry:
end_page_writeback(wdata->pages[i]);
put_page(wdata->pages[i]);
}
- if (rc != -EAGAIN)
+ if (!is_retryable_error(rc))
mapping_set_error(mapping, rc);
}
kref_put(&wdata->refcount, cifs_writedata_release);
@@ -2179,6 +2206,15 @@ retry:
continue;
}
+ /* Return immediately if we received a signal during writing */
+ if (is_interrupt_error(rc)) {
+ done = true;
+ break;
+ }
+
+ if (rc != 0 && saved_rc == 0)
+ saved_rc = rc;
+
wbc->nr_to_write -= nr_pages;
if (wbc->nr_to_write <= 0)
done = true;
@@ -2196,9 +2232,13 @@ retry:
goto retry;
}
+ if (saved_rc != 0)
+ rc = saved_rc;
+
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
mapping->writeback_index = index;
+ free_xid(xid);
return rc;
}
@@ -2227,8 +2267,8 @@ cifs_writepage_locked(struct page *page, struct writeback_control *wbc)
set_page_writeback(page);
retry_write:
rc = cifs_partialpagewrite(page, 0, PAGE_SIZE);
- if (rc == -EAGAIN) {
- if (wbc->sync_mode == WB_SYNC_ALL)
+ if (is_retryable_error(rc)) {
+ if (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN)
goto retry_write;
redirty_page_for_writepage(wbc, page);
} else if (rc != 0) {
@@ -2524,6 +2564,54 @@ wdata_fill_from_iovec(struct cifs_writedata *wdata, struct iov_iter *from,
}
static int
+cifs_resend_wdata(struct cifs_writedata *wdata, struct list_head *wdata_list,
+ struct cifs_aio_ctx *ctx)
+{
+ unsigned int wsize, credits;
+ int rc;
+ struct TCP_Server_Info *server =
+ tlink_tcon(wdata->cfile->tlink)->ses->server;
+
+ /*
+ * Wait for credits to resend this wdata.
+ * Note: we are attempting to resend the whole wdata not in segments
+ */
+ do {
+ rc = server->ops->wait_mtu_credits(
+ server, wdata->bytes, &wsize, &credits);
+
+ if (rc)
+ goto out;
+
+ if (wsize < wdata->bytes) {
+ add_credits_and_wake_if(server, credits, 0);
+ msleep(1000);
+ }
+ } while (wsize < wdata->bytes);
+
+ rc = -EAGAIN;
+ while (rc == -EAGAIN) {
+ rc = 0;
+ if (wdata->cfile->invalidHandle)
+ rc = cifs_reopen_file(wdata->cfile, false);
+ if (!rc)
+ rc = server->ops->async_writev(wdata,
+ cifs_uncached_writedata_release);
+ }
+
+ if (!rc) {
+ list_add_tail(&wdata->list, wdata_list);
+ return 0;
+ }
+
+ add_credits_and_wake_if(server, wdata->credits, 0);
+out:
+ kref_put(&wdata->refcount, cifs_uncached_writedata_release);
+
+ return rc;
+}
+
+static int
cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
struct cifsFileInfo *open_file,
struct cifs_sb_info *cifs_sb, struct list_head *wdata_list,
@@ -2537,6 +2625,8 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
loff_t saved_offset = offset;
pid_t pid;
struct TCP_Server_Info *server;
+ struct page **pagevec;
+ size_t start;
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
pid = open_file->pid;
@@ -2553,38 +2643,86 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
if (rc)
break;
- nr_pages = get_numpages(wsize, len, &cur_len);
- wdata = cifs_writedata_alloc(nr_pages,
+ cur_len = min_t(const size_t, len, wsize);
+
+ if (ctx->direct_io) {
+ ssize_t result;
+
+ result = iov_iter_get_pages_alloc(
+ from, &pagevec, cur_len, &start);
+ if (result < 0) {
+ cifs_dbg(VFS,
+ "direct_writev couldn't get user pages "
+ "(rc=%zd) iter type %d iov_offset %zd "
+ "count %zd\n",
+ result, from->type,
+ from->iov_offset, from->count);
+ dump_stack();
+
+ rc = result;
+ add_credits_and_wake_if(server, credits, 0);
+ break;
+ }
+ cur_len = (size_t)result;
+ iov_iter_advance(from, cur_len);
+
+ nr_pages =
+ (cur_len + start + PAGE_SIZE - 1) / PAGE_SIZE;
+
+ wdata = cifs_writedata_direct_alloc(pagevec,
cifs_uncached_writev_complete);
- if (!wdata) {
- rc = -ENOMEM;
- add_credits_and_wake_if(server, credits, 0);
- break;
- }
+ if (!wdata) {
+ rc = -ENOMEM;
+ add_credits_and_wake_if(server, credits, 0);
+ break;
+ }
- rc = cifs_write_allocate_pages(wdata->pages, nr_pages);
- if (rc) {
- kfree(wdata);
- add_credits_and_wake_if(server, credits, 0);
- break;
- }
- num_pages = nr_pages;
- rc = wdata_fill_from_iovec(wdata, from, &cur_len, &num_pages);
- if (rc) {
- for (i = 0; i < nr_pages; i++)
- put_page(wdata->pages[i]);
- kfree(wdata);
- add_credits_and_wake_if(server, credits, 0);
- break;
- }
+ wdata->page_offset = start;
+ wdata->tailsz =
+ nr_pages > 1 ?
+ cur_len - (PAGE_SIZE - start) -
+ (nr_pages - 2) * PAGE_SIZE :
+ cur_len;
+ } else {
+ nr_pages = get_numpages(wsize, len, &cur_len);
+ wdata = cifs_writedata_alloc(nr_pages,
+ cifs_uncached_writev_complete);
+ if (!wdata) {
+ rc = -ENOMEM;
+ add_credits_and_wake_if(server, credits, 0);
+ break;
+ }
- /*
- * Bring nr_pages down to the number of pages we actually used,
- * and free any pages that we didn't use.
- */
- for ( ; nr_pages > num_pages; nr_pages--)
- put_page(wdata->pages[nr_pages - 1]);
+ rc = cifs_write_allocate_pages(wdata->pages, nr_pages);
+ if (rc) {
+ kvfree(wdata->pages);
+ kfree(wdata);
+ add_credits_and_wake_if(server, credits, 0);
+ break;
+ }
+
+ num_pages = nr_pages;
+ rc = wdata_fill_from_iovec(
+ wdata, from, &cur_len, &num_pages);
+ if (rc) {
+ for (i = 0; i < nr_pages; i++)
+ put_page(wdata->pages[i]);
+ kvfree(wdata->pages);
+ kfree(wdata);
+ add_credits_and_wake_if(server, credits, 0);
+ break;
+ }
+
+ /*
+ * Bring nr_pages down to the number of pages we
+ * actually used, and free any pages that we didn't use.
+ */
+ for ( ; nr_pages > num_pages; nr_pages--)
+ put_page(wdata->pages[nr_pages - 1]);
+
+ wdata->tailsz = cur_len - ((nr_pages - 1) * PAGE_SIZE);
+ }
wdata->sync_mode = WB_SYNC_ALL;
wdata->nr_pages = nr_pages;
@@ -2593,7 +2731,6 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
wdata->pid = pid;
wdata->bytes = cur_len;
wdata->pagesz = PAGE_SIZE;
- wdata->tailsz = cur_len - ((nr_pages - 1) * PAGE_SIZE);
wdata->credits = credits;
wdata->ctx = ctx;
kref_get(&ctx->refcount);
@@ -2668,13 +2805,18 @@ restart_loop:
INIT_LIST_HEAD(&tmp_list);
list_del_init(&wdata->list);
- iov_iter_advance(&tmp_from,
+ if (ctx->direct_io)
+ rc = cifs_resend_wdata(
+ wdata, &tmp_list, ctx);
+ else {
+ iov_iter_advance(&tmp_from,
wdata->offset - ctx->pos);
- rc = cifs_write_from_iter(wdata->offset,
+ rc = cifs_write_from_iter(wdata->offset,
wdata->bytes, &tmp_from,
ctx->cfile, cifs_sb, &tmp_list,
ctx);
+ }
list_splice(&tmp_list, &ctx->list);
@@ -2687,8 +2829,9 @@ restart_loop:
kref_put(&wdata->refcount, cifs_uncached_writedata_release);
}
- for (i = 0; i < ctx->npages; i++)
- put_page(ctx->bv[i].bv_page);
+ if (!ctx->direct_io)
+ for (i = 0; i < ctx->npages; i++)
+ put_page(ctx->bv[i].bv_page);
cifs_stats_bytes_written(tcon, ctx->total_len);
set_bit(CIFS_INO_INVALID_MAPPING, &CIFS_I(dentry->d_inode)->flags);
@@ -2703,7 +2846,8 @@ restart_loop:
complete(&ctx->done);
}
-ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from)
+static ssize_t __cifs_writev(
+ struct kiocb *iocb, struct iov_iter *from, bool direct)
{
struct file *file = iocb->ki_filp;
ssize_t total_written = 0;
@@ -2712,13 +2856,18 @@ ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from)
struct cifs_sb_info *cifs_sb;
struct cifs_aio_ctx *ctx;
struct iov_iter saved_from = *from;
+ size_t len = iov_iter_count(from);
int rc;
/*
- * BB - optimize the way when signing is disabled. We can drop this
- * extra memory-to-memory copying and use iovec buffers for constructing
- * write request.
+ * iov_iter_get_pages_alloc doesn't work with ITER_KVEC.
+ * In this case, fall back to non-direct write function.
+ * this could be improved by getting pages directly in ITER_KVEC
*/
+ if (direct && from->type & ITER_KVEC) {
+ cifs_dbg(FYI, "use non-direct cifs_writev for kvec I/O\n");
+ direct = false;
+ }
rc = generic_write_checks(iocb, from);
if (rc <= 0)
@@ -2742,10 +2891,16 @@ ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from)
ctx->pos = iocb->ki_pos;
- rc = setup_aio_ctx_iter(ctx, from, WRITE);
- if (rc) {
- kref_put(&ctx->refcount, cifs_aio_ctx_release);
- return rc;
+ if (direct) {
+ ctx->direct_io = true;
+ ctx->iter = *from;
+ ctx->len = len;
+ } else {
+ rc = setup_aio_ctx_iter(ctx, from, WRITE);
+ if (rc) {
+ kref_put(&ctx->refcount, cifs_aio_ctx_release);
+ return rc;
+ }
}
/* grab a lock here due to read response handlers can access ctx */
@@ -2795,6 +2950,16 @@ ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from)
return total_written;
}
+ssize_t cifs_direct_writev(struct kiocb *iocb, struct iov_iter *from)
+{
+ return __cifs_writev(iocb, from, true);
+}
+
+ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from)
+{
+ return __cifs_writev(iocb, from, false);
+}
+
static ssize_t
cifs_writev(struct kiocb *iocb, struct iov_iter *from)
{
@@ -2817,8 +2982,8 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from)
goto out;
if (!cifs_find_lock_conflict(cfile, iocb->ki_pos, iov_iter_count(from),
- server->vals->exclusive_lock_type, NULL,
- CIFS_WRITE_OP))
+ server->vals->exclusive_lock_type, 0,
+ NULL, CIFS_WRITE_OP))
rc = __generic_file_write_iter(iocb, from);
else
rc = -EACCES;
@@ -2965,7 +3130,6 @@ cifs_uncached_readdata_release(struct kref *refcount)
kref_put(&rdata->ctx->refcount, cifs_aio_ctx_release);
for (i = 0; i < rdata->nr_pages; i++) {
put_page(rdata->pages[i]);
- rdata->pages[i] = NULL;
}
cifs_readdata_release(refcount);
}
@@ -2990,7 +3154,7 @@ cifs_readdata_to_iov(struct cifs_readdata *rdata, struct iov_iter *iter)
size_t copy = min_t(size_t, remaining, PAGE_SIZE);
size_t written;
- if (unlikely(iter->type & ITER_PIPE)) {
+ if (unlikely(iov_iter_is_pipe(iter))) {
void *addr = kmap_atomic(page);
written = copy_to_iter(addr, copy, iter);
@@ -3092,6 +3256,55 @@ cifs_uncached_copy_into_pages(struct TCP_Server_Info *server,
return uncached_fill_pages(server, rdata, iter, iter->count);
}
+static int cifs_resend_rdata(struct cifs_readdata *rdata,
+ struct list_head *rdata_list,
+ struct cifs_aio_ctx *ctx)
+{
+ unsigned int rsize, credits;
+ int rc;
+ struct TCP_Server_Info *server =
+ tlink_tcon(rdata->cfile->tlink)->ses->server;
+
+ /*
+ * Wait for credits to resend this rdata.
+ * Note: we are attempting to resend the whole rdata not in segments
+ */
+ do {
+ rc = server->ops->wait_mtu_credits(server, rdata->bytes,
+ &rsize, &credits);
+
+ if (rc)
+ goto out;
+
+ if (rsize < rdata->bytes) {
+ add_credits_and_wake_if(server, credits, 0);
+ msleep(1000);
+ }
+ } while (rsize < rdata->bytes);
+
+ rc = -EAGAIN;
+ while (rc == -EAGAIN) {
+ rc = 0;
+ if (rdata->cfile->invalidHandle)
+ rc = cifs_reopen_file(rdata->cfile, true);
+ if (!rc)
+ rc = server->ops->async_readv(rdata);
+ }
+
+ if (!rc) {
+ /* Add to aio pending list */
+ list_add_tail(&rdata->list, rdata_list);
+ return 0;
+ }
+
+ add_credits_and_wake_if(server, rdata->credits, 0);
+out:
+ kref_put(&rdata->refcount,
+ cifs_uncached_readdata_release);
+
+ return rc;
+}
+
static int
cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
struct cifs_sb_info *cifs_sb, struct list_head *rdata_list,
@@ -3103,6 +3316,9 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
int rc;
pid_t pid;
struct TCP_Server_Info *server;
+ struct page **pagevec;
+ size_t start;
+ struct iov_iter direct_iov = ctx->iter;
server = tlink_tcon(open_file->tlink)->ses->server;
@@ -3111,6 +3327,9 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
else
pid = current->tgid;
+ if (ctx->direct_io)
+ iov_iter_advance(&direct_iov, offset - ctx->pos);
+
do {
rc = server->ops->wait_mtu_credits(server, cifs_sb->rsize,
&rsize, &credits);
@@ -3118,20 +3337,66 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
break;
cur_len = min_t(const size_t, len, rsize);
- npages = DIV_ROUND_UP(cur_len, PAGE_SIZE);
- /* allocate a readdata struct */
- rdata = cifs_readdata_alloc(npages,
+ if (ctx->direct_io) {
+ ssize_t result;
+
+ result = iov_iter_get_pages_alloc(
+ &direct_iov, &pagevec,
+ cur_len, &start);
+ if (result < 0) {
+ cifs_dbg(VFS,
+ "couldn't get user pages (rc=%zd)"
+ " iter type %d"
+ " iov_offset %zd count %zd\n",
+ result, direct_iov.type,
+ direct_iov.iov_offset,
+ direct_iov.count);
+ dump_stack();
+
+ rc = result;
+ add_credits_and_wake_if(server, credits, 0);
+ break;
+ }
+ cur_len = (size_t)result;
+ iov_iter_advance(&direct_iov, cur_len);
+
+ rdata = cifs_readdata_direct_alloc(
+ pagevec, cifs_uncached_readv_complete);
+ if (!rdata) {
+ add_credits_and_wake_if(server, credits, 0);
+ rc = -ENOMEM;
+ break;
+ }
+
+ npages = (cur_len + start + PAGE_SIZE-1) / PAGE_SIZE;
+ rdata->page_offset = start;
+ rdata->tailsz = npages > 1 ?
+ cur_len-(PAGE_SIZE-start)-(npages-2)*PAGE_SIZE :
+ cur_len;
+
+ } else {
+
+ npages = DIV_ROUND_UP(cur_len, PAGE_SIZE);
+ /* allocate a readdata struct */
+ rdata = cifs_readdata_alloc(npages,
cifs_uncached_readv_complete);
- if (!rdata) {
- add_credits_and_wake_if(server, credits, 0);
- rc = -ENOMEM;
- break;
- }
+ if (!rdata) {
+ add_credits_and_wake_if(server, credits, 0);
+ rc = -ENOMEM;
+ break;
+ }
- rc = cifs_read_allocate_pages(rdata, npages);
- if (rc)
- goto error;
+ rc = cifs_read_allocate_pages(rdata, npages);
+ if (rc) {
+ kvfree(rdata->pages);
+ kfree(rdata);
+ add_credits_and_wake_if(server, credits, 0);
+ break;
+ }
+
+ rdata->tailsz = PAGE_SIZE;
+ }
rdata->cfile = cifsFileInfo_get(open_file);
rdata->nr_pages = npages;
@@ -3139,7 +3404,6 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
rdata->bytes = cur_len;
rdata->pid = pid;
rdata->pagesz = PAGE_SIZE;
- rdata->tailsz = PAGE_SIZE;
rdata->read_into_pages = cifs_uncached_read_into_pages;
rdata->copy_into_pages = cifs_uncached_copy_into_pages;
rdata->credits = credits;
@@ -3149,13 +3413,14 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
if (!rdata->cfile->invalidHandle ||
!(rc = cifs_reopen_file(rdata->cfile, true)))
rc = server->ops->async_readv(rdata);
-error:
if (rc) {
add_credits_and_wake_if(server, rdata->credits, 0);
kref_put(&rdata->refcount,
- cifs_uncached_readdata_release);
- if (rc == -EAGAIN)
+ cifs_uncached_readdata_release);
+ if (rc == -EAGAIN) {
+ iov_iter_revert(&direct_iov, cur_len);
continue;
+ }
break;
}
@@ -3211,45 +3476,62 @@ again:
* reading.
*/
if (got_bytes && got_bytes < rdata->bytes) {
- rc = cifs_readdata_to_iov(rdata, to);
+ rc = 0;
+ if (!ctx->direct_io)
+ rc = cifs_readdata_to_iov(rdata, to);
if (rc) {
kref_put(&rdata->refcount,
- cifs_uncached_readdata_release);
+ cifs_uncached_readdata_release);
continue;
}
}
- rc = cifs_send_async_read(
+ if (ctx->direct_io) {
+ /*
+ * Re-use rdata as this is a
+ * direct I/O
+ */
+ rc = cifs_resend_rdata(
+ rdata,
+ &tmp_list, ctx);
+ } else {
+ rc = cifs_send_async_read(
rdata->offset + got_bytes,
rdata->bytes - got_bytes,
rdata->cfile, cifs_sb,
&tmp_list, ctx);
+ kref_put(&rdata->refcount,
+ cifs_uncached_readdata_release);
+ }
+
list_splice(&tmp_list, &ctx->list);
- kref_put(&rdata->refcount,
- cifs_uncached_readdata_release);
goto again;
} else if (rdata->result)
rc = rdata->result;
- else
+ else if (!ctx->direct_io)
rc = cifs_readdata_to_iov(rdata, to);
/* if there was a short read -- discard anything left */
if (rdata->got_bytes && rdata->got_bytes < rdata->bytes)
rc = -ENODATA;
+
+ ctx->total_len += rdata->got_bytes;
}
list_del_init(&rdata->list);
kref_put(&rdata->refcount, cifs_uncached_readdata_release);
}
- for (i = 0; i < ctx->npages; i++) {
- if (ctx->should_dirty)
- set_page_dirty(ctx->bv[i].bv_page);
- put_page(ctx->bv[i].bv_page);
- }
+ if (!ctx->direct_io) {
+ for (i = 0; i < ctx->npages; i++) {
+ if (ctx->should_dirty)
+ set_page_dirty(ctx->bv[i].bv_page);
+ put_page(ctx->bv[i].bv_page);
+ }
- ctx->total_len = ctx->len - iov_iter_count(to);
+ ctx->total_len = ctx->len - iov_iter_count(to);
+ }
cifs_stats_bytes_read(tcon, ctx->total_len);
@@ -3267,18 +3549,28 @@ again:
complete(&ctx->done);
}
-ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
+static ssize_t __cifs_readv(
+ struct kiocb *iocb, struct iov_iter *to, bool direct)
{
- struct file *file = iocb->ki_filp;
- ssize_t rc;
size_t len;
- ssize_t total_read = 0;
- loff_t offset = iocb->ki_pos;
+ struct file *file = iocb->ki_filp;
struct cifs_sb_info *cifs_sb;
- struct cifs_tcon *tcon;
struct cifsFileInfo *cfile;
+ struct cifs_tcon *tcon;
+ ssize_t rc, total_read = 0;
+ loff_t offset = iocb->ki_pos;
struct cifs_aio_ctx *ctx;
+ /*
+ * iov_iter_get_pages_alloc() doesn't work with ITER_KVEC,
+ * fall back to data copy read path
+ * this could be improved by getting pages directly in ITER_KVEC
+ */
+ if (direct && to->type & ITER_KVEC) {
+ cifs_dbg(FYI, "use non-direct cifs_user_readv for kvec I/O\n");
+ direct = false;
+ }
+
len = iov_iter_count(to);
if (!len)
return 0;
@@ -3302,17 +3594,23 @@ ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
if (!is_sync_kiocb(iocb))
ctx->iocb = iocb;
- if (to->type == ITER_IOVEC)
+ if (iter_is_iovec(to))
ctx->should_dirty = true;
- rc = setup_aio_ctx_iter(ctx, to, READ);
- if (rc) {
- kref_put(&ctx->refcount, cifs_aio_ctx_release);
- return rc;
+ if (direct) {
+ ctx->pos = offset;
+ ctx->direct_io = true;
+ ctx->iter = *to;
+ ctx->len = len;
+ } else {
+ rc = setup_aio_ctx_iter(ctx, to, READ);
+ if (rc) {
+ kref_put(&ctx->refcount, cifs_aio_ctx_release);
+ return rc;
+ }
+ len = ctx->len;
}
- len = ctx->len;
-
/* grab a lock here due to read response handlers can access ctx */
mutex_lock(&ctx->aio_mutex);
@@ -3354,6 +3652,16 @@ ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
return rc;
}
+ssize_t cifs_direct_readv(struct kiocb *iocb, struct iov_iter *to)
+{
+ return __cifs_readv(iocb, to, true);
+}
+
+ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
+{
+ return __cifs_readv(iocb, to, false);
+}
+
ssize_t
cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to)
{
@@ -3388,7 +3696,7 @@ cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to)
down_read(&cinode->lock_sem);
if (!cifs_find_lock_conflict(cfile, iocb->ki_pos, iov_iter_count(to),
tcon->ses->server->vals->shared_lock_type,
- NULL, CIFS_READ_OP))
+ 0, NULL, CIFS_READ_OP))
rc = generic_file_read_iter(iocb, to);
up_read(&cinode->lock_sem);
return rc;
@@ -3687,7 +3995,7 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
INIT_LIST_HEAD(tmplist);
- page = list_entry(page_list->prev, struct page, lru);
+ page = lru_to_page(page_list);
/*
* Lock the page and put it in the cache. Since no one else
@@ -3743,7 +4051,9 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file);
struct TCP_Server_Info *server;
pid_t pid;
+ unsigned int xid;
+ xid = get_xid();
/*
* Reads as many pages as possible from fscache. Returns -ENOBUFS
* immediately if the cookie is negative
@@ -3753,8 +4063,10 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
*/
rc = cifs_readpages_from_fscache(mapping->host, mapping, page_list,
&num_pages);
- if (rc == 0)
+ if (rc == 0) {
+ free_xid(xid);
return rc;
+ }
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
pid = open_file->pid;
@@ -3798,6 +4110,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
*/
if (unlikely(rsize < PAGE_SIZE)) {
add_credits_and_wake_if(server, credits, 0);
+ free_xid(xid);
return 0;
}
@@ -3862,6 +4175,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
* allocator.
*/
cifs_fscache_readpages_cancel(mapping->host, page_list);
+ free_xid(xid);
return rc;
}
@@ -3889,8 +4203,12 @@ static int cifs_readpage_worker(struct file *file, struct page *page,
else
cifs_dbg(FYI, "Bytes read %d\n", rc);
- file_inode(file)->i_atime =
- current_time(file_inode(file));
+ /* we do not want atime to be less than mtime, it broke some apps */
+ file_inode(file)->i_atime = current_time(file_inode(file));
+ if (timespec64_compare(&(file_inode(file)->i_atime), &(file_inode(file)->i_mtime)))
+ file_inode(file)->i_atime = file_inode(file)->i_mtime;
+ else
+ file_inode(file)->i_atime = current_time(file_inode(file));
if (PAGE_SIZE > rc)
memset(read_data + rc, 0, PAGE_SIZE - rc);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 6e8765f44508..478003644916 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -162,7 +162,11 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
cifs_revalidate_cache(inode, fattr);
spin_lock(&inode->i_lock);
- inode->i_atime = fattr->cf_atime;
+ /* we do not want atime to be less than mtime, it broke some apps */
+ if (timespec64_compare(&fattr->cf_atime, &fattr->cf_mtime))
+ inode->i_atime = fattr->cf_mtime;
+ else
+ inode->i_atime = fattr->cf_atime;
inode->i_mtime = fattr->cf_mtime;
inode->i_ctime = fattr->cf_ctime;
inode->i_rdev = fattr->cf_rdev;
@@ -329,7 +333,7 @@ cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb)
fattr->cf_mtime = timespec64_trunc(fattr->cf_mtime, sb->s_time_gran);
fattr->cf_atime = fattr->cf_ctime = fattr->cf_mtime;
fattr->cf_nlink = 2;
- fattr->cf_flags |= CIFS_FATTR_DFS_REFERRAL;
+ fattr->cf_flags = CIFS_FATTR_DFS_REFERRAL;
}
static int
@@ -726,7 +730,6 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
FILE_ALL_INFO *data, struct super_block *sb, int xid,
const struct cifs_fid *fid)
{
- bool validinum = false;
__u16 srchflgs;
int rc = 0, tmprc = ENOSYS;
struct cifs_tcon *tcon;
@@ -777,38 +780,52 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
} else if (rc == -EREMOTE) {
cifs_create_dfs_fattr(&fattr, sb);
rc = 0;
- } else if (rc == -EACCES && backup_cred(cifs_sb)) {
- srchinf = kzalloc(sizeof(struct cifs_search_info),
- GFP_KERNEL);
- if (srchinf == NULL) {
- rc = -ENOMEM;
- goto cgii_exit;
- }
+ } else if ((rc == -EACCES) && backup_cred(cifs_sb) &&
+ (strcmp(server->vals->version_string, SMB1_VERSION_STRING)
+ == 0)) {
+ /*
+ * For SMB2 and later the backup intent flag is already
+ * sent if needed on open and there is no path based
+ * FindFirst operation to use to retry with
+ */
+
+ srchinf = kzalloc(sizeof(struct cifs_search_info),
+ GFP_KERNEL);
+ if (srchinf == NULL) {
+ rc = -ENOMEM;
+ goto cgii_exit;
+ }
- srchinf->endOfSearch = false;
+ srchinf->endOfSearch = false;
+ if (tcon->unix_ext)
+ srchinf->info_level = SMB_FIND_FILE_UNIX;
+ else if ((tcon->ses->capabilities &
+ tcon->ses->server->vals->cap_nt_find) == 0)
+ srchinf->info_level = SMB_FIND_FILE_INFO_STANDARD;
+ else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
srchinf->info_level = SMB_FIND_FILE_ID_FULL_DIR_INFO;
+ else /* no srvino useful for fallback to some netapp */
+ srchinf->info_level = SMB_FIND_FILE_DIRECTORY_INFO;
- srchflgs = CIFS_SEARCH_CLOSE_ALWAYS |
- CIFS_SEARCH_CLOSE_AT_END |
- CIFS_SEARCH_BACKUP_SEARCH;
+ srchflgs = CIFS_SEARCH_CLOSE_ALWAYS |
+ CIFS_SEARCH_CLOSE_AT_END |
+ CIFS_SEARCH_BACKUP_SEARCH;
- rc = CIFSFindFirst(xid, tcon, full_path,
- cifs_sb, NULL, srchflgs, srchinf, false);
- if (!rc) {
- data =
- (FILE_ALL_INFO *)srchinf->srch_entries_start;
+ rc = CIFSFindFirst(xid, tcon, full_path,
+ cifs_sb, NULL, srchflgs, srchinf, false);
+ if (!rc) {
+ data = (FILE_ALL_INFO *)srchinf->srch_entries_start;
- cifs_dir_info_to_fattr(&fattr,
- (FILE_DIRECTORY_INFO *)data, cifs_sb);
- fattr.cf_uniqueid = le64_to_cpu(
- ((SEARCH_ID_FULL_DIR_INFO *)data)->UniqueId);
- validinum = true;
+ cifs_dir_info_to_fattr(&fattr,
+ (FILE_DIRECTORY_INFO *)data, cifs_sb);
+ fattr.cf_uniqueid = le64_to_cpu(
+ ((SEARCH_ID_FULL_DIR_INFO *)data)->UniqueId);
- cifs_buf_release(srchinf->ntwrk_buf_start);
- }
- kfree(srchinf);
- if (rc)
- goto cgii_exit;
+ cifs_buf_release(srchinf->ntwrk_buf_start);
+ }
+ kfree(srchinf);
+ if (rc)
+ goto cgii_exit;
} else
goto cgii_exit;
@@ -821,31 +838,29 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
*/
if (*inode == NULL) {
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
- if (validinum == false) {
- if (server->ops->get_srv_inum)
- tmprc = server->ops->get_srv_inum(xid,
- tcon, cifs_sb, full_path,
- &fattr.cf_uniqueid, data);
- if (tmprc) {
- cifs_dbg(FYI, "GetSrvInodeNum rc %d\n",
- tmprc);
- fattr.cf_uniqueid = iunique(sb, ROOT_I);
- cifs_autodisable_serverino(cifs_sb);
- } else if ((fattr.cf_uniqueid == 0) &&
- strlen(full_path) == 0) {
- /* some servers ret bad root ino ie 0 */
- cifs_dbg(FYI, "Invalid (0) inodenum\n");
- fattr.cf_flags |=
- CIFS_FATTR_FAKE_ROOT_INO;
- fattr.cf_uniqueid =
- simple_hashstr(tcon->treeName);
- }
+ if (server->ops->get_srv_inum)
+ tmprc = server->ops->get_srv_inum(xid,
+ tcon, cifs_sb, full_path,
+ &fattr.cf_uniqueid, data);
+ if (tmprc) {
+ cifs_dbg(FYI, "GetSrvInodeNum rc %d\n",
+ tmprc);
+ fattr.cf_uniqueid = iunique(sb, ROOT_I);
+ cifs_autodisable_serverino(cifs_sb);
+ } else if ((fattr.cf_uniqueid == 0) &&
+ strlen(full_path) == 0) {
+ /* some servers ret bad root ino ie 0 */
+ cifs_dbg(FYI, "Invalid (0) inodenum\n");
+ fattr.cf_flags |=
+ CIFS_FATTR_FAKE_ROOT_INO;
+ fattr.cf_uniqueid =
+ simple_hashstr(tcon->treeName);
}
} else
fattr.cf_uniqueid = iunique(sb, ROOT_I);
} else {
- if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) &&
- validinum == false && server->ops->get_srv_inum) {
+ if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
+ && server->ops->get_srv_inum) {
/*
* Pass a NULL tcon to ensure we don't make a round
* trip to the server. This only works for SMB2+.
@@ -1301,8 +1316,8 @@ cifs_drop_nlink(struct inode *inode)
/*
* If d_inode(dentry) is null (usually meaning the cached dentry
* is a negative dentry) then we would attempt a standard SMB delete, but
- * if that fails we can not attempt the fall back mechanisms on EACCESS
- * but will return the EACCESS to the caller. Note that the VFS does not call
+ * if that fails we can not attempt the fall back mechanisms on EACCES
+ * but will return the EACCES to the caller. Note that the VFS does not call
* unlink on negative dentries currently.
*/
int cifs_unlink(struct inode *dir, struct dentry *dentry)
@@ -2242,6 +2257,11 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
* the flush returns error?
*/
rc = filemap_write_and_wait(inode->i_mapping);
+ if (is_interrupt_error(rc)) {
+ rc = -ERESTARTSYS;
+ goto out;
+ }
+
mapping_set_error(inode->i_mapping, rc);
rc = 0;
@@ -2385,6 +2405,11 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
* the flush returns error?
*/
rc = filemap_write_and_wait(inode->i_mapping);
+ if (is_interrupt_error(rc)) {
+ rc = -ERESTARTSYS;
+ goto cifs_setattr_exit;
+ }
+
mapping_set_error(inode->i_mapping, rc);
rc = 0;
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 54f32f9143a9..76ddd98b6298 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -32,8 +32,51 @@
#include "cifs_debug.h"
#include "cifsfs.h"
#include "cifs_ioctl.h"
+#include "smb2proto.h"
#include <linux/btrfs.h>
+static long cifs_ioctl_query_info(unsigned int xid, struct file *filep,
+ unsigned long p)
+{
+ struct inode *inode = file_inode(filep);
+ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+ struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
+ struct dentry *dentry = filep->f_path.dentry;
+ unsigned char *path;
+ __le16 *utf16_path = NULL, root_path;
+ int rc = 0;
+
+ path = build_path_from_dentry(dentry);
+ if (path == NULL)
+ return -ENOMEM;
+
+ cifs_dbg(FYI, "%s %s\n", __func__, path);
+
+ if (!path[0]) {
+ root_path = 0;
+ utf16_path = &root_path;
+ } else {
+ utf16_path = cifs_convert_path_to_utf16(path + 1, cifs_sb);
+ if (!utf16_path) {
+ rc = -ENOMEM;
+ goto ici_exit;
+ }
+ }
+
+ if (tcon->ses->server->ops->ioctl_query_info)
+ rc = tcon->ses->server->ops->ioctl_query_info(
+ xid, tcon, utf16_path,
+ filep->private_data ? 0 : 1, p);
+ else
+ rc = -EOPNOTSUPP;
+
+ ici_exit:
+ if (utf16_path != &root_path)
+ kfree(utf16_path);
+ kfree(path);
+ return rc;
+}
+
static long cifs_ioctl_copychunk(unsigned int xid, struct file *dst_file,
unsigned long srcfd)
{
@@ -123,7 +166,6 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
struct inode *inode = file_inode(filep);
int rc = -ENOTTY; /* strange error - but the precedent */
unsigned int xid;
- struct cifs_sb_info *cifs_sb;
struct cifsFileInfo *pSMBFile = filep->private_data;
struct cifs_tcon *tcon;
__u64 ExtAttrBits = 0;
@@ -131,7 +173,6 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
xid = get_xid();
- cifs_sb = CIFS_SB(inode->i_sb);
cifs_dbg(FYI, "cifs ioctl 0x%x\n", command);
switch (command) {
case FS_IOC_GETFLAGS:
@@ -196,6 +237,9 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
case CIFS_IOC_COPYCHUNK_FILE:
rc = cifs_ioctl_copychunk(xid, filep, arg);
break;
+ case CIFS_QUERY_INFO:
+ rc = cifs_ioctl_query_info(xid, filep, arg);
+ break;
case CIFS_IOC_SET_INTEGRITY:
if (pSMBFile == NULL)
break;
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 6926685e513c..bee203055b30 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -111,19 +111,27 @@ struct cifs_tcon *
tconInfoAlloc(void)
{
struct cifs_tcon *ret_buf;
- ret_buf = kzalloc(sizeof(struct cifs_tcon), GFP_KERNEL);
- if (ret_buf) {
- atomic_inc(&tconInfoAllocCount);
- ret_buf->tidStatus = CifsNew;
- ++ret_buf->tc_count;
- INIT_LIST_HEAD(&ret_buf->openFileList);
- INIT_LIST_HEAD(&ret_buf->tcon_list);
- spin_lock_init(&ret_buf->open_file_lock);
- mutex_init(&ret_buf->crfid.fid_mutex);
- ret_buf->crfid.fid = kzalloc(sizeof(struct cifs_fid),
- GFP_KERNEL);
- spin_lock_init(&ret_buf->stat_lock);
+
+ ret_buf = kzalloc(sizeof(*ret_buf), GFP_KERNEL);
+ if (!ret_buf)
+ return NULL;
+ ret_buf->crfid.fid = kzalloc(sizeof(*ret_buf->crfid.fid), GFP_KERNEL);
+ if (!ret_buf->crfid.fid) {
+ kfree(ret_buf);
+ return NULL;
}
+
+ atomic_inc(&tconInfoAllocCount);
+ ret_buf->tidStatus = CifsNew;
+ ++ret_buf->tc_count;
+ INIT_LIST_HEAD(&ret_buf->openFileList);
+ INIT_LIST_HEAD(&ret_buf->tcon_list);
+ spin_lock_init(&ret_buf->open_file_lock);
+ mutex_init(&ret_buf->crfid.fid_mutex);
+ spin_lock_init(&ret_buf->stat_lock);
+ atomic_set(&ret_buf->num_local_opens, 0);
+ atomic_set(&ret_buf->num_remote_opens, 0);
+
return ret_buf;
}
@@ -138,6 +146,9 @@ tconInfoFree(struct cifs_tcon *buf_to_free)
kfree(buf_to_free->nativeFileSystem);
kzfree(buf_to_free->password);
kfree(buf_to_free->crfid.fid);
+#ifdef CONFIG_CIFS_DFS_UPCALL
+ kfree(buf_to_free->dfs_path);
+#endif
kfree(buf_to_free);
}
@@ -523,9 +534,17 @@ void
cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb)
{
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
+ struct cifs_tcon *tcon = NULL;
+
+ if (cifs_sb->master_tlink)
+ tcon = cifs_sb_master_tcon(cifs_sb);
+
cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM;
- cifs_dbg(VFS, "Autodisabling the use of server inode numbers on %s. This server doesn't seem to support them properly. Hardlinks will not be recognized on this mount. Consider mounting with the \"noserverino\" option to silence this message.\n",
- cifs_sb_master_tcon(cifs_sb)->treeName);
+ cifs_dbg(VFS, "Autodisabling the use of server inode numbers on %s.\n",
+ tcon ? tcon->treeName : "new server");
+ cifs_dbg(VFS, "The server doesn't seem to support them properly or the files might be on different servers (DFS).\n");
+ cifs_dbg(VFS, "Hardlinks will not be recognized on this mount. Consider mounting with the \"noserverino\" option to silence this message.\n");
+
}
}
@@ -730,6 +749,8 @@ parse_dfs_referrals(struct get_dfs_referral_rsp *rsp, u32 rsp_size,
goto parse_DFS_referrals_exit;
}
+ node->ttl = le32_to_cpu(ref->TimeToLive);
+
ref++;
}
@@ -786,7 +807,7 @@ setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw)
struct page **pages = NULL;
struct bio_vec *bv = NULL;
- if (iter->type & ITER_KVEC) {
+ if (iov_iter_is_kvec(iter)) {
memcpy(&ctx->iter, iter, sizeof(struct iov_iter));
ctx->len = count;
iov_iter_advance(iter, count);
@@ -857,7 +878,7 @@ setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw)
ctx->bv = bv;
ctx->len = saved_len - count;
ctx->npages = npages;
- iov_iter_bvec(&ctx->iter, ITER_BVEC | rw, ctx->bv, npages, ctx->len);
+ iov_iter_bvec(&ctx->iter, rw, ctx->bv, npages, ctx->len);
return 0;
}
@@ -931,3 +952,20 @@ void rqst_page_get_length(struct smb_rqst *rqst, unsigned int page,
else if (page == 0)
*len = rqst->rq_pagesz - rqst->rq_offset;
}
+
+void extract_unc_hostname(const char *unc, const char **h, size_t *len)
+{
+ const char *end;
+
+ /* skip initial slashes */
+ while (*unc && (*unc == '\\' || *unc == '/'))
+ unc++;
+
+ end = unc;
+
+ while (*end && !(*end == '\\' || *end == '/'))
+ end++;
+
+ *h = unc;
+ *len = end - unc;
+}
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index e169e1a5fd35..3925a7bfc74d 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -655,7 +655,14 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos,
/* scan and find it */
int i;
char *cur_ent;
- char *end_of_smb = cfile->srch_inf.ntwrk_buf_start +
+ char *end_of_smb;
+
+ if (cfile->srch_inf.ntwrk_buf_start == NULL) {
+ cifs_dbg(VFS, "ntwrk_buf_start is NULL during readdir\n");
+ return -EIO;
+ }
+
+ end_of_smb = cfile->srch_inf.ntwrk_buf_start +
server->ops->calc_smb_size(
cfile->srch_inf.ntwrk_buf_start,
server);
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index aa23c00367ec..dcd49ad60c83 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -534,9 +534,9 @@ cifs_select_sectype(struct TCP_Server_Info *server, enum securityEnum requested)
if (global_secflags & CIFSSEC_MAY_NTLM)
return NTLM;
default:
- /* Fallthrough to attempt LANMAN authentication next */
break;
}
+ /* Fallthrough - to attempt LANMAN authentication next */
case CIFS_NEGFLAVOR_LANMAN:
switch (requested) {
case LANMAN:
@@ -1154,14 +1154,12 @@ out:
static int
_sess_auth_rawntlmssp_assemble_req(struct sess_data *sess_data)
{
- struct smb_hdr *smb_buf;
SESSION_SETUP_ANDX *pSMB;
struct cifs_ses *ses = sess_data->ses;
__u32 capabilities;
char *bcc_ptr;
pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
- smb_buf = (struct smb_hdr *)pSMB;
capabilities = cifs_ssetup_hdr(ses, pSMB);
if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) {
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 378151e09e91..32a6c020478f 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -929,19 +929,18 @@ cifs_unix_dfs_readlink(const unsigned int xid, struct cifs_tcon *tcon,
{
#ifdef CONFIG_CIFS_DFS_UPCALL
int rc;
- unsigned int num_referrals = 0;
- struct dfs_info3_param *referrals = NULL;
+ struct dfs_info3_param referral = {0};
- rc = get_dfs_path(xid, tcon->ses, searchName, nls_codepage,
- &num_referrals, &referrals, 0);
+ rc = get_dfs_path(xid, tcon->ses, searchName, nls_codepage, &referral,
+ 0);
- if (!rc && num_referrals > 0) {
- *symlinkinfo = kstrndup(referrals->node_name,
- strlen(referrals->node_name),
+ if (!rc) {
+ *symlinkinfo = kstrndup(referral.node_name,
+ strlen(referral.node_name),
GFP_KERNEL);
+ free_dfs_info_param(&referral);
if (!*symlinkinfo)
rc = -ENOMEM;
- free_dfs_info_array(referrals, num_referrals);
}
return rc;
#else /* No DFS support */
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index 4ed10dd086e6..b204e84b87fb 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -122,12 +122,14 @@ smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock,
/*
* Accessing maxBuf is racy with cifs_reconnect - need to store value
- * and check it for zero before using.
+ * and check it before using.
*/
max_buf = tcon->ses->server->maxBuf;
- if (!max_buf)
+ if (max_buf < sizeof(struct smb2_lock_element))
return -EINVAL;
+ BUILD_BUG_ON(sizeof(struct smb2_lock_element) > PAGE_SIZE);
+ max_buf = min_t(unsigned int, max_buf, PAGE_SIZE);
max_num = max_buf / sizeof(struct smb2_lock_element);
buf = kcalloc(max_num, sizeof(struct smb2_lock_element), GFP_KERNEL);
if (!buf)
@@ -264,6 +266,8 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile)
return -EINVAL;
}
+ BUILD_BUG_ON(sizeof(struct smb2_lock_element) > PAGE_SIZE);
+ max_buf = min_t(unsigned int, max_buf, PAGE_SIZE);
max_num = max_buf / sizeof(struct smb2_lock_element);
buf = kcalloc(max_num, sizeof(struct smb2_lock_element), GFP_KERNEL);
if (!buf) {
diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h
index 0ffa18094335..dd10f0ce4cd5 100644
--- a/fs/cifs/smb2glob.h
+++ b/fs/cifs/smb2glob.h
@@ -33,7 +33,7 @@
/*
* Identifiers for functions that use the open, operation, close pattern
- * in smb2inode.c:smb2_open_op_close()
+ * in smb2inode.c:smb2_compound_op()
*/
#define SMB2_OP_SET_DELETE 1
#define SMB2_OP_SET_INFO 2
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index 1eef1791d0c4..01a76bccdb8d 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -38,54 +38,82 @@
#include "smb2proto.h"
static int
-smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon,
- struct cifs_sb_info *cifs_sb, const char *full_path,
- __u32 desired_access, __u32 create_disposition,
- __u32 create_options, void *data, int command)
+smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb, const char *full_path,
+ __u32 desired_access, __u32 create_disposition,
+ __u32 create_options, void *ptr, int command)
{
- int rc, tmprc = 0;
+ int rc;
__le16 *utf16_path = NULL;
__u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
struct cifs_open_parms oparms;
struct cifs_fid fid;
- bool use_cached_root_handle = false;
+ struct cifs_ses *ses = tcon->ses;
+ int num_rqst = 0;
+ struct smb_rqst rqst[3];
+ int resp_buftype[3];
+ struct kvec rsp_iov[3];
+ struct kvec open_iov[SMB2_CREATE_IOV_SIZE];
+ struct kvec qi_iov[1];
+ struct kvec si_iov[SMB2_SET_INFO_IOV_SIZE];
+ struct kvec close_iov[1];
+ struct smb2_query_info_rsp *qi_rsp = NULL;
+ int flags = 0;
+ __u8 delete_pending[8] = {1, 0, 0, 0, 0, 0, 0, 0};
+ unsigned int size[2];
+ void *data[2];
+ struct smb2_file_rename_info rename_info;
+ struct smb2_file_link_info link_info;
+ int len;
- if ((strcmp(full_path, "") == 0) && (create_options == 0) &&
- (desired_access == FILE_READ_ATTRIBUTES) &&
- (create_disposition == FILE_OPEN) &&
- (tcon->nohandlecache == false)) {
- rc = open_shroot(xid, tcon, &fid);
- if (rc == 0)
- use_cached_root_handle = true;
- }
+ if (smb3_encryption_required(tcon))
+ flags |= CIFS_TRANSFORM_REQ;
- if (use_cached_root_handle == false) {
- utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb);
- if (!utf16_path)
- return -ENOMEM;
-
- oparms.tcon = tcon;
- oparms.desired_access = desired_access;
- oparms.disposition = create_disposition;
- oparms.create_options = create_options;
- oparms.fid = &fid;
- oparms.reconnect = false;
-
- rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL,
- NULL);
- if (rc) {
- kfree(utf16_path);
- return rc;
- }
- }
+ memset(rqst, 0, sizeof(rqst));
+ resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER;
+ memset(rsp_iov, 0, sizeof(rsp_iov));
+ /* Open */
+ utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb);
+ if (!utf16_path)
+ return -ENOMEM;
+
+ oparms.tcon = tcon;
+ oparms.desired_access = desired_access;
+ oparms.disposition = create_disposition;
+ oparms.create_options = create_options;
+ if (backup_cred(cifs_sb))
+ oparms.create_options |= CREATE_OPEN_BACKUP_INTENT;
+ oparms.fid = &fid;
+ oparms.reconnect = false;
+
+ memset(&open_iov, 0, sizeof(open_iov));
+ rqst[num_rqst].rq_iov = open_iov;
+ rqst[num_rqst].rq_nvec = SMB2_CREATE_IOV_SIZE;
+ rc = SMB2_open_init(tcon, &rqst[num_rqst], &oplock, &oparms,
+ utf16_path);
+ kfree(utf16_path);
+ if (rc)
+ goto finished;
+
+ smb2_set_next_command(tcon, &rqst[num_rqst++]);
+
+ /* Operation */
switch (command) {
- case SMB2_OP_DELETE:
- break;
case SMB2_OP_QUERY_INFO:
- tmprc = SMB2_query_info(xid, tcon, fid.persistent_fid,
- fid.volatile_fid,
- (struct smb2_file_all_info *)data);
+ memset(&qi_iov, 0, sizeof(qi_iov));
+ rqst[num_rqst].rq_iov = qi_iov;
+ rqst[num_rqst].rq_nvec = 1;
+
+ rc = SMB2_query_info_init(tcon, &rqst[num_rqst], COMPOUND_FID,
+ COMPOUND_FID, FILE_ALL_INFORMATION,
+ SMB2_O_INFO_FILE, 0,
+ sizeof(struct smb2_file_all_info) +
+ PATH_MAX * 2, 0, NULL);
+ smb2_set_next_command(tcon, &rqst[num_rqst]);
+ smb2_set_related(&rqst[num_rqst++]);
+ break;
+ case SMB2_OP_DELETE:
break;
case SMB2_OP_MKDIR:
/*
@@ -94,39 +122,156 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon,
*/
break;
case SMB2_OP_RMDIR:
- tmprc = SMB2_rmdir(xid, tcon, fid.persistent_fid,
- fid.volatile_fid);
- break;
- case SMB2_OP_RENAME:
- tmprc = SMB2_rename(xid, tcon, fid.persistent_fid,
- fid.volatile_fid, (__le16 *)data);
- break;
- case SMB2_OP_HARDLINK:
- tmprc = SMB2_set_hardlink(xid, tcon, fid.persistent_fid,
- fid.volatile_fid, (__le16 *)data);
+ memset(&si_iov, 0, sizeof(si_iov));
+ rqst[num_rqst].rq_iov = si_iov;
+ rqst[num_rqst].rq_nvec = 1;
+
+ size[0] = 1; /* sizeof __u8 See MS-FSCC section 2.4.11 */
+ data[0] = &delete_pending[0];
+
+ rc = SMB2_set_info_init(tcon, &rqst[num_rqst], COMPOUND_FID,
+ COMPOUND_FID, current->tgid,
+ FILE_DISPOSITION_INFORMATION,
+ SMB2_O_INFO_FILE, 0, data, size);
+ smb2_set_next_command(tcon, &rqst[num_rqst]);
+ smb2_set_related(&rqst[num_rqst++]);
break;
case SMB2_OP_SET_EOF:
- tmprc = SMB2_set_eof(xid, tcon, fid.persistent_fid,
- fid.volatile_fid, current->tgid,
- (__le64 *)data, false);
+ memset(&si_iov, 0, sizeof(si_iov));
+ rqst[num_rqst].rq_iov = si_iov;
+ rqst[num_rqst].rq_nvec = 1;
+
+ size[0] = 8; /* sizeof __le64 */
+ data[0] = ptr;
+
+ rc = SMB2_set_info_init(tcon, &rqst[num_rqst], COMPOUND_FID,
+ COMPOUND_FID, current->tgid,
+ FILE_END_OF_FILE_INFORMATION,
+ SMB2_O_INFO_FILE, 0, data, size);
+ smb2_set_next_command(tcon, &rqst[num_rqst]);
+ smb2_set_related(&rqst[num_rqst++]);
break;
case SMB2_OP_SET_INFO:
- tmprc = SMB2_set_info(xid, tcon, fid.persistent_fid,
- fid.volatile_fid,
- (FILE_BASIC_INFO *)data);
+ memset(&si_iov, 0, sizeof(si_iov));
+ rqst[num_rqst].rq_iov = si_iov;
+ rqst[num_rqst].rq_nvec = 1;
+
+
+ size[0] = sizeof(FILE_BASIC_INFO);
+ data[0] = ptr;
+
+ rc = SMB2_set_info_init(tcon, &rqst[num_rqst], COMPOUND_FID,
+ COMPOUND_FID, current->tgid,
+ FILE_BASIC_INFORMATION,
+ SMB2_O_INFO_FILE, 0, data, size);
+ smb2_set_next_command(tcon, &rqst[num_rqst]);
+ smb2_set_related(&rqst[num_rqst++]);
+ break;
+ case SMB2_OP_RENAME:
+ memset(&si_iov, 0, sizeof(si_iov));
+ rqst[num_rqst].rq_iov = si_iov;
+ rqst[num_rqst].rq_nvec = 2;
+
+ len = (2 * UniStrnlen((wchar_t *)ptr, PATH_MAX));
+
+ rename_info.ReplaceIfExists = 1;
+ rename_info.RootDirectory = 0;
+ rename_info.FileNameLength = cpu_to_le32(len);
+
+ size[0] = sizeof(struct smb2_file_rename_info);
+ data[0] = &rename_info;
+
+ size[1] = len + 2 /* null */;
+ data[1] = (__le16 *)ptr;
+
+ rc = SMB2_set_info_init(tcon, &rqst[num_rqst], COMPOUND_FID,
+ COMPOUND_FID, current->tgid,
+ FILE_RENAME_INFORMATION,
+ SMB2_O_INFO_FILE, 0, data, size);
+ smb2_set_next_command(tcon, &rqst[num_rqst]);
+ smb2_set_related(&rqst[num_rqst++]);
+ break;
+ case SMB2_OP_HARDLINK:
+ memset(&si_iov, 0, sizeof(si_iov));
+ rqst[num_rqst].rq_iov = si_iov;
+ rqst[num_rqst].rq_nvec = 2;
+
+ len = (2 * UniStrnlen((wchar_t *)ptr, PATH_MAX));
+
+ link_info.ReplaceIfExists = 0;
+ link_info.RootDirectory = 0;
+ link_info.FileNameLength = cpu_to_le32(len);
+
+ size[0] = sizeof(struct smb2_file_link_info);
+ data[0] = &link_info;
+
+ size[1] = len + 2 /* null */;
+ data[1] = (__le16 *)ptr;
+
+ rc = SMB2_set_info_init(tcon, &rqst[num_rqst], COMPOUND_FID,
+ COMPOUND_FID, current->tgid,
+ FILE_LINK_INFORMATION,
+ SMB2_O_INFO_FILE, 0, data, size);
+ smb2_set_next_command(tcon, &rqst[num_rqst]);
+ smb2_set_related(&rqst[num_rqst++]);
break;
default:
cifs_dbg(VFS, "Invalid command\n");
- break;
+ rc = -EINVAL;
}
+ if (rc)
+ goto finished;
- if (use_cached_root_handle)
- close_shroot(&tcon->crfid);
- else
- rc = SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
- if (tmprc)
- rc = tmprc;
- kfree(utf16_path);
+ /* Close */
+ memset(&close_iov, 0, sizeof(close_iov));
+ rqst[num_rqst].rq_iov = close_iov;
+ rqst[num_rqst].rq_nvec = 1;
+ rc = SMB2_close_init(tcon, &rqst[num_rqst], COMPOUND_FID,
+ COMPOUND_FID);
+ smb2_set_related(&rqst[num_rqst++]);
+ if (rc)
+ goto finished;
+
+ rc = compound_send_recv(xid, ses, flags, num_rqst, rqst,
+ resp_buftype, rsp_iov);
+
+ finished:
+ SMB2_open_free(&rqst[0]);
+ switch (command) {
+ case SMB2_OP_QUERY_INFO:
+ if (rc == 0) {
+ qi_rsp = (struct smb2_query_info_rsp *)
+ rsp_iov[1].iov_base;
+ rc = smb2_validate_and_copy_iov(
+ le16_to_cpu(qi_rsp->OutputBufferOffset),
+ le32_to_cpu(qi_rsp->OutputBufferLength),
+ &rsp_iov[1], sizeof(struct smb2_file_all_info),
+ ptr);
+ }
+ if (rqst[1].rq_iov)
+ SMB2_query_info_free(&rqst[1]);
+ if (rqst[2].rq_iov)
+ SMB2_close_free(&rqst[2]);
+ break;
+ case SMB2_OP_DELETE:
+ case SMB2_OP_MKDIR:
+ if (rqst[1].rq_iov)
+ SMB2_close_free(&rqst[1]);
+ break;
+ case SMB2_OP_HARDLINK:
+ case SMB2_OP_RENAME:
+ case SMB2_OP_RMDIR:
+ case SMB2_OP_SET_EOF:
+ case SMB2_OP_SET_INFO:
+ if (rqst[1].rq_iov)
+ SMB2_set_info_free(&rqst[1]);
+ if (rqst[2].rq_iov)
+ SMB2_close_free(&rqst[2]);
+ break;
+ }
+ free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
+ free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
+ free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base);
return rc;
}
@@ -147,6 +292,9 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
{
int rc;
struct smb2_file_all_info *smb2_data;
+ __u32 create_options = 0;
+ struct cifs_fid fid;
+ bool no_cached_open = tcon->nohandlecache;
*adjust_tz = false;
*symlink = false;
@@ -156,16 +304,35 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
if (smb2_data == NULL)
return -ENOMEM;
- rc = smb2_open_op_close(xid, tcon, cifs_sb, full_path,
- FILE_READ_ATTRIBUTES, FILE_OPEN, 0,
- smb2_data, SMB2_OP_QUERY_INFO);
+ /* If it is a root and its handle is cached then use it */
+ if (!strlen(full_path) && !no_cached_open) {
+ rc = open_shroot(xid, tcon, &fid);
+ if (rc)
+ goto out;
+ rc = SMB2_query_info(xid, tcon, fid.persistent_fid,
+ fid.volatile_fid, smb2_data);
+ close_shroot(&tcon->crfid);
+ if (rc)
+ goto out;
+ move_smb2_info_to_cifs(data, smb2_data);
+ goto out;
+ }
+
+ if (backup_cred(cifs_sb))
+ create_options |= CREATE_OPEN_BACKUP_INTENT;
+
+ rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
+ FILE_READ_ATTRIBUTES, FILE_OPEN, create_options,
+ smb2_data, SMB2_OP_QUERY_INFO);
if (rc == -EOPNOTSUPP) {
*symlink = true;
+ create_options |= OPEN_REPARSE_POINT;
+
/* Failed on a symbolic link - query a reparse point info */
- rc = smb2_open_op_close(xid, tcon, cifs_sb, full_path,
- FILE_READ_ATTRIBUTES, FILE_OPEN,
- OPEN_REPARSE_POINT, smb2_data,
- SMB2_OP_QUERY_INFO);
+ rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
+ FILE_READ_ATTRIBUTES, FILE_OPEN,
+ create_options, smb2_data,
+ SMB2_OP_QUERY_INFO);
}
if (rc)
goto out;
@@ -180,9 +347,9 @@ int
smb2_mkdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
struct cifs_sb_info *cifs_sb)
{
- return smb2_open_op_close(xid, tcon, cifs_sb, name,
- FILE_WRITE_ATTRIBUTES, FILE_CREATE,
- CREATE_NOT_FILE, NULL, SMB2_OP_MKDIR);
+ return smb2_compound_op(xid, tcon, cifs_sb, name,
+ FILE_WRITE_ATTRIBUTES, FILE_CREATE,
+ CREATE_NOT_FILE, NULL, SMB2_OP_MKDIR);
}
void
@@ -199,9 +366,9 @@ smb2_mkdir_setinfo(struct inode *inode, const char *name,
cifs_i = CIFS_I(inode);
dosattrs = cifs_i->cifsAttrs | ATTR_READONLY;
data.Attributes = cpu_to_le32(dosattrs);
- tmprc = smb2_open_op_close(xid, tcon, cifs_sb, name,
- FILE_WRITE_ATTRIBUTES, FILE_CREATE,
- CREATE_NOT_FILE, &data, SMB2_OP_SET_INFO);
+ tmprc = smb2_compound_op(xid, tcon, cifs_sb, name,
+ FILE_WRITE_ATTRIBUTES, FILE_CREATE,
+ CREATE_NOT_FILE, &data, SMB2_OP_SET_INFO);
if (tmprc == 0)
cifs_i->cifsAttrs = dosattrs;
}
@@ -210,18 +377,18 @@ int
smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
struct cifs_sb_info *cifs_sb)
{
- return smb2_open_op_close(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
- CREATE_NOT_FILE,
- NULL, SMB2_OP_RMDIR);
+ return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
+ CREATE_NOT_FILE,
+ NULL, SMB2_OP_RMDIR);
}
int
smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
struct cifs_sb_info *cifs_sb)
{
- return smb2_open_op_close(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
- CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT,
- NULL, SMB2_OP_DELETE);
+ return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
+ CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT,
+ NULL, SMB2_OP_DELETE);
}
static int
@@ -237,9 +404,8 @@ smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon,
rc = -ENOMEM;
goto smb2_rename_path;
}
-
- rc = smb2_open_op_close(xid, tcon, cifs_sb, from_name, access,
- FILE_OPEN, 0, smb2_to_name, command);
+ rc = smb2_compound_op(xid, tcon, cifs_sb, from_name, access,
+ FILE_OPEN, 0, smb2_to_name, command);
smb2_rename_path:
kfree(smb2_to_name);
return rc;
@@ -269,9 +435,10 @@ smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb, bool set_alloc)
{
__le64 eof = cpu_to_le64(size);
- return smb2_open_op_close(xid, tcon, cifs_sb, full_path,
- FILE_WRITE_DATA, FILE_OPEN, 0, &eof,
- SMB2_OP_SET_EOF);
+
+ return smb2_compound_op(xid, tcon, cifs_sb, full_path,
+ FILE_WRITE_DATA, FILE_OPEN, 0, &eof,
+ SMB2_OP_SET_EOF);
}
int
@@ -291,9 +458,9 @@ smb2_set_file_info(struct inode *inode, const char *full_path,
if (IS_ERR(tlink))
return PTR_ERR(tlink);
- rc = smb2_open_op_close(xid, tlink_tcon(tlink), cifs_sb, full_path,
- FILE_WRITE_ATTRIBUTES, FILE_OPEN, 0, buf,
- SMB2_OP_SET_INFO);
+ rc = smb2_compound_op(xid, tlink_tcon(tlink), cifs_sb, full_path,
+ FILE_WRITE_ATTRIBUTES, FILE_OPEN, 0, buf,
+ SMB2_OP_SET_INFO);
cifs_put_tlink(tlink);
return rc;
}
diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c
index 20a2d304c603..924269cec135 100644
--- a/fs/cifs/smb2maperror.c
+++ b/fs/cifs/smb2maperror.c
@@ -288,7 +288,7 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
{STATUS_FLT_BUFFER_TOO_SMALL, -ENOBUFS, "STATUS_FLT_BUFFER_TOO_SMALL"},
{STATUS_FVE_PARTIAL_METADATA, -EIO, "STATUS_FVE_PARTIAL_METADATA"},
{STATUS_UNSUCCESSFUL, -EIO, "STATUS_UNSUCCESSFUL"},
- {STATUS_NOT_IMPLEMENTED, -ENOSYS, "STATUS_NOT_IMPLEMENTED"},
+ {STATUS_NOT_IMPLEMENTED, -EOPNOTSUPP, "STATUS_NOT_IMPLEMENTED"},
{STATUS_INVALID_INFO_CLASS, -EIO, "STATUS_INVALID_INFO_CLASS"},
{STATUS_INFO_LENGTH_MISMATCH, -EIO, "STATUS_INFO_LENGTH_MISMATCH"},
{STATUS_ACCESS_VIOLATION, -EACCES, "STATUS_ACCESS_VIOLATION"},
@@ -379,8 +379,8 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
{STATUS_NONEXISTENT_EA_ENTRY, -EIO, "STATUS_NONEXISTENT_EA_ENTRY"},
{STATUS_NO_EAS_ON_FILE, -ENODATA, "STATUS_NO_EAS_ON_FILE"},
{STATUS_EA_CORRUPT_ERROR, -EIO, "STATUS_EA_CORRUPT_ERROR"},
- {STATUS_FILE_LOCK_CONFLICT, -EIO, "STATUS_FILE_LOCK_CONFLICT"},
- {STATUS_LOCK_NOT_GRANTED, -EIO, "STATUS_LOCK_NOT_GRANTED"},
+ {STATUS_FILE_LOCK_CONFLICT, -EACCES, "STATUS_FILE_LOCK_CONFLICT"},
+ {STATUS_LOCK_NOT_GRANTED, -EACCES, "STATUS_LOCK_NOT_GRANTED"},
{STATUS_DELETE_PENDING, -ENOENT, "STATUS_DELETE_PENDING"},
{STATUS_CTL_FILE_NOT_SUPPORTED, -ENOSYS,
"STATUS_CTL_FILE_NOT_SUPPORTED"},
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index 6a9c47541c53..7b8b58fb4d3f 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -648,6 +648,13 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
if (rsp->sync_hdr.Command != SMB2_OPLOCK_BREAK)
return false;
+ if (rsp->sync_hdr.CreditRequest) {
+ spin_lock(&server->req_lock);
+ server->credits += le16_to_cpu(rsp->sync_hdr.CreditRequest);
+ spin_unlock(&server->req_lock);
+ wake_up(&server->request_q);
+ }
+
if (rsp->StructureSize !=
smb2_rsp_struct_sizes[SMB2_OPLOCK_BREAK_HE]) {
if (le16_to_cpu(rsp->StructureSize) == 44)
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 89985a0a6819..6f96e2292856 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -34,6 +34,7 @@
#include "cifs_ioctl.h"
#include "smbdirect.h"
+/* Change credits for different ops and return the total number of credits */
static int
change_conf(struct TCP_Server_Info *server)
{
@@ -41,17 +42,15 @@ change_conf(struct TCP_Server_Info *server)
server->oplock_credits = server->echo_credits = 0;
switch (server->credits) {
case 0:
- return -1;
+ return 0;
case 1:
server->echoes = false;
server->oplocks = false;
- cifs_dbg(VFS, "disabling echoes and oplocks\n");
break;
case 2:
server->echoes = true;
server->oplocks = false;
server->echo_credits = 1;
- cifs_dbg(FYI, "disabling oplocks\n");
break;
default:
server->echoes = true;
@@ -64,16 +63,23 @@ change_conf(struct TCP_Server_Info *server)
server->echo_credits = 1;
}
server->credits -= server->echo_credits + server->oplock_credits;
- return 0;
+ return server->credits + server->echo_credits + server->oplock_credits;
}
static void
smb2_add_credits(struct TCP_Server_Info *server, const unsigned int add,
const int optype)
{
- int *val, rc = 0;
+ int *val, rc = -1;
+
spin_lock(&server->req_lock);
val = server->ops->get_credits_field(server, optype);
+
+ /* eg found case where write overlapping reconnect messed up credits */
+ if (((optype & CIFS_OP_MASK) == CIFS_NEG_OP) && (*val != 0))
+ trace_smb3_reconnect_with_invalid_credits(server->CurrentMid,
+ server->hostname, *val);
+
*val += add;
if (*val > 65000) {
*val = 65000; /* Don't get near 64K credits, avoid srv bugs */
@@ -95,8 +101,26 @@ smb2_add_credits(struct TCP_Server_Info *server, const unsigned int add,
}
spin_unlock(&server->req_lock);
wake_up(&server->request_q);
- if (rc)
- cifs_reconnect(server);
+
+ if (server->tcpStatus == CifsNeedReconnect)
+ return;
+
+ switch (rc) {
+ case -1:
+ /* change_conf hasn't been executed */
+ break;
+ case 0:
+ cifs_dbg(VFS, "Possible client or server bug - zero credits\n");
+ break;
+ case 1:
+ cifs_dbg(VFS, "disabling echoes and oplocks\n");
+ break;
+ case 2:
+ cifs_dbg(FYI, "disabling oplocks\n");
+ break;
+ default:
+ cifs_dbg(FYI, "add %u credits total=%d\n", add, rc);
+ }
}
static void
@@ -104,7 +128,12 @@ smb2_set_credits(struct TCP_Server_Info *server, const int val)
{
spin_lock(&server->req_lock);
server->credits = val;
+ if (val == 1)
+ server->reconnect_instance++;
spin_unlock(&server->req_lock);
+ /* don't log while holding the lock */
+ if (val == 1)
+ cifs_dbg(FYI, "set credits to 1 due to smb2 reconnect\n");
}
static int *
@@ -125,7 +154,11 @@ smb2_get_credits(struct mid_q_entry *mid)
{
struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)mid->resp_buf;
- return le16_to_cpu(shdr->CreditRequest);
+ if (mid->mid_state == MID_RESPONSE_RECEIVED
+ || mid->mid_state == MID_RESPONSE_MALFORMED)
+ return le16_to_cpu(shdr->CreditRequest);
+
+ return 0;
}
static int
@@ -154,14 +187,14 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
scredits = server->credits;
/* can deadlock with reopen */
- if (scredits == 1) {
+ if (scredits <= 8) {
*num = SMB2_MAX_BUFFER_SIZE;
*credits = 0;
break;
}
- /* leave one credit for a possible reopen */
- scredits--;
+ /* leave some credits for reopen and other ops */
+ scredits -= 8;
*num = min_t(unsigned int, size,
scredits * SMB2_MAX_BUFFER_SIZE);
@@ -270,6 +303,31 @@ smb2_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
}
static unsigned int
+smb3_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
+{
+ struct TCP_Server_Info *server = tcon->ses->server;
+ unsigned int wsize;
+
+ /* start with specified wsize, or default */
+ wsize = volume_info->wsize ? volume_info->wsize : SMB3_DEFAULT_IOSIZE;
+ wsize = min_t(unsigned int, wsize, server->max_write);
+#ifdef CONFIG_CIFS_SMB_DIRECT
+ if (server->rdma) {
+ if (server->sign)
+ wsize = min_t(unsigned int,
+ wsize, server->smbd_conn->max_fragmented_send_size);
+ else
+ wsize = min_t(unsigned int,
+ wsize, server->smbd_conn->max_readwrite_size);
+ }
+#endif
+ if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
+ wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE);
+
+ return wsize;
+}
+
+static unsigned int
smb2_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
{
struct TCP_Server_Info *server = tcon->ses->server;
@@ -295,6 +353,31 @@ smb2_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
return rsize;
}
+static unsigned int
+smb3_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
+{
+ struct TCP_Server_Info *server = tcon->ses->server;
+ unsigned int rsize;
+
+ /* start with specified rsize, or default */
+ rsize = volume_info->rsize ? volume_info->rsize : SMB3_DEFAULT_IOSIZE;
+ rsize = min_t(unsigned int, rsize, server->max_read);
+#ifdef CONFIG_CIFS_SMB_DIRECT
+ if (server->rdma) {
+ if (server->sign)
+ rsize = min_t(unsigned int,
+ rsize, server->smbd_conn->max_fragmented_recv_size);
+ else
+ rsize = min_t(unsigned int,
+ rsize, server->smbd_conn->max_readwrite_size);
+ }
+#endif
+
+ if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
+ rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE);
+
+ return rsize;
+}
static int
parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,
@@ -686,6 +769,7 @@ move_smb2_ea_to_cifs(char *dst, size_t dst_size,
int rc = 0;
unsigned int ea_name_len = ea_name ? strlen(ea_name) : 0;
char *name, *value;
+ size_t buf_size = dst_size;
size_t name_len, value_len, user_name_len;
while (src_size > 0) {
@@ -721,9 +805,10 @@ move_smb2_ea_to_cifs(char *dst, size_t dst_size,
/* 'user.' plus a terminating null */
user_name_len = 5 + 1 + name_len;
- rc += user_name_len;
-
- if (dst_size >= user_name_len) {
+ if (buf_size == 0) {
+ /* skip copy - calc size only */
+ rc += user_name_len;
+ } else if (dst_size >= user_name_len) {
dst_size -= user_name_len;
memcpy(dst, "user.", 5);
dst += 5;
@@ -731,8 +816,7 @@ move_smb2_ea_to_cifs(char *dst, size_t dst_size,
dst += name_len;
*dst = 0;
++dst;
- } else if (dst_size == 0) {
- /* skip copy - calc size only */
+ rc += user_name_len;
} else {
/* stop before overrun buffer */
rc = -ERANGE;
@@ -769,72 +853,50 @@ smb2_query_eas(const unsigned int xid, struct cifs_tcon *tcon,
{
int rc;
__le16 *utf16_path;
- __u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
- struct cifs_open_parms oparms;
- struct cifs_fid fid;
- struct smb2_file_full_ea_info *smb2_data;
- int ea_buf_size = SMB2_MIN_EA_BUF;
+ struct kvec rsp_iov = {NULL, 0};
+ int buftype = CIFS_NO_BUFFER;
+ struct smb2_query_info_rsp *rsp;
+ struct smb2_file_full_ea_info *info = NULL;
utf16_path = cifs_convert_path_to_utf16(path, cifs_sb);
if (!utf16_path)
return -ENOMEM;
- oparms.tcon = tcon;
- oparms.desired_access = FILE_READ_EA;
- oparms.disposition = FILE_OPEN;
- if (backup_cred(cifs_sb))
- oparms.create_options = CREATE_OPEN_BACKUP_INTENT;
- else
- oparms.create_options = 0;
- oparms.fid = &fid;
- oparms.reconnect = false;
-
- rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, NULL);
- kfree(utf16_path);
+ rc = smb2_query_info_compound(xid, tcon, utf16_path,
+ FILE_READ_EA,
+ FILE_FULL_EA_INFORMATION,
+ SMB2_O_INFO_FILE,
+ CIFSMaxBufSize -
+ MAX_SMB2_CREATE_RESPONSE_SIZE -
+ MAX_SMB2_CLOSE_RESPONSE_SIZE,
+ &rsp_iov, &buftype, cifs_sb);
if (rc) {
- cifs_dbg(FYI, "open failed rc=%d\n", rc);
- return rc;
- }
-
- while (1) {
- smb2_data = kzalloc(ea_buf_size, GFP_KERNEL);
- if (smb2_data == NULL) {
- SMB2_close(xid, tcon, fid.persistent_fid,
- fid.volatile_fid);
- return -ENOMEM;
- }
-
- rc = SMB2_query_eas(xid, tcon, fid.persistent_fid,
- fid.volatile_fid,
- ea_buf_size, smb2_data);
-
- if (rc != -E2BIG)
- break;
-
- kfree(smb2_data);
- ea_buf_size <<= 1;
-
- if (ea_buf_size > SMB2_MAX_EA_BUF) {
- cifs_dbg(VFS, "EA size is too large\n");
- SMB2_close(xid, tcon, fid.persistent_fid,
- fid.volatile_fid);
- return -ENOMEM;
- }
+ /*
+ * If ea_name is NULL (listxattr) and there are no EAs,
+ * return 0 as it's not an error. Otherwise, the specified
+ * ea_name was not found.
+ */
+ if (!ea_name && rc == -ENODATA)
+ rc = 0;
+ goto qeas_exit;
}
- SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
+ rsp = (struct smb2_query_info_rsp *)rsp_iov.iov_base;
+ rc = smb2_validate_iov(le16_to_cpu(rsp->OutputBufferOffset),
+ le32_to_cpu(rsp->OutputBufferLength),
+ &rsp_iov,
+ sizeof(struct smb2_file_full_ea_info));
+ if (rc)
+ goto qeas_exit;
- /*
- * If ea_name is NULL (listxattr) and there are no EAs, return 0 as it's
- * not an error. Otherwise, the specified ea_name was not found.
- */
- if (!rc)
- rc = move_smb2_ea_to_cifs(ea_data, buf_size, smb2_data,
- SMB2_MAX_EA_BUF, ea_name);
- else if (!ea_name && rc == -ENODATA)
- rc = 0;
+ info = (struct smb2_file_full_ea_info *)(
+ le16_to_cpu(rsp->OutputBufferOffset) + (char *)rsp);
+ rc = move_smb2_ea_to_cifs(ea_data, buf_size, info,
+ le32_to_cpu(rsp->OutputBufferLength), ea_name);
- kfree(smb2_data);
+ qeas_exit:
+ kfree(utf16_path);
+ free_rsp_buf(buftype, rsp_iov.iov_base);
return rc;
}
@@ -845,14 +907,27 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
const __u16 ea_value_len, const struct nls_table *nls_codepage,
struct cifs_sb_info *cifs_sb)
{
- int rc;
- __le16 *utf16_path;
- __u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
- struct cifs_open_parms oparms;
- struct cifs_fid fid;
- struct smb2_file_full_ea_info *ea;
+ struct cifs_ses *ses = tcon->ses;
+ __le16 *utf16_path = NULL;
int ea_name_len = strlen(ea_name);
+ int flags = 0;
int len;
+ struct smb_rqst rqst[3];
+ int resp_buftype[3];
+ struct kvec rsp_iov[3];
+ struct kvec open_iov[SMB2_CREATE_IOV_SIZE];
+ struct cifs_open_parms oparms;
+ __u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
+ struct cifs_fid fid;
+ struct kvec si_iov[SMB2_SET_INFO_IOV_SIZE];
+ unsigned int size[1];
+ void *data[1];
+ struct smb2_file_full_ea_info *ea = NULL;
+ struct kvec close_iov[1];
+ int rc;
+
+ if (smb3_encryption_required(tcon))
+ flags |= CIFS_TRANSFORM_REQ;
if (ea_name_len > 255)
return -EINVAL;
@@ -861,6 +936,16 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
if (!utf16_path)
return -ENOMEM;
+ memset(rqst, 0, sizeof(rqst));
+ resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER;
+ memset(rsp_iov, 0, sizeof(rsp_iov));
+
+ /* Open */
+ memset(&open_iov, 0, sizeof(open_iov));
+ rqst[0].rq_iov = open_iov;
+ rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE;
+
+ memset(&oparms, 0, sizeof(oparms));
oparms.tcon = tcon;
oparms.desired_access = FILE_WRITE_EA;
oparms.disposition = FILE_OPEN;
@@ -871,18 +956,22 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
oparms.fid = &fid;
oparms.reconnect = false;
- rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, NULL);
- kfree(utf16_path);
- if (rc) {
- cifs_dbg(FYI, "open failed rc=%d\n", rc);
- return rc;
- }
+ rc = SMB2_open_init(tcon, &rqst[0], &oplock, &oparms, utf16_path);
+ if (rc)
+ goto sea_exit;
+ smb2_set_next_command(tcon, &rqst[0]);
+
+
+ /* Set Info */
+ memset(&si_iov, 0, sizeof(si_iov));
+ rqst[1].rq_iov = si_iov;
+ rqst[1].rq_nvec = 1;
len = sizeof(ea) + ea_name_len + ea_value_len + 1;
ea = kzalloc(len, GFP_KERNEL);
if (ea == NULL) {
- SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
- return -ENOMEM;
+ rc = -ENOMEM;
+ goto sea_exit;
}
ea->ea_name_length = ea_name_len;
@@ -890,12 +979,36 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
memcpy(ea->ea_data, ea_name, ea_name_len + 1);
memcpy(ea->ea_data + ea_name_len + 1, ea_value, ea_value_len);
- rc = SMB2_set_ea(xid, tcon, fid.persistent_fid, fid.volatile_fid, ea,
- len);
- kfree(ea);
+ size[0] = len;
+ data[0] = ea;
- SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
+ rc = SMB2_set_info_init(tcon, &rqst[1], COMPOUND_FID,
+ COMPOUND_FID, current->tgid,
+ FILE_FULL_EA_INFORMATION,
+ SMB2_O_INFO_FILE, 0, data, size);
+ smb2_set_next_command(tcon, &rqst[1]);
+ smb2_set_related(&rqst[1]);
+
+
+ /* Close */
+ memset(&close_iov, 0, sizeof(close_iov));
+ rqst[2].rq_iov = close_iov;
+ rqst[2].rq_nvec = 1;
+ rc = SMB2_close_init(tcon, &rqst[2], COMPOUND_FID, COMPOUND_FID);
+ smb2_set_related(&rqst[2]);
+
+ rc = compound_send_recv(xid, ses, flags, 3, rqst,
+ resp_buftype, rsp_iov);
+ sea_exit:
+ kfree(ea);
+ kfree(utf16_path);
+ SMB2_open_free(&rqst[0]);
+ SMB2_set_info_free(&rqst[1]);
+ SMB2_close_free(&rqst[2]);
+ free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
+ free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
+ free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base);
return rc;
}
#endif
@@ -962,6 +1075,9 @@ smb2_print_stats(struct seq_file *m, struct cifs_tcon *tcon)
seq_printf(m, "\nBytes read: %llu Bytes written: %llu",
(long long)(tcon->bytes_read),
(long long)(tcon->bytes_written));
+ seq_printf(m, "\nOpen files: %d total (local), %d open on server",
+ atomic_read(&tcon->num_local_opens),
+ atomic_read(&tcon->num_remote_opens));
seq_printf(m, "\nTreeConnects: %d total %d failed",
atomic_read(&sent[SMB2_TREE_CONNECT_HE]),
atomic_read(&failed[SMB2_TREE_CONNECT_HE]));
@@ -1014,6 +1130,9 @@ smb2_set_fid(struct cifsFileInfo *cfile, struct cifs_fid *fid, __u32 oplock)
cfile->fid.persistent_fid = fid->persistent_fid;
cfile->fid.volatile_fid = fid->volatile_fid;
+#ifdef CONFIG_CIFS_DEBUG2
+ cfile->fid.mid = fid->mid;
+#endif /* CIFS_DEBUG2 */
server->ops->set_oplock_level(cinode, oplock, fid->epoch,
&fid->purge_cache);
cinode->can_cache_brlcks = CIFS_CACHE_WRITE(cinode);
@@ -1057,6 +1176,131 @@ req_res_key_exit:
return rc;
}
+static int
+smb2_ioctl_query_info(const unsigned int xid,
+ struct cifs_tcon *tcon,
+ __le16 *path, int is_dir,
+ unsigned long p)
+{
+ struct cifs_ses *ses = tcon->ses;
+ char __user *arg = (char __user *)p;
+ struct smb_query_info qi;
+ struct smb_query_info __user *pqi;
+ int rc = 0;
+ int flags = 0;
+ struct smb2_query_info_rsp *rsp = NULL;
+ void *buffer = NULL;
+ struct smb_rqst rqst[3];
+ int resp_buftype[3];
+ struct kvec rsp_iov[3];
+ struct kvec open_iov[SMB2_CREATE_IOV_SIZE];
+ struct cifs_open_parms oparms;
+ u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
+ struct cifs_fid fid;
+ struct kvec qi_iov[1];
+ struct kvec close_iov[1];
+
+ memset(rqst, 0, sizeof(rqst));
+ resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER;
+ memset(rsp_iov, 0, sizeof(rsp_iov));
+
+ if (copy_from_user(&qi, arg, sizeof(struct smb_query_info)))
+ return -EFAULT;
+
+ if (qi.output_buffer_length > 1024)
+ return -EINVAL;
+
+ if (!ses || !(ses->server))
+ return -EIO;
+
+ if (smb3_encryption_required(tcon))
+ flags |= CIFS_TRANSFORM_REQ;
+
+ buffer = kmalloc(qi.output_buffer_length, GFP_KERNEL);
+ if (buffer == NULL)
+ return -ENOMEM;
+
+ if (copy_from_user(buffer, arg + sizeof(struct smb_query_info),
+ qi.output_buffer_length)) {
+ rc = -EFAULT;
+ goto iqinf_exit;
+ }
+
+ /* Open */
+ memset(&open_iov, 0, sizeof(open_iov));
+ rqst[0].rq_iov = open_iov;
+ rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE;
+
+ memset(&oparms, 0, sizeof(oparms));
+ oparms.tcon = tcon;
+ oparms.desired_access = FILE_READ_ATTRIBUTES | READ_CONTROL;
+ oparms.disposition = FILE_OPEN;
+ if (is_dir)
+ oparms.create_options = CREATE_NOT_FILE;
+ else
+ oparms.create_options = CREATE_NOT_DIR;
+ oparms.fid = &fid;
+ oparms.reconnect = false;
+
+ rc = SMB2_open_init(tcon, &rqst[0], &oplock, &oparms, path);
+ if (rc)
+ goto iqinf_exit;
+ smb2_set_next_command(tcon, &rqst[0]);
+
+ /* Query */
+ memset(&qi_iov, 0, sizeof(qi_iov));
+ rqst[1].rq_iov = qi_iov;
+ rqst[1].rq_nvec = 1;
+
+ rc = SMB2_query_info_init(tcon, &rqst[1], COMPOUND_FID, COMPOUND_FID,
+ qi.file_info_class, qi.info_type,
+ qi.additional_information,
+ qi.input_buffer_length,
+ qi.output_buffer_length, buffer);
+ if (rc)
+ goto iqinf_exit;
+ smb2_set_next_command(tcon, &rqst[1]);
+ smb2_set_related(&rqst[1]);
+
+ /* Close */
+ memset(&close_iov, 0, sizeof(close_iov));
+ rqst[2].rq_iov = close_iov;
+ rqst[2].rq_nvec = 1;
+
+ rc = SMB2_close_init(tcon, &rqst[2], COMPOUND_FID, COMPOUND_FID);
+ if (rc)
+ goto iqinf_exit;
+ smb2_set_related(&rqst[2]);
+
+ rc = compound_send_recv(xid, ses, flags, 3, rqst,
+ resp_buftype, rsp_iov);
+ if (rc)
+ goto iqinf_exit;
+ pqi = (struct smb_query_info __user *)arg;
+ rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base;
+ if (le32_to_cpu(rsp->OutputBufferLength) < qi.input_buffer_length)
+ qi.input_buffer_length = le32_to_cpu(rsp->OutputBufferLength);
+ if (copy_to_user(&pqi->input_buffer_length, &qi.input_buffer_length,
+ sizeof(qi.input_buffer_length))) {
+ rc = -EFAULT;
+ goto iqinf_exit;
+ }
+ if (copy_to_user(pqi + 1, rsp->Buffer, qi.input_buffer_length)) {
+ rc = -EFAULT;
+ goto iqinf_exit;
+ }
+
+ iqinf_exit:
+ kfree(buffer);
+ SMB2_open_free(&rqst[0]);
+ SMB2_query_info_free(&rqst[1]);
+ SMB2_close_free(&rqst[2]);
+ free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
+ free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
+ free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base);
+ return rc;
+}
+
static ssize_t
smb2_copychunk_range(const unsigned int xid,
struct cifsFileInfo *srcfile,
@@ -1301,7 +1545,7 @@ smb2_set_file_size(const unsigned int xid, struct cifs_tcon *tcon,
}
return SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid,
- cfile->fid.volatile_fid, cfile->pid, &eof, false);
+ cfile->fid.volatile_fid, cfile->pid, &eof);
}
static int
@@ -1556,7 +1800,7 @@ smb2_oplock_response(struct cifs_tcon *tcon, struct cifs_fid *fid,
CIFS_CACHE_READ(cinode) ? 1 : 0);
}
-static void
+void
smb2_set_related(struct smb_rqst *rqst)
{
struct smb2_sync_hdr *shdr;
@@ -1567,50 +1811,87 @@ smb2_set_related(struct smb_rqst *rqst)
char smb2_padding[7] = {0, 0, 0, 0, 0, 0, 0};
-static void
-smb2_set_next_command(struct TCP_Server_Info *server, struct smb_rqst *rqst)
+void
+smb2_set_next_command(struct cifs_tcon *tcon, struct smb_rqst *rqst)
{
struct smb2_sync_hdr *shdr;
+ struct cifs_ses *ses = tcon->ses;
+ struct TCP_Server_Info *server = ses->server;
unsigned long len = smb_rqst_len(server, rqst);
+ int i, num_padding;
/* SMB headers in a compound are 8 byte aligned. */
- if (len & 7) {
+
+ /* No padding needed */
+ if (!(len & 7))
+ goto finished;
+
+ num_padding = 8 - (len & 7);
+ if (!smb3_encryption_required(tcon)) {
+ /*
+ * If we do not have encryption then we can just add an extra
+ * iov for the padding.
+ */
rqst->rq_iov[rqst->rq_nvec].iov_base = smb2_padding;
- rqst->rq_iov[rqst->rq_nvec].iov_len = 8 - (len & 7);
+ rqst->rq_iov[rqst->rq_nvec].iov_len = num_padding;
rqst->rq_nvec++;
- len = smb_rqst_len(server, rqst);
+ len += num_padding;
+ } else {
+ /*
+ * We can not add a small padding iov for the encryption case
+ * because the encryption framework can not handle the padding
+ * iovs.
+ * We have to flatten this into a single buffer and add
+ * the padding to it.
+ */
+ for (i = 1; i < rqst->rq_nvec; i++) {
+ memcpy(rqst->rq_iov[0].iov_base +
+ rqst->rq_iov[0].iov_len,
+ rqst->rq_iov[i].iov_base,
+ rqst->rq_iov[i].iov_len);
+ rqst->rq_iov[0].iov_len += rqst->rq_iov[i].iov_len;
+ }
+ memset(rqst->rq_iov[0].iov_base + rqst->rq_iov[0].iov_len,
+ 0, num_padding);
+ rqst->rq_iov[0].iov_len += num_padding;
+ len += num_padding;
+ rqst->rq_nvec = 1;
}
+ finished:
shdr = (struct smb2_sync_hdr *)(rqst->rq_iov[0].iov_base);
shdr->NextCommand = cpu_to_le32(len);
}
-static int
-smb2_queryfs(const unsigned int xid, struct cifs_tcon *tcon,
- struct kstatfs *buf)
+/*
+ * Passes the query info response back to the caller on success.
+ * Caller need to free this with free_rsp_buf().
+ */
+int
+smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,
+ __le16 *utf16_path, u32 desired_access,
+ u32 class, u32 type, u32 output_len,
+ struct kvec *rsp, int *buftype,
+ struct cifs_sb_info *cifs_sb)
{
- struct smb2_query_info_rsp *rsp;
- struct smb2_fs_full_size_info *info = NULL;
+ struct cifs_ses *ses = tcon->ses;
+ int flags = 0;
struct smb_rqst rqst[3];
int resp_buftype[3];
struct kvec rsp_iov[3];
struct kvec open_iov[SMB2_CREATE_IOV_SIZE];
struct kvec qi_iov[1];
struct kvec close_iov[1];
- struct cifs_ses *ses = tcon->ses;
- struct TCP_Server_Info *server = ses->server;
- __le16 srch_path = 0; /* Null - open root of share */
u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
struct cifs_open_parms oparms;
struct cifs_fid fid;
- int flags = 0;
int rc;
if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
memset(rqst, 0, sizeof(rqst));
- memset(resp_buftype, 0, sizeof(resp_buftype));
+ resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER;
memset(rsp_iov, 0, sizeof(rsp_iov));
memset(&open_iov, 0, sizeof(open_iov));
@@ -1618,28 +1899,31 @@ smb2_queryfs(const unsigned int xid, struct cifs_tcon *tcon,
rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE;
oparms.tcon = tcon;
- oparms.desired_access = FILE_READ_ATTRIBUTES;
+ oparms.desired_access = desired_access;
oparms.disposition = FILE_OPEN;
- oparms.create_options = 0;
+ if (cifs_sb && backup_cred(cifs_sb))
+ oparms.create_options = CREATE_OPEN_BACKUP_INTENT;
+ else
+ oparms.create_options = 0;
oparms.fid = &fid;
oparms.reconnect = false;
- rc = SMB2_open_init(tcon, &rqst[0], &oplock, &oparms, &srch_path);
+ rc = SMB2_open_init(tcon, &rqst[0], &oplock, &oparms, utf16_path);
if (rc)
- goto qfs_exit;
- smb2_set_next_command(server, &rqst[0]);
+ goto qic_exit;
+ smb2_set_next_command(tcon, &rqst[0]);
memset(&qi_iov, 0, sizeof(qi_iov));
rqst[1].rq_iov = qi_iov;
rqst[1].rq_nvec = 1;
rc = SMB2_query_info_init(tcon, &rqst[1], COMPOUND_FID, COMPOUND_FID,
- FS_FULL_SIZE_INFORMATION,
- SMB2_O_INFO_FILESYSTEM, 0,
- sizeof(struct smb2_fs_full_size_info));
+ class, type, 0,
+ output_len, 0,
+ NULL);
if (rc)
- goto qfs_exit;
- smb2_set_next_command(server, &rqst[1]);
+ goto qic_exit;
+ smb2_set_next_command(tcon, &rqst[1]);
smb2_set_related(&rqst[1]);
memset(&close_iov, 0, sizeof(close_iov));
@@ -1648,32 +1932,61 @@ smb2_queryfs(const unsigned int xid, struct cifs_tcon *tcon,
rc = SMB2_close_init(tcon, &rqst[2], COMPOUND_FID, COMPOUND_FID);
if (rc)
- goto qfs_exit;
+ goto qic_exit;
smb2_set_related(&rqst[2]);
rc = compound_send_recv(xid, ses, flags, 3, rqst,
resp_buftype, rsp_iov);
+ if (rc) {
+ free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
+ goto qic_exit;
+ }
+ *rsp = rsp_iov[1];
+ *buftype = resp_buftype[1];
+
+ qic_exit:
+ SMB2_open_free(&rqst[0]);
+ SMB2_query_info_free(&rqst[1]);
+ SMB2_close_free(&rqst[2]);
+ free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
+ free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base);
+ return rc;
+}
+
+static int
+smb2_queryfs(const unsigned int xid, struct cifs_tcon *tcon,
+ struct kstatfs *buf)
+{
+ struct smb2_query_info_rsp *rsp;
+ struct smb2_fs_full_size_info *info = NULL;
+ __le16 utf16_path = 0; /* Null - open root of share */
+ struct kvec rsp_iov = {NULL, 0};
+ int buftype = CIFS_NO_BUFFER;
+ int rc;
+
+
+ rc = smb2_query_info_compound(xid, tcon, &utf16_path,
+ FILE_READ_ATTRIBUTES,
+ FS_FULL_SIZE_INFORMATION,
+ SMB2_O_INFO_FILESYSTEM,
+ sizeof(struct smb2_fs_full_size_info),
+ &rsp_iov, &buftype, NULL);
if (rc)
goto qfs_exit;
- rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base;
+ rsp = (struct smb2_query_info_rsp *)rsp_iov.iov_base;
buf->f_type = SMB2_MAGIC_NUMBER;
info = (struct smb2_fs_full_size_info *)(
le16_to_cpu(rsp->OutputBufferOffset) + (char *)rsp);
rc = smb2_validate_iov(le16_to_cpu(rsp->OutputBufferOffset),
le32_to_cpu(rsp->OutputBufferLength),
- &rsp_iov[1],
+ &rsp_iov,
sizeof(struct smb2_fs_full_size_info));
if (!rc)
smb2_copy_fs_info_to_kstatfs(info, buf);
qfs_exit:
- SMB2_open_free(&rqst[0]);
- SMB2_query_info_free(&rqst[1]);
- SMB2_close_free(&rqst[2]);
- free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
- free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
- free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base);
+ free_rsp_buf(buftype, rsp_iov.iov_base);
return rc;
}
@@ -2542,7 +2855,7 @@ init_sg(int num_rqst, struct smb_rqst *rqst, u8 *sign)
smb2_sg_set_buf(&sg[idx++],
rqst[i].rq_iov[j].iov_base + skip,
rqst[i].rq_iov[j].iov_len - skip);
- }
+ }
for (j = 0; j < rqst[i].rq_npages; j++) {
unsigned int len, offset;
@@ -2900,11 +3213,23 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
server->ops->is_status_pending(buf, server, 0))
return -1;
- rdata->result = server->ops->map_error(buf, false);
+ /* set up first two iov to get credits */
+ rdata->iov[0].iov_base = buf;
+ rdata->iov[0].iov_len = 4;
+ rdata->iov[1].iov_base = buf + 4;
+ rdata->iov[1].iov_len =
+ min_t(unsigned int, buf_len, server->vals->read_rsp_size) - 4;
+ cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n",
+ rdata->iov[0].iov_base, rdata->iov[0].iov_len);
+ cifs_dbg(FYI, "1: iov_base=%p iov_len=%zu\n",
+ rdata->iov[1].iov_base, rdata->iov[1].iov_len);
+
+ rdata->result = server->ops->map_error(buf, true);
if (rdata->result != 0) {
cifs_dbg(FYI, "%s: server returned error %d\n",
__func__, rdata->result);
- dequeue_mid(mid, rdata->result);
+ /* normal error on read response */
+ dequeue_mid(mid, false);
return 0;
}
@@ -2962,13 +3287,13 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
return 0;
}
- iov_iter_bvec(&iter, WRITE | ITER_BVEC, bvec, npages, data_len);
+ iov_iter_bvec(&iter, WRITE, bvec, npages, data_len);
} else if (buf_len >= data_offset + data_len) {
/* read response payload is in buf */
WARN_ONCE(npages > 0, "read data can be either in buf or in pages");
iov.iov_base = buf + data_offset;
iov.iov_len = data_len;
- iov_iter_kvec(&iter, WRITE | ITER_KVEC, &iov, 1, data_len);
+ iov_iter_kvec(&iter, WRITE, &iov, 1, data_len);
} else {
/* read response payload cannot be in both buf and pages */
WARN_ONCE(1, "buf can not contain only a part of read data");
@@ -2977,14 +3302,6 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
return 0;
}
- /* set up first iov for signature check */
- rdata->iov[0].iov_base = buf;
- rdata->iov[0].iov_len = 4;
- rdata->iov[1].iov_base = buf + 4;
- rdata->iov[1].iov_len = server->vals->read_rsp_size - 4;
- cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n",
- rdata->iov[0].iov_base, server->vals->read_rsp_size);
-
length = rdata->copy_into_pages(server, rdata, &iter);
kfree(bvec);
@@ -3183,8 +3500,10 @@ smb3_receive_transform(struct TCP_Server_Info *server,
}
/* TODO: add support for compounds containing READ. */
- if (pdu_length > CIFSMaxBufSize + MAX_HEADER_SIZE(server))
+ if (pdu_length > CIFSMaxBufSize + MAX_HEADER_SIZE(server)) {
+ *num_mids = 1;
return receive_encrypted_read(server, &mids[0]);
+ }
return receive_encrypted_standard(server, mids, bufs, num_mids);
}
@@ -3303,6 +3622,7 @@ struct smb_version_operations smb20_operations = {
.set_acl = set_smb2_acl,
#endif /* CIFS_ACL */
.next_header = smb2_next_header,
+ .ioctl_query_info = smb2_ioctl_query_info,
};
struct smb_version_operations smb21_operations = {
@@ -3398,6 +3718,7 @@ struct smb_version_operations smb21_operations = {
.set_acl = set_smb2_acl,
#endif /* CIFS_ACL */
.next_header = smb2_next_header,
+ .ioctl_query_info = smb2_ioctl_query_info,
};
struct smb_version_operations smb30_operations = {
@@ -3425,8 +3746,8 @@ struct smb_version_operations smb30_operations = {
.downgrade_oplock = smb2_downgrade_oplock,
.need_neg = smb2_need_neg,
.negotiate = smb2_negotiate,
- .negotiate_wsize = smb2_negotiate_wsize,
- .negotiate_rsize = smb2_negotiate_rsize,
+ .negotiate_wsize = smb3_negotiate_wsize,
+ .negotiate_rsize = smb3_negotiate_rsize,
.sess_setup = SMB2_sess_setup,
.logoff = SMB2_logoff,
.tree_connect = SMB2_tcon,
@@ -3502,6 +3823,7 @@ struct smb_version_operations smb30_operations = {
.set_acl = set_smb2_acl,
#endif /* CIFS_ACL */
.next_header = smb2_next_header,
+ .ioctl_query_info = smb2_ioctl_query_info,
};
struct smb_version_operations smb311_operations = {
@@ -3529,8 +3851,8 @@ struct smb_version_operations smb311_operations = {
.downgrade_oplock = smb2_downgrade_oplock,
.need_neg = smb2_need_neg,
.negotiate = smb2_negotiate,
- .negotiate_wsize = smb2_negotiate_wsize,
- .negotiate_rsize = smb2_negotiate_rsize,
+ .negotiate_wsize = smb3_negotiate_wsize,
+ .negotiate_rsize = smb3_negotiate_rsize,
.sess_setup = SMB2_sess_setup,
.logoff = SMB2_logoff,
.tree_connect = SMB2_tcon,
@@ -3607,6 +3929,7 @@ struct smb_version_operations smb311_operations = {
.set_acl = set_smb2_acl,
#endif /* CIFS_ACL */
.next_header = smb2_next_header,
+ .ioctl_query_info = smb2_ioctl_query_info,
};
struct smb_version_values smb20_values = {
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index f54d07bda067..77b3aaa39b35 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -50,6 +50,9 @@
#include "cifs_spnego.h"
#include "smbdirect.h"
#include "trace.h"
+#ifdef CONFIG_CIFS_DFS_UPCALL
+#include "dfs_cache.h"
+#endif
/*
* The following table defines the expected "StructureSize" of SMB2 requests
@@ -152,6 +155,86 @@ out:
return;
}
+#ifdef CONFIG_CIFS_DFS_UPCALL
+static int __smb2_reconnect(const struct nls_table *nlsc,
+ struct cifs_tcon *tcon)
+{
+ int rc;
+ struct dfs_cache_tgt_list tl;
+ struct dfs_cache_tgt_iterator *it = NULL;
+ char *tree;
+ const char *tcp_host;
+ size_t tcp_host_len;
+ const char *dfs_host;
+ size_t dfs_host_len;
+
+ tree = kzalloc(MAX_TREE_SIZE, GFP_KERNEL);
+ if (!tree)
+ return -ENOMEM;
+
+ if (tcon->ipc) {
+ snprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$",
+ tcon->ses->server->hostname);
+ rc = SMB2_tcon(0, tcon->ses, tree, tcon, nlsc);
+ goto out;
+ }
+
+ if (!tcon->dfs_path) {
+ rc = SMB2_tcon(0, tcon->ses, tcon->treeName, tcon, nlsc);
+ goto out;
+ }
+
+ rc = dfs_cache_noreq_find(tcon->dfs_path + 1, NULL, &tl);
+ if (rc)
+ goto out;
+
+ extract_unc_hostname(tcon->ses->server->hostname, &tcp_host,
+ &tcp_host_len);
+
+ for (it = dfs_cache_get_tgt_iterator(&tl); it;
+ it = dfs_cache_get_next_tgt(&tl, it)) {
+ const char *tgt = dfs_cache_get_tgt_name(it);
+
+ extract_unc_hostname(tgt, &dfs_host, &dfs_host_len);
+
+ if (dfs_host_len != tcp_host_len
+ || strncasecmp(dfs_host, tcp_host, dfs_host_len) != 0) {
+ cifs_dbg(FYI, "%s: skipping %.*s, doesn't match %.*s",
+ __func__,
+ (int)dfs_host_len, dfs_host,
+ (int)tcp_host_len, tcp_host);
+ continue;
+ }
+
+ snprintf(tree, MAX_TREE_SIZE, "\\%s", tgt);
+
+ rc = SMB2_tcon(0, tcon->ses, tree, tcon, nlsc);
+ if (!rc)
+ break;
+ if (rc == -EREMOTE)
+ break;
+ }
+
+ if (!rc) {
+ if (it)
+ rc = dfs_cache_noreq_update_tgthint(tcon->dfs_path + 1,
+ it);
+ else
+ rc = -ENOENT;
+ }
+ dfs_cache_free_tgts(&tl);
+out:
+ kfree(tree);
+ return rc;
+}
+#else
+static inline int __smb2_reconnect(const struct nls_table *nlsc,
+ struct cifs_tcon *tcon)
+{
+ return SMB2_tcon(0, tcon->ses, tcon->treeName, tcon, nlsc);
+}
+#endif
+
static int
smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon)
{
@@ -159,6 +242,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon)
struct nls_table *nls_codepage;
struct cifs_ses *ses;
struct TCP_Server_Info *server;
+ int retries;
/*
* SMB2s NegProt, SessSetup, Logoff do not have tcon yet so
@@ -192,9 +276,12 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon)
ses = tcon->ses;
server = ses->server;
+ retries = server->nr_targets;
+
/*
- * Give demultiplex thread up to 10 seconds to reconnect, should be
- * greater than cifs socket timeout which is 7 seconds
+ * Give demultiplex thread up to 10 seconds to each target available for
+ * reconnect -- should be greater than cifs socket timeout which is 7
+ * seconds.
*/
while (server->tcpStatus == CifsNeedReconnect) {
/*
@@ -225,6 +312,9 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon)
if (server->tcpStatus != CifsNeedReconnect)
break;
+ if (--retries)
+ continue;
+
/*
* on "soft" mounts we wait once. Hard mounts keep
* retrying until process is killed or server comes
@@ -234,6 +324,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon)
cifs_dbg(FYI, "gave up waiting on reconnect in smb_init\n");
return -EHOSTDOWN;
}
+ retries = server->nr_targets;
}
if (!tcon->ses->need_reconnect && !tcon->need_reconnect)
@@ -271,7 +362,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon)
if (tcon->use_persistent)
tcon->need_reopen_files = true;
- rc = SMB2_tcon(0, tcon->ses, tcon->treeName, tcon, nls_codepage);
+ rc = __smb2_reconnect(nls_codepage, tcon);
mutex_unlock(&tcon->ses->session_mutex);
cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc);
@@ -369,10 +460,6 @@ smb2_plain_req_init(__le16 smb2_command, struct cifs_tcon *tcon,
}
-/* offset is sizeof smb2_negotiate_req but rounded up to 8 bytes */
-#define OFFSET_OF_NEG_CONTEXT 0x68 /* sizeof(struct smb2_negotiate_req) */
-
-
#define SMB2_PREAUTH_INTEGRITY_CAPABILITIES cpu_to_le16(1)
#define SMB2_ENCRYPTION_CAPABILITIES cpu_to_le16(2)
#define SMB2_POSIX_EXTENSIONS_AVAILABLE cpu_to_le16(0x100)
@@ -409,10 +496,24 @@ static void
assemble_neg_contexts(struct smb2_negotiate_req *req,
unsigned int *total_len)
{
- char *pneg_ctxt = (char *)req + OFFSET_OF_NEG_CONTEXT;
+ char *pneg_ctxt = (char *)req;
unsigned int ctxt_len;
- *total_len += 2; /* Add 2 due to round to 8 byte boundary for 1st ctxt */
+ if (*total_len > 200) {
+ /* In case length corrupted don't want to overrun smb buffer */
+ cifs_dbg(VFS, "Bad frame length assembling neg contexts\n");
+ return;
+ }
+
+ /*
+ * round up total_len of fixed part of SMB3 negotiate request to 8
+ * byte boundary before adding negotiate contexts
+ */
+ *total_len = roundup(*total_len, 8);
+
+ pneg_ctxt = (*total_len) + (char *)req;
+ req->NegotiateContextOffset = cpu_to_le32(*total_len);
+
build_preauth_ctxt((struct smb2_preauth_neg_context *)pneg_ctxt);
ctxt_len = DIV_ROUND_UP(sizeof(struct smb2_preauth_neg_context), 8) * 8;
*total_len += ctxt_len;
@@ -426,7 +527,6 @@ assemble_neg_contexts(struct smb2_negotiate_req *req,
build_posix_ctxt((struct smb2_posix_neg_context *)pneg_ctxt);
*total_len += sizeof(struct smb2_posix_neg_context);
- req->NegotiateContextOffset = cpu_to_le32(OFFSET_OF_NEG_CONTEXT);
req->NegotiateContextCount = cpu_to_le16(3);
}
@@ -642,8 +742,9 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
req->Dialects[0] = cpu_to_le16(SMB21_PROT_ID);
req->Dialects[1] = cpu_to_le16(SMB30_PROT_ID);
req->Dialects[2] = cpu_to_le16(SMB302_PROT_ID);
- req->DialectCount = cpu_to_le16(3);
- total_len += 6;
+ req->Dialects[3] = cpu_to_le16(SMB311_PROT_ID);
+ req->DialectCount = cpu_to_le16(4);
+ total_len += 8;
} else {
/* otherwise send specific dialect */
req->Dialects[0] = cpu_to_le16(ses->server->vals->protocol_id);
@@ -667,7 +768,9 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
else {
memcpy(req->ClientGUID, server->client_guid,
SMB2_CLIENT_GUID_SIZE);
- if (ses->server->vals->protocol_id == SMB311_PROT_ID)
+ if ((ses->server->vals->protocol_id == SMB311_PROT_ID) ||
+ (strcmp(ses->server->vals->version_string,
+ SMBDEFAULT_VERSION_STRING) == 0))
assemble_neg_contexts(req, &total_len);
}
iov[0].iov_base = (char *)req;
@@ -712,7 +815,8 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
} else if (rsp->DialectRevision == cpu_to_le16(SMB21_PROT_ID)) {
/* ops set to 3.0 by default for default so update */
ses->server->ops = &smb21_operations;
- }
+ } else if (rsp->DialectRevision == cpu_to_le16(SMB311_PROT_ID))
+ ses->server->ops = &smb311_operations;
} else if (le16_to_cpu(rsp->DialectRevision) !=
ses->server->vals->protocol_id) {
/* if requested single dialect ensure returned dialect matched */
@@ -859,13 +963,14 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon)
pneg_inbuf->DialectCount = cpu_to_le16(2);
/* structure is big enough for 3 dialects, sending only 2 */
inbuflen = sizeof(*pneg_inbuf) -
- sizeof(pneg_inbuf->Dialects[0]);
+ (2 * sizeof(pneg_inbuf->Dialects[0]));
} else if (strcmp(tcon->ses->server->vals->version_string,
SMBDEFAULT_VERSION_STRING) == 0) {
pneg_inbuf->Dialects[0] = cpu_to_le16(SMB21_PROT_ID);
pneg_inbuf->Dialects[1] = cpu_to_le16(SMB30_PROT_ID);
pneg_inbuf->Dialects[2] = cpu_to_le16(SMB302_PROT_ID);
- pneg_inbuf->DialectCount = cpu_to_le16(3);
+ pneg_inbuf->Dialects[3] = cpu_to_le16(SMB311_PROT_ID);
+ pneg_inbuf->DialectCount = cpu_to_le16(4);
/* structure is big enough for 3 dialects */
inbuflen = sizeof(*pneg_inbuf);
} else {
@@ -1478,7 +1583,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
/* SMB2 TREE_CONNECT request must be called with TreeId == 0 */
tcon->tid = 0;
-
+ atomic_set(&tcon->num_remote_opens, 0);
rc = smb2_plain_req_init(SMB2_TREE_CONNECT, tcon, (void **) &req,
&total_len);
if (rc) {
@@ -1512,7 +1617,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov);
cifs_small_buf_release(req);
rsp = (struct smb2_tree_connect_rsp *)rsp_iov.iov_base;
-
+ trace_smb3_tcon(xid, tcon->tid, ses->Suid, tree, rc);
if (rc != 0) {
if (tcon) {
cifs_stats_fail_inc(tcon, SMB2_TREE_CONNECT_HE);
@@ -1559,6 +1664,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
if (tcon->ses->server->ops->validate_negotiate)
rc = tcon->ses->server->ops->validate_negotiate(xid, tcon);
tcon_exit:
+
free_rsp_buf(resp_buftype, rsp);
kfree(unc_path);
return rc;
@@ -1954,7 +2060,6 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode,
struct smb_rqst rqst;
struct smb2_create_req *req;
struct smb2_create_rsp *rsp = NULL;
- struct TCP_Server_Info *server;
struct cifs_ses *ses = tcon->ses;
struct kvec iov[3]; /* make sure at least one for each open context */
struct kvec rsp_iov = {NULL, 0};
@@ -1977,9 +2082,7 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode,
if (!utf16_path)
return -ENOMEM;
- if (ses && (ses->server))
- server = ses->server;
- else {
+ if (!ses || !(ses->server)) {
rc = -EIO;
goto err_free_path;
}
@@ -2243,10 +2346,12 @@ SMB2_open_free(struct smb_rqst *rqst)
{
int i;
- cifs_small_buf_release(rqst->rq_iov[0].iov_base);
- for (i = 1; i < rqst->rq_nvec; i++)
- if (rqst->rq_iov[i].iov_base != smb2_padding)
- kfree(rqst->rq_iov[i].iov_base);
+ if (rqst && rqst->rq_iov) {
+ cifs_small_buf_release(rqst->rq_iov[0].iov_base);
+ for (i = 1; i < rqst->rq_nvec; i++)
+ if (rqst->rq_iov[i].iov_base != smb2_padding)
+ kfree(rqst->rq_iov[i].iov_base);
+ }
}
int
@@ -2261,7 +2366,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
struct cifs_ses *ses = tcon->ses;
struct kvec iov[SMB2_CREATE_IOV_SIZE];
struct kvec rsp_iov = {NULL, 0};
- int resp_buftype;
+ int resp_buftype = CIFS_NO_BUFFER;
int rc = 0;
int flags = 0;
@@ -2303,8 +2408,12 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
ses->Suid, oparms->create_options,
oparms->desired_access);
+ atomic_inc(&tcon->num_remote_opens);
oparms->fid->persistent_fid = rsp->PersistentFileId;
oparms->fid->volatile_fid = rsp->VolatileFileId;
+#ifdef CONFIG_CIFS_DEBUG2
+ oparms->fid->mid = le64_to_cpu(rsp->sync_hdr.MessageId);
+#endif /* CIFS_DEBUG2 */
if (buf) {
memcpy(buf, &rsp->CreationTime, 32);
@@ -2474,13 +2583,13 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
goto ioctl_exit;
}
- *out_data = kmalloc(*plen, GFP_KERNEL);
+ *out_data = kmemdup((char *)rsp + le32_to_cpu(rsp->OutputOffset),
+ *plen, GFP_KERNEL);
if (*out_data == NULL) {
rc = -ENOMEM;
goto ioctl_exit;
}
- memcpy(*out_data, (char *)rsp + le32_to_cpu(rsp->OutputOffset), *plen);
ioctl_exit:
free_rsp_buf(resp_buftype, rsp);
return rc;
@@ -2535,7 +2644,8 @@ SMB2_close_init(struct cifs_tcon *tcon, struct smb_rqst *rqst,
void
SMB2_close_free(struct smb_rqst *rqst)
{
- cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */
+ if (rqst && rqst->rq_iov)
+ cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */
}
int
@@ -2547,7 +2657,7 @@ SMB2_close_flags(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_ses *ses = tcon->ses;
struct kvec iov[1];
struct kvec rsp_iov;
- int resp_buftype;
+ int resp_buftype = CIFS_NO_BUFFER;
int rc = 0;
cifs_dbg(FYI, "Close\n");
@@ -2577,6 +2687,8 @@ SMB2_close_flags(const unsigned int xid, struct cifs_tcon *tcon,
goto close_exit;
}
+ atomic_dec(&tcon->num_remote_opens);
+
/* BB FIXME - decode close response, update inode for caching */
close_exit:
@@ -2627,10 +2739,10 @@ smb2_validate_iov(unsigned int offset, unsigned int buffer_length,
* If SMB buffer fields are valid, copy into temporary buffer to hold result.
* Caller must free buffer.
*/
-static int
-validate_and_copy_iov(unsigned int offset, unsigned int buffer_length,
- struct kvec *iov, unsigned int minbufsize,
- char *data)
+int
+smb2_validate_and_copy_iov(unsigned int offset, unsigned int buffer_length,
+ struct kvec *iov, unsigned int minbufsize,
+ char *data)
{
char *begin_of_buf = offset + (char *)iov->iov_base;
int rc;
@@ -2651,7 +2763,7 @@ int
SMB2_query_info_init(struct cifs_tcon *tcon, struct smb_rqst *rqst,
u64 persistent_fid, u64 volatile_fid,
u8 info_class, u8 info_type, u32 additional_info,
- size_t output_len)
+ size_t output_len, size_t input_len, void *input)
{
struct smb2_query_info_req *req;
struct kvec *iov = rqst->rq_iov;
@@ -2669,23 +2781,25 @@ SMB2_query_info_init(struct cifs_tcon *tcon, struct smb_rqst *rqst,
req->VolatileFileId = volatile_fid;
req->AdditionalInformation = cpu_to_le32(additional_info);
- /*
- * We do not use the input buffer (do not send extra byte)
- */
- req->InputBufferOffset = 0;
-
req->OutputBufferLength = cpu_to_le32(output_len);
+ if (input_len) {
+ req->InputBufferLength = cpu_to_le32(input_len);
+ /* total_len for smb query request never close to le16 max */
+ req->InputBufferOffset = cpu_to_le16(total_len - 1);
+ memcpy(req->Buffer, input, input_len);
+ }
iov[0].iov_base = (char *)req;
/* 1 for Buffer */
- iov[0].iov_len = total_len - 1;
+ iov[0].iov_len = total_len - 1 + input_len;
return 0;
}
void
SMB2_query_info_free(struct smb_rqst *rqst)
{
- cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */
+ if (rqst && rqst->rq_iov)
+ cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */
}
static int
@@ -2699,9 +2813,10 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon,
struct kvec iov[1];
struct kvec rsp_iov;
int rc = 0;
- int resp_buftype;
+ int resp_buftype = CIFS_NO_BUFFER;
struct cifs_ses *ses = tcon->ses;
int flags = 0;
+ bool allocated = false;
cifs_dbg(FYI, "Query Info\n");
@@ -2718,7 +2833,7 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon,
rc = SMB2_query_info_init(tcon, &rqst, persistent_fid, volatile_fid,
info_class, info_type, additional_info,
- output_len);
+ output_len, 0, NULL);
if (rc)
goto qinf_exit;
@@ -2741,14 +2856,21 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon,
"Error %d allocating memory for acl\n",
rc);
*dlen = 0;
+ rc = -ENOMEM;
goto qinf_exit;
}
+ allocated = true;
}
}
- rc = validate_and_copy_iov(le16_to_cpu(rsp->OutputBufferOffset),
- le32_to_cpu(rsp->OutputBufferLength),
- &rsp_iov, min_len, *data);
+ rc = smb2_validate_and_copy_iov(le16_to_cpu(rsp->OutputBufferOffset),
+ le32_to_cpu(rsp->OutputBufferLength),
+ &rsp_iov, min_len, *data);
+ if (rc && allocated) {
+ kfree(*data);
+ *data = NULL;
+ *dlen = 0;
+ }
qinf_exit:
SMB2_query_info_free(&rqst);
@@ -2756,18 +2878,6 @@ qinf_exit:
return rc;
}
-int SMB2_query_eas(const unsigned int xid, struct cifs_tcon *tcon,
- u64 persistent_fid, u64 volatile_fid,
- int ea_buf_size, struct smb2_file_full_ea_info *data)
-{
- return query_info(xid, tcon, persistent_fid, volatile_fid,
- FILE_FULL_EA_INFORMATION, SMB2_O_INFO_FILE, 0,
- ea_buf_size,
- sizeof(struct smb2_file_full_ea_info),
- (void **)&data,
- NULL);
-}
-
int SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid, struct smb2_file_all_info *data)
{
@@ -2814,9 +2924,10 @@ smb2_echo_callback(struct mid_q_entry *mid)
{
struct TCP_Server_Info *server = mid->callback_data;
struct smb2_echo_rsp *rsp = (struct smb2_echo_rsp *)mid->resp_buf;
- unsigned int credits_received = 1;
+ unsigned int credits_received = 0;
- if (mid->mid_state == MID_RESPONSE_RECEIVED)
+ if (mid->mid_state == MID_RESPONSE_RECEIVED
+ || mid->mid_state == MID_RESPONSE_MALFORMED)
credits_received = le16_to_cpu(rsp->sync_hdr.CreditRequest);
DeleteMidQEntry(mid);
@@ -3073,7 +3184,7 @@ smb2_readv_callback(struct mid_q_entry *mid)
struct TCP_Server_Info *server = tcon->ses->server;
struct smb2_sync_hdr *shdr =
(struct smb2_sync_hdr *)rdata->iov[0].iov_base;
- unsigned int credits_received = 1;
+ unsigned int credits_received = 0;
struct smb_rqst rqst = { .rq_iov = rdata->iov,
.rq_nvec = 2,
.rq_pages = rdata->pages,
@@ -3112,6 +3223,9 @@ smb2_readv_callback(struct mid_q_entry *mid)
task_io_account_read(rdata->got_bytes);
cifs_stats_bytes_read(tcon, rdata->got_bytes);
break;
+ case MID_RESPONSE_MALFORMED:
+ credits_received = le16_to_cpu(shdr->CreditRequest);
+ /* fall through */
default:
if (rdata->result != -ENODATA)
rdata->result = -EIO;
@@ -3127,8 +3241,17 @@ smb2_readv_callback(struct mid_q_entry *mid)
rdata->mr = NULL;
}
#endif
- if (rdata->result)
+ if (rdata->result && rdata->result != -ENODATA) {
cifs_stats_fail_inc(tcon, SMB2_READ_HE);
+ trace_smb3_read_err(0 /* xid */,
+ rdata->cfile->fid.persistent_fid,
+ tcon->tid, tcon->ses->Suid, rdata->offset,
+ rdata->bytes, rdata->result);
+ } else
+ trace_smb3_read_done(0 /* xid */,
+ rdata->cfile->fid.persistent_fid,
+ tcon->tid, tcon->ses->Suid,
+ rdata->offset, rdata->got_bytes);
queue_work(cifsiod_wq, &rdata->work);
DeleteMidQEntry(mid);
@@ -3185,12 +3308,14 @@ smb2_async_readv(struct cifs_readdata *rdata)
if (rdata->credits) {
shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->bytes,
SMB2_MAX_BUFFER_SIZE));
- shdr->CreditRequest = shdr->CreditCharge;
+ shdr->CreditRequest =
+ cpu_to_le16(le16_to_cpu(shdr->CreditCharge) + 1);
spin_lock(&server->req_lock);
server->credits += rdata->credits -
le16_to_cpu(shdr->CreditCharge);
spin_unlock(&server->req_lock);
wake_up(&server->request_q);
+ rdata->credits = le16_to_cpu(shdr->CreditCharge);
flags |= CIFS_HAS_CREDITS;
}
@@ -3201,13 +3326,11 @@ smb2_async_readv(struct cifs_readdata *rdata)
if (rc) {
kref_put(&rdata->refcount, cifs_readdata_release);
cifs_stats_fail_inc(io_parms.tcon, SMB2_READ_HE);
- trace_smb3_read_err(rc, 0 /* xid */, io_parms.persistent_fid,
- io_parms.tcon->tid, io_parms.tcon->ses->Suid,
- io_parms.offset, io_parms.length);
- } else
- trace_smb3_read_done(0 /* xid */, io_parms.persistent_fid,
- io_parms.tcon->tid, io_parms.tcon->ses->Suid,
- io_parms.offset, io_parms.length);
+ trace_smb3_read_err(0 /* xid */, io_parms.persistent_fid,
+ io_parms.tcon->tid,
+ io_parms.tcon->ses->Suid,
+ io_parms.offset, io_parms.length, rc);
+ }
cifs_small_buf_release(buf);
return rc;
@@ -3251,10 +3374,11 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
if (rc != -ENODATA) {
cifs_stats_fail_inc(io_parms->tcon, SMB2_READ_HE);
cifs_dbg(VFS, "Send error in read = %d\n", rc);
+ trace_smb3_read_err(xid, req->PersistentFileId,
+ io_parms->tcon->tid, ses->Suid,
+ io_parms->offset, io_parms->length,
+ rc);
}
- trace_smb3_read_err(rc, xid, req->PersistentFileId,
- io_parms->tcon->tid, ses->Suid,
- io_parms->offset, io_parms->length);
free_rsp_buf(resp_buftype, rsp_iov.iov_base);
return rc == -ENODATA ? 0 : rc;
} else
@@ -3295,7 +3419,7 @@ smb2_writev_callback(struct mid_q_entry *mid)
struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
unsigned int written;
struct smb2_write_rsp *rsp = (struct smb2_write_rsp *)mid->resp_buf;
- unsigned int credits_received = 1;
+ unsigned int credits_received = 0;
switch (mid->mid_state) {
case MID_RESPONSE_RECEIVED:
@@ -3323,6 +3447,9 @@ smb2_writev_callback(struct mid_q_entry *mid)
case MID_RETRY_NEEDED:
wdata->result = -EAGAIN;
break;
+ case MID_RESPONSE_MALFORMED:
+ credits_received = le16_to_cpu(rsp->sync_hdr.CreditRequest);
+ /* fall through */
default:
wdata->result = -EIO;
break;
@@ -3340,8 +3467,17 @@ smb2_writev_callback(struct mid_q_entry *mid)
wdata->mr = NULL;
}
#endif
- if (wdata->result)
+ if (wdata->result) {
cifs_stats_fail_inc(tcon, SMB2_WRITE_HE);
+ trace_smb3_write_err(0 /* no xid */,
+ wdata->cfile->fid.persistent_fid,
+ tcon->tid, tcon->ses->Suid, wdata->offset,
+ wdata->bytes, wdata->result);
+ } else
+ trace_smb3_write_done(0 /* no xid */,
+ wdata->cfile->fid.persistent_fid,
+ tcon->tid, tcon->ses->Suid,
+ wdata->offset, wdata->bytes);
queue_work(cifsiod_wq, &wdata->work);
DeleteMidQEntry(mid);
@@ -3462,12 +3598,14 @@ smb2_async_writev(struct cifs_writedata *wdata,
if (wdata->credits) {
shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(wdata->bytes,
SMB2_MAX_BUFFER_SIZE));
- shdr->CreditRequest = shdr->CreditCharge;
+ shdr->CreditRequest =
+ cpu_to_le16(le16_to_cpu(shdr->CreditCharge) + 1);
spin_lock(&server->req_lock);
server->credits += wdata->credits -
le16_to_cpu(shdr->CreditCharge);
spin_unlock(&server->req_lock);
wake_up(&server->request_q);
+ wdata->credits = le16_to_cpu(shdr->CreditCharge);
flags |= CIFS_HAS_CREDITS;
}
@@ -3481,10 +3619,7 @@ smb2_async_writev(struct cifs_writedata *wdata,
wdata->bytes, rc);
kref_put(&wdata->refcount, release);
cifs_stats_fail_inc(tcon, SMB2_WRITE_HE);
- } else
- trace_smb3_write_done(0 /* no xid */, req->PersistentFileId,
- tcon->tid, tcon->ses->Suid, wdata->offset,
- wdata->bytes);
+ }
async_writev_out:
cifs_small_buf_release(req);
@@ -3710,8 +3845,8 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
rsp->sync_hdr.Status == STATUS_NO_MORE_FILES) {
srch_inf->endOfSearch = true;
rc = 0;
- }
- cifs_stats_fail_inc(tcon, SMB2_QUERY_DIRECTORY_HE);
+ } else
+ cifs_stats_fail_inc(tcon, SMB2_QUERY_DIRECTORY_HE);
goto qdir_exit;
}
@@ -3754,45 +3889,22 @@ qdir_exit:
return rc;
}
-static int
-send_set_info(const unsigned int xid, struct cifs_tcon *tcon,
+int
+SMB2_set_info_init(struct cifs_tcon *tcon, struct smb_rqst *rqst,
u64 persistent_fid, u64 volatile_fid, u32 pid, u8 info_class,
- u8 info_type, u32 additional_info, unsigned int num,
+ u8 info_type, u32 additional_info,
void **data, unsigned int *size)
{
- struct smb_rqst rqst;
struct smb2_set_info_req *req;
- struct smb2_set_info_rsp *rsp = NULL;
- struct kvec *iov;
- struct kvec rsp_iov;
- int rc = 0;
- int resp_buftype;
- unsigned int i;
- struct cifs_ses *ses = tcon->ses;
- int flags = 0;
- unsigned int total_len;
-
- if (!ses || !(ses->server))
- return -EIO;
-
- if (!num)
- return -EINVAL;
-
- iov = kmalloc_array(num, sizeof(struct kvec), GFP_KERNEL);
- if (!iov)
- return -ENOMEM;
+ struct kvec *iov = rqst->rq_iov;
+ unsigned int i, total_len;
+ int rc;
rc = smb2_plain_req_init(SMB2_SET_INFO, tcon, (void **) &req, &total_len);
- if (rc) {
- kfree(iov);
+ if (rc)
return rc;
- }
-
- if (smb3_encryption_required(tcon))
- flags |= CIFS_TRANSFORM_REQ;
req->sync_hdr.ProcessId = cpu_to_le32(pid);
-
req->InfoType = info_type;
req->FileInfoClass = info_class;
req->PersistentFileId = persistent_fid;
@@ -3810,19 +3922,66 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon,
/* 1 for Buffer */
iov[0].iov_len = total_len - 1;
- for (i = 1; i < num; i++) {
+ for (i = 1; i < rqst->rq_nvec; i++) {
le32_add_cpu(&req->BufferLength, size[i]);
iov[i].iov_base = (char *)data[i];
iov[i].iov_len = size[i];
}
+ return 0;
+}
+
+void
+SMB2_set_info_free(struct smb_rqst *rqst)
+{
+ if (rqst && rqst->rq_iov)
+ cifs_buf_release(rqst->rq_iov[0].iov_base); /* request */
+}
+
+static int
+send_set_info(const unsigned int xid, struct cifs_tcon *tcon,
+ u64 persistent_fid, u64 volatile_fid, u32 pid, u8 info_class,
+ u8 info_type, u32 additional_info, unsigned int num,
+ void **data, unsigned int *size)
+{
+ struct smb_rqst rqst;
+ struct smb2_set_info_rsp *rsp = NULL;
+ struct kvec *iov;
+ struct kvec rsp_iov;
+ int rc = 0;
+ int resp_buftype;
+ struct cifs_ses *ses = tcon->ses;
+ int flags = 0;
+
+ if (!ses || !(ses->server))
+ return -EIO;
+
+ if (!num)
+ return -EINVAL;
+
+ if (smb3_encryption_required(tcon))
+ flags |= CIFS_TRANSFORM_REQ;
+
+ iov = kmalloc_array(num, sizeof(struct kvec), GFP_KERNEL);
+ if (!iov)
+ return -ENOMEM;
+
memset(&rqst, 0, sizeof(struct smb_rqst));
rqst.rq_iov = iov;
rqst.rq_nvec = num;
+ rc = SMB2_set_info_init(tcon, &rqst, persistent_fid, volatile_fid, pid,
+ info_class, info_type, additional_info,
+ data, size);
+ if (rc) {
+ kfree(iov);
+ return rc;
+ }
+
+
rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags,
&rsp_iov);
- cifs_buf_release(req);
+ SMB2_set_info_free(&rqst);
rsp = (struct smb2_set_info_rsp *)rsp_iov.iov_base;
if (rc != 0) {
@@ -3837,88 +3996,8 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon,
}
int
-SMB2_rename(const unsigned int xid, struct cifs_tcon *tcon,
- u64 persistent_fid, u64 volatile_fid, __le16 *target_file)
-{
- struct smb2_file_rename_info info;
- void **data;
- unsigned int size[2];
- int rc;
- int len = (2 * UniStrnlen((wchar_t *)target_file, PATH_MAX));
-
- data = kmalloc_array(2, sizeof(void *), GFP_KERNEL);
- if (!data)
- return -ENOMEM;
-
- info.ReplaceIfExists = 1; /* 1 = replace existing target with new */
- /* 0 = fail if target already exists */
- info.RootDirectory = 0; /* MBZ for network ops (why does spec say?) */
- info.FileNameLength = cpu_to_le32(len);
-
- data[0] = &info;
- size[0] = sizeof(struct smb2_file_rename_info);
-
- data[1] = target_file;
- size[1] = len + 2 /* null */;
-
- rc = send_set_info(xid, tcon, persistent_fid, volatile_fid,
- current->tgid, FILE_RENAME_INFORMATION, SMB2_O_INFO_FILE,
- 0, 2, data, size);
- kfree(data);
- return rc;
-}
-
-int
-SMB2_rmdir(const unsigned int xid, struct cifs_tcon *tcon,
- u64 persistent_fid, u64 volatile_fid)
-{
- __u8 delete_pending = 1;
- void *data;
- unsigned int size;
-
- data = &delete_pending;
- size = 1; /* sizeof __u8 */
-
- return send_set_info(xid, tcon, persistent_fid, volatile_fid,
- current->tgid, FILE_DISPOSITION_INFORMATION, SMB2_O_INFO_FILE,
- 0, 1, &data, &size);
-}
-
-int
-SMB2_set_hardlink(const unsigned int xid, struct cifs_tcon *tcon,
- u64 persistent_fid, u64 volatile_fid, __le16 *target_file)
-{
- struct smb2_file_link_info info;
- void **data;
- unsigned int size[2];
- int rc;
- int len = (2 * UniStrnlen((wchar_t *)target_file, PATH_MAX));
-
- data = kmalloc_array(2, sizeof(void *), GFP_KERNEL);
- if (!data)
- return -ENOMEM;
-
- info.ReplaceIfExists = 0; /* 1 = replace existing link with new */
- /* 0 = fail if link already exists */
- info.RootDirectory = 0; /* MBZ for network ops (why does spec say?) */
- info.FileNameLength = cpu_to_le32(len);
-
- data[0] = &info;
- size[0] = sizeof(struct smb2_file_link_info);
-
- data[1] = target_file;
- size[1] = len + 2 /* null */;
-
- rc = send_set_info(xid, tcon, persistent_fid, volatile_fid,
- current->tgid, FILE_LINK_INFORMATION, SMB2_O_INFO_FILE,
- 0, 2, data, size);
- kfree(data);
- return rc;
-}
-
-int
SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
- u64 volatile_fid, u32 pid, __le64 *eof, bool is_falloc)
+ u64 volatile_fid, u32 pid, __le64 *eof)
{
struct smb2_file_eof_info info;
void *data;
@@ -3929,28 +4008,12 @@ SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
data = &info;
size = sizeof(struct smb2_file_eof_info);
- if (is_falloc)
- return send_set_info(xid, tcon, persistent_fid, volatile_fid,
- pid, FILE_ALLOCATION_INFORMATION, SMB2_O_INFO_FILE,
- 0, 1, &data, &size);
- else
- return send_set_info(xid, tcon, persistent_fid, volatile_fid,
+ return send_set_info(xid, tcon, persistent_fid, volatile_fid,
pid, FILE_END_OF_FILE_INFORMATION, SMB2_O_INFO_FILE,
0, 1, &data, &size);
}
int
-SMB2_set_info(const unsigned int xid, struct cifs_tcon *tcon,
- u64 persistent_fid, u64 volatile_fid, FILE_BASIC_INFO *buf)
-{
- unsigned int size;
- size = sizeof(FILE_BASIC_INFO);
- return send_set_info(xid, tcon, persistent_fid, volatile_fid,
- current->tgid, FILE_BASIC_INFORMATION, SMB2_O_INFO_FILE,
- 0, 1, (void **)&buf, &size);
-}
-
-int
SMB2_set_acl(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid,
struct cifs_ntsd *pnntsd, int pacllen, int aclflag)
@@ -4054,7 +4117,6 @@ static int
build_qfs_info_req(struct kvec *iov, struct cifs_tcon *tcon, int level,
int outbuf_len, u64 persistent_fid, u64 volatile_fid)
{
- struct TCP_Server_Info *server;
int rc;
struct smb2_query_info_req *req;
unsigned int total_len;
@@ -4064,8 +4126,6 @@ build_qfs_info_req(struct kvec *iov, struct cifs_tcon *tcon, int level,
if ((tcon->ses == NULL) || (tcon->ses->server == NULL))
return -EIO;
- server = tcon->ses->server;
-
rc = smb2_plain_req_init(SMB2_QUERY_INFO, tcon, (void **) &req,
&total_len);
if (rc)
@@ -4350,6 +4410,8 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon,
struct kvec iov[1];
struct kvec rsp_iov;
int resp_buf_type;
+ __u64 *please_key_high;
+ __u64 *please_key_low;
cifs_dbg(FYI, "SMB2_lease_break\n");
rc = smb2_plain_req_init(SMB2_OPLOCK_BREAK, tcon, (void **) &req,
@@ -4379,10 +4441,16 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon,
rc = cifs_send_recv(xid, ses, &rqst, &resp_buf_type, flags, &rsp_iov);
cifs_small_buf_release(req);
+ please_key_low = (__u64 *)lease_key;
+ please_key_high = (__u64 *)(lease_key+8);
if (rc) {
cifs_stats_fail_inc(tcon, SMB2_OPLOCK_BREAK_HE);
+ trace_smb3_lease_err(le32_to_cpu(lease_state), tcon->tid,
+ ses->Suid, *please_key_low, *please_key_high, rc);
cifs_dbg(FYI, "Send error in Lease Break = %d\n", rc);
- }
+ } else
+ trace_smb3_lease_done(le32_to_cpu(lease_state), tcon->tid,
+ ses->Suid, *please_key_low, *please_key_high);
return rc;
}
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index 8fb7887f2b3d..538e2299805f 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -84,8 +84,9 @@
#define NUMBER_OF_SMB2_COMMANDS 0x0013
-/* 4 len + 52 transform hdr + 64 hdr + 56 create rsp */
-#define MAX_SMB2_HDR_SIZE 0x00b0
+/* 52 transform hdr + 64 hdr + 88 create rsp */
+#define SMB2_TRANSFORM_HEADER_SIZE 52
+#define MAX_SMB2_HDR_SIZE 204
#define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe)
#define SMB2_TRANSFORM_PROTO_NUM cpu_to_le32(0x424d53fd)
@@ -613,6 +614,8 @@ struct smb2_tree_disconnect_rsp {
#define SVHDX_OPEN_DEVICE_CONTEX 0x9CCBCF9E04C1E643980E158DA1F6EC83
#define SMB2_CREATE_TAG_POSIX 0x93AD25509CB411E7B42383DE968BCD7C
+/* Flag (SMB3 open response) values */
+#define SMB2_CREATE_FLAG_REPARSEPOINT 0x01
/*
* Maximum number of iovs we need for an open/create request.
@@ -646,11 +649,18 @@ struct smb2_create_req {
__u8 Buffer[0];
} __packed;
+/*
+ * Maximum size of a SMB2_CREATE response is 64 (smb2 header) +
+ * 88 (fixed part of create response) + 520 (path) + 150 (contexts) +
+ * 2 bytes of padding.
+ */
+#define MAX_SMB2_CREATE_RESPONSE_SIZE 824
+
struct smb2_create_rsp {
struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 89 */
__u8 OplockLevel;
- __u8 Reserved;
+ __u8 Flag; /* 0x01 if reparse point */
__le32 CreateAction;
__le64 CreationTime;
__le64 LastAccessTime;
@@ -840,6 +850,41 @@ struct fsctl_get_integrity_information_rsp {
/* Integrity flags for above */
#define FSCTL_INTEGRITY_FLAG_CHECKSUM_ENFORCEMENT_OFF 0x00000001
+/* Reparse structures - see MS-FSCC 2.1.2 */
+
+/* struct fsctl_reparse_info_req is empty, only response structs (see below) */
+
+struct reparse_data_buffer {
+ __le32 ReparseTag;
+ __le16 ReparseDataLength;
+ __u16 Reserved;
+ __u8 DataBuffer[0]; /* Variable Length */
+} __packed;
+
+struct reparse_guid_data_buffer {
+ __le32 ReparseTag;
+ __le16 ReparseDataLength;
+ __u16 Reserved;
+ __u8 ReparseGuid[16];
+ __u8 DataBuffer[0]; /* Variable Length */
+} __packed;
+
+struct reparse_mount_point_data_buffer {
+ __le32 ReparseTag;
+ __le16 ReparseDataLength;
+ __u16 Reserved;
+ __le16 SubstituteNameOffset;
+ __le16 SubstituteNameLength;
+ __le16 PrintNameOffset;
+ __le16 PrintNameLength;
+ __u8 PathBuffer[0]; /* Variable Length */
+} __packed;
+
+/* See MS-FSCC 2.1.2.4 and cifspdu.h for struct reparse_symlink_data */
+
+/* See MS-FSCC 2.1.2.6 and cifspdu.h for struct reparse_posix_data */
+
+
/* See MS-DFSC 2.2.2 */
struct fsctl_get_dfs_referral_req {
__le16 MaxReferralLevel;
@@ -861,7 +906,7 @@ struct validate_negotiate_info_req {
__u8 Guid[SMB2_CLIENT_GUID_SIZE];
__le16 SecurityMode;
__le16 DialectCount;
- __le16 Dialects[3]; /* BB expand this if autonegotiate > 3 dialects */
+ __le16 Dialects[4]; /* BB expand this if autonegotiate > 4 dialects */
} __packed;
struct validate_negotiate_info_rsp {
@@ -959,6 +1004,11 @@ struct smb2_close_req {
__u64 VolatileFileId; /* opaque endianness */
} __packed;
+/*
+ * Maximum size of a SMB2_CLOSE response is 64 (smb2 header) + 60 (data)
+ */
+#define MAX_SMB2_CLOSE_RESPONSE_SIZE 124
+
struct smb2_close_rsp {
struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* 60 */
@@ -1174,6 +1224,15 @@ struct smb2_query_info_rsp {
__u8 Buffer[1];
} __packed;
+/*
+ * Maximum number of iovs we need for a set-info request.
+ * The largest one is rename/hardlink
+ * [0] : struct smb2_set_info_req + smb2_file_[rename|link]_info
+ * [1] : path
+ * [2] : compound padding
+ */
+#define SMB2_SET_INFO_IOV_SIZE 3
+
struct smb2_set_info_req {
struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 33 */
@@ -1352,9 +1411,6 @@ struct smb2_file_link_info { /* encoding of request for level 11 */
char FileName[0]; /* Name to be assigned to new link */
} __packed; /* level 11 Set */
-#define SMB2_MIN_EA_BUF 2048
-#define SMB2_MAX_EA_BUF 65536
-
struct smb2_file_full_ea_info { /* encoding of response for level 15 */
__le32 next_entry_offset;
__u8 flags;
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index b4076577eeb7..87733b27a65f 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -116,6 +116,9 @@ extern void smb2_reconnect_server(struct work_struct *work);
extern int smb3_crypto_aead_allocate(struct TCP_Server_Info *server);
extern unsigned long smb_rqst_len(struct TCP_Server_Info *server,
struct smb_rqst *rqst);
+extern void smb2_set_next_command(struct cifs_tcon *tcon,
+ struct smb_rqst *rqst);
+extern void smb2_set_related(struct smb_rqst *rqst);
/*
* SMB2 Worker functions - most of protocol specific implementation details
@@ -150,17 +153,14 @@ extern int SMB2_close_init(struct cifs_tcon *tcon, struct smb_rqst *rqst,
extern void SMB2_close_free(struct smb_rqst *rqst);
extern int SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_file_id, u64 volatile_file_id);
-extern int SMB2_query_eas(const unsigned int xid, struct cifs_tcon *tcon,
- u64 persistent_file_id, u64 volatile_file_id,
- int ea_buf_size,
- struct smb2_file_full_ea_info *data);
extern int SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_file_id, u64 volatile_file_id,
struct smb2_file_all_info *data);
extern int SMB2_query_info_init(struct cifs_tcon *tcon, struct smb_rqst *rqst,
u64 persistent_fid, u64 volatile_fid,
u8 info_class, u8 info_type,
- u32 additional_info, size_t output_len);
+ u32 additional_info, size_t output_len,
+ size_t input_len, void *input);
extern void SMB2_query_info_free(struct smb_rqst *rqst);
extern int SMB2_query_acl(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_file_id, u64 volatile_file_id,
@@ -179,20 +179,14 @@ extern int SMB2_echo(struct TCP_Server_Info *server);
extern int SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid, int index,
struct cifs_search_info *srch_inf);
-extern int SMB2_rename(const unsigned int xid, struct cifs_tcon *tcon,
- u64 persistent_fid, u64 volatile_fid,
- __le16 *target_file);
-extern int SMB2_rmdir(const unsigned int xid, struct cifs_tcon *tcon,
- u64 persistent_fid, u64 volatile_fid);
-extern int SMB2_set_hardlink(const unsigned int xid, struct cifs_tcon *tcon,
- u64 persistent_fid, u64 volatile_fid,
- __le16 *target_file);
extern int SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid, u32 pid,
- __le64 *eof, bool is_fallocate);
-extern int SMB2_set_info(const unsigned int xid, struct cifs_tcon *tcon,
- u64 persistent_fid, u64 volatile_fid,
- FILE_BASIC_INFO *buf);
+ __le64 *eof);
+extern int SMB2_set_info_init(struct cifs_tcon *tcon, struct smb_rqst *rqst,
+ u64 persistent_fid, u64 volatile_fid, u32 pid,
+ u8 info_class, u8 info_type, u32 additional_info,
+ void **data, unsigned int *size);
+extern void SMB2_set_info_free(struct smb_rqst *rqst);
extern int SMB2_set_acl(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid,
struct cifs_ntsd *pnntsd, int pacllen, int aclflag);
@@ -232,10 +226,20 @@ extern enum securityEnum smb2_select_sectype(struct TCP_Server_Info *,
extern int smb3_encryption_required(const struct cifs_tcon *tcon);
extern int smb2_validate_iov(unsigned int offset, unsigned int buffer_length,
struct kvec *iov, unsigned int min_buf_size);
+extern int smb2_validate_and_copy_iov(unsigned int offset,
+ unsigned int buffer_length,
+ struct kvec *iov,
+ unsigned int minbufsize, char *data);
extern void smb2_copy_fs_info_to_kstatfs(
struct smb2_fs_full_size_info *pfs_inf,
struct kstatfs *kst);
extern int smb311_crypto_shash_allocate(struct TCP_Server_Info *server);
extern int smb311_update_preauth_hash(struct cifs_ses *ses,
struct kvec *iov, int nvec);
+extern int smb2_query_info_compound(const unsigned int xid,
+ struct cifs_tcon *tcon,
+ __le16 *utf16_path, u32 desired_access,
+ u32 class, u32 type, u32 output_len,
+ struct kvec *rsp, int *buftype,
+ struct cifs_sb_info *cifs_sb);
#endif /* _SMB2PROTO_H */
diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
index 5fdb9a509a97..a568dac7b3a1 100644
--- a/fs/cifs/smbdirect.c
+++ b/fs/cifs/smbdirect.c
@@ -1724,7 +1724,7 @@ static struct smbd_connection *_smbd_get_connection(
info->responder_resources);
/* Need to send IRD/ORD in private data for iWARP */
- info->id->device->get_port_immutable(
+ info->id->device->ops.get_port_immutable(
info->id->device, info->id->port_num, &port_immutable);
if (port_immutable.core_cap_flags & RDMA_CORE_PORT_IWARP) {
ird_ord_hdr[0] = info->responder_resources;
@@ -2054,14 +2054,22 @@ int smbd_recv(struct smbd_connection *info, struct msghdr *msg)
info->smbd_recv_pending++;
- switch (msg->msg_iter.type) {
- case READ | ITER_KVEC:
+ if (iov_iter_rw(&msg->msg_iter) == WRITE) {
+ /* It's a bug in upper layer to get there */
+ cifs_dbg(VFS, "CIFS: invalid msg iter dir %u\n",
+ iov_iter_rw(&msg->msg_iter));
+ rc = -EINVAL;
+ goto out;
+ }
+
+ switch (iov_iter_type(&msg->msg_iter)) {
+ case ITER_KVEC:
buf = msg->msg_iter.kvec->iov_base;
to_read = msg->msg_iter.kvec->iov_len;
rc = smbd_recv_buf(info, buf, to_read);
break;
- case READ | ITER_BVEC:
+ case ITER_BVEC:
page = msg->msg_iter.bvec->bv_page;
page_offset = msg->msg_iter.bvec->bv_offset;
to_read = msg->msg_iter.bvec->bv_len;
@@ -2071,10 +2079,11 @@ int smbd_recv(struct smbd_connection *info, struct msghdr *msg)
default:
/* It's a bug in upper layer to get there */
cifs_dbg(VFS, "CIFS: invalid msg type %d\n",
- msg->msg_iter.type);
+ iov_iter_type(&msg->msg_iter));
rc = -EINVAL;
}
+out:
info->smbd_recv_pending--;
wake_up(&info->wait_smbd_recv_pending);
@@ -2295,8 +2304,12 @@ static void smbd_mr_recovery_work(struct work_struct *work)
int rc;
list_for_each_entry(smbdirect_mr, &info->mr_list, list) {
- if (smbdirect_mr->state == MR_INVALIDATED ||
- smbdirect_mr->state == MR_ERROR) {
+ if (smbdirect_mr->state == MR_INVALIDATED)
+ ib_dma_unmap_sg(
+ info->id->device, smbdirect_mr->sgl,
+ smbdirect_mr->sgl_count,
+ smbdirect_mr->dir);
+ else if (smbdirect_mr->state == MR_ERROR) {
/* recover this MR entry */
rc = ib_dereg_mr(smbdirect_mr->mr);
@@ -2320,25 +2333,21 @@ static void smbd_mr_recovery_work(struct work_struct *work)
smbd_disconnect_rdma_connection(info);
continue;
}
+ } else
+ /* This MR is being used, don't recover it */
+ continue;
- if (smbdirect_mr->state == MR_INVALIDATED)
- ib_dma_unmap_sg(
- info->id->device, smbdirect_mr->sgl,
- smbdirect_mr->sgl_count,
- smbdirect_mr->dir);
-
- smbdirect_mr->state = MR_READY;
+ smbdirect_mr->state = MR_READY;
- /* smbdirect_mr->state is updated by this function
- * and is read and updated by I/O issuing CPUs trying
- * to get a MR, the call to atomic_inc_return
- * implicates a memory barrier and guarantees this
- * value is updated before waking up any calls to
- * get_mr() from the I/O issuing CPUs
- */
- if (atomic_inc_return(&info->mr_ready_count) == 1)
- wake_up_interruptible(&info->wait_mr);
- }
+ /* smbdirect_mr->state is updated by this function
+ * and is read and updated by I/O issuing CPUs trying
+ * to get a MR, the call to atomic_inc_return
+ * implicates a memory barrier and guarantees this
+ * value is updated before waking up any calls to
+ * get_mr() from the I/O issuing CPUs
+ */
+ if (atomic_inc_return(&info->mr_ready_count) == 1)
+ wake_up_interruptible(&info->wait_mr);
}
}
diff --git a/fs/cifs/trace.c b/fs/cifs/trace.c
index bd4a546feec1..465483787193 100644
--- a/fs/cifs/trace.c
+++ b/fs/cifs/trace.c
@@ -3,16 +3,6 @@
* Copyright (C) 2018, Microsoft Corporation.
*
* Author(s): Steve French <stfrench@microsoft.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU General Public License for more details.
*/
#define CREATE_TRACE_POINTS
#include "trace.h"
diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h
index d4aed5217a56..59be48206932 100644
--- a/fs/cifs/trace.h
+++ b/fs/cifs/trace.h
@@ -3,16 +3,6 @@
* Copyright (C) 2018, Microsoft Corporation.
*
* Author(s): Steve French <stfrench@microsoft.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU General Public License for more details.
*/
#undef TRACE_SYSTEM
#define TRACE_SYSTEM cifs
@@ -374,6 +364,48 @@ DEFINE_SMB3_ENTER_EXIT_EVENT(enter);
DEFINE_SMB3_ENTER_EXIT_EVENT(exit_done);
/*
+ * For SMB2/SMB3 tree connect
+ */
+
+DECLARE_EVENT_CLASS(smb3_tcon_class,
+ TP_PROTO(unsigned int xid,
+ __u32 tid,
+ __u64 sesid,
+ const char *unc_name,
+ int rc),
+ TP_ARGS(xid, tid, sesid, unc_name, rc),
+ TP_STRUCT__entry(
+ __field(unsigned int, xid)
+ __field(__u32, tid)
+ __field(__u64, sesid)
+ __field(const char *, unc_name)
+ __field(int, rc)
+ ),
+ TP_fast_assign(
+ __entry->xid = xid;
+ __entry->tid = tid;
+ __entry->sesid = sesid;
+ __entry->unc_name = unc_name;
+ __entry->rc = rc;
+ ),
+ TP_printk("xid=%u sid=0x%llx tid=0x%x unc_name=%s rc=%d",
+ __entry->xid, __entry->sesid, __entry->tid,
+ __entry->unc_name, __entry->rc)
+)
+
+#define DEFINE_SMB3_TCON_EVENT(name) \
+DEFINE_EVENT(smb3_tcon_class, smb3_##name, \
+ TP_PROTO(unsigned int xid, \
+ __u32 tid, \
+ __u64 sesid, \
+ const char *unc_name, \
+ int rc), \
+ TP_ARGS(xid, tid, sesid, unc_name, rc))
+
+DEFINE_SMB3_TCON_EVENT(tcon);
+
+
+/*
* For smb2/smb3 open call
*/
DECLARE_EVENT_CLASS(smb3_open_err_class,
@@ -460,6 +492,85 @@ DEFINE_EVENT(smb3_open_done_class, smb3_##name, \
DEFINE_SMB3_OPEN_DONE_EVENT(open_done);
DEFINE_SMB3_OPEN_DONE_EVENT(posix_mkdir_done);
+
+DECLARE_EVENT_CLASS(smb3_lease_done_class,
+ TP_PROTO(__u32 lease_state,
+ __u32 tid,
+ __u64 sesid,
+ __u64 lease_key_low,
+ __u64 lease_key_high),
+ TP_ARGS(lease_state, tid, sesid, lease_key_low, lease_key_high),
+ TP_STRUCT__entry(
+ __field(__u32, lease_state)
+ __field(__u32, tid)
+ __field(__u64, sesid)
+ __field(__u64, lease_key_low)
+ __field(__u64, lease_key_high)
+ ),
+ TP_fast_assign(
+ __entry->lease_state = lease_state;
+ __entry->tid = tid;
+ __entry->sesid = sesid;
+ __entry->lease_key_low = lease_key_low;
+ __entry->lease_key_high = lease_key_high;
+ ),
+ TP_printk("sid=0x%llx tid=0x%x lease_key=0x%llx%llx lease_state=0x%x",
+ __entry->sesid, __entry->tid, __entry->lease_key_high,
+ __entry->lease_key_low, __entry->lease_state)
+)
+
+#define DEFINE_SMB3_LEASE_DONE_EVENT(name) \
+DEFINE_EVENT(smb3_lease_done_class, smb3_##name, \
+ TP_PROTO(__u32 lease_state, \
+ __u32 tid, \
+ __u64 sesid, \
+ __u64 lease_key_low, \
+ __u64 lease_key_high), \
+ TP_ARGS(lease_state, tid, sesid, lease_key_low, lease_key_high))
+
+DEFINE_SMB3_LEASE_DONE_EVENT(lease_done);
+
+DECLARE_EVENT_CLASS(smb3_lease_err_class,
+ TP_PROTO(__u32 lease_state,
+ __u32 tid,
+ __u64 sesid,
+ __u64 lease_key_low,
+ __u64 lease_key_high,
+ int rc),
+ TP_ARGS(lease_state, tid, sesid, lease_key_low, lease_key_high, rc),
+ TP_STRUCT__entry(
+ __field(__u32, lease_state)
+ __field(__u32, tid)
+ __field(__u64, sesid)
+ __field(__u64, lease_key_low)
+ __field(__u64, lease_key_high)
+ __field(int, rc)
+ ),
+ TP_fast_assign(
+ __entry->lease_state = lease_state;
+ __entry->tid = tid;
+ __entry->sesid = sesid;
+ __entry->lease_key_low = lease_key_low;
+ __entry->lease_key_high = lease_key_high;
+ __entry->rc = rc;
+ ),
+ TP_printk("sid=0x%llx tid=0x%x lease_key=0x%llx%llx lease_state=0x%x rc=%d",
+ __entry->sesid, __entry->tid, __entry->lease_key_high,
+ __entry->lease_key_low, __entry->lease_state, __entry->rc)
+)
+
+#define DEFINE_SMB3_LEASE_ERR_EVENT(name) \
+DEFINE_EVENT(smb3_lease_err_class, smb3_##name, \
+ TP_PROTO(__u32 lease_state, \
+ __u32 tid, \
+ __u64 sesid, \
+ __u64 lease_key_low, \
+ __u64 lease_key_high, \
+ int rc), \
+ TP_ARGS(lease_state, tid, sesid, lease_key_low, lease_key_high, rc))
+
+DEFINE_SMB3_LEASE_ERR_EVENT(lease_err);
+
DECLARE_EVENT_CLASS(smb3_reconnect_class,
TP_PROTO(__u64 currmid,
char *hostname),
@@ -486,6 +597,36 @@ DEFINE_EVENT(smb3_reconnect_class, smb3_##name, \
DEFINE_SMB3_RECONNECT_EVENT(reconnect);
DEFINE_SMB3_RECONNECT_EVENT(partial_send_reconnect);
+DECLARE_EVENT_CLASS(smb3_credit_class,
+ TP_PROTO(__u64 currmid,
+ char *hostname,
+ int credits),
+ TP_ARGS(currmid, hostname, credits),
+ TP_STRUCT__entry(
+ __field(__u64, currmid)
+ __field(char *, hostname)
+ __field(int, credits)
+ ),
+ TP_fast_assign(
+ __entry->currmid = currmid;
+ __entry->hostname = hostname;
+ __entry->credits = credits;
+ ),
+ TP_printk("server=%s current_mid=0x%llx credits=%d",
+ __entry->hostname,
+ __entry->currmid,
+ __entry->credits)
+)
+
+#define DEFINE_SMB3_CREDIT_EVENT(name) \
+DEFINE_EVENT(smb3_credit_class, smb3_##name, \
+ TP_PROTO(__u64 currmid, \
+ char *hostname, \
+ int credits), \
+ TP_ARGS(currmid, hostname, credits))
+
+DEFINE_SMB3_CREDIT_EVENT(reconnect_with_invalid_credits);
+
#endif /* _CIFS_TRACE_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index b48f43963da6..53532bd3f50d 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -113,13 +113,24 @@ DeleteMidQEntry(struct mid_q_entry *midEntry)
cifs_small_buf_release(midEntry->resp_buf);
#ifdef CONFIG_CIFS_STATS2
now = jiffies;
- /* commands taking longer than one second are indications that
- something is wrong, unless it is quite a slow link or server */
- if (time_after(now, midEntry->when_alloc + HZ) &&
+ /*
+ * commands taking longer than one second (default) can be indications
+ * that something is wrong, unless it is quite a slow link or a very
+ * busy server. Note that this calc is unlikely or impossible to wrap
+ * as long as slow_rsp_threshold is not set way above recommended max
+ * value (32767 ie 9 hours) and is generally harmless even if wrong
+ * since only affects debug counters - so leaving the calc as simple
+ * comparison rather than doing multiple conversions and overflow
+ * checks
+ */
+ if ((slow_rsp_threshold != 0) &&
+ time_after(now, midEntry->when_alloc + (slow_rsp_threshold * HZ)) &&
(midEntry->command != command)) {
- /* smb2slowcmd[NUMBER_OF_SMB2_COMMANDS] counts by command */
- if ((le16_to_cpu(midEntry->command) < NUMBER_OF_SMB2_COMMANDS) &&
- (le16_to_cpu(midEntry->command) >= 0))
+ /*
+ * smb2slowcmd[NUMBER_OF_SMB2_COMMANDS] counts by command
+ * NB: le16_to_cpu returns unsigned so can not be negative below
+ */
+ if (le16_to_cpu(midEntry->command) < NUMBER_OF_SMB2_COMMANDS)
cifs_stats_inc(&midEntry->server->smb2slowcmd[le16_to_cpu(midEntry->command)]);
trace_smb3_slow_rsp(le16_to_cpu(midEntry->command),
@@ -128,7 +139,7 @@ DeleteMidQEntry(struct mid_q_entry *midEntry)
if (cifsFYI & CIFS_TIMER) {
pr_debug(" CIFS slow rsp: cmd %d mid %llu",
midEntry->command, midEntry->mid);
- pr_info(" A: 0x%lx S: 0x%lx R: 0x%lx\n",
+ cifs_info(" A: 0x%lx S: 0x%lx R: 0x%lx\n",
now - midEntry->when_alloc,
now - midEntry->when_sent,
now - midEntry->when_received);
@@ -307,8 +318,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
.iov_base = &rfc1002_marker,
.iov_len = 4
};
- iov_iter_kvec(&smb_msg.msg_iter, WRITE | ITER_KVEC, &hiov,
- 1, 4);
+ iov_iter_kvec(&smb_msg.msg_iter, WRITE, &hiov, 1, 4);
rc = smb_send_kvec(server, &smb_msg, &sent);
if (rc < 0)
goto uncork;
@@ -329,8 +339,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
size += iov[i].iov_len;
}
- iov_iter_kvec(&smb_msg.msg_iter, WRITE | ITER_KVEC,
- iov, n_vec, size);
+ iov_iter_kvec(&smb_msg.msg_iter, WRITE, iov, n_vec, size);
rc = smb_send_kvec(server, &smb_msg, &sent);
if (rc < 0)
@@ -346,7 +355,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
rqst_page_get_length(&rqst[j], i, &bvec.bv_len,
&bvec.bv_offset);
- iov_iter_bvec(&smb_msg.msg_iter, WRITE | ITER_BVEC,
+ iov_iter_bvec(&smb_msg.msg_iter, WRITE,
&bvec, 1, bvec.bv_len);
rc = smb_send_kvec(server, &smb_msg, &sent);
if (rc < 0)
@@ -378,7 +387,7 @@ smbd_done:
if (rc < 0 && rc != -EINTR)
cifs_dbg(VFS, "Error %d sending data on socket to server\n",
rc);
- else
+ else if (rc > 0)
rc = 0;
return rc;
@@ -774,8 +783,25 @@ cifs_setup_request(struct cifs_ses *ses, struct smb_rqst *rqst)
}
static void
-cifs_noop_callback(struct mid_q_entry *mid)
+cifs_compound_callback(struct mid_q_entry *mid)
+{
+ struct TCP_Server_Info *server = mid->server;
+
+ add_credits(server, server->ops->get_credits(mid), mid->optype);
+}
+
+static void
+cifs_compound_last_callback(struct mid_q_entry *mid)
{
+ cifs_compound_callback(mid);
+ cifs_wake_up_task(mid);
+}
+
+static void
+cifs_cancelled_callback(struct mid_q_entry *mid)
+{
+ cifs_compound_callback(mid);
+ DeleteMidQEntry(mid);
}
int
@@ -786,7 +812,8 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
int i, j, rc = 0;
int timeout, optype;
struct mid_q_entry *midQ[MAX_COMPOUND];
- unsigned int credits = 1;
+ bool cancelled_mid[MAX_COMPOUND] = {false};
+ unsigned int credits[MAX_COMPOUND] = {0};
char *buf;
timeout = flags & CIFS_TIMEOUT_MASK;
@@ -804,13 +831,31 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
return -ENOENT;
/*
- * Ensure that we do not send more than 50 overlapping requests
- * to the same server. We may make this configurable later or
- * use ses->maxReq.
+ * Ensure we obtain 1 credit per request in the compound chain.
+ * It can be optimized further by waiting for all the credits
+ * at once but this can wait long enough if we don't have enough
+ * credits due to some heavy operations in progress or the server
+ * not granting us much, so a fallback to the current approach is
+ * needed anyway.
*/
- rc = wait_for_free_request(ses->server, timeout, optype);
- if (rc)
- return rc;
+ for (i = 0; i < num_rqst; i++) {
+ rc = wait_for_free_request(ses->server, timeout, optype);
+ if (rc) {
+ /*
+ * We haven't sent an SMB packet to the server yet but
+ * we already obtained credits for i requests in the
+ * compound chain - need to return those credits back
+ * for future use. Note that we need to call add_credits
+ * multiple times to match the way we obtained credits
+ * in the first place and to account for in flight
+ * requests correctly.
+ */
+ for (j = 0; j < i; j++)
+ add_credits(ses->server, 1, optype);
+ return rc;
+ }
+ credits[i] = 1;
+ }
/*
* Make sure that we sign in the same order that we send on this socket
@@ -826,18 +871,24 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
for (j = 0; j < i; j++)
cifs_delete_mid(midQ[j]);
mutex_unlock(&ses->server->srv_mutex);
+
/* Update # of requests on wire to server */
- add_credits(ses->server, 1, optype);
+ for (j = 0; j < num_rqst; j++)
+ add_credits(ses->server, credits[j], optype);
return PTR_ERR(midQ[i]);
}
midQ[i]->mid_state = MID_REQUEST_SUBMITTED;
+ midQ[i]->optype = optype;
/*
- * We don't invoke the callback compounds unless it is the last
- * request.
+ * Invoke callback for every part of the compound chain
+ * to calculate credits properly. Wake up this thread only when
+ * the last element is received.
*/
if (i < num_rqst - 1)
- midQ[i]->callback = cifs_noop_callback;
+ midQ[i]->callback = cifs_compound_callback;
+ else
+ midQ[i]->callback = cifs_compound_last_callback;
}
cifs_in_send_inc(ses->server);
rc = smb_send_rqst(ses->server, num_rqst, rqst, flags);
@@ -851,37 +902,61 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
mutex_unlock(&ses->server->srv_mutex);
- for (i = 0; i < num_rqst; i++) {
- if (rc < 0)
- goto out;
+ if (rc < 0) {
+ /* Sending failed for some reason - return credits back */
+ for (i = 0; i < num_rqst; i++)
+ add_credits(ses->server, credits[i], optype);
+ goto out;
+ }
+
+ /*
+ * At this point the request is passed to the network stack - we assume
+ * that any credits taken from the server structure on the client have
+ * been spent and we can't return them back. Once we receive responses
+ * we will collect credits granted by the server in the mid callbacks
+ * and add those credits to the server structure.
+ */
- if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP))
- smb311_update_preauth_hash(ses, rqst[i].rq_iov,
- rqst[i].rq_nvec);
+ /*
+ * Compounding is never used during session establish.
+ */
+ if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP))
+ smb311_update_preauth_hash(ses, rqst[0].rq_iov,
+ rqst[0].rq_nvec);
- if (timeout == CIFS_ASYNC_OP)
- goto out;
+ if (timeout == CIFS_ASYNC_OP)
+ goto out;
+ for (i = 0; i < num_rqst; i++) {
rc = wait_for_response(ses->server, midQ[i]);
- if (rc != 0) {
- cifs_dbg(FYI, "Cancelling wait for mid %llu\n",
- midQ[i]->mid);
+ if (rc != 0)
+ break;
+ }
+ if (rc != 0) {
+ for (; i < num_rqst; i++) {
+ cifs_dbg(VFS, "Cancelling wait for mid %llu cmd: %d\n",
+ midQ[i]->mid, le16_to_cpu(midQ[i]->command));
send_cancel(ses->server, &rqst[i], midQ[i]);
spin_lock(&GlobalMid_Lock);
if (midQ[i]->mid_state == MID_REQUEST_SUBMITTED) {
midQ[i]->mid_flags |= MID_WAIT_CANCELLED;
- midQ[i]->callback = DeleteMidQEntry;
- spin_unlock(&GlobalMid_Lock);
- add_credits(ses->server, 1, optype);
- return rc;
+ midQ[i]->callback = cifs_cancelled_callback;
+ cancelled_mid[i] = true;
+ credits[i] = 0;
}
spin_unlock(&GlobalMid_Lock);
}
+ }
+
+ for (i = 0; i < num_rqst; i++) {
+ if (rc < 0)
+ goto out;
rc = cifs_sync_mid_result(midQ[i], ses->server);
if (rc != 0) {
- add_credits(ses->server, 1, optype);
- return rc;
+ /* mark this mid as cancelled to not free it below */
+ cancelled_mid[i] = true;
+ goto out;
}
if (!midQ[i]->resp_buf ||
@@ -901,23 +976,26 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
else
resp_buf_type[i] = CIFS_SMALL_BUFFER;
- if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP)) {
- struct kvec iov = {
- .iov_base = resp_iov[i].iov_base,
- .iov_len = resp_iov[i].iov_len
- };
- smb311_update_preauth_hash(ses, &iov, 1);
- }
-
- credits = ses->server->ops->get_credits(midQ[i]);
-
rc = ses->server->ops->check_receive(midQ[i], ses->server,
flags & CIFS_LOG_ERROR);
/* mark it so buf will not be freed by cifs_delete_mid */
if ((flags & CIFS_NO_RESP) == 0)
midQ[i]->resp_buf = NULL;
+
}
+
+ /*
+ * Compounding is never used during session establish.
+ */
+ if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP)) {
+ struct kvec iov = {
+ .iov_base = resp_iov[0].iov_base,
+ .iov_len = resp_iov[0].iov_len
+ };
+ smb311_update_preauth_hash(ses, &iov, 1);
+ }
+
out:
/*
* This will dequeue all mids. After this it is important that the
@@ -925,9 +1003,10 @@ out:
* This is prevented above by using a noop callback that will not
* wake this thread except for the very last PDU.
*/
- for (i = 0; i < num_rqst; i++)
- cifs_delete_mid(midQ[i]);
- add_credits(ses->server, credits, optype);
+ for (i = 0; i < num_rqst; i++) {
+ if (!cancelled_mid[i])
+ cifs_delete_mid(midQ[i]);
+ }
return rc;
}
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index ce2cc2169040..6e30949d9f77 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -64,11 +64,6 @@
#include <linux/hiddev.h>
-#define __DVB_CORE__
-#include <linux/dvb/audio.h>
-#include <linux/dvb/dmx.h>
-#include <linux/dvb/frontend.h>
-#include <linux/dvb/video.h>
#include <linux/sort.h>
@@ -95,71 +90,6 @@ static int do_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
return vfs_ioctl(file, cmd, arg);
}
-struct compat_video_event {
- int32_t type;
- compat_time_t timestamp;
- union {
- video_size_t size;
- unsigned int frame_rate;
- } u;
-};
-
-static int do_video_get_event(struct file *file,
- unsigned int cmd, struct compat_video_event __user *up)
-{
- struct video_event __user *kevent =
- compat_alloc_user_space(sizeof(*kevent));
- int err;
-
- if (kevent == NULL)
- return -EFAULT;
-
- err = do_ioctl(file, cmd, (unsigned long)kevent);
- if (!err) {
- err = convert_in_user(&kevent->type, &up->type);
- err |= convert_in_user(&kevent->timestamp, &up->timestamp);
- err |= convert_in_user(&kevent->u.size.w, &up->u.size.w);
- err |= convert_in_user(&kevent->u.size.h, &up->u.size.h);
- err |= convert_in_user(&kevent->u.size.aspect_ratio,
- &up->u.size.aspect_ratio);
- if (err)
- err = -EFAULT;
- }
-
- return err;
-}
-
-struct compat_video_still_picture {
- compat_uptr_t iFrame;
- int32_t size;
-};
-
-static int do_video_stillpicture(struct file *file,
- unsigned int cmd, struct compat_video_still_picture __user *up)
-{
- struct video_still_picture __user *up_native;
- compat_uptr_t fp;
- int32_t size;
- int err;
-
- err = get_user(fp, &up->iFrame);
- err |= get_user(size, &up->size);
- if (err)
- return -EFAULT;
-
- up_native =
- compat_alloc_user_space(sizeof(struct video_still_picture));
-
- err = put_user(compat_ptr(fp), &up_native->iFrame);
- err |= put_user(size, &up_native->size);
- if (err)
- return -EFAULT;
-
- err = do_ioctl(file, cmd, (unsigned long) up_native);
-
- return err;
-}
-
#ifdef CONFIG_BLOCK
typedef struct sg_io_hdr32 {
compat_int_t interface_id; /* [i] 'S' for SCSI generic (required) */
@@ -958,61 +888,6 @@ COMPATIBLE_IOCTL(HIDIOCGFLAG)
COMPATIBLE_IOCTL(HIDIOCSFLAG)
COMPATIBLE_IOCTL(HIDIOCGCOLLECTIONINDEX)
COMPATIBLE_IOCTL(HIDIOCGCOLLECTIONINFO)
-/* dvb */
-COMPATIBLE_IOCTL(AUDIO_STOP)
-COMPATIBLE_IOCTL(AUDIO_PLAY)
-COMPATIBLE_IOCTL(AUDIO_PAUSE)
-COMPATIBLE_IOCTL(AUDIO_CONTINUE)
-COMPATIBLE_IOCTL(AUDIO_SELECT_SOURCE)
-COMPATIBLE_IOCTL(AUDIO_SET_MUTE)
-COMPATIBLE_IOCTL(AUDIO_SET_AV_SYNC)
-COMPATIBLE_IOCTL(AUDIO_SET_BYPASS_MODE)
-COMPATIBLE_IOCTL(AUDIO_CHANNEL_SELECT)
-COMPATIBLE_IOCTL(AUDIO_GET_STATUS)
-COMPATIBLE_IOCTL(AUDIO_GET_CAPABILITIES)
-COMPATIBLE_IOCTL(AUDIO_CLEAR_BUFFER)
-COMPATIBLE_IOCTL(AUDIO_SET_ID)
-COMPATIBLE_IOCTL(AUDIO_SET_MIXER)
-COMPATIBLE_IOCTL(AUDIO_SET_STREAMTYPE)
-COMPATIBLE_IOCTL(DMX_START)
-COMPATIBLE_IOCTL(DMX_STOP)
-COMPATIBLE_IOCTL(DMX_SET_FILTER)
-COMPATIBLE_IOCTL(DMX_SET_PES_FILTER)
-COMPATIBLE_IOCTL(DMX_SET_BUFFER_SIZE)
-COMPATIBLE_IOCTL(DMX_GET_PES_PIDS)
-COMPATIBLE_IOCTL(DMX_GET_STC)
-COMPATIBLE_IOCTL(DMX_REQBUFS)
-COMPATIBLE_IOCTL(DMX_QUERYBUF)
-COMPATIBLE_IOCTL(DMX_EXPBUF)
-COMPATIBLE_IOCTL(DMX_QBUF)
-COMPATIBLE_IOCTL(DMX_DQBUF)
-COMPATIBLE_IOCTL(VIDEO_STOP)
-COMPATIBLE_IOCTL(VIDEO_PLAY)
-COMPATIBLE_IOCTL(VIDEO_FREEZE)
-COMPATIBLE_IOCTL(VIDEO_CONTINUE)
-COMPATIBLE_IOCTL(VIDEO_SELECT_SOURCE)
-COMPATIBLE_IOCTL(VIDEO_SET_BLANK)
-COMPATIBLE_IOCTL(VIDEO_GET_STATUS)
-COMPATIBLE_IOCTL(VIDEO_SET_DISPLAY_FORMAT)
-COMPATIBLE_IOCTL(VIDEO_FAST_FORWARD)
-COMPATIBLE_IOCTL(VIDEO_SLOWMOTION)
-COMPATIBLE_IOCTL(VIDEO_GET_CAPABILITIES)
-COMPATIBLE_IOCTL(VIDEO_CLEAR_BUFFER)
-COMPATIBLE_IOCTL(VIDEO_SET_STREAMTYPE)
-COMPATIBLE_IOCTL(VIDEO_SET_FORMAT)
-COMPATIBLE_IOCTL(VIDEO_GET_SIZE)
-/* cec */
-COMPATIBLE_IOCTL(CEC_ADAP_G_CAPS)
-COMPATIBLE_IOCTL(CEC_ADAP_G_LOG_ADDRS)
-COMPATIBLE_IOCTL(CEC_ADAP_S_LOG_ADDRS)
-COMPATIBLE_IOCTL(CEC_ADAP_G_PHYS_ADDR)
-COMPATIBLE_IOCTL(CEC_ADAP_S_PHYS_ADDR)
-COMPATIBLE_IOCTL(CEC_G_MODE)
-COMPATIBLE_IOCTL(CEC_S_MODE)
-COMPATIBLE_IOCTL(CEC_TRANSMIT)
-COMPATIBLE_IOCTL(CEC_RECEIVE)
-COMPATIBLE_IOCTL(CEC_DQEVENT)
-
/* joystick */
COMPATIBLE_IOCTL(JSIOCGVERSION)
COMPATIBLE_IOCTL(JSIOCGAXES)
@@ -1080,12 +955,6 @@ static long do_ioctl_trans(unsigned int cmd,
case RTC_EPOCH_READ32:
case RTC_EPOCH_SET32:
return rtc_ioctl(file, cmd, argp);
-
- /* dvb */
- case VIDEO_GET_EVENT:
- return do_video_get_event(file, cmd, argp);
- case VIDEO_STILLPICTURE:
- return do_video_stillpicture(file, cmd, argp);
}
/*
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index f408994fc632..9352487bd0fc 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -202,7 +202,8 @@ static void *cramfs_blkdev_read(struct super_block *sb, unsigned int offset,
continue;
blk_offset = (blocknr - buffer_blocknr[i]) << PAGE_SHIFT;
blk_offset += offset;
- if (blk_offset + len > BUFFER_SIZE)
+ if (blk_offset > BUFFER_SIZE ||
+ blk_offset + len > BUFFER_SIZE)
continue;
return read_buffers[i] + blk_offset;
}
@@ -418,9 +419,12 @@ static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma)
int i;
vma->vm_flags |= VM_MIXEDMAP;
for (i = 0; i < pages && !ret; i++) {
+ vm_fault_t vmf;
unsigned long off = i * PAGE_SIZE;
pfn_t pfn = phys_to_pfn_t(address + off, PFN_DEV);
- ret = vm_insert_mixed(vma, vma->vm_start + off, pfn);
+ vmf = vmf_insert_mixed(vma, vma->vm_start + off, pfn);
+ if (vmf & VM_FAULT_ERROR)
+ ret = vm_fault_to_errno(vmf, 0);
}
}
@@ -869,8 +873,8 @@ static int cramfs_readpage(struct file *file, struct page *page)
if (unlikely(block_start & CRAMFS_BLK_FLAG_DIRECT_PTR)) {
/* See comments on earlier code. */
u32 prev_start = block_start;
- block_start = prev_start & ~CRAMFS_BLK_FLAGS;
- block_start <<= CRAMFS_BLK_DIRECT_PTR_SHIFT;
+ block_start = prev_start & ~CRAMFS_BLK_FLAGS;
+ block_start <<= CRAMFS_BLK_DIRECT_PTR_SHIFT;
if (prev_start & CRAMFS_BLK_FLAG_UNCOMPRESSED) {
block_start += PAGE_SIZE;
} else {
diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
index 0959044c5cee..5759bcd018cd 100644
--- a/fs/crypto/bio.c
+++ b/fs/crypto/bio.c
@@ -30,8 +30,9 @@ static void __fscrypt_decrypt_bio(struct bio *bio, bool done)
{
struct bio_vec *bv;
int i;
+ struct bvec_iter_all iter_all;
- bio_for_each_segment_all(bv, bio, i) {
+ bio_for_each_segment_all(bv, bio, i, iter_all) {
struct page *page = bv->bv_page;
int ret = fscrypt_decrypt_page(page->mapping->host, page,
PAGE_SIZE, 0, page->index);
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 0f46cf550907..4dc788e3bc96 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -133,15 +133,25 @@ struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode, gfp_t gfp_flags)
}
EXPORT_SYMBOL(fscrypt_get_ctx);
+void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num,
+ const struct fscrypt_info *ci)
+{
+ memset(iv, 0, ci->ci_mode->ivsize);
+ iv->lblk_num = cpu_to_le64(lblk_num);
+
+ if (ci->ci_flags & FS_POLICY_FLAG_DIRECT_KEY)
+ memcpy(iv->nonce, ci->ci_nonce, FS_KEY_DERIVATION_NONCE_SIZE);
+
+ if (ci->ci_essiv_tfm != NULL)
+ crypto_cipher_encrypt_one(ci->ci_essiv_tfm, iv->raw, iv->raw);
+}
+
int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw,
u64 lblk_num, struct page *src_page,
struct page *dest_page, unsigned int len,
unsigned int offs, gfp_t gfp_flags)
{
- struct {
- __le64 index;
- u8 padding[FS_IV_SIZE - sizeof(__le64)];
- } iv;
+ union fscrypt_iv iv;
struct skcipher_request *req = NULL;
DECLARE_CRYPTO_WAIT(wait);
struct scatterlist dst, src;
@@ -151,15 +161,7 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw,
BUG_ON(len == 0);
- BUILD_BUG_ON(sizeof(iv) != FS_IV_SIZE);
- BUILD_BUG_ON(AES_BLOCK_SIZE != FS_IV_SIZE);
- iv.index = cpu_to_le64(lblk_num);
- memset(iv.padding, 0, sizeof(iv.padding));
-
- if (ci->ci_essiv_tfm != NULL) {
- crypto_cipher_encrypt_one(ci->ci_essiv_tfm, (u8 *)&iv,
- (u8 *)&iv);
- }
+ fscrypt_generate_iv(&iv, lblk_num, ci);
req = skcipher_request_alloc(tfm, gfp_flags);
if (!req)
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index d7a0f682ca12..7ff40a73dbec 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -40,10 +40,11 @@ int fname_encrypt(struct inode *inode, const struct qstr *iname,
{
struct skcipher_request *req = NULL;
DECLARE_CRYPTO_WAIT(wait);
- struct crypto_skcipher *tfm = inode->i_crypt_info->ci_ctfm;
- int res = 0;
- char iv[FS_CRYPTO_BLOCK_SIZE];
+ struct fscrypt_info *ci = inode->i_crypt_info;
+ struct crypto_skcipher *tfm = ci->ci_ctfm;
+ union fscrypt_iv iv;
struct scatterlist sg;
+ int res;
/*
* Copy the filename to the output buffer for encrypting in-place and
@@ -55,7 +56,7 @@ int fname_encrypt(struct inode *inode, const struct qstr *iname,
memset(out + iname->len, 0, olen - iname->len);
/* Initialize the IV */
- memset(iv, 0, FS_CRYPTO_BLOCK_SIZE);
+ fscrypt_generate_iv(&iv, 0, ci);
/* Set up the encryption request */
req = skcipher_request_alloc(tfm, GFP_NOFS);
@@ -65,7 +66,7 @@ int fname_encrypt(struct inode *inode, const struct qstr *iname,
CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
crypto_req_done, &wait);
sg_init_one(&sg, out, olen);
- skcipher_request_set_crypt(req, &sg, &sg, olen, iv);
+ skcipher_request_set_crypt(req, &sg, &sg, olen, &iv);
/* Do the encryption */
res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
@@ -94,9 +95,10 @@ static int fname_decrypt(struct inode *inode,
struct skcipher_request *req = NULL;
DECLARE_CRYPTO_WAIT(wait);
struct scatterlist src_sg, dst_sg;
- struct crypto_skcipher *tfm = inode->i_crypt_info->ci_ctfm;
- int res = 0;
- char iv[FS_CRYPTO_BLOCK_SIZE];
+ struct fscrypt_info *ci = inode->i_crypt_info;
+ struct crypto_skcipher *tfm = ci->ci_ctfm;
+ union fscrypt_iv iv;
+ int res;
/* Allocate request */
req = skcipher_request_alloc(tfm, GFP_NOFS);
@@ -107,12 +109,12 @@ static int fname_decrypt(struct inode *inode,
crypto_req_done, &wait);
/* Initialize IV */
- memset(iv, 0, FS_CRYPTO_BLOCK_SIZE);
+ fscrypt_generate_iv(&iv, 0, ci);
/* Create decryption request */
sg_init_one(&src_sg, iname->name, iname->len);
sg_init_one(&dst_sg, oname->name, oname->len);
- skcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv);
+ skcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, &iv);
res = crypto_wait_req(crypto_skcipher_decrypt(req), &wait);
skcipher_request_free(req);
if (res < 0) {
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 79debfc9cef9..7424f851eb5c 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -17,7 +17,6 @@
#include <crypto/hash.h>
/* Encryption parameters */
-#define FS_IV_SIZE 16
#define FS_KEY_DERIVATION_NONCE_SIZE 16
/**
@@ -52,16 +51,42 @@ struct fscrypt_symlink_data {
} __packed;
/*
- * A pointer to this structure is stored in the file system's in-core
- * representation of an inode.
+ * fscrypt_info - the "encryption key" for an inode
+ *
+ * When an encrypted file's key is made available, an instance of this struct is
+ * allocated and stored in ->i_crypt_info. Once created, it remains until the
+ * inode is evicted.
*/
struct fscrypt_info {
+
+ /* The actual crypto transform used for encryption and decryption */
+ struct crypto_skcipher *ci_ctfm;
+
+ /*
+ * Cipher for ESSIV IV generation. Only set for CBC contents
+ * encryption, otherwise is NULL.
+ */
+ struct crypto_cipher *ci_essiv_tfm;
+
+ /*
+ * Encryption mode used for this inode. It corresponds to either
+ * ci_data_mode or ci_filename_mode, depending on the inode type.
+ */
+ struct fscrypt_mode *ci_mode;
+
+ /*
+ * If non-NULL, then this inode uses a master key directly rather than a
+ * derived key, and ci_ctfm will equal ci_master_key->mk_ctfm.
+ * Otherwise, this inode uses a derived key.
+ */
+ struct fscrypt_master_key *ci_master_key;
+
+ /* fields from the fscrypt_context */
u8 ci_data_mode;
u8 ci_filename_mode;
u8 ci_flags;
- struct crypto_skcipher *ci_ctfm;
- struct crypto_cipher *ci_essiv_tfm;
- u8 ci_master_key[FS_KEY_DESCRIPTOR_SIZE];
+ u8 ci_master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE];
+ u8 ci_nonce[FS_KEY_DERIVATION_NONCE_SIZE];
};
typedef enum {
@@ -83,6 +108,10 @@ static inline bool fscrypt_valid_enc_modes(u32 contents_mode,
filenames_mode == FS_ENCRYPTION_MODE_AES_256_CTS)
return true;
+ if (contents_mode == FS_ENCRYPTION_MODE_ADIANTUM &&
+ filenames_mode == FS_ENCRYPTION_MODE_ADIANTUM)
+ return true;
+
return false;
}
@@ -107,6 +136,22 @@ fscrypt_msg(struct super_block *sb, const char *level, const char *fmt, ...);
#define fscrypt_err(sb, fmt, ...) \
fscrypt_msg(sb, KERN_ERR, fmt, ##__VA_ARGS__)
+#define FSCRYPT_MAX_IV_SIZE 32
+
+union fscrypt_iv {
+ struct {
+ /* logical block number within the file */
+ __le64 lblk_num;
+
+ /* per-file nonce; only set in DIRECT_KEY mode */
+ u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE];
+ };
+ u8 raw[FSCRYPT_MAX_IV_SIZE];
+};
+
+void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num,
+ const struct fscrypt_info *ci);
+
/* fname.c */
extern int fname_encrypt(struct inode *inode, const struct qstr *iname,
u8 *out, unsigned int olen);
@@ -115,6 +160,16 @@ extern bool fscrypt_fname_encrypted_size(const struct inode *inode,
u32 *encrypted_len_ret);
/* keyinfo.c */
+
+struct fscrypt_mode {
+ const char *friendly_name;
+ const char *cipher_str;
+ int keysize;
+ int ivsize;
+ bool logged_impl_name;
+ bool needs_essiv;
+};
+
extern void __exit fscrypt_essiv_cleanup(void);
#endif /* _FSCRYPT_PRIVATE_H */
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
index 7874c9bb2fc5..322ce9686bdb 100644
--- a/fs/crypto/keyinfo.c
+++ b/fs/crypto/keyinfo.c
@@ -10,15 +10,21 @@
*/
#include <keys/user-type.h>
+#include <linux/hashtable.h>
#include <linux/scatterlist.h>
#include <linux/ratelimit.h>
#include <crypto/aes.h>
+#include <crypto/algapi.h>
#include <crypto/sha.h>
#include <crypto/skcipher.h>
#include "fscrypt_private.h"
static struct crypto_shash *essiv_hash_tfm;
+/* Table of keys referenced by FS_POLICY_FLAG_DIRECT_KEY policies */
+static DEFINE_HASHTABLE(fscrypt_master_keys, 6); /* 6 bits = 64 buckets */
+static DEFINE_SPINLOCK(fscrypt_master_keys_lock);
+
/*
* Key derivation function. This generates the derived key by encrypting the
* master key with AES-128-ECB using the inode's nonce as the AES key.
@@ -41,7 +47,7 @@ static int derive_key_aes(const u8 *master_key,
tfm = NULL;
goto out;
}
- crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY);
+ crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS);
req = skcipher_request_alloc(tfm, GFP_NOFS);
if (!req) {
res = -ENOMEM;
@@ -123,56 +129,37 @@ invalid:
return ERR_PTR(-ENOKEY);
}
-/* Find the master key, then derive the inode's actual encryption key */
-static int find_and_derive_key(const struct inode *inode,
- const struct fscrypt_context *ctx,
- u8 *derived_key, unsigned int derived_keysize)
-{
- struct key *key;
- const struct fscrypt_key *payload;
- int err;
-
- key = find_and_lock_process_key(FS_KEY_DESC_PREFIX,
- ctx->master_key_descriptor,
- derived_keysize, &payload);
- if (key == ERR_PTR(-ENOKEY) && inode->i_sb->s_cop->key_prefix) {
- key = find_and_lock_process_key(inode->i_sb->s_cop->key_prefix,
- ctx->master_key_descriptor,
- derived_keysize, &payload);
- }
- if (IS_ERR(key))
- return PTR_ERR(key);
- err = derive_key_aes(payload->raw, ctx, derived_key, derived_keysize);
- up_read(&key->sem);
- key_put(key);
- return err;
-}
-
-static struct fscrypt_mode {
- const char *friendly_name;
- const char *cipher_str;
- int keysize;
- bool logged_impl_name;
-} available_modes[] = {
+static struct fscrypt_mode available_modes[] = {
[FS_ENCRYPTION_MODE_AES_256_XTS] = {
.friendly_name = "AES-256-XTS",
.cipher_str = "xts(aes)",
.keysize = 64,
+ .ivsize = 16,
},
[FS_ENCRYPTION_MODE_AES_256_CTS] = {
.friendly_name = "AES-256-CTS-CBC",
.cipher_str = "cts(cbc(aes))",
.keysize = 32,
+ .ivsize = 16,
},
[FS_ENCRYPTION_MODE_AES_128_CBC] = {
.friendly_name = "AES-128-CBC",
.cipher_str = "cbc(aes)",
.keysize = 16,
+ .ivsize = 16,
+ .needs_essiv = true,
},
[FS_ENCRYPTION_MODE_AES_128_CTS] = {
.friendly_name = "AES-128-CTS-CBC",
.cipher_str = "cts(cbc(aes))",
.keysize = 16,
+ .ivsize = 16,
+ },
+ [FS_ENCRYPTION_MODE_ADIANTUM] = {
+ .friendly_name = "Adiantum",
+ .cipher_str = "adiantum(xchacha12,aes)",
+ .keysize = 32,
+ .ivsize = 32,
},
};
@@ -198,14 +185,196 @@ select_encryption_mode(const struct fscrypt_info *ci, const struct inode *inode)
return ERR_PTR(-EINVAL);
}
-static void put_crypt_info(struct fscrypt_info *ci)
+/* Find the master key, then derive the inode's actual encryption key */
+static int find_and_derive_key(const struct inode *inode,
+ const struct fscrypt_context *ctx,
+ u8 *derived_key, const struct fscrypt_mode *mode)
{
- if (!ci)
+ struct key *key;
+ const struct fscrypt_key *payload;
+ int err;
+
+ key = find_and_lock_process_key(FS_KEY_DESC_PREFIX,
+ ctx->master_key_descriptor,
+ mode->keysize, &payload);
+ if (key == ERR_PTR(-ENOKEY) && inode->i_sb->s_cop->key_prefix) {
+ key = find_and_lock_process_key(inode->i_sb->s_cop->key_prefix,
+ ctx->master_key_descriptor,
+ mode->keysize, &payload);
+ }
+ if (IS_ERR(key))
+ return PTR_ERR(key);
+
+ if (ctx->flags & FS_POLICY_FLAG_DIRECT_KEY) {
+ if (mode->ivsize < offsetofend(union fscrypt_iv, nonce)) {
+ fscrypt_warn(inode->i_sb,
+ "direct key mode not allowed with %s",
+ mode->friendly_name);
+ err = -EINVAL;
+ } else if (ctx->contents_encryption_mode !=
+ ctx->filenames_encryption_mode) {
+ fscrypt_warn(inode->i_sb,
+ "direct key mode not allowed with different contents and filenames modes");
+ err = -EINVAL;
+ } else {
+ memcpy(derived_key, payload->raw, mode->keysize);
+ err = 0;
+ }
+ } else {
+ err = derive_key_aes(payload->raw, ctx, derived_key,
+ mode->keysize);
+ }
+ up_read(&key->sem);
+ key_put(key);
+ return err;
+}
+
+/* Allocate and key a symmetric cipher object for the given encryption mode */
+static struct crypto_skcipher *
+allocate_skcipher_for_mode(struct fscrypt_mode *mode, const u8 *raw_key,
+ const struct inode *inode)
+{
+ struct crypto_skcipher *tfm;
+ int err;
+
+ tfm = crypto_alloc_skcipher(mode->cipher_str, 0, 0);
+ if (IS_ERR(tfm)) {
+ fscrypt_warn(inode->i_sb,
+ "error allocating '%s' transform for inode %lu: %ld",
+ mode->cipher_str, inode->i_ino, PTR_ERR(tfm));
+ return tfm;
+ }
+ if (unlikely(!mode->logged_impl_name)) {
+ /*
+ * fscrypt performance can vary greatly depending on which
+ * crypto algorithm implementation is used. Help people debug
+ * performance problems by logging the ->cra_driver_name the
+ * first time a mode is used. Note that multiple threads can
+ * race here, but it doesn't really matter.
+ */
+ mode->logged_impl_name = true;
+ pr_info("fscrypt: %s using implementation \"%s\"\n",
+ mode->friendly_name,
+ crypto_skcipher_alg(tfm)->base.cra_driver_name);
+ }
+ crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS);
+ err = crypto_skcipher_setkey(tfm, raw_key, mode->keysize);
+ if (err)
+ goto err_free_tfm;
+
+ return tfm;
+
+err_free_tfm:
+ crypto_free_skcipher(tfm);
+ return ERR_PTR(err);
+}
+
+/* Master key referenced by FS_POLICY_FLAG_DIRECT_KEY policy */
+struct fscrypt_master_key {
+ struct hlist_node mk_node;
+ refcount_t mk_refcount;
+ const struct fscrypt_mode *mk_mode;
+ struct crypto_skcipher *mk_ctfm;
+ u8 mk_descriptor[FS_KEY_DESCRIPTOR_SIZE];
+ u8 mk_raw[FS_MAX_KEY_SIZE];
+};
+
+static void free_master_key(struct fscrypt_master_key *mk)
+{
+ if (mk) {
+ crypto_free_skcipher(mk->mk_ctfm);
+ kzfree(mk);
+ }
+}
+
+static void put_master_key(struct fscrypt_master_key *mk)
+{
+ if (!refcount_dec_and_lock(&mk->mk_refcount, &fscrypt_master_keys_lock))
return;
+ hash_del(&mk->mk_node);
+ spin_unlock(&fscrypt_master_keys_lock);
- crypto_free_skcipher(ci->ci_ctfm);
- crypto_free_cipher(ci->ci_essiv_tfm);
- kmem_cache_free(fscrypt_info_cachep, ci);
+ free_master_key(mk);
+}
+
+/*
+ * Find/insert the given master key into the fscrypt_master_keys table. If
+ * found, it is returned with elevated refcount, and 'to_insert' is freed if
+ * non-NULL. If not found, 'to_insert' is inserted and returned if it's
+ * non-NULL; otherwise NULL is returned.
+ */
+static struct fscrypt_master_key *
+find_or_insert_master_key(struct fscrypt_master_key *to_insert,
+ const u8 *raw_key, const struct fscrypt_mode *mode,
+ const struct fscrypt_info *ci)
+{
+ unsigned long hash_key;
+ struct fscrypt_master_key *mk;
+
+ /*
+ * Careful: to avoid potentially leaking secret key bytes via timing
+ * information, we must key the hash table by descriptor rather than by
+ * raw key, and use crypto_memneq() when comparing raw keys.
+ */
+
+ BUILD_BUG_ON(sizeof(hash_key) > FS_KEY_DESCRIPTOR_SIZE);
+ memcpy(&hash_key, ci->ci_master_key_descriptor, sizeof(hash_key));
+
+ spin_lock(&fscrypt_master_keys_lock);
+ hash_for_each_possible(fscrypt_master_keys, mk, mk_node, hash_key) {
+ if (memcmp(ci->ci_master_key_descriptor, mk->mk_descriptor,
+ FS_KEY_DESCRIPTOR_SIZE) != 0)
+ continue;
+ if (mode != mk->mk_mode)
+ continue;
+ if (crypto_memneq(raw_key, mk->mk_raw, mode->keysize))
+ continue;
+ /* using existing tfm with same (descriptor, mode, raw_key) */
+ refcount_inc(&mk->mk_refcount);
+ spin_unlock(&fscrypt_master_keys_lock);
+ free_master_key(to_insert);
+ return mk;
+ }
+ if (to_insert)
+ hash_add(fscrypt_master_keys, &to_insert->mk_node, hash_key);
+ spin_unlock(&fscrypt_master_keys_lock);
+ return to_insert;
+}
+
+/* Prepare to encrypt directly using the master key in the given mode */
+static struct fscrypt_master_key *
+fscrypt_get_master_key(const struct fscrypt_info *ci, struct fscrypt_mode *mode,
+ const u8 *raw_key, const struct inode *inode)
+{
+ struct fscrypt_master_key *mk;
+ int err;
+
+ /* Is there already a tfm for this key? */
+ mk = find_or_insert_master_key(NULL, raw_key, mode, ci);
+ if (mk)
+ return mk;
+
+ /* Nope, allocate one. */
+ mk = kzalloc(sizeof(*mk), GFP_NOFS);
+ if (!mk)
+ return ERR_PTR(-ENOMEM);
+ refcount_set(&mk->mk_refcount, 1);
+ mk->mk_mode = mode;
+ mk->mk_ctfm = allocate_skcipher_for_mode(mode, raw_key, inode);
+ if (IS_ERR(mk->mk_ctfm)) {
+ err = PTR_ERR(mk->mk_ctfm);
+ mk->mk_ctfm = NULL;
+ goto err_free_mk;
+ }
+ memcpy(mk->mk_descriptor, ci->ci_master_key_descriptor,
+ FS_KEY_DESCRIPTOR_SIZE);
+ memcpy(mk->mk_raw, raw_key, mode->keysize);
+
+ return find_or_insert_master_key(mk, raw_key, mode, ci);
+
+err_free_mk:
+ free_master_key(mk);
+ return ERR_PTR(err);
}
static int derive_essiv_salt(const u8 *key, int keysize, u8 *salt)
@@ -275,11 +444,67 @@ void __exit fscrypt_essiv_cleanup(void)
crypto_free_shash(essiv_hash_tfm);
}
+/*
+ * Given the encryption mode and key (normally the derived key, but for
+ * FS_POLICY_FLAG_DIRECT_KEY mode it's the master key), set up the inode's
+ * symmetric cipher transform object(s).
+ */
+static int setup_crypto_transform(struct fscrypt_info *ci,
+ struct fscrypt_mode *mode,
+ const u8 *raw_key, const struct inode *inode)
+{
+ struct fscrypt_master_key *mk;
+ struct crypto_skcipher *ctfm;
+ int err;
+
+ if (ci->ci_flags & FS_POLICY_FLAG_DIRECT_KEY) {
+ mk = fscrypt_get_master_key(ci, mode, raw_key, inode);
+ if (IS_ERR(mk))
+ return PTR_ERR(mk);
+ ctfm = mk->mk_ctfm;
+ } else {
+ mk = NULL;
+ ctfm = allocate_skcipher_for_mode(mode, raw_key, inode);
+ if (IS_ERR(ctfm))
+ return PTR_ERR(ctfm);
+ }
+ ci->ci_master_key = mk;
+ ci->ci_ctfm = ctfm;
+
+ if (mode->needs_essiv) {
+ /* ESSIV implies 16-byte IVs which implies !DIRECT_KEY */
+ WARN_ON(mode->ivsize != AES_BLOCK_SIZE);
+ WARN_ON(ci->ci_flags & FS_POLICY_FLAG_DIRECT_KEY);
+
+ err = init_essiv_generator(ci, raw_key, mode->keysize);
+ if (err) {
+ fscrypt_warn(inode->i_sb,
+ "error initializing ESSIV generator for inode %lu: %d",
+ inode->i_ino, err);
+ return err;
+ }
+ }
+ return 0;
+}
+
+static void put_crypt_info(struct fscrypt_info *ci)
+{
+ if (!ci)
+ return;
+
+ if (ci->ci_master_key) {
+ put_master_key(ci->ci_master_key);
+ } else {
+ crypto_free_skcipher(ci->ci_ctfm);
+ crypto_free_cipher(ci->ci_essiv_tfm);
+ }
+ kmem_cache_free(fscrypt_info_cachep, ci);
+}
+
int fscrypt_get_encryption_info(struct inode *inode)
{
struct fscrypt_info *crypt_info;
struct fscrypt_context ctx;
- struct crypto_skcipher *ctfm;
struct fscrypt_mode *mode;
u8 *raw_key = NULL;
int res;
@@ -312,74 +537,42 @@ int fscrypt_get_encryption_info(struct inode *inode)
if (ctx.flags & ~FS_POLICY_FLAGS_VALID)
return -EINVAL;
- crypt_info = kmem_cache_alloc(fscrypt_info_cachep, GFP_NOFS);
+ crypt_info = kmem_cache_zalloc(fscrypt_info_cachep, GFP_NOFS);
if (!crypt_info)
return -ENOMEM;
crypt_info->ci_flags = ctx.flags;
crypt_info->ci_data_mode = ctx.contents_encryption_mode;
crypt_info->ci_filename_mode = ctx.filenames_encryption_mode;
- crypt_info->ci_ctfm = NULL;
- crypt_info->ci_essiv_tfm = NULL;
- memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor,
- sizeof(crypt_info->ci_master_key));
+ memcpy(crypt_info->ci_master_key_descriptor, ctx.master_key_descriptor,
+ FS_KEY_DESCRIPTOR_SIZE);
+ memcpy(crypt_info->ci_nonce, ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE);
mode = select_encryption_mode(crypt_info, inode);
if (IS_ERR(mode)) {
res = PTR_ERR(mode);
goto out;
}
+ WARN_ON(mode->ivsize > FSCRYPT_MAX_IV_SIZE);
+ crypt_info->ci_mode = mode;
/*
- * This cannot be a stack buffer because it is passed to the scatterlist
- * crypto API as part of key derivation.
+ * This cannot be a stack buffer because it may be passed to the
+ * scatterlist crypto API as part of key derivation.
*/
res = -ENOMEM;
raw_key = kmalloc(mode->keysize, GFP_NOFS);
if (!raw_key)
goto out;
- res = find_and_derive_key(inode, &ctx, raw_key, mode->keysize);
+ res = find_and_derive_key(inode, &ctx, raw_key, mode);
if (res)
goto out;
- ctfm = crypto_alloc_skcipher(mode->cipher_str, 0, 0);
- if (IS_ERR(ctfm)) {
- res = PTR_ERR(ctfm);
- fscrypt_warn(inode->i_sb,
- "error allocating '%s' transform for inode %lu: %d",
- mode->cipher_str, inode->i_ino, res);
- goto out;
- }
- if (unlikely(!mode->logged_impl_name)) {
- /*
- * fscrypt performance can vary greatly depending on which
- * crypto algorithm implementation is used. Help people debug
- * performance problems by logging the ->cra_driver_name the
- * first time a mode is used. Note that multiple threads can
- * race here, but it doesn't really matter.
- */
- mode->logged_impl_name = true;
- pr_info("fscrypt: %s using implementation \"%s\"\n",
- mode->friendly_name,
- crypto_skcipher_alg(ctfm)->base.cra_driver_name);
- }
- crypt_info->ci_ctfm = ctfm;
- crypto_skcipher_set_flags(ctfm, CRYPTO_TFM_REQ_WEAK_KEY);
- res = crypto_skcipher_setkey(ctfm, raw_key, mode->keysize);
+ res = setup_crypto_transform(crypt_info, mode, raw_key, inode);
if (res)
goto out;
- if (S_ISREG(inode->i_mode) &&
- crypt_info->ci_data_mode == FS_ENCRYPTION_MODE_AES_128_CBC) {
- res = init_essiv_generator(crypt_info, raw_key, mode->keysize);
- if (res) {
- fscrypt_warn(inode->i_sb,
- "error initializing ESSIV generator for inode %lu: %d",
- inode->i_ino, res);
- goto out;
- }
- }
if (cmpxchg(&inode->i_crypt_info, NULL, crypt_info) == NULL)
crypt_info = NULL;
out:
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index c6d431a5cce9..f490de921ce8 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -199,7 +199,8 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
child_ci = child->i_crypt_info;
if (parent_ci && child_ci) {
- return memcmp(parent_ci->ci_master_key, child_ci->ci_master_key,
+ return memcmp(parent_ci->ci_master_key_descriptor,
+ child_ci->ci_master_key_descriptor,
FS_KEY_DESCRIPTOR_SIZE) == 0 &&
(parent_ci->ci_data_mode == child_ci->ci_data_mode) &&
(parent_ci->ci_filename_mode ==
@@ -254,7 +255,7 @@ int fscrypt_inherit_context(struct inode *parent, struct inode *child,
ctx.contents_encryption_mode = ci->ci_data_mode;
ctx.filenames_encryption_mode = ci->ci_filename_mode;
ctx.flags = ci->ci_flags;
- memcpy(ctx.master_key_descriptor, ci->ci_master_key,
+ memcpy(ctx.master_key_descriptor, ci->ci_master_key_descriptor,
FS_KEY_DESCRIPTOR_SIZE);
get_random_bytes(ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE);
BUILD_BUG_ON(sizeof(ctx) != FSCRYPT_SET_CONTEXT_MAX_SIZE);
diff --git a/fs/dax.c b/fs/dax.c
index 0fb270f0a0ef..6959837cc465 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -38,6 +38,17 @@
#define CREATE_TRACE_POINTS
#include <trace/events/fs_dax.h>
+static inline unsigned int pe_order(enum page_entry_size pe_size)
+{
+ if (pe_size == PE_SIZE_PTE)
+ return PAGE_SHIFT - PAGE_SHIFT;
+ if (pe_size == PE_SIZE_PMD)
+ return PMD_SHIFT - PAGE_SHIFT;
+ if (pe_size == PE_SIZE_PUD)
+ return PUD_SHIFT - PAGE_SHIFT;
+ return ~0;
+}
+
/* We choose 4096 entries - same as per-zone page wait tables */
#define DAX_WAIT_TABLE_BITS 12
#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
@@ -46,6 +57,9 @@
#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
#define PG_PMD_NR (PMD_SIZE >> PAGE_SHIFT)
+/* The order of a PMD entry */
+#define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT)
+
static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
static int __init init_dax_wait_table(void)
@@ -59,63 +73,68 @@ static int __init init_dax_wait_table(void)
fs_initcall(init_dax_wait_table);
/*
- * We use lowest available bit in exceptional entry for locking, one bit for
- * the entry size (PMD) and two more to tell us if the entry is a zero page or
- * an empty entry that is just used for locking. In total four special bits.
+ * DAX pagecache entries use XArray value entries so they can't be mistaken
+ * for pages. We use one bit for locking, one bit for the entry size (PMD)
+ * and two more to tell us if the entry is a zero page or an empty entry that
+ * is just used for locking. In total four special bits.
*
* If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE
* and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
* block allocation.
*/
-#define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 4)
-#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
-#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
-#define RADIX_DAX_ZERO_PAGE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
-#define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3))
+#define DAX_SHIFT (4)
+#define DAX_LOCKED (1UL << 0)
+#define DAX_PMD (1UL << 1)
+#define DAX_ZERO_PAGE (1UL << 2)
+#define DAX_EMPTY (1UL << 3)
-static unsigned long dax_radix_pfn(void *entry)
+static unsigned long dax_to_pfn(void *entry)
{
- return (unsigned long)entry >> RADIX_DAX_SHIFT;
+ return xa_to_value(entry) >> DAX_SHIFT;
}
-static void *dax_radix_locked_entry(unsigned long pfn, unsigned long flags)
+static void *dax_make_entry(pfn_t pfn, unsigned long flags)
{
- return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags |
- (pfn << RADIX_DAX_SHIFT) | RADIX_DAX_ENTRY_LOCK);
+ return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT));
}
-static unsigned int dax_radix_order(void *entry)
+static bool dax_is_locked(void *entry)
{
- if ((unsigned long)entry & RADIX_DAX_PMD)
- return PMD_SHIFT - PAGE_SHIFT;
+ return xa_to_value(entry) & DAX_LOCKED;
+}
+
+static unsigned int dax_entry_order(void *entry)
+{
+ if (xa_to_value(entry) & DAX_PMD)
+ return PMD_ORDER;
return 0;
}
-static int dax_is_pmd_entry(void *entry)
+static unsigned long dax_is_pmd_entry(void *entry)
{
- return (unsigned long)entry & RADIX_DAX_PMD;
+ return xa_to_value(entry) & DAX_PMD;
}
-static int dax_is_pte_entry(void *entry)
+static bool dax_is_pte_entry(void *entry)
{
- return !((unsigned long)entry & RADIX_DAX_PMD);
+ return !(xa_to_value(entry) & DAX_PMD);
}
static int dax_is_zero_entry(void *entry)
{
- return (unsigned long)entry & RADIX_DAX_ZERO_PAGE;
+ return xa_to_value(entry) & DAX_ZERO_PAGE;
}
static int dax_is_empty_entry(void *entry)
{
- return (unsigned long)entry & RADIX_DAX_EMPTY;
+ return xa_to_value(entry) & DAX_EMPTY;
}
/*
- * DAX radix tree locking
+ * DAX page cache entry locking
*/
struct exceptional_entry_key {
- struct address_space *mapping;
+ struct xarray *xa;
pgoff_t entry_start;
};
@@ -124,10 +143,11 @@ struct wait_exceptional_entry_queue {
struct exceptional_entry_key key;
};
-static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
- pgoff_t index, void *entry, struct exceptional_entry_key *key)
+static wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas,
+ void *entry, struct exceptional_entry_key *key)
{
unsigned long hash;
+ unsigned long index = xas->xa_index;
/*
* If 'entry' is a PMD, align the 'index' that we use for the wait
@@ -136,22 +156,21 @@ static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
*/
if (dax_is_pmd_entry(entry))
index &= ~PG_PMD_COLOUR;
-
- key->mapping = mapping;
+ key->xa = xas->xa;
key->entry_start = index;
- hash = hash_long((unsigned long)mapping ^ index, DAX_WAIT_TABLE_BITS);
+ hash = hash_long((unsigned long)xas->xa ^ index, DAX_WAIT_TABLE_BITS);
return wait_table + hash;
}
-static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mode,
- int sync, void *keyp)
+static int wake_exceptional_entry_func(wait_queue_entry_t *wait,
+ unsigned int mode, int sync, void *keyp)
{
struct exceptional_entry_key *key = keyp;
struct wait_exceptional_entry_queue *ewait =
container_of(wait, struct wait_exceptional_entry_queue, wait);
- if (key->mapping != ewait->key.mapping ||
+ if (key->xa != ewait->key.xa ||
key->entry_start != ewait->key.entry_start)
return 0;
return autoremove_wake_function(wait, mode, sync, NULL);
@@ -162,13 +181,12 @@ static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mo
* The important information it's conveying is whether the entry at
* this index used to be a PMD entry.
*/
-static void dax_wake_mapping_entry_waiter(struct address_space *mapping,
- pgoff_t index, void *entry, bool wake_all)
+static void dax_wake_entry(struct xa_state *xas, void *entry, bool wake_all)
{
struct exceptional_entry_key key;
wait_queue_head_t *wq;
- wq = dax_entry_waitqueue(mapping, index, entry, &key);
+ wq = dax_entry_waitqueue(xas, entry, &key);
/*
* Checking for locked entry and prepare_to_wait_exclusive() happens
@@ -181,55 +199,16 @@ static void dax_wake_mapping_entry_waiter(struct address_space *mapping,
}
/*
- * Check whether the given slot is locked. Must be called with the i_pages
- * lock held.
- */
-static inline int slot_locked(struct address_space *mapping, void **slot)
-{
- unsigned long entry = (unsigned long)
- radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock);
- return entry & RADIX_DAX_ENTRY_LOCK;
-}
-
-/*
- * Mark the given slot as locked. Must be called with the i_pages lock held.
- */
-static inline void *lock_slot(struct address_space *mapping, void **slot)
-{
- unsigned long entry = (unsigned long)
- radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock);
-
- entry |= RADIX_DAX_ENTRY_LOCK;
- radix_tree_replace_slot(&mapping->i_pages, slot, (void *)entry);
- return (void *)entry;
-}
-
-/*
- * Mark the given slot as unlocked. Must be called with the i_pages lock held.
- */
-static inline void *unlock_slot(struct address_space *mapping, void **slot)
-{
- unsigned long entry = (unsigned long)
- radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock);
-
- entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK;
- radix_tree_replace_slot(&mapping->i_pages, slot, (void *)entry);
- return (void *)entry;
-}
-
-/*
- * Lookup entry in radix tree, wait for it to become unlocked if it is
- * exceptional entry and return it. The caller must call
- * put_unlocked_mapping_entry() when he decided not to lock the entry or
- * put_locked_mapping_entry() when he locked the entry and now wants to
- * unlock it.
+ * Look up entry in page cache, wait for it to become unlocked if it
+ * is a DAX entry and return it. The caller must subsequently call
+ * put_unlocked_entry() if it did not lock the entry or dax_unlock_entry()
+ * if it did.
*
* Must be called with the i_pages lock held.
*/
-static void *__get_unlocked_mapping_entry(struct address_space *mapping,
- pgoff_t index, void ***slotp, bool (*wait_fn)(void))
+static void *get_unlocked_entry(struct xa_state *xas)
{
- void *entry, **slot;
+ void *entry;
struct wait_exceptional_entry_queue ewait;
wait_queue_head_t *wq;
@@ -237,80 +216,80 @@ static void *__get_unlocked_mapping_entry(struct address_space *mapping,
ewait.wait.func = wake_exceptional_entry_func;
for (;;) {
- bool revalidate;
-
- entry = __radix_tree_lookup(&mapping->i_pages, index, NULL,
- &slot);
- if (!entry ||
- WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)) ||
- !slot_locked(mapping, slot)) {
- if (slotp)
- *slotp = slot;
+ entry = xas_find_conflict(xas);
+ if (!entry || WARN_ON_ONCE(!xa_is_value(entry)) ||
+ !dax_is_locked(entry))
return entry;
- }
- wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key);
+ wq = dax_entry_waitqueue(xas, entry, &ewait.key);
prepare_to_wait_exclusive(wq, &ewait.wait,
TASK_UNINTERRUPTIBLE);
- xa_unlock_irq(&mapping->i_pages);
- revalidate = wait_fn();
+ xas_unlock_irq(xas);
+ xas_reset(xas);
+ schedule();
finish_wait(wq, &ewait.wait);
- xa_lock_irq(&mapping->i_pages);
- if (revalidate)
- return ERR_PTR(-EAGAIN);
+ xas_lock_irq(xas);
}
}
-static bool entry_wait(void)
+/*
+ * The only thing keeping the address space around is the i_pages lock
+ * (it's cycled in clear_inode() after removing the entries from i_pages)
+ * After we call xas_unlock_irq(), we cannot touch xas->xa.
+ */
+static void wait_entry_unlocked(struct xa_state *xas, void *entry)
{
- schedule();
+ struct wait_exceptional_entry_queue ewait;
+ wait_queue_head_t *wq;
+
+ init_wait(&ewait.wait);
+ ewait.wait.func = wake_exceptional_entry_func;
+
+ wq = dax_entry_waitqueue(xas, entry, &ewait.key);
/*
- * Never return an ERR_PTR() from
- * __get_unlocked_mapping_entry(), just keep looping.
+ * Unlike get_unlocked_entry() there is no guarantee that this
+ * path ever successfully retrieves an unlocked entry before an
+ * inode dies. Perform a non-exclusive wait in case this path
+ * never successfully performs its own wake up.
*/
- return false;
-}
-
-static void *get_unlocked_mapping_entry(struct address_space *mapping,
- pgoff_t index, void ***slotp)
-{
- return __get_unlocked_mapping_entry(mapping, index, slotp, entry_wait);
+ prepare_to_wait(wq, &ewait.wait, TASK_UNINTERRUPTIBLE);
+ xas_unlock_irq(xas);
+ schedule();
+ finish_wait(wq, &ewait.wait);
}
-static void unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
+static void put_unlocked_entry(struct xa_state *xas, void *entry)
{
- void *entry, **slot;
-
- xa_lock_irq(&mapping->i_pages);
- entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, &slot);
- if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) ||
- !slot_locked(mapping, slot))) {
- xa_unlock_irq(&mapping->i_pages);
- return;
- }
- unlock_slot(mapping, slot);
- xa_unlock_irq(&mapping->i_pages);
- dax_wake_mapping_entry_waiter(mapping, index, entry, false);
+ /* If we were the only waiter woken, wake the next one */
+ if (entry)
+ dax_wake_entry(xas, entry, false);
}
-static void put_locked_mapping_entry(struct address_space *mapping,
- pgoff_t index)
+/*
+ * We used the xa_state to get the entry, but then we locked the entry and
+ * dropped the xa_lock, so we know the xa_state is stale and must be reset
+ * before use.
+ */
+static void dax_unlock_entry(struct xa_state *xas, void *entry)
{
- unlock_mapping_entry(mapping, index);
+ void *old;
+
+ BUG_ON(dax_is_locked(entry));
+ xas_reset(xas);
+ xas_lock_irq(xas);
+ old = xas_store(xas, entry);
+ xas_unlock_irq(xas);
+ BUG_ON(!dax_is_locked(old));
+ dax_wake_entry(xas, entry, false);
}
/*
- * Called when we are done with radix tree entry we looked up via
- * get_unlocked_mapping_entry() and which we didn't lock in the end.
+ * Return: The entry stored at this location before it was locked.
*/
-static void put_unlocked_mapping_entry(struct address_space *mapping,
- pgoff_t index, void *entry)
+static void *dax_lock_entry(struct xa_state *xas, void *entry)
{
- if (!entry)
- return;
-
- /* We have to wake up next waiter for the radix tree entry lock */
- dax_wake_mapping_entry_waiter(mapping, index, entry, false);
+ unsigned long v = xa_to_value(entry);
+ return xas_store(xas, xa_mk_value(v | DAX_LOCKED));
}
static unsigned long dax_entry_size(void *entry)
@@ -325,9 +304,9 @@ static unsigned long dax_entry_size(void *entry)
return PAGE_SIZE;
}
-static unsigned long dax_radix_end_pfn(void *entry)
+static unsigned long dax_end_pfn(void *entry)
{
- return dax_radix_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE;
+ return dax_to_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE;
}
/*
@@ -335,8 +314,8 @@ static unsigned long dax_radix_end_pfn(void *entry)
* 'empty' and 'zero' entries.
*/
#define for_each_mapped_pfn(entry, pfn) \
- for (pfn = dax_radix_pfn(entry); \
- pfn < dax_radix_end_pfn(entry); pfn++)
+ for (pfn = dax_to_pfn(entry); \
+ pfn < dax_end_pfn(entry); pfn++)
/*
* TODO: for reflink+dax we need a way to associate a single page with
@@ -393,32 +372,26 @@ static struct page *dax_busy_page(void *entry)
return NULL;
}
-static bool entry_wait_revalidate(void)
-{
- rcu_read_unlock();
- schedule();
- rcu_read_lock();
-
- /*
- * Tell __get_unlocked_mapping_entry() to take a break, we need
- * to revalidate page->mapping after dropping locks
- */
- return true;
-}
-
-bool dax_lock_mapping_entry(struct page *page)
+/*
+ * dax_lock_mapping_entry - Lock the DAX entry corresponding to a page
+ * @page: The page whose entry we want to lock
+ *
+ * Context: Process context.
+ * Return: A cookie to pass to dax_unlock_page() or 0 if the entry could
+ * not be locked.
+ */
+dax_entry_t dax_lock_page(struct page *page)
{
- pgoff_t index;
- struct inode *inode;
- bool did_lock = false;
- void *entry = NULL, **slot;
- struct address_space *mapping;
+ XA_STATE(xas, NULL, 0);
+ void *entry;
+ /* Ensure page->mapping isn't freed while we look at it */
rcu_read_lock();
for (;;) {
- mapping = READ_ONCE(page->mapping);
+ struct address_space *mapping = READ_ONCE(page->mapping);
- if (!dax_mapping(mapping))
+ entry = NULL;
+ if (!mapping || !dax_mapping(mapping))
break;
/*
@@ -428,98 +401,93 @@ bool dax_lock_mapping_entry(struct page *page)
* otherwise we would not have a valid pfn_to_page()
* translation.
*/
- inode = mapping->host;
- if (S_ISCHR(inode->i_mode)) {
- did_lock = true;
+ entry = (void *)~0UL;
+ if (S_ISCHR(mapping->host->i_mode))
break;
- }
- xa_lock_irq(&mapping->i_pages);
+ xas.xa = &mapping->i_pages;
+ xas_lock_irq(&xas);
if (mapping != page->mapping) {
- xa_unlock_irq(&mapping->i_pages);
+ xas_unlock_irq(&xas);
continue;
}
- index = page->index;
-
- entry = __get_unlocked_mapping_entry(mapping, index, &slot,
- entry_wait_revalidate);
- if (!entry) {
- xa_unlock_irq(&mapping->i_pages);
- break;
- } else if (IS_ERR(entry)) {
- xa_unlock_irq(&mapping->i_pages);
- WARN_ON_ONCE(PTR_ERR(entry) != -EAGAIN);
+ xas_set(&xas, page->index);
+ entry = xas_load(&xas);
+ if (dax_is_locked(entry)) {
+ rcu_read_unlock();
+ wait_entry_unlocked(&xas, entry);
+ rcu_read_lock();
continue;
}
- lock_slot(mapping, slot);
- did_lock = true;
- xa_unlock_irq(&mapping->i_pages);
+ dax_lock_entry(&xas, entry);
+ xas_unlock_irq(&xas);
break;
}
rcu_read_unlock();
-
- return did_lock;
+ return (dax_entry_t)entry;
}
-void dax_unlock_mapping_entry(struct page *page)
+void dax_unlock_page(struct page *page, dax_entry_t cookie)
{
struct address_space *mapping = page->mapping;
- struct inode *inode = mapping->host;
+ XA_STATE(xas, &mapping->i_pages, page->index);
- if (S_ISCHR(inode->i_mode))
+ if (S_ISCHR(mapping->host->i_mode))
return;
- unlock_mapping_entry(mapping, page->index);
+ dax_unlock_entry(&xas, (void *)cookie);
}
/*
- * Find radix tree entry at given index. If it points to an exceptional entry,
- * return it with the radix tree entry locked. If the radix tree doesn't
- * contain given index, create an empty exceptional entry for the index and
- * return with it locked.
+ * Find page cache entry at given index. If it is a DAX entry, return it
+ * with the entry locked. If the page cache doesn't contain an entry at
+ * that index, add a locked empty entry.
*
- * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will
- * either return that locked entry or will return an error. This error will
- * happen if there are any 4k entries within the 2MiB range that we are
- * requesting.
+ * When requesting an entry with size DAX_PMD, grab_mapping_entry() will
+ * either return that locked entry or will return VM_FAULT_FALLBACK.
+ * This will happen if there are any PTE entries within the PMD range
+ * that we are requesting.
*
- * We always favor 4k entries over 2MiB entries. There isn't a flow where we
- * evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB
- * insertion will fail if it finds any 4k entries already in the tree, and a
- * 4k insertion will cause an existing 2MiB entry to be unmapped and
- * downgraded to 4k entries. This happens for both 2MiB huge zero pages as
- * well as 2MiB empty entries.
+ * We always favor PTE entries over PMD entries. There isn't a flow where we
+ * evict PTE entries in order to 'upgrade' them to a PMD entry. A PMD
+ * insertion will fail if it finds any PTE entries already in the tree, and a
+ * PTE insertion will cause an existing PMD entry to be unmapped and
+ * downgraded to PTE entries. This happens for both PMD zero pages as
+ * well as PMD empty entries.
*
- * The exception to this downgrade path is for 2MiB DAX PMD entries that have
- * real storage backing them. We will leave these real 2MiB DAX entries in
- * the tree, and PTE writes will simply dirty the entire 2MiB DAX entry.
+ * The exception to this downgrade path is for PMD entries that have
+ * real storage backing them. We will leave these real PMD entries in
+ * the tree, and PTE writes will simply dirty the entire PMD entry.
*
* Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
* persistent memory the benefit is doubtful. We can add that later if we can
* show it helps.
+ *
+ * On error, this function does not return an ERR_PTR. Instead it returns
+ * a VM_FAULT code, encoded as an xarray internal entry. The ERR_PTR values
+ * overlap with xarray value entries.
*/
-static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index,
- unsigned long size_flag)
+static void *grab_mapping_entry(struct xa_state *xas,
+ struct address_space *mapping, unsigned long size_flag)
{
- bool pmd_downgrade = false; /* splitting 2MiB entry into 4k entries? */
- void *entry, **slot;
+ unsigned long index = xas->xa_index;
+ bool pmd_downgrade = false; /* splitting PMD entry into PTE entries? */
+ void *entry;
-restart:
- xa_lock_irq(&mapping->i_pages);
- entry = get_unlocked_mapping_entry(mapping, index, &slot);
-
- if (WARN_ON_ONCE(entry && !radix_tree_exceptional_entry(entry))) {
- entry = ERR_PTR(-EIO);
- goto out_unlock;
- }
+retry:
+ xas_lock_irq(xas);
+ entry = get_unlocked_entry(xas);
if (entry) {
- if (size_flag & RADIX_DAX_PMD) {
+ if (!xa_is_value(entry)) {
+ xas_set_err(xas, EIO);
+ goto out_unlock;
+ }
+
+ if (size_flag & DAX_PMD) {
if (dax_is_pte_entry(entry)) {
- put_unlocked_mapping_entry(mapping, index,
- entry);
- entry = ERR_PTR(-EEXIST);
- goto out_unlock;
+ put_unlocked_entry(xas, entry);
+ goto fallback;
}
} else { /* trying to grab a PTE entry */
if (dax_is_pmd_entry(entry) &&
@@ -530,87 +498,57 @@ restart:
}
}
- /* No entry for given index? Make sure radix tree is big enough. */
- if (!entry || pmd_downgrade) {
- int err;
-
- if (pmd_downgrade) {
- /*
- * Make sure 'entry' remains valid while we drop
- * the i_pages lock.
- */
- entry = lock_slot(mapping, slot);
- }
+ if (pmd_downgrade) {
+ /*
+ * Make sure 'entry' remains valid while we drop
+ * the i_pages lock.
+ */
+ dax_lock_entry(xas, entry);
- xa_unlock_irq(&mapping->i_pages);
/*
* Besides huge zero pages the only other thing that gets
* downgraded are empty entries which don't need to be
* unmapped.
*/
- if (pmd_downgrade && dax_is_zero_entry(entry))
- unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR,
- PG_PMD_NR, false);
-
- err = radix_tree_preload(
- mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
- if (err) {
- if (pmd_downgrade)
- put_locked_mapping_entry(mapping, index);
- return ERR_PTR(err);
- }
- xa_lock_irq(&mapping->i_pages);
-
- if (!entry) {
- /*
- * We needed to drop the i_pages lock while calling
- * radix_tree_preload() and we didn't have an entry to
- * lock. See if another thread inserted an entry at
- * our index during this time.
- */
- entry = __radix_tree_lookup(&mapping->i_pages, index,
- NULL, &slot);
- if (entry) {
- radix_tree_preload_end();
- xa_unlock_irq(&mapping->i_pages);
- goto restart;
- }
+ if (dax_is_zero_entry(entry)) {
+ xas_unlock_irq(xas);
+ unmap_mapping_pages(mapping,
+ xas->xa_index & ~PG_PMD_COLOUR,
+ PG_PMD_NR, false);
+ xas_reset(xas);
+ xas_lock_irq(xas);
}
- if (pmd_downgrade) {
- dax_disassociate_entry(entry, mapping, false);
- radix_tree_delete(&mapping->i_pages, index);
- mapping->nrexceptional--;
- dax_wake_mapping_entry_waiter(mapping, index, entry,
- true);
- }
+ dax_disassociate_entry(entry, mapping, false);
+ xas_store(xas, NULL); /* undo the PMD join */
+ dax_wake_entry(xas, entry, true);
+ mapping->nrexceptional--;
+ entry = NULL;
+ xas_set(xas, index);
+ }
- entry = dax_radix_locked_entry(0, size_flag | RADIX_DAX_EMPTY);
-
- err = __radix_tree_insert(&mapping->i_pages, index,
- dax_radix_order(entry), entry);
- radix_tree_preload_end();
- if (err) {
- xa_unlock_irq(&mapping->i_pages);
- /*
- * Our insertion of a DAX entry failed, most likely
- * because we were inserting a PMD entry and it
- * collided with a PTE sized entry at a different
- * index in the PMD range. We haven't inserted
- * anything into the radix tree and have no waiters to
- * wake.
- */
- return ERR_PTR(err);
- }
- /* Good, we have inserted empty locked entry into the tree. */
+ if (entry) {
+ dax_lock_entry(xas, entry);
+ } else {
+ entry = dax_make_entry(pfn_to_pfn_t(0), size_flag | DAX_EMPTY);
+ dax_lock_entry(xas, entry);
+ if (xas_error(xas))
+ goto out_unlock;
mapping->nrexceptional++;
- xa_unlock_irq(&mapping->i_pages);
- return entry;
}
- entry = lock_slot(mapping, slot);
- out_unlock:
- xa_unlock_irq(&mapping->i_pages);
+
+out_unlock:
+ xas_unlock_irq(xas);
+ if (xas_nomem(xas, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM))
+ goto retry;
+ if (xas->xa_node == XA_ERROR(-ENOMEM))
+ return xa_mk_internal(VM_FAULT_OOM);
+ if (xas_error(xas))
+ return xa_mk_internal(VM_FAULT_SIGBUS);
return entry;
+fallback:
+ xas_unlock_irq(xas);
+ return xa_mk_internal(VM_FAULT_FALLBACK);
}
/**
@@ -630,11 +568,10 @@ restart:
*/
struct page *dax_layout_busy_page(struct address_space *mapping)
{
- pgoff_t indices[PAGEVEC_SIZE];
+ XA_STATE(xas, &mapping->i_pages, 0);
+ void *entry;
+ unsigned int scanned = 0;
struct page *page = NULL;
- struct pagevec pvec;
- pgoff_t index, end;
- unsigned i;
/*
* In the 'limited' case get_user_pages() for dax is disabled.
@@ -645,13 +582,9 @@ struct page *dax_layout_busy_page(struct address_space *mapping)
if (!dax_mapping(mapping) || !mapping_mapped(mapping))
return NULL;
- pagevec_init(&pvec);
- index = 0;
- end = -1;
-
/*
* If we race get_user_pages_fast() here either we'll see the
- * elevated page count in the pagevec_lookup and wait, or
+ * elevated page count in the iteration and wait, or
* get_user_pages_fast() will see that the page it took a reference
* against is no longer mapped in the page tables and bail to the
* get_user_pages() slow path. The slow path is protected by
@@ -663,94 +596,68 @@ struct page *dax_layout_busy_page(struct address_space *mapping)
*/
unmap_mapping_range(mapping, 0, 0, 1);
- while (index < end && pagevec_lookup_entries(&pvec, mapping, index,
- min(end - index, (pgoff_t)PAGEVEC_SIZE),
- indices)) {
- pgoff_t nr_pages = 1;
-
- for (i = 0; i < pagevec_count(&pvec); i++) {
- struct page *pvec_ent = pvec.pages[i];
- void *entry;
-
- index = indices[i];
- if (index >= end)
- break;
-
- if (WARN_ON_ONCE(
- !radix_tree_exceptional_entry(pvec_ent)))
- continue;
-
- xa_lock_irq(&mapping->i_pages);
- entry = get_unlocked_mapping_entry(mapping, index, NULL);
- if (entry) {
- page = dax_busy_page(entry);
- /*
- * Account for multi-order entries at
- * the end of the pagevec.
- */
- if (i + 1 >= pagevec_count(&pvec))
- nr_pages = 1UL << dax_radix_order(entry);
- }
- put_unlocked_mapping_entry(mapping, index, entry);
- xa_unlock_irq(&mapping->i_pages);
- if (page)
- break;
- }
-
- /*
- * We don't expect normal struct page entries to exist in our
- * tree, but we keep these pagevec calls so that this code is
- * consistent with the common pattern for handling pagevecs
- * throughout the kernel.
- */
- pagevec_remove_exceptionals(&pvec);
- pagevec_release(&pvec);
- index += nr_pages;
-
+ xas_lock_irq(&xas);
+ xas_for_each(&xas, entry, ULONG_MAX) {
+ if (WARN_ON_ONCE(!xa_is_value(entry)))
+ continue;
+ if (unlikely(dax_is_locked(entry)))
+ entry = get_unlocked_entry(&xas);
+ if (entry)
+ page = dax_busy_page(entry);
+ put_unlocked_entry(&xas, entry);
if (page)
break;
+ if (++scanned % XA_CHECK_SCHED)
+ continue;
+
+ xas_pause(&xas);
+ xas_unlock_irq(&xas);
+ cond_resched();
+ xas_lock_irq(&xas);
}
+ xas_unlock_irq(&xas);
return page;
}
EXPORT_SYMBOL_GPL(dax_layout_busy_page);
-static int __dax_invalidate_mapping_entry(struct address_space *mapping,
+static int __dax_invalidate_entry(struct address_space *mapping,
pgoff_t index, bool trunc)
{
+ XA_STATE(xas, &mapping->i_pages, index);
int ret = 0;
void *entry;
- struct radix_tree_root *pages = &mapping->i_pages;
- xa_lock_irq(pages);
- entry = get_unlocked_mapping_entry(mapping, index, NULL);
- if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)))
+ xas_lock_irq(&xas);
+ entry = get_unlocked_entry(&xas);
+ if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
goto out;
if (!trunc &&
- (radix_tree_tag_get(pages, index, PAGECACHE_TAG_DIRTY) ||
- radix_tree_tag_get(pages, index, PAGECACHE_TAG_TOWRITE)))
+ (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) ||
+ xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE)))
goto out;
dax_disassociate_entry(entry, mapping, trunc);
- radix_tree_delete(pages, index);
+ xas_store(&xas, NULL);
mapping->nrexceptional--;
ret = 1;
out:
- put_unlocked_mapping_entry(mapping, index, entry);
- xa_unlock_irq(pages);
+ put_unlocked_entry(&xas, entry);
+ xas_unlock_irq(&xas);
return ret;
}
+
/*
- * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
- * entry to get unlocked before deleting it.
+ * Delete DAX entry at @index from @mapping. Wait for it
+ * to be unlocked before deleting it.
*/
int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
{
- int ret = __dax_invalidate_mapping_entry(mapping, index, true);
+ int ret = __dax_invalidate_entry(mapping, index, true);
/*
* This gets called from truncate / punch_hole path. As such, the caller
* must hold locks protecting against concurrent modifications of the
- * radix tree (usually fs-private i_mmap_sem for writing). Since the
- * caller has seen exceptional entry for this index, we better find it
+ * page cache (usually fs-private i_mmap_sem for writing). Since the
+ * caller has seen a DAX entry for this index, we better find it
* at that index as well...
*/
WARN_ON_ONCE(!ret);
@@ -758,12 +665,12 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
}
/*
- * Invalidate exceptional DAX entry if it is clean.
+ * Invalidate DAX entry if it is clean.
*/
int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
pgoff_t index)
{
- return __dax_invalidate_mapping_entry(mapping, index, false);
+ return __dax_invalidate_entry(mapping, index, false);
}
static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
@@ -799,30 +706,27 @@ static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
* already in the tree, we will skip the insertion and just dirty the PMD as
* appropriate.
*/
-static void *dax_insert_mapping_entry(struct address_space *mapping,
- struct vm_fault *vmf,
- void *entry, pfn_t pfn_t,
- unsigned long flags, bool dirty)
+static void *dax_insert_entry(struct xa_state *xas,
+ struct address_space *mapping, struct vm_fault *vmf,
+ void *entry, pfn_t pfn, unsigned long flags, bool dirty)
{
- struct radix_tree_root *pages = &mapping->i_pages;
- unsigned long pfn = pfn_t_to_pfn(pfn_t);
- pgoff_t index = vmf->pgoff;
- void *new_entry;
+ void *new_entry = dax_make_entry(pfn, flags);
if (dirty)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
- if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) {
+ if (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE)) {
+ unsigned long index = xas->xa_index;
/* we are replacing a zero page with block mapping */
if (dax_is_pmd_entry(entry))
unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR,
- PG_PMD_NR, false);
+ PG_PMD_NR, false);
else /* pte entry */
- unmap_mapping_pages(mapping, vmf->pgoff, 1, false);
+ unmap_mapping_pages(mapping, index, 1, false);
}
- xa_lock_irq(pages);
- new_entry = dax_radix_locked_entry(pfn, flags);
+ xas_reset(xas);
+ xas_lock_irq(xas);
if (dax_entry_size(entry) != dax_entry_size(new_entry)) {
dax_disassociate_entry(entry, mapping, false);
dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address);
@@ -830,33 +734,30 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
/*
- * Only swap our new entry into the radix tree if the current
+ * Only swap our new entry into the page cache if the current
* entry is a zero page or an empty entry. If a normal PTE or
- * PMD entry is already in the tree, we leave it alone. This
+ * PMD entry is already in the cache, we leave it alone. This
* means that if we are trying to insert a PTE and the
* existing entry is a PMD, we will just leave the PMD in the
* tree and dirty it if necessary.
*/
- struct radix_tree_node *node;
- void **slot;
- void *ret;
-
- ret = __radix_tree_lookup(pages, index, &node, &slot);
- WARN_ON_ONCE(ret != entry);
- __radix_tree_replace(pages, node, slot,
- new_entry, NULL);
+ void *old = dax_lock_entry(xas, new_entry);
+ WARN_ON_ONCE(old != xa_mk_value(xa_to_value(entry) |
+ DAX_LOCKED));
entry = new_entry;
+ } else {
+ xas_load(xas); /* Walk the xa_state */
}
if (dirty)
- radix_tree_tag_set(pages, index, PAGECACHE_TAG_DIRTY);
+ xas_set_mark(xas, PAGECACHE_TAG_DIRTY);
- xa_unlock_irq(pages);
+ xas_unlock_irq(xas);
return entry;
}
-static inline unsigned long
-pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
+static inline
+unsigned long pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
{
unsigned long address;
@@ -866,8 +767,8 @@ pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
}
/* Walk all mappings of a given index of a file and writeprotect them */
-static void dax_mapping_entry_mkclean(struct address_space *mapping,
- pgoff_t index, unsigned long pfn)
+static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index,
+ unsigned long pfn)
{
struct vm_area_struct *vma;
pte_t pte, *ptep = NULL;
@@ -876,7 +777,8 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping,
i_mmap_lock_read(mapping);
vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) {
- unsigned long address, start, end;
+ struct mmu_notifier_range range;
+ unsigned long address;
cond_resched();
@@ -890,7 +792,8 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping,
* call mmu_notifier_invalidate_range_start() on our behalf
* before taking any lock.
*/
- if (follow_pte_pmd(vma->vm_mm, address, &start, &end, &ptep, &pmdp, &ptl))
+ if (follow_pte_pmd(vma->vm_mm, address, &range,
+ &ptep, &pmdp, &ptl))
continue;
/*
@@ -932,16 +835,14 @@ unlock_pte:
pte_unmap_unlock(ptep, ptl);
}
- mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);
+ mmu_notifier_invalidate_range_end(&range);
}
i_mmap_unlock_read(mapping);
}
-static int dax_writeback_one(struct dax_device *dax_dev,
- struct address_space *mapping, pgoff_t index, void *entry)
+static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
+ struct address_space *mapping, void *entry)
{
- struct radix_tree_root *pages = &mapping->i_pages;
- void *entry2, **slot;
unsigned long pfn;
long ret = 0;
size_t size;
@@ -950,32 +851,38 @@ static int dax_writeback_one(struct dax_device *dax_dev,
* A page got tagged dirty in DAX mapping? Something is seriously
* wrong.
*/
- if (WARN_ON(!radix_tree_exceptional_entry(entry)))
+ if (WARN_ON(!xa_is_value(entry)))
return -EIO;
- xa_lock_irq(pages);
- entry2 = get_unlocked_mapping_entry(mapping, index, &slot);
- /* Entry got punched out / reallocated? */
- if (!entry2 || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry2)))
- goto put_unlocked;
- /*
- * Entry got reallocated elsewhere? No need to writeback. We have to
- * compare pfns as we must not bail out due to difference in lockbit
- * or entry type.
- */
- if (dax_radix_pfn(entry2) != dax_radix_pfn(entry))
- goto put_unlocked;
- if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
- dax_is_zero_entry(entry))) {
- ret = -EIO;
- goto put_unlocked;
+ if (unlikely(dax_is_locked(entry))) {
+ void *old_entry = entry;
+
+ entry = get_unlocked_entry(xas);
+
+ /* Entry got punched out / reallocated? */
+ if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
+ goto put_unlocked;
+ /*
+ * Entry got reallocated elsewhere? No need to writeback.
+ * We have to compare pfns as we must not bail out due to
+ * difference in lockbit or entry type.
+ */
+ if (dax_to_pfn(old_entry) != dax_to_pfn(entry))
+ goto put_unlocked;
+ if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
+ dax_is_zero_entry(entry))) {
+ ret = -EIO;
+ goto put_unlocked;
+ }
+
+ /* Another fsync thread may have already done this entry */
+ if (!xas_get_mark(xas, PAGECACHE_TAG_TOWRITE))
+ goto put_unlocked;
}
- /* Another fsync thread may have already written back this entry */
- if (!radix_tree_tag_get(pages, index, PAGECACHE_TAG_TOWRITE))
- goto put_unlocked;
/* Lock the entry to serialize with page faults */
- entry = lock_slot(mapping, slot);
+ dax_lock_entry(xas, entry);
+
/*
* We can clear the tag now but we have to be careful so that concurrent
* dax_writeback_one() calls for the same index cannot finish before we
@@ -983,8 +890,8 @@ static int dax_writeback_one(struct dax_device *dax_dev,
* at the entry only under the i_pages lock and once they do that
* they will see the entry locked and wait for it to unlock.
*/
- radix_tree_tag_clear(pages, index, PAGECACHE_TAG_TOWRITE);
- xa_unlock_irq(pages);
+ xas_clear_mark(xas, PAGECACHE_TAG_TOWRITE);
+ xas_unlock_irq(xas);
/*
* Even if dax_writeback_mapping_range() was given a wbc->range_start
@@ -993,10 +900,10 @@ static int dax_writeback_one(struct dax_device *dax_dev,
* This allows us to flush for PMD_SIZE and not have to worry about
* partial PMD writebacks.
*/
- pfn = dax_radix_pfn(entry);
- size = PAGE_SIZE << dax_radix_order(entry);
+ pfn = dax_to_pfn(entry);
+ size = PAGE_SIZE << dax_entry_order(entry);
- dax_mapping_entry_mkclean(mapping, index, pfn);
+ dax_entry_mkclean(mapping, xas->xa_index, pfn);
dax_flush(dax_dev, page_address(pfn_to_page(pfn)), size);
/*
* After we have flushed the cache, we can clear the dirty tag. There
@@ -1004,16 +911,18 @@ static int dax_writeback_one(struct dax_device *dax_dev,
* the pfn mappings are writeprotected and fault waits for mapping
* entry lock.
*/
- xa_lock_irq(pages);
- radix_tree_tag_clear(pages, index, PAGECACHE_TAG_DIRTY);
- xa_unlock_irq(pages);
- trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT);
- put_locked_mapping_entry(mapping, index);
+ xas_reset(xas);
+ xas_lock_irq(xas);
+ xas_store(xas, entry);
+ xas_clear_mark(xas, PAGECACHE_TAG_DIRTY);
+ dax_wake_entry(xas, entry, false);
+
+ trace_dax_writeback_one(mapping->host, xas->xa_index,
+ size >> PAGE_SHIFT);
return ret;
put_unlocked:
- put_unlocked_mapping_entry(mapping, index, entry2);
- xa_unlock_irq(pages);
+ put_unlocked_entry(xas, entry);
return ret;
}
@@ -1025,13 +934,13 @@ static int dax_writeback_one(struct dax_device *dax_dev,
int dax_writeback_mapping_range(struct address_space *mapping,
struct block_device *bdev, struct writeback_control *wbc)
{
+ XA_STATE(xas, &mapping->i_pages, wbc->range_start >> PAGE_SHIFT);
struct inode *inode = mapping->host;
- pgoff_t start_index, end_index;
- pgoff_t indices[PAGEVEC_SIZE];
+ pgoff_t end_index = wbc->range_end >> PAGE_SHIFT;
struct dax_device *dax_dev;
- struct pagevec pvec;
- bool done = false;
- int i, ret = 0;
+ void *entry;
+ int ret = 0;
+ unsigned int scanned = 0;
if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
return -EIO;
@@ -1043,41 +952,29 @@ int dax_writeback_mapping_range(struct address_space *mapping,
if (!dax_dev)
return -EIO;
- start_index = wbc->range_start >> PAGE_SHIFT;
- end_index = wbc->range_end >> PAGE_SHIFT;
-
- trace_dax_writeback_range(inode, start_index, end_index);
-
- tag_pages_for_writeback(mapping, start_index, end_index);
+ trace_dax_writeback_range(inode, xas.xa_index, end_index);
- pagevec_init(&pvec);
- while (!done) {
- pvec.nr = find_get_entries_tag(mapping, start_index,
- PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE,
- pvec.pages, indices);
+ tag_pages_for_writeback(mapping, xas.xa_index, end_index);
- if (pvec.nr == 0)
+ xas_lock_irq(&xas);
+ xas_for_each_marked(&xas, entry, end_index, PAGECACHE_TAG_TOWRITE) {
+ ret = dax_writeback_one(&xas, dax_dev, mapping, entry);
+ if (ret < 0) {
+ mapping_set_error(mapping, ret);
break;
-
- for (i = 0; i < pvec.nr; i++) {
- if (indices[i] > end_index) {
- done = true;
- break;
- }
-
- ret = dax_writeback_one(dax_dev, mapping, indices[i],
- pvec.pages[i]);
- if (ret < 0) {
- mapping_set_error(mapping, ret);
- goto out;
- }
}
- start_index = indices[pvec.nr - 1] + 1;
+ if (++scanned % XA_CHECK_SCHED)
+ continue;
+
+ xas_pause(&xas);
+ xas_unlock_irq(&xas);
+ cond_resched();
+ xas_lock_irq(&xas);
}
-out:
+ xas_unlock_irq(&xas);
put_dax(dax_dev);
- trace_dax_writeback_range_done(inode, start_index, end_index);
- return (ret < 0 ? ret : 0);
+ trace_dax_writeback_range_done(inode, xas.xa_index, end_index);
+ return ret;
}
EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
@@ -1125,16 +1022,18 @@ out:
* If this page is ever written to we will re-fault and change the mapping to
* point to real DAX storage instead.
*/
-static vm_fault_t dax_load_hole(struct address_space *mapping, void *entry,
- struct vm_fault *vmf)
+static vm_fault_t dax_load_hole(struct xa_state *xas,
+ struct address_space *mapping, void **entry,
+ struct vm_fault *vmf)
{
struct inode *inode = mapping->host;
unsigned long vaddr = vmf->address;
pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr));
vm_fault_t ret;
- dax_insert_mapping_entry(mapping, vmf, entry, pfn, RADIX_DAX_ZERO_PAGE,
- false);
+ *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
+ DAX_ZERO_PAGE, false);
+
ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
trace_dax_load_hole(inode, vmf, ret);
return ret;
@@ -1342,6 +1241,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
{
struct vm_area_struct *vma = vmf->vma;
struct address_space *mapping = vma->vm_file->f_mapping;
+ XA_STATE(xas, &mapping->i_pages, vmf->pgoff);
struct inode *inode = mapping->host;
unsigned long vaddr = vmf->address;
loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
@@ -1368,9 +1268,9 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
if (write && !vmf->cow_page)
flags |= IOMAP_WRITE;
- entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
- if (IS_ERR(entry)) {
- ret = dax_fault_return(PTR_ERR(entry));
+ entry = grab_mapping_entry(&xas, mapping, 0);
+ if (xa_is_internal(entry)) {
+ ret = xa_to_internal(entry);
goto out;
}
@@ -1443,7 +1343,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
if (error < 0)
goto error_finish_iomap;
- entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
+ entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn,
0, write && !sync);
/*
@@ -1471,7 +1371,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
case IOMAP_UNWRITTEN:
case IOMAP_HOLE:
if (!write) {
- ret = dax_load_hole(mapping, entry, vmf);
+ ret = dax_load_hole(&xas, mapping, &entry, vmf);
goto finish_iomap;
}
/*FALLTHRU*/
@@ -1498,21 +1398,20 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
}
unlock_entry:
- put_locked_mapping_entry(mapping, vmf->pgoff);
+ dax_unlock_entry(&xas, entry);
out:
trace_dax_pte_fault_done(inode, vmf, ret);
return ret | major;
}
#ifdef CONFIG_FS_DAX_PMD
-static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
- void *entry)
+static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
+ struct iomap *iomap, void **entry)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
unsigned long pmd_addr = vmf->address & PMD_MASK;
struct inode *inode = mapping->host;
struct page *zero_page;
- void *ret = NULL;
spinlock_t *ptl;
pmd_t pmd_entry;
pfn_t pfn;
@@ -1523,8 +1422,8 @@ static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
goto fallback;
pfn = page_to_pfn_t(zero_page);
- ret = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
- RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false);
+ *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
+ DAX_PMD | DAX_ZERO_PAGE, false);
ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
if (!pmd_none(*(vmf->pmd))) {
@@ -1536,11 +1435,11 @@ static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
pmd_entry = pmd_mkhuge(pmd_entry);
set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
spin_unlock(ptl);
- trace_dax_pmd_load_hole(inode, vmf, zero_page, ret);
+ trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry);
return VM_FAULT_NOPAGE;
fallback:
- trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, ret);
+ trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry);
return VM_FAULT_FALLBACK;
}
@@ -1549,6 +1448,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
{
struct vm_area_struct *vma = vmf->vma;
struct address_space *mapping = vma->vm_file->f_mapping;
+ XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER);
unsigned long pmd_addr = vmf->address & PMD_MASK;
bool write = vmf->flags & FAULT_FLAG_WRITE;
bool sync;
@@ -1556,7 +1456,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
struct inode *inode = mapping->host;
vm_fault_t result = VM_FAULT_FALLBACK;
struct iomap iomap = { 0 };
- pgoff_t max_pgoff, pgoff;
+ pgoff_t max_pgoff;
void *entry;
loff_t pos;
int error;
@@ -1567,7 +1467,6 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
* supposed to hold locks serializing us with truncate / punch hole so
* this is a reliable test.
*/
- pgoff = linear_page_index(vma, pmd_addr);
max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
trace_dax_pmd_fault(inode, vmf, max_pgoff, 0);
@@ -1576,7 +1475,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
* Make sure that the faulting address's PMD offset (color) matches
* the PMD offset from the start of the file. This is necessary so
* that a PMD range in the page table overlaps exactly with a PMD
- * range in the radix tree.
+ * range in the page cache.
*/
if ((vmf->pgoff & PG_PMD_COLOUR) !=
((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR))
@@ -1592,24 +1491,26 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
if ((pmd_addr + PMD_SIZE) > vma->vm_end)
goto fallback;
- if (pgoff >= max_pgoff) {
+ if (xas.xa_index >= max_pgoff) {
result = VM_FAULT_SIGBUS;
goto out;
}
/* If the PMD would extend beyond the file size */
- if ((pgoff | PG_PMD_COLOUR) >= max_pgoff)
+ if ((xas.xa_index | PG_PMD_COLOUR) >= max_pgoff)
goto fallback;
/*
- * grab_mapping_entry() will make sure we get a 2MiB empty entry, a
- * 2MiB zero page entry or a DAX PMD. If it can't (because a 4k page
- * is already in the tree, for instance), it will return -EEXIST and
- * we just fall back to 4k entries.
+ * grab_mapping_entry() will make sure we get an empty PMD entry,
+ * a zero PMD entry or a DAX PMD. If it can't (because a PTE
+ * entry is already in the array, for instance), it will return
+ * VM_FAULT_FALLBACK.
*/
- entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
- if (IS_ERR(entry))
+ entry = grab_mapping_entry(&xas, mapping, DAX_PMD);
+ if (xa_is_internal(entry)) {
+ result = xa_to_internal(entry);
goto fallback;
+ }
/*
* It is possible, particularly with mixed reads & writes to private
@@ -1628,7 +1529,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
* setting up a mapping, so really we're using iomap_begin() as a way
* to look up our filesystem block.
*/
- pos = (loff_t)pgoff << PAGE_SHIFT;
+ pos = (loff_t)xas.xa_index << PAGE_SHIFT;
error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
if (error)
goto unlock_entry;
@@ -1644,8 +1545,8 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
if (error < 0)
goto finish_iomap;
- entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
- RADIX_DAX_PMD, write && !sync);
+ entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn,
+ DAX_PMD, write && !sync);
/*
* If we are doing synchronous page fault and inode needs fsync,
@@ -1669,7 +1570,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
case IOMAP_HOLE:
if (WARN_ON_ONCE(write))
break;
- result = dax_pmd_load_hole(vmf, &iomap, entry);
+ result = dax_pmd_load_hole(&xas, vmf, &iomap, &entry);
break;
default:
WARN_ON_ONCE(1);
@@ -1692,7 +1593,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
&iomap);
}
unlock_entry:
- put_locked_mapping_entry(mapping, pgoff);
+ dax_unlock_entry(&xas, entry);
fallback:
if (result == VM_FAULT_FALLBACK) {
split_huge_pmd(vma, vmf->pmd, vmf->address);
@@ -1737,54 +1638,48 @@ vm_fault_t dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
}
EXPORT_SYMBOL_GPL(dax_iomap_fault);
-/**
+/*
* dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables
* @vmf: The description of the fault
- * @pe_size: Size of entry to be inserted
* @pfn: PFN to insert
+ * @order: Order of entry to insert.
*
- * This function inserts writeable PTE or PMD entry into page tables for mmaped
- * DAX file. It takes care of marking corresponding radix tree entry as dirty
- * as well.
+ * This function inserts a writeable PTE or PMD entry into the page tables
+ * for an mmaped DAX file. It also marks the page cache entry as dirty.
*/
-static vm_fault_t dax_insert_pfn_mkwrite(struct vm_fault *vmf,
- enum page_entry_size pe_size,
- pfn_t pfn)
+static vm_fault_t
+dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
- void *entry, **slot;
- pgoff_t index = vmf->pgoff;
+ XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order);
+ void *entry;
vm_fault_t ret;
- xa_lock_irq(&mapping->i_pages);
- entry = get_unlocked_mapping_entry(mapping, index, &slot);
+ xas_lock_irq(&xas);
+ entry = get_unlocked_entry(&xas);
/* Did we race with someone splitting entry or so? */
if (!entry ||
- (pe_size == PE_SIZE_PTE && !dax_is_pte_entry(entry)) ||
- (pe_size == PE_SIZE_PMD && !dax_is_pmd_entry(entry))) {
- put_unlocked_mapping_entry(mapping, index, entry);
- xa_unlock_irq(&mapping->i_pages);
+ (order == 0 && !dax_is_pte_entry(entry)) ||
+ (order == PMD_ORDER && !dax_is_pmd_entry(entry))) {
+ put_unlocked_entry(&xas, entry);
+ xas_unlock_irq(&xas);
trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf,
VM_FAULT_NOPAGE);
return VM_FAULT_NOPAGE;
}
- radix_tree_tag_set(&mapping->i_pages, index, PAGECACHE_TAG_DIRTY);
- entry = lock_slot(mapping, slot);
- xa_unlock_irq(&mapping->i_pages);
- switch (pe_size) {
- case PE_SIZE_PTE:
+ xas_set_mark(&xas, PAGECACHE_TAG_DIRTY);
+ dax_lock_entry(&xas, entry);
+ xas_unlock_irq(&xas);
+ if (order == 0)
ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
- break;
#ifdef CONFIG_FS_DAX_PMD
- case PE_SIZE_PMD:
+ else if (order == PMD_ORDER)
ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
pfn, true);
- break;
#endif
- default:
+ else
ret = VM_FAULT_FALLBACK;
- }
- put_locked_mapping_entry(mapping, index);
+ dax_unlock_entry(&xas, entry);
trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret);
return ret;
}
@@ -1804,17 +1699,12 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
{
int err;
loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;
- size_t len = 0;
+ unsigned int order = pe_order(pe_size);
+ size_t len = PAGE_SIZE << order;
- if (pe_size == PE_SIZE_PTE)
- len = PAGE_SIZE;
- else if (pe_size == PE_SIZE_PMD)
- len = PMD_SIZE;
- else
- WARN_ON_ONCE(1);
err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1);
if (err)
return VM_FAULT_SIGBUS;
- return dax_insert_pfn_mkwrite(vmf, pe_size, pfn);
+ return dax_insert_pfn_mkwrite(vmf, pfn, order);
}
EXPORT_SYMBOL_GPL(dax_finish_sync_fault);
diff --git a/fs/dcache.c b/fs/dcache.c
index 2e7e8d85e9b4..aac41adf4743 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -26,7 +26,7 @@
#include <linux/export.h>
#include <linux/security.h>
#include <linux/seqlock.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/bit_spinlock.h>
#include <linux/rculist_bl.h>
#include <linux/list_lru.h>
@@ -119,6 +119,7 @@ struct dentry_stat_t dentry_stat = {
static DEFINE_PER_CPU(long, nr_dentry);
static DEFINE_PER_CPU(long, nr_dentry_unused);
+static DEFINE_PER_CPU(long, nr_dentry_negative);
#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
@@ -152,11 +153,22 @@ static long get_nr_dentry_unused(void)
return sum < 0 ? 0 : sum;
}
+static long get_nr_dentry_negative(void)
+{
+ int i;
+ long sum = 0;
+
+ for_each_possible_cpu(i)
+ sum += per_cpu(nr_dentry_negative, i);
+ return sum < 0 ? 0 : sum;
+}
+
int proc_nr_dentry(struct ctl_table *table, int write, void __user *buffer,
size_t *lenp, loff_t *ppos)
{
dentry_stat.nr_dentry = get_nr_dentry();
dentry_stat.nr_unused = get_nr_dentry_unused();
+ dentry_stat.nr_negative = get_nr_dentry_negative();
return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}
#endif
@@ -257,24 +269,10 @@ static void __d_free(struct rcu_head *head)
kmem_cache_free(dentry_cache, dentry);
}
-static void __d_free_external_name(struct rcu_head *head)
-{
- struct external_name *name = container_of(head, struct external_name,
- u.head);
-
- mod_node_page_state(page_pgdat(virt_to_page(name)),
- NR_INDIRECTLY_RECLAIMABLE_BYTES,
- -ksize(name));
-
- kfree(name);
-}
-
static void __d_free_external(struct rcu_head *head)
{
struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
-
- __d_free_external_name(&external_name(dentry)->u.head);
-
+ kfree(external_name(dentry));
kmem_cache_free(dentry_cache, dentry);
}
@@ -306,7 +304,7 @@ void release_dentry_name_snapshot(struct name_snapshot *name)
struct external_name *p;
p = container_of(name->name, struct external_name, name[0]);
if (unlikely(atomic_dec_and_test(&p->u.count)))
- call_rcu(&p->u.head, __d_free_external_name);
+ kfree_rcu(p, u.head);
}
}
EXPORT_SYMBOL(release_dentry_name_snapshot);
@@ -331,6 +329,8 @@ static inline void __d_clear_type_and_inode(struct dentry *dentry)
flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU);
WRITE_ONCE(dentry->d_flags, flags);
dentry->d_inode = NULL;
+ if (dentry->d_flags & DCACHE_LRU_LIST)
+ this_cpu_inc(nr_dentry_negative);
}
static void dentry_free(struct dentry *dentry)
@@ -385,6 +385,11 @@ static void dentry_unlink_inode(struct dentry * dentry)
* The per-cpu "nr_dentry_unused" counters are updated with
* the DCACHE_LRU_LIST bit.
*
+ * The per-cpu "nr_dentry_negative" counters are only updated
+ * when deleted from or added to the per-superblock LRU list, not
+ * from/to the shrink list. That is to avoid an unneeded dec/inc
+ * pair when moving from LRU to shrink list in select_collect().
+ *
* These helper functions make sure we always follow the
* rules. d_lock must be held by the caller.
*/
@@ -394,6 +399,8 @@ static void d_lru_add(struct dentry *dentry)
D_FLAG_VERIFY(dentry, 0);
dentry->d_flags |= DCACHE_LRU_LIST;
this_cpu_inc(nr_dentry_unused);
+ if (d_is_negative(dentry))
+ this_cpu_inc(nr_dentry_negative);
WARN_ON_ONCE(!list_lru_add(&dentry->d_sb->s_dentry_lru, &dentry->d_lru));
}
@@ -402,6 +409,8 @@ static void d_lru_del(struct dentry *dentry)
D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
dentry->d_flags &= ~DCACHE_LRU_LIST;
this_cpu_dec(nr_dentry_unused);
+ if (d_is_negative(dentry))
+ this_cpu_dec(nr_dentry_negative);
WARN_ON_ONCE(!list_lru_del(&dentry->d_sb->s_dentry_lru, &dentry->d_lru));
}
@@ -432,6 +441,8 @@ static void d_lru_isolate(struct list_lru_one *lru, struct dentry *dentry)
D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
dentry->d_flags &= ~DCACHE_LRU_LIST;
this_cpu_dec(nr_dentry_unused);
+ if (d_is_negative(dentry))
+ this_cpu_dec(nr_dentry_negative);
list_lru_isolate(lru, &dentry->d_lru);
}
@@ -440,6 +451,8 @@ static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry,
{
D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
dentry->d_flags |= DCACHE_SHRINK_LIST;
+ if (d_is_negative(dentry))
+ this_cpu_dec(nr_dentry_negative);
list_lru_isolate_move(lru, &dentry->d_lru, list);
}
@@ -1202,15 +1215,11 @@ static enum lru_status dentry_lru_isolate_shrink(struct list_head *item,
*/
void shrink_dcache_sb(struct super_block *sb)
{
- long freed;
-
do {
LIST_HEAD(dispose);
- freed = list_lru_walk(&sb->s_dentry_lru,
+ list_lru_walk(&sb->s_dentry_lru,
dentry_lru_isolate_shrink, &dispose, 1024);
-
- this_cpu_sub(nr_dentry_unused, freed);
shrink_dentry_list(&dispose);
} while (list_lru_count(&sb->s_dentry_lru) > 0);
}
@@ -1606,7 +1615,6 @@ EXPORT_SYMBOL(d_invalidate);
struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
{
- struct external_name *ext = NULL;
struct dentry *dentry;
char *dname;
int err;
@@ -1627,14 +1635,15 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
dname = dentry->d_iname;
} else if (name->len > DNAME_INLINE_LEN-1) {
size_t size = offsetof(struct external_name, name[1]);
-
- ext = kmalloc(size + name->len, GFP_KERNEL_ACCOUNT);
- if (!ext) {
+ struct external_name *p = kmalloc(size + name->len,
+ GFP_KERNEL_ACCOUNT |
+ __GFP_RECLAIMABLE);
+ if (!p) {
kmem_cache_free(dentry_cache, dentry);
return NULL;
}
- atomic_set(&ext->u.count, 1);
- dname = ext->name;
+ atomic_set(&p->u.count, 1);
+ dname = p->name;
} else {
dname = dentry->d_iname;
}
@@ -1673,12 +1682,6 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
}
}
- if (unlikely(ext)) {
- pg_data_t *pgdat = page_pgdat(virt_to_page(ext));
- mod_node_page_state(pgdat, NR_INDIRECTLY_RECLAIMABLE_BYTES,
- ksize(ext));
- }
-
this_cpu_inc(nr_dentry);
return dentry;
@@ -1840,6 +1843,11 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
WARN_ON(d_in_lookup(dentry));
spin_lock(&dentry->d_lock);
+ /*
+ * Decrement negative dentry count if it was in the LRU list.
+ */
+ if (dentry->d_flags & DCACHE_LRU_LIST)
+ this_cpu_dec(nr_dentry_negative);
hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
raw_write_seqcount_begin(&dentry->d_seq);
__d_set_inode_and_type(dentry, inode, add_flags);
@@ -2707,7 +2715,7 @@ static void copy_name(struct dentry *dentry, struct dentry *target)
dentry->d_name.hash_len = target->d_name.hash_len;
}
if (old_name && likely(atomic_dec_and_test(&old_name->u.count)))
- call_rcu(&old_name->u.head, __d_free_external_name);
+ kfree_rcu(old_name, u.head);
}
/*
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 13b01351dd1c..95b5e78c22b1 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -324,7 +324,7 @@ static struct dentry *failed_creating(struct dentry *dentry)
inode_unlock(d_inode(dentry->d_parent));
dput(dentry);
simple_release_fs(&debugfs_mount, &debugfs_mount_count);
- return NULL;
+ return ERR_PTR(-ENOMEM);
}
static struct dentry *end_creating(struct dentry *dentry)
@@ -347,7 +347,7 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode,
dentry = start_creating(name, parent);
if (IS_ERR(dentry))
- return NULL;
+ return dentry;
inode = debugfs_get_inode(dentry->d_sb);
if (unlikely(!inode))
@@ -386,7 +386,8 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode,
* This function will return a pointer to a dentry if it succeeds. This
* pointer must be passed to the debugfs_remove() function when the file is
* to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.) If an error occurs, %NULL will be returned.
+ * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be
+ * returned.
*
* If debugfs is not enabled in the kernel, the value -%ENODEV will be
* returned.
@@ -422,8 +423,8 @@ EXPORT_SYMBOL_GPL(debugfs_create_file);
* debugfs core.
*
* It is your responsibility to protect your struct file_operation
- * methods against file removals by means of debugfs_use_file_start()
- * and debugfs_use_file_finish(). ->open() is still protected by
+ * methods against file removals by means of debugfs_file_get()
+ * and debugfs_file_put(). ->open() is still protected by
* debugfs though.
*
* Any struct file_operations defined by means of
@@ -464,7 +465,8 @@ EXPORT_SYMBOL_GPL(debugfs_create_file_unsafe);
* This function will return a pointer to a dentry if it succeeds. This
* pointer must be passed to the debugfs_remove() function when the file is
* to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.) If an error occurs, %NULL will be returned.
+ * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be
+ * returned.
*
* If debugfs is not enabled in the kernel, the value -%ENODEV will be
* returned.
@@ -495,7 +497,8 @@ EXPORT_SYMBOL_GPL(debugfs_create_file_size);
* This function will return a pointer to a dentry if it succeeds. This
* pointer must be passed to the debugfs_remove() function when the file is
* to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.) If an error occurs, %NULL will be returned.
+ * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be
+ * returned.
*
* If debugfs is not enabled in the kernel, the value -%ENODEV will be
* returned.
@@ -506,7 +509,7 @@ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent)
struct inode *inode;
if (IS_ERR(dentry))
- return NULL;
+ return dentry;
inode = debugfs_get_inode(dentry->d_sb);
if (unlikely(!inode))
@@ -545,7 +548,7 @@ struct dentry *debugfs_create_automount(const char *name,
struct inode *inode;
if (IS_ERR(dentry))
- return NULL;
+ return dentry;
inode = debugfs_get_inode(dentry->d_sb);
if (unlikely(!inode))
@@ -581,8 +584,8 @@ EXPORT_SYMBOL(debugfs_create_automount);
* This function will return a pointer to a dentry if it succeeds. This
* pointer must be passed to the debugfs_remove() function when the symbolic
* link is to be removed (no automatic cleanup happens if your module is
- * unloaded, you are responsible here.) If an error occurs, %NULL will be
- * returned.
+ * unloaded, you are responsible here.) If an error occurs, %ERR_PTR(-ERROR)
+ * will be returned.
*
* If debugfs is not enabled in the kernel, the value -%ENODEV will be
* returned.
@@ -594,12 +597,12 @@ struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent,
struct inode *inode;
char *link = kstrdup(target, GFP_KERNEL);
if (!link)
- return NULL;
+ return ERR_PTR(-ENOMEM);
dentry = start_creating(name, parent);
if (IS_ERR(dentry)) {
kfree(link);
- return NULL;
+ return dentry;
}
inode = debugfs_get_inode(dentry->d_sb);
@@ -787,6 +790,13 @@ struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry,
struct dentry *dentry = NULL, *trap;
struct name_snapshot old_name;
+ if (IS_ERR(old_dir))
+ return old_dir;
+ if (IS_ERR(new_dir))
+ return new_dir;
+ if (IS_ERR_OR_NULL(old_dentry))
+ return old_dentry;
+
trap = lock_rename(new_dir, old_dir);
/* Source or destination directories don't exist? */
if (d_really_is_negative(old_dir) || d_really_is_negative(new_dir))
@@ -820,7 +830,9 @@ exit:
if (dentry && !IS_ERR(dentry))
dput(dentry);
unlock_rename(new_dir, old_dir);
- return NULL;
+ if (IS_ERR(dentry))
+ return dentry;
+ return ERR_PTR(-EINVAL);
}
EXPORT_SYMBOL_GPL(debugfs_rename);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 093fb54cd316..9bb015bc4a83 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -325,8 +325,8 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags)
*/
dio->iocb->ki_pos += transferred;
- if (dio->op == REQ_OP_WRITE)
- ret = generic_write_sync(dio->iocb, transferred);
+ if (ret > 0 && dio->op == REQ_OP_WRITE)
+ ret = generic_write_sync(dio->iocb, ret);
dio->iocb->ki_complete(dio->iocb, ret, 0);
}
@@ -518,7 +518,7 @@ static struct bio *dio_await_one(struct dio *dio)
dio->waiter = current;
spin_unlock_irqrestore(&dio->bio_lock, flags);
if (!(dio->iocb->ki_flags & IOCB_HIPRI) ||
- !blk_poll(dio->bio_disk->queue, dio->bio_cookie))
+ !blk_poll(dio->bio_disk->queue, dio->bio_cookie, true))
io_schedule();
/* wake up sets us TASK_RUNNING */
spin_lock_irqsave(&dio->bio_lock, flags);
@@ -551,7 +551,9 @@ static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio)
if (dio->is_async && dio->op == REQ_OP_READ && dio->should_dirty) {
bio_check_pages_dirty(bio); /* transfers ownership */
} else {
- bio_for_each_segment_all(bvec, bio, i) {
+ struct bvec_iter_all iter_all;
+
+ bio_for_each_segment_all(bvec, bio, i, iter_all) {
struct page *page = bvec->bv_page;
if (dio->op == REQ_OP_READ && !PageCompound(page) &&
@@ -679,6 +681,7 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
unsigned long fs_count; /* Number of filesystem-sized blocks */
int create;
unsigned int i_blkbits = sdio->blkbits + sdio->blkfactor;
+ loff_t i_size;
/*
* If there was a memory error and we've overwritten all the
@@ -708,8 +711,8 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
*/
create = dio->op == REQ_OP_WRITE;
if (dio->flags & DIO_SKIP_HOLES) {
- if (fs_startblk <= ((i_size_read(dio->inode) - 1) >>
- i_blkbits))
+ i_size = i_size_read(dio->inode);
+ if (i_size && fs_startblk <= (i_size - 1) >> i_blkbits)
create = 0;
}
@@ -1265,6 +1268,8 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
} else {
dio->op = REQ_OP_READ;
}
+ if (iocb->ki_flags & IOCB_HIPRI)
+ dio->op_flags |= REQ_HIPRI;
/*
* For AIO O_(D)SYNC writes we need to defer completions to a workqueue
@@ -1313,7 +1318,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
spin_lock_init(&dio->bio_lock);
dio->refcount = 1;
- dio->should_dirty = (iter->type == ITER_IOVEC);
+ dio->should_dirty = iter_is_iovec(iter) && iov_iter_rw(iter) == READ;
sdio.iter = iter;
sdio.final_block_in_request = end >> blkbits;
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index 562fa8c3edff..47ee66d70109 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -292,6 +292,8 @@ void dlm_callback_suspend(struct dlm_ls *ls)
flush_workqueue(ls->ls_callback_wq);
}
+#define MAX_CB_QUEUE 25
+
void dlm_callback_resume(struct dlm_ls *ls)
{
struct dlm_lkb *lkb, *safe;
@@ -302,15 +304,23 @@ void dlm_callback_resume(struct dlm_ls *ls)
if (!ls->ls_callback_wq)
return;
+more:
mutex_lock(&ls->ls_cb_mutex);
list_for_each_entry_safe(lkb, safe, &ls->ls_cb_delay, lkb_cb_list) {
list_del_init(&lkb->lkb_cb_list);
queue_work(ls->ls_callback_wq, &lkb->lkb_cb_work);
count++;
+ if (count == MAX_CB_QUEUE)
+ break;
}
mutex_unlock(&ls->ls_cb_mutex);
if (count)
log_rinfo(ls, "dlm_callback_resume %d", count);
+ if (count == MAX_CB_QUEUE) {
+ count = 0;
+ cond_resched();
+ goto more;
+ }
}
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index cc91963683de..a928ba008d7d 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1209,6 +1209,7 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
if (rv < 0) {
log_error(ls, "create_lkb idr error %d", rv);
+ dlm_free_lkb(lkb);
return rv;
}
@@ -4179,6 +4180,7 @@ static int receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
(unsigned long long)lkb->lkb_recover_seq,
ms->m_header.h_nodeid, ms->m_lkid);
error = -ENOENT;
+ dlm_put_lkb(lkb);
goto fail;
}
@@ -4232,6 +4234,7 @@ static int receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
lkb->lkb_id, lkb->lkb_remid,
ms->m_header.h_nodeid, ms->m_lkid);
error = -ENOENT;
+ dlm_put_lkb(lkb);
goto fail;
}
@@ -5792,20 +5795,20 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
goto out;
}
}
-
- /* After ua is attached to lkb it will be freed by dlm_free_lkb().
- When DLM_IFL_USER is set, the dlm knows that this is a userspace
- lock and that lkb_astparam is the dlm_user_args structure. */
-
error = set_lock_args(mode, &ua->lksb, flags, namelen, timeout_cs,
fake_astfn, ua, fake_bastfn, &args);
- lkb->lkb_flags |= DLM_IFL_USER;
-
if (error) {
+ kfree(ua->lksb.sb_lvbptr);
+ ua->lksb.sb_lvbptr = NULL;
+ kfree(ua);
__put_lkb(ls, lkb);
goto out;
}
+ /* After ua is attached to lkb it will be freed by dlm_free_lkb().
+ When DLM_IFL_USER is set, the dlm knows that this is a userspace
+ lock and that lkb_astparam is the dlm_user_args structure. */
+ lkb->lkb_flags |= DLM_IFL_USER;
error = request_lock(ls, lkb, name, namelen, &args);
switch (error) {
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 5ba94be006ee..db43b98c4d64 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -431,7 +431,7 @@ static int new_lockspace(const char *name, const char *cluster,
int do_unreg = 0;
int namelen = strlen(name);
- if (namelen > DLM_LOCKSPACE_LEN)
+ if (namelen > DLM_LOCKSPACE_LEN || namelen == 0)
return -EINVAL;
if (!lvblen || (lvblen % 8))
@@ -680,11 +680,9 @@ static int new_lockspace(const char *name, const char *cluster,
kfree(ls->ls_recover_buf);
out_lkbidr:
idr_destroy(&ls->ls_lkbidr);
- for (i = 0; i < DLM_REMOVE_NAMES_MAX; i++) {
- if (ls->ls_remove_names[i])
- kfree(ls->ls_remove_names[i]);
- }
out_rsbtbl:
+ for (i = 0; i < DLM_REMOVE_NAMES_MAX; i++)
+ kfree(ls->ls_remove_names[i]);
vfree(ls->ls_rsbtbl);
out_lsfree:
if (do_unreg)
@@ -807,6 +805,7 @@ static int release_lockspace(struct dlm_ls *ls, int force)
dlm_delete_debug_file(ls);
+ idr_destroy(&ls->ls_recover_idr);
kfree(ls->ls_recover_buf);
/*
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index a5e4a221435c..c98ad9777ad9 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -674,7 +674,7 @@ static int receive_from_sock(struct connection *con)
nvec = 2;
}
len = iov[0].iov_len + iov[1].iov_len;
- iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, iov, nvec, len);
+ iov_iter_kvec(&msg.msg_iter, READ, iov, nvec, len);
r = ret = sock_recvmsg(con->sock, &msg, MSG_DONTWAIT | MSG_NOSIGNAL);
if (ret <= 0)
@@ -1089,12 +1089,12 @@ static void sctp_connect_to_sock(struct connection *con)
* since O_NONBLOCK argument in connect() function does not work here,
* then, we should restore the default value of this attribute.
*/
- kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&tv,
+ kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO_OLD, (char *)&tv,
sizeof(tv));
result = sock->ops->connect(sock, (struct sockaddr *)&daddr, addr_len,
0);
memset(&tv, 0, sizeof(tv));
- kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&tv,
+ kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO_OLD, (char *)&tv,
sizeof(tv));
if (result == -EINPROGRESS)
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index 3fda3832cf6a..0bc43b35d2c5 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -671,7 +671,7 @@ int dlm_ls_stop(struct dlm_ls *ls)
int dlm_ls_start(struct dlm_ls *ls)
{
struct dlm_recover *rv, *rv_old;
- struct dlm_config_node *nodes;
+ struct dlm_config_node *nodes = NULL;
int error, count;
rv = kzalloc(sizeof(*rv), GFP_NOFS);
@@ -680,7 +680,7 @@ int dlm_ls_start(struct dlm_ls *ls)
error = dlm_config_nodes(ls->ls_name, &nodes, &count);
if (error < 0)
- goto fail;
+ goto fail_rv;
spin_lock(&ls->ls_recover_lock);
@@ -712,8 +712,9 @@ int dlm_ls_start(struct dlm_ls *ls)
return 0;
fail:
- kfree(rv);
kfree(nodes);
+ fail_rv:
+ kfree(rv);
return error;
}
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
index 7cd24bccd4fe..37be29f21d04 100644
--- a/fs/dlm/memory.c
+++ b/fs/dlm/memory.c
@@ -38,10 +38,8 @@ int __init dlm_memory_init(void)
void dlm_memory_exit(void)
{
- if (lkb_cache)
- kmem_cache_destroy(lkb_cache);
- if (rsb_cache)
- kmem_cache_destroy(rsb_cache);
+ kmem_cache_destroy(lkb_cache);
+ kmem_cache_destroy(rsb_cache);
}
char *dlm_allocate_lvb(struct dlm_ls *ls)
@@ -86,8 +84,7 @@ void dlm_free_lkb(struct dlm_lkb *lkb)
struct dlm_user_args *ua;
ua = lkb->lkb_ua;
if (ua) {
- if (ua->lksb.sb_lvbptr)
- kfree(ua->lksb.sb_lvbptr);
+ kfree(ua->lksb.sb_lvbptr);
kfree(ua);
}
}
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 2a669390cd7f..3c84c62dadb7 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -25,6 +25,7 @@
#include "lvb_table.h"
#include "user.h"
#include "ast.h"
+#include "config.h"
static const char name_prefix[] = "dlm";
static const struct file_operations device_fops;
@@ -404,7 +405,7 @@ static int device_create_lockspace(struct dlm_lspace_params *params)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- error = dlm_new_lockspace(params->name, NULL, params->flags,
+ error = dlm_new_lockspace(params->name, dlm_config.ci_cluster_name, params->flags,
DLM_USER_LVB_LEN, NULL, NULL, NULL,
&lockspace);
if (error)
@@ -702,7 +703,7 @@ static int copy_result_to_user(struct dlm_user_args *ua, int compat,
result.version[0] = DLM_DEVICE_VERSION_MAJOR;
result.version[1] = DLM_DEVICE_VERSION_MINOR;
result.version[2] = DLM_DEVICE_VERSION_PATCH;
- memcpy(&result.lksb, &ua->lksb, sizeof(struct dlm_lksb));
+ memcpy(&result.lksb, &ua->lksb, offsetof(struct dlm_lksb, sb_lvbptr));
result.user_lksb = ua->user_lksb;
/* FIXME: dlm1 provides for the user's bastparam/addr to not be updated
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 82377017130f..d31b6c72b476 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -21,8 +21,13 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
spin_lock(&sb->s_inode_list_lock);
list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
spin_lock(&inode->i_lock);
+ /*
+ * We must skip inodes in unusual state. We may also skip
+ * inodes without pages but we deliberately won't in case
+ * we need to reschedule to avoid softlockups.
+ */
if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
- (inode->i_mapping->nrpages == 0)) {
+ (inode->i_mapping->nrpages == 0 && !need_resched())) {
spin_unlock(&inode->i_lock);
continue;
}
@@ -30,6 +35,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
spin_unlock(&inode->i_lock);
spin_unlock(&sb->s_inode_list_lock);
+ cond_resched();
invalidate_mapping_pages(inode->i_mapping, 0, -1);
iput(toput_inode);
toput_inode = inode;
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 4dd842f72846..f664da55234e 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -610,7 +610,8 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat)
full_alg_name);
goto out_free;
}
- crypto_skcipher_set_flags(crypt_stat->tfm, CRYPTO_TFM_REQ_WEAK_KEY);
+ crypto_skcipher_set_flags(crypt_stat->tfm,
+ CRYPTO_TFM_REQ_FORBID_WEAK_KEYS);
rc = 0;
out_free:
kfree(full_alg_name);
@@ -1590,7 +1591,7 @@ ecryptfs_process_key_cipher(struct crypto_skcipher **key_tfm,
"[%s]; rc = [%d]\n", full_alg_name, rc);
goto out;
}
- crypto_skcipher_set_flags(*key_tfm, CRYPTO_TFM_REQ_WEAK_KEY);
+ crypto_skcipher_set_flags(*key_tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS);
if (*key_size == 0)
*key_size = crypto_skcipher_default_keysize(*key_tfm);
get_random_bytes(dummy_key, *key_size);
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 42bbe6824b4b..4a0e98d87fcc 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -50,10 +50,10 @@
*
* 1) epmutex (mutex)
* 2) ep->mtx (mutex)
- * 3) ep->wq.lock (spinlock)
+ * 3) ep->lock (rwlock)
*
* The acquire order is the one listed above, from 1 to 3.
- * We need a spinlock (ep->wq.lock) because we manipulate objects
+ * We need a rwlock (ep->lock) because we manipulate objects
* from inside the poll callback, that might be triggered from
* a wake_up() that in turn might be called from IRQ context.
* So we can't sleep inside the poll callback and hence we need
@@ -85,7 +85,7 @@
* of epoll file descriptors, we use the current recursion depth as
* the lockdep subkey.
* It is possible to drop the "ep->mtx" and to use the global
- * mutex "epmutex" (together with "ep->wq.lock") to have it working,
+ * mutex "epmutex" (together with "ep->lock") to have it working,
* but having "ep->mtx" will make the interface more scalable.
* Events that require holding "epmutex" are very rare, while for
* normal operations the epoll private "ep->mtx" will guarantee
@@ -182,8 +182,6 @@ struct epitem {
* This structure is stored inside the "private_data" member of the file
* structure and represents the main data structure for the eventpoll
* interface.
- *
- * Access to it is protected by the lock inside wq.
*/
struct eventpoll {
/*
@@ -203,13 +201,16 @@ struct eventpoll {
/* List of ready file descriptors */
struct list_head rdllist;
+ /* Lock which protects rdllist and ovflist */
+ rwlock_t lock;
+
/* RB tree root used to store monitored fd structs */
struct rb_root_cached rbr;
/*
* This is a single linked list that chains all the "struct epitem" that
* happened while transferring ready events to userspace w/out
- * holding ->wq.lock.
+ * holding ->lock.
*/
struct epitem *ovflist;
@@ -381,7 +382,8 @@ static void ep_nested_calls_init(struct nested_calls *ncalls)
*/
static inline int ep_events_available(struct eventpoll *ep)
{
- return !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR;
+ return !list_empty_careful(&ep->rdllist) ||
+ READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR;
}
#ifdef CONFIG_NET_RX_BUSY_POLL
@@ -471,7 +473,6 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
* no re-entered.
*
* @ncalls: Pointer to the nested_calls structure to be used for this call.
- * @max_nests: Maximum number of allowed nesting calls.
* @nproc: Nested call core function pointer.
* @priv: Opaque data to be passed to the @nproc callback.
* @cookie: Cookie to be used to identify this nested call.
@@ -480,7 +481,7 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
* Returns: Returns the code returned by the @nproc callback, or -1 if
* the maximum recursion limit has been exceeded.
*/
-static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
+static int ep_call_nested(struct nested_calls *ncalls,
int (*nproc)(void *, void *, int), void *priv,
void *cookie, void *ctx)
{
@@ -499,7 +500,7 @@ static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
*/
list_for_each_entry(tncur, lsthead, llink) {
if (tncur->ctx == ctx &&
- (tncur->cookie == cookie || ++call_nests > max_nests)) {
+ (tncur->cookie == cookie || ++call_nests > EP_MAX_NESTS)) {
/*
* Ops ... loop detected or maximum nest level reached.
* We abort this wake by breaking the cycle itself.
@@ -573,7 +574,7 @@ static void ep_poll_safewake(wait_queue_head_t *wq)
{
int this_cpu = get_cpu();
- ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
+ ep_call_nested(&poll_safewake_ncalls,
ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
put_cpu();
@@ -697,23 +698,23 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
* because we want the "sproc" callback to be able to do it
* in a lockless way.
*/
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
list_splice_init(&ep->rdllist, &txlist);
- ep->ovflist = NULL;
- spin_unlock_irq(&ep->wq.lock);
+ WRITE_ONCE(ep->ovflist, NULL);
+ write_unlock_irq(&ep->lock);
/*
* Now call the callback function.
*/
res = (*sproc)(ep, &txlist, priv);
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
/*
* During the time we spent inside the "sproc" callback, some
* other events might have been queued by the poll callback.
* We re-insert them inside the main ready-list here.
*/
- for (nepi = ep->ovflist; (epi = nepi) != NULL;
+ for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL;
nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
/*
* We need to check if the item is already in the list.
@@ -722,7 +723,11 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
* contain them, and the list_splice() below takes care of them.
*/
if (!ep_is_linked(epi)) {
- list_add_tail(&epi->rdllink, &ep->rdllist);
+ /*
+ * ->ovflist is LIFO, so we have to reverse it in order
+ * to keep in FIFO.
+ */
+ list_add(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
}
}
@@ -731,7 +736,7 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
* releasing the lock, events will be queued in the normal way inside
* ep->rdllist.
*/
- ep->ovflist = EP_UNACTIVE_PTR;
+ WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR);
/*
* Quickly re-inject items left on "txlist".
@@ -745,11 +750,11 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
* the ->poll() wait list (delayed after we release the lock).
*/
if (waitqueue_active(&ep->wq))
- wake_up_locked(&ep->wq);
+ wake_up(&ep->wq);
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
if (!ep_locked)
mutex_unlock(&ep->mtx);
@@ -789,10 +794,10 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
rb_erase_cached(&epi->rbn, &ep->rbr);
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
if (ep_is_linked(epi))
list_del_init(&epi->rdllink);
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
wakeup_source_unregister(ep_wakeup_source(epi));
/*
@@ -842,7 +847,7 @@ static void ep_free(struct eventpoll *ep)
* Walks through the whole tree by freeing each "struct epitem". At this
* point we are sure no poll callbacks will be lingering around, and also by
* holding "epmutex" we can be sure that no file cleanup code will hit
- * us during this operation. So we can avoid the lock on "ep->wq.lock".
+ * us during this operation. So we can avoid the lock on "ep->lock".
* We do not need to lock ep->mtx, either, we only do it to prevent
* a lockdep warning.
*/
@@ -1023,6 +1028,7 @@ static int ep_alloc(struct eventpoll **pep)
goto free_uid;
mutex_init(&ep->mtx);
+ rwlock_init(&ep->lock);
init_waitqueue_head(&ep->wq);
init_waitqueue_head(&ep->poll_wait);
INIT_LIST_HEAD(&ep->rdllist);
@@ -1112,21 +1118,107 @@ struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd,
}
#endif /* CONFIG_CHECKPOINT_RESTORE */
+/**
+ * Adds a new entry to the tail of the list in a lockless way, i.e.
+ * multiple CPUs are allowed to call this function concurrently.
+ *
+ * Beware: it is necessary to prevent any other modifications of the
+ * existing list until all changes are completed, in other words
+ * concurrent list_add_tail_lockless() calls should be protected
+ * with a read lock, where write lock acts as a barrier which
+ * makes sure all list_add_tail_lockless() calls are fully
+ * completed.
+ *
+ * Also an element can be locklessly added to the list only in one
+ * direction i.e. either to the tail either to the head, otherwise
+ * concurrent access will corrupt the list.
+ *
+ * Returns %false if element has been already added to the list, %true
+ * otherwise.
+ */
+static inline bool list_add_tail_lockless(struct list_head *new,
+ struct list_head *head)
+{
+ struct list_head *prev;
+
+ /*
+ * This is simple 'new->next = head' operation, but cmpxchg()
+ * is used in order to detect that same element has been just
+ * added to the list from another CPU: the winner observes
+ * new->next == new.
+ */
+ if (cmpxchg(&new->next, new, head) != new)
+ return false;
+
+ /*
+ * Initially ->next of a new element must be updated with the head
+ * (we are inserting to the tail) and only then pointers are atomically
+ * exchanged. XCHG guarantees memory ordering, thus ->next should be
+ * updated before pointers are actually swapped and pointers are
+ * swapped before prev->next is updated.
+ */
+
+ prev = xchg(&head->prev, new);
+
+ /*
+ * It is safe to modify prev->next and new->prev, because a new element
+ * is added only to the tail and new->next is updated before XCHG.
+ */
+
+ prev->next = new;
+ new->prev = prev;
+
+ return true;
+}
+
+/**
+ * Chains a new epi entry to the tail of the ep->ovflist in a lockless way,
+ * i.e. multiple CPUs are allowed to call this function concurrently.
+ *
+ * Returns %false if epi element has been already chained, %true otherwise.
+ */
+static inline bool chain_epi_lockless(struct epitem *epi)
+{
+ struct eventpoll *ep = epi->ep;
+
+ /* Check that the same epi has not been just chained from another CPU */
+ if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR)
+ return false;
+
+ /* Atomically exchange tail */
+ epi->next = xchg(&ep->ovflist, epi);
+
+ return true;
+}
+
/*
* This is the callback that is passed to the wait queue wakeup
* mechanism. It is called by the stored file descriptors when they
* have events to report.
+ *
+ * This callback takes a read lock in order not to content with concurrent
+ * events from another file descriptors, thus all modifications to ->rdllist
+ * or ->ovflist are lockless. Read lock is paired with the write lock from
+ * ep_scan_ready_list(), which stops all list modifications and guarantees
+ * that lists state is seen correctly.
+ *
+ * Another thing worth to mention is that ep_poll_callback() can be called
+ * concurrently for the same @epi from different CPUs if poll table was inited
+ * with several wait queues entries. Plural wakeup from different CPUs of a
+ * single wait queue is serialized by wq.lock, but the case when multiple wait
+ * queues are used should be detected accordingly. This is detected using
+ * cmpxchg() operation.
*/
static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
int pwake = 0;
- unsigned long flags;
struct epitem *epi = ep_item_from_wait(wait);
struct eventpoll *ep = epi->ep;
__poll_t pollflags = key_to_poll(key);
+ unsigned long flags;
int ewake = 0;
- spin_lock_irqsave(&ep->wq.lock, flags);
+ read_lock_irqsave(&ep->lock, flags);
ep_set_busy_poll_napi_id(epi);
@@ -1154,25 +1246,16 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
* semantics). All the events that happen during that period of time are
* chained in ep->ovflist and requeued later on.
*/
- if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
- if (epi->next == EP_UNACTIVE_PTR) {
- epi->next = ep->ovflist;
- ep->ovflist = epi;
- if (epi->ws) {
- /*
- * Activate ep->ws since epi->ws may get
- * deactivated at any time.
- */
- __pm_stay_awake(ep->ws);
- }
-
- }
+ if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
+ if (epi->next == EP_UNACTIVE_PTR &&
+ chain_epi_lockless(epi))
+ ep_pm_stay_awake_rcu(epi);
goto out_unlock;
}
/* If this file is already in the ready list we exit soon */
- if (!ep_is_linked(epi)) {
- list_add_tail(&epi->rdllink, &ep->rdllist);
+ if (!ep_is_linked(epi) &&
+ list_add_tail_lockless(&epi->rdllink, &ep->rdllist)) {
ep_pm_stay_awake_rcu(epi);
}
@@ -1197,13 +1280,13 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
break;
}
}
- wake_up_locked(&ep->wq);
+ wake_up(&ep->wq);
}
if (waitqueue_active(&ep->poll_wait))
pwake++;
out_unlock:
- spin_unlock_irqrestore(&ep->wq.lock, flags);
+ read_unlock_irqrestore(&ep->lock, flags);
/* We have to call this outside the lock */
if (pwake)
@@ -1333,7 +1416,6 @@ static int reverse_path_check_proc(void *priv, void *cookie, int call_nests)
}
} else {
error = ep_call_nested(&poll_loop_ncalls,
- EP_MAX_NESTS,
reverse_path_check_proc,
child_file, child_file,
current);
@@ -1367,7 +1449,7 @@ static int reverse_path_check(void)
/* let's call this for all tfiles */
list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) {
path_count_init();
- error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+ error = ep_call_nested(&poll_loop_ncalls,
reverse_path_check_proc, current_file,
current_file, current);
if (error)
@@ -1489,7 +1571,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
goto error_remove_epi;
/* We have to drop the new item inside our item list to keep track of it */
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
/* record NAPI ID of new item if present */
ep_set_busy_poll_napi_id(epi);
@@ -1501,12 +1583,12 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
/* Notify waiting tasks that events are available */
if (waitqueue_active(&ep->wq))
- wake_up_locked(&ep->wq);
+ wake_up(&ep->wq);
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
atomic_long_inc(&ep->user->epoll_watches);
@@ -1532,10 +1614,10 @@ error_unregister:
* list, since that is used/cleaned only inside a section bound by "mtx".
* And ep_insert() is called with "mtx" held.
*/
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
if (ep_is_linked(epi))
list_del_init(&epi->rdllink);
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
wakeup_source_unregister(ep_wakeup_source(epi));
@@ -1579,9 +1661,9 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
* 1) Flush epi changes above to other CPUs. This ensures
* we do not miss events from ep_poll_callback if an
* event occurs immediately after we call f_op->poll().
- * We need this because we did not take ep->wq.lock while
+ * We need this because we did not take ep->lock while
* changing epi above (but ep_poll_callback does take
- * ep->wq.lock).
+ * ep->lock).
*
* 2) We also need to ensure we do not miss _past_ events
* when calling f_op->poll(). This barrier also
@@ -1600,18 +1682,18 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
* list, push it inside.
*/
if (ep_item_poll(epi, &pt, 1)) {
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
if (!ep_is_linked(epi)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
/* Notify waiting tasks that events are available */
if (waitqueue_active(&ep->wq))
- wake_up_locked(&ep->wq);
+ wake_up(&ep->wq);
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
}
/* We have to call this outside the lock */
@@ -1626,21 +1708,24 @@ static __poll_t ep_send_events_proc(struct eventpoll *ep, struct list_head *head
{
struct ep_send_events_data *esed = priv;
__poll_t revents;
- struct epitem *epi;
- struct epoll_event __user *uevent;
+ struct epitem *epi, *tmp;
+ struct epoll_event __user *uevent = esed->events;
struct wakeup_source *ws;
poll_table pt;
init_poll_funcptr(&pt, NULL);
+ esed->res = 0;
/*
* We can loop without lock because we are passed a task private list.
* Items cannot vanish during the loop because ep_scan_ready_list() is
* holding "mtx" during this call.
*/
- for (esed->res = 0, uevent = esed->events;
- !list_empty(head) && esed->res < esed->maxevents;) {
- epi = list_first_entry(head, struct epitem, rdllink);
+ lockdep_assert_held(&ep->mtx);
+
+ list_for_each_entry_safe(epi, tmp, head, rdllink) {
+ if (esed->res >= esed->maxevents)
+ break;
/*
* Activate ep->ws before deactivating epi->ws to prevent
@@ -1660,42 +1745,42 @@ static __poll_t ep_send_events_proc(struct eventpoll *ep, struct list_head *head
list_del_init(&epi->rdllink);
- revents = ep_item_poll(epi, &pt, 1);
-
/*
* If the event mask intersect the caller-requested one,
* deliver the event to userspace. Again, ep_scan_ready_list()
- * is holding "mtx", so no operations coming from userspace
+ * is holding ep->mtx, so no operations coming from userspace
* can change the item.
*/
- if (revents) {
- if (__put_user(revents, &uevent->events) ||
- __put_user(epi->event.data, &uevent->data)) {
- list_add(&epi->rdllink, head);
- ep_pm_stay_awake(epi);
- if (!esed->res)
- esed->res = -EFAULT;
- return 0;
- }
- esed->res++;
- uevent++;
- if (epi->event.events & EPOLLONESHOT)
- epi->event.events &= EP_PRIVATE_BITS;
- else if (!(epi->event.events & EPOLLET)) {
- /*
- * If this file has been added with Level
- * Trigger mode, we need to insert back inside
- * the ready list, so that the next call to
- * epoll_wait() will check again the events
- * availability. At this point, no one can insert
- * into ep->rdllist besides us. The epoll_ctl()
- * callers are locked out by
- * ep_scan_ready_list() holding "mtx" and the
- * poll callback will queue them in ep->ovflist.
- */
- list_add_tail(&epi->rdllink, &ep->rdllist);
- ep_pm_stay_awake(epi);
- }
+ revents = ep_item_poll(epi, &pt, 1);
+ if (!revents)
+ continue;
+
+ if (__put_user(revents, &uevent->events) ||
+ __put_user(epi->event.data, &uevent->data)) {
+ list_add(&epi->rdllink, head);
+ ep_pm_stay_awake(epi);
+ if (!esed->res)
+ esed->res = -EFAULT;
+ return 0;
+ }
+ esed->res++;
+ uevent++;
+ if (epi->event.events & EPOLLONESHOT)
+ epi->event.events &= EP_PRIVATE_BITS;
+ else if (!(epi->event.events & EPOLLET)) {
+ /*
+ * If this file has been added with Level
+ * Trigger mode, we need to insert back inside
+ * the ready list, so that the next call to
+ * epoll_wait() will check again the events
+ * availability. At this point, no one can insert
+ * into ep->rdllist besides us. The epoll_ctl()
+ * callers are locked out by
+ * ep_scan_ready_list() holding "mtx" and the
+ * poll callback will queue them in ep->ovflist.
+ */
+ list_add_tail(&epi->rdllink, &ep->rdllist);
+ ep_pm_stay_awake(epi);
}
}
@@ -1747,6 +1832,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
{
int res = 0, eavail, timed_out = 0;
u64 slack = 0;
+ bool waiter = false;
wait_queue_entry_t wait;
ktime_t expires, *to = NULL;
@@ -1761,11 +1847,18 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
} else if (timeout == 0) {
/*
* Avoid the unnecessary trip to the wait queue loop, if the
- * caller specified a non blocking operation.
+ * caller specified a non blocking operation. We still need
+ * lock because we could race and not see an epi being added
+ * to the ready list while in irq callback. Thus incorrectly
+ * returning 0 back to userspace.
*/
timed_out = 1;
- spin_lock_irq(&ep->wq.lock);
- goto check_events;
+
+ write_lock_irq(&ep->lock);
+ eavail = ep_events_available(ep);
+ write_unlock_irq(&ep->lock);
+
+ goto send_events;
}
fetch_events:
@@ -1773,64 +1866,66 @@ fetch_events:
if (!ep_events_available(ep))
ep_busy_loop(ep, timed_out);
- spin_lock_irq(&ep->wq.lock);
+ eavail = ep_events_available(ep);
+ if (eavail)
+ goto send_events;
- if (!ep_events_available(ep)) {
- /*
- * Busy poll timed out. Drop NAPI ID for now, we can add
- * it back in when we have moved a socket with a valid NAPI
- * ID onto the ready list.
- */
- ep_reset_busy_poll_napi_id(ep);
+ /*
+ * Busy poll timed out. Drop NAPI ID for now, we can add
+ * it back in when we have moved a socket with a valid NAPI
+ * ID onto the ready list.
+ */
+ ep_reset_busy_poll_napi_id(ep);
- /*
- * We don't have any available event to return to the caller.
- * We need to sleep here, and we will be wake up by
- * ep_poll_callback() when events will become available.
- */
+ /*
+ * We don't have any available event to return to the caller. We need
+ * to sleep here, and we will be woken by ep_poll_callback() when events
+ * become available.
+ */
+ if (!waiter) {
+ waiter = true;
init_waitqueue_entry(&wait, current);
- __add_wait_queue_exclusive(&ep->wq, &wait);
- for (;;) {
- /*
- * We don't want to sleep if the ep_poll_callback() sends us
- * a wakeup in between. That's why we set the task state
- * to TASK_INTERRUPTIBLE before doing the checks.
- */
- set_current_state(TASK_INTERRUPTIBLE);
- /*
- * Always short-circuit for fatal signals to allow
- * threads to make a timely exit without the chance of
- * finding more events available and fetching
- * repeatedly.
- */
- if (fatal_signal_pending(current)) {
- res = -EINTR;
- break;
- }
- if (ep_events_available(ep) || timed_out)
- break;
- if (signal_pending(current)) {
- res = -EINTR;
- break;
- }
+ spin_lock_irq(&ep->wq.lock);
+ __add_wait_queue_exclusive(&ep->wq, &wait);
+ spin_unlock_irq(&ep->wq.lock);
+ }
- spin_unlock_irq(&ep->wq.lock);
- if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
- timed_out = 1;
+ for (;;) {
+ /*
+ * We don't want to sleep if the ep_poll_callback() sends us
+ * a wakeup in between. That's why we set the task state
+ * to TASK_INTERRUPTIBLE before doing the checks.
+ */
+ set_current_state(TASK_INTERRUPTIBLE);
+ /*
+ * Always short-circuit for fatal signals to allow
+ * threads to make a timely exit without the chance of
+ * finding more events available and fetching
+ * repeatedly.
+ */
+ if (fatal_signal_pending(current)) {
+ res = -EINTR;
+ break;
+ }
- spin_lock_irq(&ep->wq.lock);
+ eavail = ep_events_available(ep);
+ if (eavail)
+ break;
+ if (signal_pending(current)) {
+ res = -EINTR;
+ break;
}
- __remove_wait_queue(&ep->wq, &wait);
- __set_current_state(TASK_RUNNING);
+ if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) {
+ timed_out = 1;
+ break;
+ }
}
-check_events:
- /* Is it worth to try to dig for events ? */
- eavail = ep_events_available(ep);
- spin_unlock_irq(&ep->wq.lock);
+ __set_current_state(TASK_RUNNING);
+send_events:
/*
* Try to transfer events to user space. In case we get 0 events and
* there's still timeout left over, we go trying again in search of
@@ -1840,6 +1935,12 @@ check_events:
!(res = ep_send_events(ep, events, maxevents)) && !timed_out)
goto fetch_events;
+ if (waiter) {
+ spin_lock_irq(&ep->wq.lock);
+ __remove_wait_queue(&ep->wq, &wait);
+ spin_unlock_irq(&ep->wq.lock);
+ }
+
return res;
}
@@ -1876,7 +1977,7 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
ep_tovisit = epi->ffd.file->private_data;
if (ep_tovisit->visited)
continue;
- error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+ error = ep_call_nested(&poll_loop_ncalls,
ep_loop_check_proc, epi->ffd.file,
ep_tovisit, current);
if (error != 0)
@@ -1916,7 +2017,7 @@ static int ep_loop_check(struct eventpoll *ep, struct file *file)
int ret;
struct eventpoll *ep_cur, *ep_next;
- ret = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+ ret = ep_call_nested(&poll_loop_ncalls,
ep_loop_check_proc, file, ep, current);
/* clear visited list */
list_for_each_entry_safe(ep_cur, ep_next, &visited_list,
@@ -2172,7 +2273,7 @@ static int do_epoll_wait(int epfd, struct epoll_event __user *events,
return -EINVAL;
/* Verify that the area passed by the user is writeable */
- if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event)))
+ if (!access_ok(events, maxevents * sizeof(struct epoll_event)))
return -EFAULT;
/* Get the "struct file *" for the eventpoll file */
@@ -2223,31 +2324,13 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
* If the caller wants a certain signal mask to be set during the wait,
* we apply it here.
*/
- if (sigmask) {
- if (sigsetsize != sizeof(sigset_t))
- return -EINVAL;
- if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
- return -EFAULT;
- sigsaved = current->blocked;
- set_current_blocked(&ksigmask);
- }
+ error = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+ if (error)
+ return error;
error = do_epoll_wait(epfd, events, maxevents, timeout);
- /*
- * If we changed the signal mask, we need to restore the original one.
- * In case we've got a signal while waiting, we do not restore the
- * signal mask yet, and we allow do_signal() to deliver the signal on
- * the way back to userspace, before the signal mask is restored.
- */
- if (sigmask) {
- if (error == -EINTR) {
- memcpy(&current->saved_sigmask, &sigsaved,
- sizeof(sigsaved));
- set_restore_sigmask();
- } else
- set_current_blocked(&sigsaved);
- }
+ restore_user_sigmask(sigmask, &sigsaved);
return error;
}
@@ -2266,31 +2349,13 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
* If the caller wants a certain signal mask to be set during the wait,
* we apply it here.
*/
- if (sigmask) {
- if (sigsetsize != sizeof(compat_sigset_t))
- return -EINVAL;
- if (get_compat_sigset(&ksigmask, sigmask))
- return -EFAULT;
- sigsaved = current->blocked;
- set_current_blocked(&ksigmask);
- }
+ err = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+ if (err)
+ return err;
err = do_epoll_wait(epfd, events, maxevents, timeout);
- /*
- * If we changed the signal mask, we need to restore the original one.
- * In case we've got a signal while waiting, we do not restore the
- * signal mask yet, and we allow do_signal() to deliver the signal on
- * the way back to userspace, before the signal mask is restored.
- */
- if (sigmask) {
- if (err == -EINTR) {
- memcpy(&current->saved_sigmask, &sigsaved,
- sizeof(sigsaved));
- set_restore_sigmask();
- } else
- set_current_blocked(&sigsaved);
- }
+ restore_user_sigmask(sigmask, &sigsaved);
return err;
}
diff --git a/fs/exec.c b/fs/exec.c
index fc281b738a98..2e0033348d8e 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -218,55 +218,10 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
if (ret <= 0)
return NULL;
- if (write) {
- unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start;
- unsigned long ptr_size, limit;
-
- /*
- * Since the stack will hold pointers to the strings, we
- * must account for them as well.
- *
- * The size calculation is the entire vma while each arg page is
- * built, so each time we get here it's calculating how far it
- * is currently (rather than each call being just the newly
- * added size from the arg page). As a result, we need to
- * always add the entire size of the pointers, so that on the
- * last call to get_arg_page() we'll actually have the entire
- * correct size.
- */
- ptr_size = (bprm->argc + bprm->envc) * sizeof(void *);
- if (ptr_size > ULONG_MAX - size)
- goto fail;
- size += ptr_size;
-
- acct_arg_size(bprm, size / PAGE_SIZE);
-
- /*
- * We've historically supported up to 32 pages (ARG_MAX)
- * of argument strings even with small stacks
- */
- if (size <= ARG_MAX)
- return page;
-
- /*
- * Limit to 1/4 of the max stack size or 3/4 of _STK_LIM
- * (whichever is smaller) for the argv+env strings.
- * This ensures that:
- * - the remaining binfmt code will not run out of stack space,
- * - the program will have a reasonable amount of stack left
- * to work from.
- */
- limit = _STK_LIM / 4 * 3;
- limit = min(limit, bprm->rlim_stack.rlim_cur / 4);
- if (size > limit)
- goto fail;
- }
+ if (write)
+ acct_arg_size(bprm, vma_pages(bprm->vma));
return page;
-
-fail:
- put_page(page);
- return NULL;
}
static void put_arg_page(struct page *page)
@@ -492,6 +447,50 @@ static int count(struct user_arg_ptr argv, int max)
return i;
}
+static int prepare_arg_pages(struct linux_binprm *bprm,
+ struct user_arg_ptr argv, struct user_arg_ptr envp)
+{
+ unsigned long limit, ptr_size;
+
+ bprm->argc = count(argv, MAX_ARG_STRINGS);
+ if (bprm->argc < 0)
+ return bprm->argc;
+
+ bprm->envc = count(envp, MAX_ARG_STRINGS);
+ if (bprm->envc < 0)
+ return bprm->envc;
+
+ /*
+ * Limit to 1/4 of the max stack size or 3/4 of _STK_LIM
+ * (whichever is smaller) for the argv+env strings.
+ * This ensures that:
+ * - the remaining binfmt code will not run out of stack space,
+ * - the program will have a reasonable amount of stack left
+ * to work from.
+ */
+ limit = _STK_LIM / 4 * 3;
+ limit = min(limit, bprm->rlim_stack.rlim_cur / 4);
+ /*
+ * We've historically supported up to 32 pages (ARG_MAX)
+ * of argument strings even with small stacks
+ */
+ limit = max_t(unsigned long, limit, ARG_MAX);
+ /*
+ * We must account for the size of all the argv and envp pointers to
+ * the argv and envp strings, since they will also take up space in
+ * the stack. They aren't stored until much later when we can't
+ * signal to the parent that the child has run out of stack space.
+ * Instead, calculate it here so it's possible to fail gracefully.
+ */
+ ptr_size = (bprm->argc + bprm->envc) * sizeof(void *);
+ if (limit <= ptr_size)
+ return -E2BIG;
+ limit -= ptr_size;
+
+ bprm->argmin = bprm->p - limit;
+ return 0;
+}
+
/*
* 'copy_strings()' copies argument/environment strings from the old
* processes's memory to the new process's stack. The call to get_user_pages()
@@ -527,6 +526,10 @@ static int copy_strings(int argc, struct user_arg_ptr argv,
pos = bprm->p;
str += len;
bprm->p -= len;
+#ifdef CONFIG_MMU
+ if (bprm->p < bprm->argmin)
+ goto out;
+#endif
while (len > 0) {
int offset, bytes_to_copy;
@@ -929,7 +932,7 @@ int kernel_read_file(struct file *file, void **buf, loff_t *size,
bytes = kernel_read(file, *buf + pos, i_size - pos, &pos);
if (bytes < 0) {
ret = bytes;
- goto out;
+ goto out_free;
}
if (bytes == 0)
@@ -1084,7 +1087,7 @@ static int de_thread(struct task_struct *tsk)
__set_current_state(TASK_KILLABLE);
spin_unlock_irq(lock);
schedule();
- if (unlikely(__fatal_signal_pending(tsk)))
+ if (__fatal_signal_pending(tsk))
goto killed;
spin_lock_irq(lock);
}
@@ -1112,7 +1115,7 @@ static int de_thread(struct task_struct *tsk)
write_unlock_irq(&tasklist_lock);
cgroup_threadgroup_change_end(tsk);
schedule();
- if (unlikely(__fatal_signal_pending(tsk)))
+ if (__fatal_signal_pending(tsk))
goto killed;
}
@@ -1186,7 +1189,7 @@ no_thread_group:
flush_itimer_signals();
#endif
- if (atomic_read(&oldsighand->count) != 1) {
+ if (refcount_read(&oldsighand->count) != 1) {
struct sighand_struct *newsighand;
/*
* This ->sighand is shared with the CLONE_SIGHAND
@@ -1196,7 +1199,7 @@ no_thread_group:
if (!newsighand)
return -ENOMEM;
- atomic_set(&newsighand->count, 1);
+ refcount_set(&newsighand->count, 1);
memcpy(newsighand->action, oldsighand->action,
sizeof(newsighand->action));
@@ -1399,7 +1402,7 @@ EXPORT_SYMBOL(finalize_exec);
* Or, if exec fails before, free_bprm() should release ->cred and
* and unlock.
*/
-int prepare_bprm_creds(struct linux_binprm *bprm)
+static int prepare_bprm_creds(struct linux_binprm *bprm)
{
if (mutex_lock_interruptible(&current->signal->cred_guard_mutex))
return -ERESTARTNOINTR;
@@ -1560,7 +1563,7 @@ static void bprm_fill_uid(struct linux_binprm *bprm)
/*
* Fill the binprm structure from the inode.
- * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
+ * Check permissions, then read the first BINPRM_BUF_SIZE bytes
*
* This may be called multiple times for binary chains (scripts for example).
*/
@@ -1789,12 +1792,8 @@ static int __do_execve_file(int fd, struct filename *filename,
if (retval)
goto out_unmark;
- bprm->argc = count(argv, MAX_ARG_STRINGS);
- if ((retval = bprm->argc) < 0)
- goto out;
-
- bprm->envc = count(envp, MAX_ARG_STRINGS);
- if ((retval = bprm->envc) < 0)
+ retval = prepare_arg_pages(bprm, argv, envp);
+ if (retval < 0)
goto out;
retval = prepare_binprm(bprm);
@@ -1945,15 +1944,10 @@ EXPORT_SYMBOL(set_binfmt);
*/
void set_dumpable(struct mm_struct *mm, int value)
{
- unsigned long old, new;
-
if (WARN_ON((unsigned)value > SUID_DUMP_ROOT))
return;
- do {
- old = READ_ONCE(mm->flags);
- new = (old & ~MMF_DUMPABLE_MASK) | value;
- } while (cmpxchg(&mm->flags, old, new) != old);
+ set_mask_bits(&mm->flags, MMF_DUMPABLE_MASK, value);
}
SYSCALL_DEFINE3(execve,
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index 5331a15a61f1..24a8e34882e9 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -420,8 +420,9 @@ static void _clear_bio(struct bio *bio)
{
struct bio_vec *bv;
unsigned i;
+ struct bvec_iter_all iter_all;
- bio_for_each_segment_all(bv, bio, i) {
+ bio_for_each_segment_all(bv, bio, i, iter_all) {
unsigned this_count = bv->bv_len;
if (likely(PAGE_SIZE == this_count))
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c
index 199590f36203..e83bab54b03e 100644
--- a/fs/exofs/ore_raid.c
+++ b/fs/exofs/ore_raid.c
@@ -468,11 +468,12 @@ static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret)
/* loop on all devices all pages */
for (d = 0; d < ios->numdevs; d++) {
struct bio *bio = ios->per_dev[d].bio;
+ struct bvec_iter_all iter_all;
if (!bio)
continue;
- bio_for_each_segment_all(bv, bio, i) {
+ bio_for_each_segment_all(bv, bio, i, iter_all) {
struct page *page = bv->bv_page;
SetPageUptodate(page);
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 41cf2fbee50d..fc80c7233fa5 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -101,6 +101,7 @@ static int parse_options(char *options, struct exofs_mountopt *opts)
token = match_token(p, tokens, args);
switch (token) {
case Opt_name:
+ kfree(opts->dev_name);
opts->dev_name = match_strdup(&args[0]);
if (unlikely(!opts->dev_name)) {
EXOFS_ERR("Error allocating dev_name");
@@ -117,7 +118,7 @@ static int parse_options(char *options, struct exofs_mountopt *opts)
EXOFS_MIN_PID);
return -EINVAL;
}
- s_pid = 1;
+ s_pid = true;
break;
case Opt_to:
if (match_int(&args[0], &option))
@@ -704,21 +705,18 @@ out:
/*
* Read the superblock from the OSD and fill in the fields
*/
-static int exofs_fill_super(struct super_block *sb, void *data, int silent)
+static int exofs_fill_super(struct super_block *sb,
+ struct exofs_mountopt *opts,
+ struct exofs_sb_info *sbi,
+ int silent)
{
struct inode *root;
- struct exofs_mountopt *opts = data;
- struct exofs_sb_info *sbi; /*extended info */
struct osd_dev *od; /* Master device */
struct exofs_fscb fscb; /*on-disk superblock info */
struct ore_comp comp;
unsigned table_count;
int ret;
- sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
- if (!sbi)
- return -ENOMEM;
-
/* use mount options to fill superblock */
if (opts->is_osdname) {
struct osd_dev_info odi = {.systemid_len = 0};
@@ -862,16 +860,42 @@ static struct dentry *exofs_mount(struct file_system_type *type,
int flags, const char *dev_name,
void *data)
{
+ struct super_block *s;
struct exofs_mountopt opts;
+ struct exofs_sb_info *sbi;
int ret;
ret = parse_options(data, &opts);
- if (ret)
+ if (ret) {
+ kfree(opts.dev_name);
return ERR_PTR(ret);
+ }
+
+ sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+ if (!sbi) {
+ kfree(opts.dev_name);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ s = sget(type, NULL, set_anon_super, flags, NULL);
+
+ if (IS_ERR(s)) {
+ kfree(opts.dev_name);
+ kfree(sbi);
+ return ERR_CAST(s);
+ }
if (!opts.dev_name)
opts.dev_name = dev_name;
- return mount_nodev(type, flags, &opts, exofs_fill_super);
+
+
+ ret = exofs_fill_super(s, &opts, sbi, flags & SB_SILENT ? 1 : 0);
+ if (ret) {
+ deactivate_locked_super(s);
+ return ERR_PTR(ret);
+ }
+ s->s_flags |= SB_ACTIVE;
+ return dget(s->s_root);
}
/*
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 645158dc33f1..c69927bed4ef 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -77,7 +77,7 @@ static bool dentry_connected(struct dentry *dentry)
struct dentry *parent = dget_parent(dentry);
dput(dentry);
- if (IS_ROOT(dentry)) {
+ if (dentry == parent) {
dput(parent);
return false;
}
@@ -147,6 +147,7 @@ static struct dentry *reconnect_one(struct vfsmount *mnt,
tmp = lookup_one_len_unlocked(nbuf, parent, strlen(nbuf));
if (IS_ERR(tmp)) {
dprintk("%s: lookup failed: %d\n", __func__, PTR_ERR(tmp));
+ err = PTR_ERR(tmp);
goto out_err;
}
if (tmp != dentry) {
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index 224c04abb2e5..cf4c77f8dd08 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -256,11 +256,15 @@ ext2_init_acl(struct inode *inode, struct inode *dir)
if (default_acl) {
error = __ext2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
posix_acl_release(default_acl);
+ } else {
+ inode->i_default_acl = NULL;
}
if (acl) {
if (!error)
error = __ext2_set_acl(inode, acl, ACL_TYPE_ACCESS);
posix_acl_release(acl);
+ } else {
+ inode->i_acl = NULL;
}
return error;
}
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 3b8114def693..13318e255ebf 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -252,33 +252,10 @@ ext2_validate_entry(char *base, unsigned offset, unsigned mask)
return (char *)p - base;
}
-static unsigned char ext2_filetype_table[EXT2_FT_MAX] = {
- [EXT2_FT_UNKNOWN] = DT_UNKNOWN,
- [EXT2_FT_REG_FILE] = DT_REG,
- [EXT2_FT_DIR] = DT_DIR,
- [EXT2_FT_CHRDEV] = DT_CHR,
- [EXT2_FT_BLKDEV] = DT_BLK,
- [EXT2_FT_FIFO] = DT_FIFO,
- [EXT2_FT_SOCK] = DT_SOCK,
- [EXT2_FT_SYMLINK] = DT_LNK,
-};
-
-#define S_SHIFT 12
-static unsigned char ext2_type_by_mode[S_IFMT >> S_SHIFT] = {
- [S_IFREG >> S_SHIFT] = EXT2_FT_REG_FILE,
- [S_IFDIR >> S_SHIFT] = EXT2_FT_DIR,
- [S_IFCHR >> S_SHIFT] = EXT2_FT_CHRDEV,
- [S_IFBLK >> S_SHIFT] = EXT2_FT_BLKDEV,
- [S_IFIFO >> S_SHIFT] = EXT2_FT_FIFO,
- [S_IFSOCK >> S_SHIFT] = EXT2_FT_SOCK,
- [S_IFLNK >> S_SHIFT] = EXT2_FT_SYMLINK,
-};
-
static inline void ext2_set_de_type(ext2_dirent *de, struct inode *inode)
{
- umode_t mode = inode->i_mode;
if (EXT2_HAS_INCOMPAT_FEATURE(inode->i_sb, EXT2_FEATURE_INCOMPAT_FILETYPE))
- de->file_type = ext2_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+ de->file_type = fs_umode_to_ftype(inode->i_mode);
else
de->file_type = 0;
}
@@ -293,14 +270,14 @@ ext2_readdir(struct file *file, struct dir_context *ctx)
unsigned long n = pos >> PAGE_SHIFT;
unsigned long npages = dir_pages(inode);
unsigned chunk_mask = ~(ext2_chunk_size(inode)-1);
- unsigned char *types = NULL;
bool need_revalidate = !inode_eq_iversion(inode, file->f_version);
+ bool has_filetype;
if (pos > inode->i_size - EXT2_DIR_REC_LEN(1))
return 0;
- if (EXT2_HAS_INCOMPAT_FEATURE(sb, EXT2_FEATURE_INCOMPAT_FILETYPE))
- types = ext2_filetype_table;
+ has_filetype =
+ EXT2_HAS_INCOMPAT_FEATURE(sb, EXT2_FEATURE_INCOMPAT_FILETYPE);
for ( ; n < npages; n++, offset = 0) {
char *kaddr, *limit;
@@ -335,8 +312,8 @@ ext2_readdir(struct file *file, struct dir_context *ctx)
if (de->inode) {
unsigned char d_type = DT_UNKNOWN;
- if (types && de->file_type < EXT2_FT_MAX)
- d_type = types[de->file_type];
+ if (has_filetype)
+ d_type = fs_ftype_to_dtype(de->file_type);
if (!dir_emit(ctx, de->name, de->name_len,
le32_to_cpu(de->inode),
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 00e759f05161..10ab238de9a6 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -390,11 +390,7 @@ struct ext2_inode {
#define EXT2_MOUNT_USRQUOTA 0x020000 /* user quota */
#define EXT2_MOUNT_GRPQUOTA 0x040000 /* group quota */
#define EXT2_MOUNT_RESERVATION 0x080000 /* Preallocation */
-#ifdef CONFIG_FS_DAX
#define EXT2_MOUNT_DAX 0x100000 /* Direct Access */
-#else
-#define EXT2_MOUNT_DAX 0
-#endif
#define clear_opt(o, opt) o &= ~EXT2_MOUNT_##opt
@@ -608,22 +604,6 @@ struct ext2_dir_entry_2 {
};
/*
- * Ext2 directory file types. Only the low 3 bits are used. The
- * other bits are reserved for now.
- */
-enum {
- EXT2_FT_UNKNOWN = 0,
- EXT2_FT_REG_FILE = 1,
- EXT2_FT_DIR = 2,
- EXT2_FT_CHRDEV = 3,
- EXT2_FT_BLKDEV = 4,
- EXT2_FT_FIFO = 5,
- EXT2_FT_SOCK = 6,
- EXT2_FT_SYMLINK = 7,
- EXT2_FT_MAX
-};
-
-/*
* EXT2_DIR_PAD defines the directory entries boundaries
*
* NOTE: It must be a multiple of 4
@@ -778,6 +758,7 @@ extern int ext2_write_inode (struct inode *, struct writeback_control *);
extern void ext2_evict_inode(struct inode *);
extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int);
extern int ext2_setattr (struct dentry *, struct iattr *);
+extern int ext2_getattr (const struct path *, struct kstat *, u32, unsigned int);
extern void ext2_set_inode_flags(struct inode *inode);
extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 28b2609f25c1..39c4772e96c9 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -199,6 +199,7 @@ const struct inode_operations ext2_file_inode_operations = {
#ifdef CONFIG_EXT2_FS_XATTR
.listxattr = ext2_listxattr,
#endif
+ .getattr = ext2_getattr,
.setattr = ext2_setattr,
.get_acl = ext2_get_acl,
.set_acl = ext2_set_acl,
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 5c3d7b7e4975..a0c5ea91fcd4 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -222,8 +222,6 @@ static int find_group_dir(struct super_block *sb, struct inode *parent)
best_desc = desc;
}
}
- if (!best_desc)
- return -1;
return best_group;
}
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index e4bb9386c045..c27c27300d95 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -717,7 +717,7 @@ static int ext2_get_blocks(struct inode *inode,
/* the number of blocks need to allocate for [d,t]indirect blocks */
indirect_blks = (chain + depth) - partial - 1;
/*
- * Next look up the indirect map to count the totoal number of
+ * Next look up the indirect map to count the total number of
* direct blocks to allocate for this branch.
*/
count = ext2_blks_to_allocate(partial, indirect_blks,
@@ -1239,6 +1239,7 @@ do_indirects:
mark_inode_dirty(inode);
ext2_free_branches(inode, &nr, &nr+1, 1);
}
+ /* fall through */
case EXT2_IND_BLOCK:
nr = i_data[EXT2_DIND_BLOCK];
if (nr) {
@@ -1246,6 +1247,7 @@ do_indirects:
mark_inode_dirty(inode);
ext2_free_branches(inode, &nr, &nr+1, 2);
}
+ /* fall through */
case EXT2_DIND_BLOCK:
nr = i_data[EXT2_TIND_BLOCK];
if (nr) {
@@ -1635,6 +1637,32 @@ int ext2_write_inode(struct inode *inode, struct writeback_control *wbc)
return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
}
+int ext2_getattr(const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int query_falgs)
+{
+ struct inode *inode = d_inode(path->dentry);
+ struct ext2_inode_info *ei = EXT2_I(inode);
+ unsigned int flags;
+
+ flags = ei->i_flags & EXT2_FL_USER_VISIBLE;
+ if (flags & EXT2_APPEND_FL)
+ stat->attributes |= STATX_ATTR_APPEND;
+ if (flags & EXT2_COMPR_FL)
+ stat->attributes |= STATX_ATTR_COMPRESSED;
+ if (flags & EXT2_IMMUTABLE_FL)
+ stat->attributes |= STATX_ATTR_IMMUTABLE;
+ if (flags & EXT2_NODUMP_FL)
+ stat->attributes |= STATX_ATTR_NODUMP;
+ stat->attributes_mask |= (STATX_ATTR_APPEND |
+ STATX_ATTR_COMPRESSED |
+ STATX_ATTR_ENCRYPTED |
+ STATX_ATTR_IMMUTABLE |
+ STATX_ATTR_NODUMP);
+
+ generic_fillattr(inode, stat);
+ return 0;
+}
+
int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
{
struct inode *inode = d_inode(dentry);
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 0c26dcc5d850..ccfbbf59e2fc 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -416,6 +416,7 @@ const struct inode_operations ext2_dir_inode_operations = {
#ifdef CONFIG_EXT2_FS_XATTR
.listxattr = ext2_listxattr,
#endif
+ .getattr = ext2_getattr,
.setattr = ext2_setattr,
.get_acl = ext2_get_acl,
.set_acl = ext2_set_acl,
@@ -426,6 +427,7 @@ const struct inode_operations ext2_special_inode_operations = {
#ifdef CONFIG_EXT2_FS_XATTR
.listxattr = ext2_listxattr,
#endif
+ .getattr = ext2_getattr,
.setattr = ext2_setattr,
.get_acl = ext2_get_acl,
.set_acl = ext2_set_acl,
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 73bd58fa13de..0128010a0874 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -73,7 +73,7 @@ void ext2_error(struct super_block *sb, const char *function,
if (test_opt(sb, ERRORS_PANIC))
panic("EXT2-fs: panic from previous error\n");
- if (test_opt(sb, ERRORS_RO)) {
+ if (!sb_rdonly(sb) && test_opt(sb, ERRORS_RO)) {
ext2_msg(sb, KERN_CRIT,
"error: remounting filesystem read-only");
sb->s_flags |= SB_RDONLY;
@@ -148,10 +148,9 @@ static void ext2_put_super (struct super_block * sb)
ext2_quota_off_umount(sb);
- if (sbi->s_ea_block_cache) {
- ext2_xattr_destroy_cache(sbi->s_ea_block_cache);
- sbi->s_ea_block_cache = NULL;
- }
+ ext2_xattr_destroy_cache(sbi->s_ea_block_cache);
+ sbi->s_ea_block_cache = NULL;
+
if (!sb_rdonly(sb)) {
struct ext2_super_block *es = sbi->s_es;
@@ -309,20 +308,17 @@ static int ext2_show_options(struct seq_file *seq, struct dentry *root)
if (test_opt(sb, NOBH))
seq_puts(seq, ",nobh");
-#if defined(CONFIG_QUOTA)
if (sbi->s_mount_opt & EXT2_MOUNT_USRQUOTA)
seq_puts(seq, ",usrquota");
if (sbi->s_mount_opt & EXT2_MOUNT_GRPQUOTA)
seq_puts(seq, ",grpquota");
-#endif
-#ifdef CONFIG_FS_DAX
if (sbi->s_mount_opt & EXT2_MOUNT_XIP)
seq_puts(seq, ",xip");
+
if (sbi->s_mount_opt & EXT2_MOUNT_DAX)
seq_puts(seq, ",dax");
-#endif
if (!test_opt(sb, RESERVATION))
seq_puts(seq, ",noreservation");
@@ -761,7 +757,8 @@ static loff_t ext2_max_size(int bits)
{
loff_t res = EXT2_NDIR_BLOCKS;
int meta_blocks;
- loff_t upper_limit;
+ unsigned int upper_limit;
+ unsigned int ppb = 1 << (bits-2);
/* This is calculated to be the largest file size for a
* dense, file such that the total number of
@@ -775,24 +772,34 @@ static loff_t ext2_max_size(int bits)
/* total blocks in file system block size */
upper_limit >>= (bits - 9);
+ /* Compute how many blocks we can address by block tree */
+ res += 1LL << (bits-2);
+ res += 1LL << (2*(bits-2));
+ res += 1LL << (3*(bits-2));
+ /* Does block tree limit file size? */
+ if (res < upper_limit)
+ goto check_lfs;
+ res = upper_limit;
+ /* How many metadata blocks are needed for addressing upper_limit? */
+ upper_limit -= EXT2_NDIR_BLOCKS;
/* indirect blocks */
meta_blocks = 1;
+ upper_limit -= ppb;
/* double indirect blocks */
- meta_blocks += 1 + (1LL << (bits-2));
- /* tripple indirect blocks */
- meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2)));
-
- upper_limit -= meta_blocks;
- upper_limit <<= bits;
-
- res += 1LL << (bits-2);
- res += 1LL << (2*(bits-2));
- res += 1LL << (3*(bits-2));
+ if (upper_limit < ppb * ppb) {
+ meta_blocks += 1 + DIV_ROUND_UP(upper_limit, ppb);
+ res -= meta_blocks;
+ goto check_lfs;
+ }
+ meta_blocks += 1 + ppb;
+ upper_limit -= ppb * ppb;
+ /* tripple indirect blocks for the rest */
+ meta_blocks += 1 + DIV_ROUND_UP(upper_limit, ppb) +
+ DIV_ROUND_UP(upper_limit, ppb*ppb);
+ res -= meta_blocks;
+check_lfs:
res <<= bits;
- if (res > upper_limit)
- res = upper_limit;
-
if (res > MAX_LFS_FILESIZE)
res = MAX_LFS_FILESIZE;
@@ -895,6 +902,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
if (sb->s_magic != EXT2_SUPER_MAGIC)
goto cantfind_ext2;
+ opts.s_mount_opt = 0;
/* Set defaults before we parse the mount options */
def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
if (def_mount_opts & EXT2_DEFM_DEBUG)
@@ -1027,8 +1035,6 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group);
sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
- if (EXT2_INODE_SIZE(sb) == 0)
- goto cantfind_ext2;
sbi->s_inodes_per_block = sb->s_blocksize / EXT2_INODE_SIZE(sb);
if (sbi->s_inodes_per_block == 0 || sbi->s_inodes_per_group == 0)
goto cantfind_ext2;
@@ -1090,12 +1096,14 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
sizeof(struct buffer_head *),
GFP_KERNEL);
if (sbi->s_group_desc == NULL) {
+ ret = -ENOMEM;
ext2_msg(sb, KERN_ERR, "error: not enough memory");
goto failed_mount;
}
bgl_lock_init(sbi->s_blockgroup_lock);
sbi->s_debts = kcalloc(sbi->s_groups_count, sizeof(*sbi->s_debts), GFP_KERNEL);
if (!sbi->s_debts) {
+ ret = -ENOMEM;
ext2_msg(sb, KERN_ERR, "error: not enough memory");
goto failed_mount_group_desc;
}
@@ -1151,6 +1159,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
#ifdef CONFIG_EXT2_FS_XATTR
sbi->s_ea_block_cache = ext2_xattr_create_cache();
if (!sbi->s_ea_block_cache) {
+ ret = -ENOMEM;
ext2_msg(sb, KERN_ERR, "Failed to create ea_block_cache");
goto failed_mount3;
}
@@ -1200,8 +1209,7 @@ cantfind_ext2:
sb->s_id);
goto failed_mount;
failed_mount3:
- if (sbi->s_ea_block_cache)
- ext2_xattr_destroy_cache(sbi->s_ea_block_cache);
+ ext2_xattr_destroy_cache(sbi->s_ea_block_cache);
percpu_counter_destroy(&sbi->s_freeblocks_counter);
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
diff --git a/fs/ext2/symlink.c b/fs/ext2/symlink.c
index d5589ddcc281..00cdb8679486 100644
--- a/fs/ext2/symlink.c
+++ b/fs/ext2/symlink.c
@@ -23,6 +23,7 @@
const struct inode_operations ext2_symlink_inode_operations = {
.get_link = page_get_link,
+ .getattr = ext2_getattr,
.setattr = ext2_setattr,
#ifdef CONFIG_EXT2_FS_XATTR
.listxattr = ext2_listxattr,
@@ -31,6 +32,7 @@ const struct inode_operations ext2_symlink_inode_operations = {
const struct inode_operations ext2_fast_symlink_inode_operations = {
.get_link = simple_get_link,
+ .getattr = ext2_getattr,
.setattr = ext2_setattr,
#ifdef CONFIG_EXT2_FS_XATTR
.listxattr = ext2_listxattr,
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 62d9a659a8ff..1e33e0ac8cf1 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -342,6 +342,7 @@ static void ext2_xattr_update_super_block(struct super_block *sb)
return;
spin_lock(&EXT2_SB(sb)->s_lock);
+ ext2_update_dynamic_rev(sb);
EXT2_SET_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR);
spin_unlock(&EXT2_SB(sb)->s_lock);
mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
@@ -612,9 +613,9 @@ skip_replace:
}
cleanup:
- brelse(bh);
if (!(bh && header == HDR(bh)))
kfree(header);
+ brelse(bh);
up_write(&EXT2_I(inode)->xattr_sem);
return error;
@@ -835,7 +836,8 @@ ext2_xattr_cache_insert(struct mb_cache *cache, struct buffer_head *bh)
__u32 hash = le32_to_cpu(HDR(bh)->h_hash);
int error;
- error = mb_cache_entry_create(cache, GFP_NOFS, hash, bh->b_blocknr, 1);
+ error = mb_cache_entry_create(cache, GFP_NOFS, hash, bh->b_blocknr,
+ true);
if (error) {
if (error == -EBUSY) {
ea_bdebug(bh, "already in cache (%d cache entries)",
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index c1d570ee1d9f..8c7bbf3e566d 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -248,7 +248,8 @@ retry:
error = posix_acl_update_mode(inode, &mode, &acl);
if (error)
goto out_stop;
- update_mode = 1;
+ if (mode != inode->i_mode)
+ update_mode = 1;
}
error = __ext4_set_acl(handle, inode, type, acl, 0 /* xattr_flags */);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 12f90d48ba61..185a05d3257e 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -45,15 +45,6 @@
#include <linux/compiler.h>
-/* Until this gets included into linux/compiler-gcc.h */
-#ifndef __nonstring
-#if defined(GCC_VERSION) && (GCC_VERSION >= 80000)
-#define __nonstring __attribute__((nonstring))
-#else
-#define __nonstring
-#endif
-#endif
-
/*
* The fourth extended filesystem constants/structures
*/
@@ -2463,8 +2454,19 @@ int do_journal_get_write_access(handle_t *handle,
#define FALL_BACK_TO_NONDELALLOC 1
#define CONVERT_INLINE_DATA 2
-extern struct inode *ext4_iget(struct super_block *, unsigned long);
-extern struct inode *ext4_iget_normal(struct super_block *, unsigned long);
+typedef enum {
+ EXT4_IGET_NORMAL = 0,
+ EXT4_IGET_SPECIAL = 0x0001, /* OK to iget a system inode */
+ EXT4_IGET_HANDLE = 0x0002 /* Inode # is from a handle */
+} ext4_iget_flags;
+
+extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
+ ext4_iget_flags flags, const char *function,
+ unsigned int line);
+
+#define ext4_iget(sb, ino, flags) \
+ __ext4_iget((sb), (ino), (flags), __func__, __LINE__)
+
extern int ext4_write_inode(struct inode *, struct writeback_control *);
extern int ext4_setattr(struct dentry *, struct iattr *);
extern int ext4_getattr(const struct path *, struct kstat *, u32, unsigned int);
@@ -2547,6 +2549,8 @@ extern int ext4_group_extend(struct super_block *sb,
extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count);
/* super.c */
+extern struct buffer_head *ext4_sb_bread(struct super_block *sb,
+ sector_t block, int op_flags);
extern int ext4_seq_options_show(struct seq_file *seq, void *offset);
extern int ext4_calculate_overhead(struct super_block *sb);
extern void ext4_superblock_csum_set(struct super_block *sb);
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 26a7fe5c4fd3..5508baa11bb6 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -159,6 +159,9 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
ret = err;
}
out:
+ err = file_check_and_advance_wb_err(file);
+ if (ret == 0)
+ ret = err;
trace_ext4_sync_file_exit(inode, ret);
return ret;
}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 2addcb8730e1..7ff14a1adba3 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1216,7 +1216,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
if (IS_ERR(bitmap_bh))
- return (struct inode *) bitmap_bh;
+ return ERR_CAST(bitmap_bh);
/* Having the inode bit set should be a 100% indicator that this
* is a valid orphan (no e2fsck run on fs). Orphans also include
@@ -1225,7 +1225,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
if (!ext4_test_bit(bit, bitmap_bh->b_data))
goto bad_orphan;
- inode = ext4_iget(sb, ino);
+ inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
ext4_error(sb, "couldn't read orphan inode %lu (err %d)",
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 9c4bac18cc6c..56f6e1782d5f 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -705,8 +705,11 @@ int ext4_try_to_write_inline_data(struct address_space *mapping,
if (!PageUptodate(page)) {
ret = ext4_read_inline_page(inode, page);
- if (ret < 0)
+ if (ret < 0) {
+ unlock_page(page);
+ put_page(page);
goto out_up_read;
+ }
}
ret = 1;
@@ -1887,12 +1890,12 @@ int ext4_inline_data_fiemap(struct inode *inode,
physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data;
physical += offsetof(struct ext4_inode, i_block);
- if (physical)
- error = fiemap_fill_next_extent(fieinfo, start, physical,
- inline_len, flags);
brelse(iloc.bh);
out:
up_read(&EXT4_I(inode)->xattr_sem);
+ if (physical)
+ error = fiemap_fill_next_extent(fieinfo, start, physical,
+ inline_len, flags);
return (error < 0 ? error : 0);
}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c3d9a42c561e..34d7e0703cc6 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2643,7 +2643,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
long left = mpd->wbc->nr_to_write;
pgoff_t index = mpd->first_page;
pgoff_t end = mpd->last_page;
- int tag;
+ xa_mark_t tag;
int i, err = 0;
int blkbits = mpd->inode->i_blkbits;
ext4_lblk_t lblk;
@@ -2778,7 +2778,8 @@ static int ext4_writepages(struct address_space *mapping,
* We may need to convert up to one extent per block in
* the page and we may dirty the inode.
*/
- rsv_blocks = 1 + (PAGE_SIZE >> inode->i_blkbits);
+ rsv_blocks = 1 + ext4_chunk_trans_blocks(inode,
+ PAGE_SIZE >> inode->i_blkbits);
}
/*
@@ -4817,7 +4818,9 @@ static inline u64 ext4_inode_peek_iversion(const struct inode *inode)
return inode_peek_iversion(inode);
}
-struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
+struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
+ ext4_iget_flags flags, const char *function,
+ unsigned int line)
{
struct ext4_iloc iloc;
struct ext4_inode *raw_inode;
@@ -4831,6 +4834,18 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
gid_t i_gid;
projid_t i_projid;
+ if ((!(flags & EXT4_IGET_SPECIAL) &&
+ (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)) ||
+ (ino < EXT4_ROOT_INO) ||
+ (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) {
+ if (flags & EXT4_IGET_HANDLE)
+ return ERR_PTR(-ESTALE);
+ __ext4_error(sb, function, line,
+ "inode #%lu: comm %s: iget: illegal inode #",
+ ino, current->comm);
+ return ERR_PTR(-EFSCORRUPTED);
+ }
+
inode = iget_locked(sb, ino);
if (!inode)
return ERR_PTR(-ENOMEM);
@@ -4846,18 +4861,26 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
raw_inode = ext4_raw_inode(&iloc);
if ((ino == EXT4_ROOT_INO) && (raw_inode->i_links_count == 0)) {
- EXT4_ERROR_INODE(inode, "root inode unallocated");
+ ext4_error_inode(inode, function, line, 0,
+ "iget: root inode unallocated");
ret = -EFSCORRUPTED;
goto bad_inode;
}
+ if ((flags & EXT4_IGET_HANDLE) &&
+ (raw_inode->i_links_count == 0) && (raw_inode->i_mode == 0)) {
+ ret = -ESTALE;
+ goto bad_inode;
+ }
+
if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
EXT4_INODE_SIZE(inode->i_sb) ||
(ei->i_extra_isize & 3)) {
- EXT4_ERROR_INODE(inode,
- "bad extra_isize %u (inode size %u)",
+ ext4_error_inode(inode, function, line, 0,
+ "iget: bad extra_isize %u "
+ "(inode size %u)",
ei->i_extra_isize,
EXT4_INODE_SIZE(inode->i_sb));
ret = -EFSCORRUPTED;
@@ -4879,7 +4902,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
}
if (!ext4_inode_csum_verify(inode, raw_inode, ei)) {
- EXT4_ERROR_INODE(inode, "checksum invalid");
+ ext4_error_inode(inode, function, line, 0,
+ "iget: checksum invalid");
ret = -EFSBADCRC;
goto bad_inode;
}
@@ -4936,7 +4960,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
inode->i_size = ext4_isize(sb, raw_inode);
if ((size = i_size_read(inode)) < 0) {
- EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size);
+ ext4_error_inode(inode, function, line, 0,
+ "iget: bad i_size value: %lld", size);
ret = -EFSCORRUPTED;
goto bad_inode;
}
@@ -5012,7 +5037,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
ret = 0;
if (ei->i_file_acl &&
!ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
- EXT4_ERROR_INODE(inode, "bad extended attribute block %llu",
+ ext4_error_inode(inode, function, line, 0,
+ "iget: bad extended attribute block %llu",
ei->i_file_acl);
ret = -EFSCORRUPTED;
goto bad_inode;
@@ -5040,8 +5066,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
} else if (S_ISLNK(inode->i_mode)) {
/* VFS does not allow setting these so must be corruption */
if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
- EXT4_ERROR_INODE(inode,
- "immutable or append flags not allowed on symlinks");
+ ext4_error_inode(inode, function, line, 0,
+ "iget: immutable or append flags "
+ "not allowed on symlinks");
ret = -EFSCORRUPTED;
goto bad_inode;
}
@@ -5071,7 +5098,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
make_bad_inode(inode);
} else {
ret = -EFSCORRUPTED;
- EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
+ ext4_error_inode(inode, function, line, 0,
+ "iget: bogus i_mode (%o)", inode->i_mode);
goto bad_inode;
}
brelse(iloc.bh);
@@ -5085,13 +5113,6 @@ bad_inode:
return ERR_PTR(ret);
}
-struct inode *ext4_iget_normal(struct super_block *sb, unsigned long ino)
-{
- if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)
- return ERR_PTR(-EFSCORRUPTED);
- return ext4_iget(sb, ino);
-}
-
static int ext4_inode_blocks_set(handle_t *handle,
struct ext4_inode *raw_inode,
struct ext4_inode_info *ei)
@@ -5380,9 +5401,13 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
{
int err;
- if (WARN_ON_ONCE(current->flags & PF_MEMALLOC))
+ if (WARN_ON_ONCE(current->flags & PF_MEMALLOC) ||
+ sb_rdonly(inode->i_sb))
return 0;
+ if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+ return -EIO;
+
if (EXT4_SB(inode->i_sb)->s_journal) {
if (ext4_journal_current_handle()) {
jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
@@ -5398,7 +5423,8 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync)
return 0;
- err = ext4_force_commit(inode->i_sb);
+ err = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal,
+ EXT4_I(inode)->i_sync_tid);
} else {
struct ext4_iloc iloc;
@@ -5835,9 +5861,10 @@ int ext4_mark_iloc_dirty(handle_t *handle,
{
int err = 0;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+ if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) {
+ put_bh(iloc->bh);
return -EIO;
-
+ }
if (IS_I_VERSION(inode))
inode_inc_iversion(inode);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 0edee31913d1..d37dafa1d133 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -125,7 +125,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN))
return -EPERM;
- inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO);
+ inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO, EXT4_IGET_SPECIAL);
if (IS_ERR(inode_bl))
return PTR_ERR(inode_bl);
ei_bl = EXT4_I(inode_bl);
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 61a9d1927817..b1e4d359f73b 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -116,9 +116,9 @@ static int update_ind_extent_range(handle_t *handle, struct inode *inode,
int i, retval = 0;
unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
- bh = sb_bread(inode->i_sb, pblock);
- if (!bh)
- return -EIO;
+ bh = ext4_sb_bread(inode->i_sb, pblock, 0);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
i_data = (__le32 *)bh->b_data;
for (i = 0; i < max_entries; i++) {
@@ -145,9 +145,9 @@ static int update_dind_extent_range(handle_t *handle, struct inode *inode,
int i, retval = 0;
unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
- bh = sb_bread(inode->i_sb, pblock);
- if (!bh)
- return -EIO;
+ bh = ext4_sb_bread(inode->i_sb, pblock, 0);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
i_data = (__le32 *)bh->b_data;
for (i = 0; i < max_entries; i++) {
@@ -175,9 +175,9 @@ static int update_tind_extent_range(handle_t *handle, struct inode *inode,
int i, retval = 0;
unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
- bh = sb_bread(inode->i_sb, pblock);
- if (!bh)
- return -EIO;
+ bh = ext4_sb_bread(inode->i_sb, pblock, 0);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
i_data = (__le32 *)bh->b_data;
for (i = 0; i < max_entries; i++) {
@@ -224,9 +224,9 @@ static int free_dind_blocks(handle_t *handle,
struct buffer_head *bh;
unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
- bh = sb_bread(inode->i_sb, le32_to_cpu(i_data));
- if (!bh)
- return -EIO;
+ bh = ext4_sb_bread(inode->i_sb, le32_to_cpu(i_data), 0);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
tmp_idata = (__le32 *)bh->b_data;
for (i = 0; i < max_entries; i++) {
@@ -254,9 +254,9 @@ static int free_tind_blocks(handle_t *handle,
struct buffer_head *bh;
unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
- bh = sb_bread(inode->i_sb, le32_to_cpu(i_data));
- if (!bh)
- return -EIO;
+ bh = ext4_sb_bread(inode->i_sb, le32_to_cpu(i_data), 0);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
tmp_idata = (__le32 *)bh->b_data;
for (i = 0; i < max_entries; i++) {
@@ -382,9 +382,9 @@ static int free_ext_idx(handle_t *handle, struct inode *inode,
struct ext4_extent_header *eh;
block = ext4_idx_pblock(ix);
- bh = sb_bread(inode->i_sb, block);
- if (!bh)
- return -EIO;
+ bh = ext4_sb_bread(inode->i_sb, block, 0);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
eh = (struct ext4_extent_header *)bh->b_data;
if (eh->eh_depth != 0) {
@@ -535,22 +535,22 @@ int ext4_ext_migrate(struct inode *inode)
if (i_data[EXT4_IND_BLOCK]) {
retval = update_ind_extent_range(handle, tmp_inode,
le32_to_cpu(i_data[EXT4_IND_BLOCK]), &lb);
- if (retval)
- goto err_out;
+ if (retval)
+ goto err_out;
} else
lb.curr_block += max_entries;
if (i_data[EXT4_DIND_BLOCK]) {
retval = update_dind_extent_range(handle, tmp_inode,
le32_to_cpu(i_data[EXT4_DIND_BLOCK]), &lb);
- if (retval)
- goto err_out;
+ if (retval)
+ goto err_out;
} else
lb.curr_block += max_entries * max_entries;
if (i_data[EXT4_TIND_BLOCK]) {
retval = update_tind_extent_range(handle, tmp_inode,
le32_to_cpu(i_data[EXT4_TIND_BLOCK]), &lb);
- if (retval)
- goto err_out;
+ if (retval)
+ goto err_out;
}
/*
* Build the last extent
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 67a38532032a..2b928eb07fa2 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -126,6 +126,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
if (!is_dx_block && type == INDEX) {
ext4_error_inode(inode, func, line, block,
"directory leaf block found instead of index block");
+ brelse(bh);
return ERR_PTR(-EFSCORRUPTED);
}
if (!ext4_has_metadata_csum(inode->i_sb) ||
@@ -1556,7 +1557,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
if (IS_ERR(bh))
- return (struct dentry *) bh;
+ return ERR_CAST(bh);
inode = NULL;
if (bh) {
__u32 ino = le32_to_cpu(de->inode);
@@ -1570,7 +1571,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
dentry);
return ERR_PTR(-EFSCORRUPTED);
}
- inode = ext4_iget_normal(dir->i_sb, ino);
+ inode = ext4_iget(dir->i_sb, ino, EXT4_IGET_NORMAL);
if (inode == ERR_PTR(-ESTALE)) {
EXT4_ERROR_INODE(dir,
"deleted inode referenced: %u",
@@ -1600,7 +1601,7 @@ struct dentry *ext4_get_parent(struct dentry *child)
bh = ext4_find_entry(d_inode(child), &dotdot, &de, NULL);
if (IS_ERR(bh))
- return (struct dentry *) bh;
+ return ERR_CAST(bh);
if (!bh)
return ERR_PTR(-ENOENT);
ino = le32_to_cpu(de->inode);
@@ -1612,7 +1613,7 @@ struct dentry *ext4_get_parent(struct dentry *child)
return ERR_PTR(-EFSCORRUPTED);
}
- return d_obtain_alias(ext4_iget_normal(child->d_sb, ino));
+ return d_obtain_alias(ext4_iget(child->d_sb, ino, EXT4_IGET_NORMAL));
}
/*
@@ -2811,7 +2812,9 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
list_del_init(&EXT4_I(inode)->i_orphan);
mutex_unlock(&sbi->s_orphan_lock);
}
- }
+ } else
+ brelse(iloc.bh);
+
jbd_debug(4, "superblock will point to %lu\n", inode->i_ino);
jbd_debug(4, "orphan inode %lu will point to %d\n",
inode->i_ino, NEXT_ORPHAN(inode));
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 2aa62d58d8dd..cff4c4aa7a9c 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -63,8 +63,9 @@ static void ext4_finish_bio(struct bio *bio)
{
int i;
struct bio_vec *bvec;
+ struct bvec_iter_all iter_all;
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, i, iter_all) {
struct page *page = bvec->bv_page;
#ifdef CONFIG_EXT4_FS_ENCRYPTION
struct page *data_page = NULL;
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index f461d75ac049..e53639784892 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -72,6 +72,7 @@ static void mpage_end_io(struct bio *bio)
{
struct bio_vec *bv;
int i;
+ struct bvec_iter_all iter_all;
if (ext4_bio_encrypted(bio)) {
if (bio->bi_status) {
@@ -81,7 +82,7 @@ static void mpage_end_io(struct bio *bio)
return;
}
}
- bio_for_each_segment_all(bv, bio, i) {
+ bio_for_each_segment_all(bv, bio, i, iter_all) {
struct page *page = bv->bv_page;
if (!bio->bi_status) {
@@ -128,7 +129,7 @@ int ext4_mpage_readpages(struct address_space *mapping,
prefetchw(&page->flags);
if (pages) {
- page = list_entry(pages->prev, struct page, lru);
+ page = lru_to_page(pages);
list_del(&page->lru);
if (add_to_page_cache_lru(page, mapping, page->index,
readahead_gfp_mask(mapping)))
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index ebbc663d0798..48421de803b7 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -127,10 +127,12 @@ static int verify_group_input(struct super_block *sb,
else if (free_blocks_count < 0)
ext4_warning(sb, "Bad blocks count %u",
input->blocks_count);
- else if (!(bh = sb_bread(sb, end - 1)))
+ else if (IS_ERR(bh = ext4_sb_bread(sb, end - 1, 0))) {
+ err = PTR_ERR(bh);
+ bh = NULL;
ext4_warning(sb, "Cannot read last block (%llu)",
end - 1);
- else if (outside(input->block_bitmap, start, end))
+ } else if (outside(input->block_bitmap, start, end))
ext4_warning(sb, "Block bitmap not in group (block %llu)",
(unsigned long long)input->block_bitmap);
else if (outside(input->inode_bitmap, start, end))
@@ -459,16 +461,18 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
BUFFER_TRACE(bh, "get_write_access");
err = ext4_journal_get_write_access(handle, bh);
- if (err)
+ if (err) {
+ brelse(bh);
return err;
+ }
ext4_debug("mark block bitmap %#04llx (+%llu/%u)\n",
first_cluster, first_cluster - start, count2);
ext4_set_bits(bh->b_data, first_cluster - start, count2);
err = ext4_handle_dirty_metadata(handle, NULL, bh);
+ brelse(bh);
if (unlikely(err))
return err;
- brelse(bh);
}
return 0;
@@ -605,7 +609,6 @@ handle_bb:
bh = bclean(handle, sb, block);
if (IS_ERR(bh)) {
err = PTR_ERR(bh);
- bh = NULL;
goto out;
}
overhead = ext4_group_overhead_blocks(sb, group);
@@ -618,9 +621,9 @@ handle_bb:
ext4_mark_bitmap_end(EXT4_B2C(sbi, group_data[i].blocks_count),
sb->s_blocksize * 8, bh->b_data);
err = ext4_handle_dirty_metadata(handle, NULL, bh);
+ brelse(bh);
if (err)
goto out;
- brelse(bh);
handle_ib:
if (bg_flags[i] & EXT4_BG_INODE_UNINIT)
@@ -635,18 +638,16 @@ handle_ib:
bh = bclean(handle, sb, block);
if (IS_ERR(bh)) {
err = PTR_ERR(bh);
- bh = NULL;
goto out;
}
ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb),
sb->s_blocksize * 8, bh->b_data);
err = ext4_handle_dirty_metadata(handle, NULL, bh);
+ brelse(bh);
if (err)
goto out;
- brelse(bh);
}
- bh = NULL;
/* Mark group tables in block bitmap */
for (j = 0; j < GROUP_TABLE_COUNT; j++) {
@@ -685,7 +686,6 @@ handle_ib:
}
out:
- brelse(bh);
err2 = ext4_journal_stop(handle);
if (err2 && !err)
err = err2;
@@ -783,11 +783,11 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
ext4_fsblk_t gdblock = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num;
- struct buffer_head **o_group_desc, **n_group_desc;
- struct buffer_head *dind;
- struct buffer_head *gdb_bh;
+ struct buffer_head **o_group_desc, **n_group_desc = NULL;
+ struct buffer_head *dind = NULL;
+ struct buffer_head *gdb_bh = NULL;
int gdbackups;
- struct ext4_iloc iloc;
+ struct ext4_iloc iloc = { .bh = NULL };
__le32 *data;
int err;
@@ -796,21 +796,22 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
"EXT4-fs: ext4_add_new_gdb: adding group block %lu\n",
gdb_num);
- gdb_bh = sb_bread(sb, gdblock);
- if (!gdb_bh)
- return -EIO;
+ gdb_bh = ext4_sb_bread(sb, gdblock, 0);
+ if (IS_ERR(gdb_bh))
+ return PTR_ERR(gdb_bh);
gdbackups = verify_reserved_gdb(sb, group, gdb_bh);
if (gdbackups < 0) {
err = gdbackups;
- goto exit_bh;
+ goto errout;
}
data = EXT4_I(inode)->i_data + EXT4_DIND_BLOCK;
- dind = sb_bread(sb, le32_to_cpu(*data));
- if (!dind) {
- err = -EIO;
- goto exit_bh;
+ dind = ext4_sb_bread(sb, le32_to_cpu(*data), 0);
+ if (IS_ERR(dind)) {
+ err = PTR_ERR(dind);
+ dind = NULL;
+ goto errout;
}
data = (__le32 *)dind->b_data;
@@ -818,18 +819,18 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
ext4_warning(sb, "new group %u GDT block %llu not reserved",
group, gdblock);
err = -EINVAL;
- goto exit_dind;
+ goto errout;
}
BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
if (unlikely(err))
- goto exit_dind;
+ goto errout;
BUFFER_TRACE(gdb_bh, "get_write_access");
err = ext4_journal_get_write_access(handle, gdb_bh);
if (unlikely(err))
- goto exit_dind;
+ goto errout;
BUFFER_TRACE(dind, "get_write_access");
err = ext4_journal_get_write_access(handle, dind);
@@ -839,7 +840,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
/* ext4_reserve_inode_write() gets a reference on the iloc */
err = ext4_reserve_inode_write(handle, inode, &iloc);
if (unlikely(err))
- goto exit_dind;
+ goto errout;
n_group_desc = ext4_kvmalloc((gdb_num + 1) *
sizeof(struct buffer_head *),
@@ -848,7 +849,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
err = -ENOMEM;
ext4_warning(sb, "not enough memory for %lu groups",
gdb_num + 1);
- goto exit_inode;
+ goto errout;
}
/*
@@ -864,7 +865,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
err = ext4_handle_dirty_metadata(handle, NULL, dind);
if (unlikely(err)) {
ext4_std_error(sb, err);
- goto exit_inode;
+ goto errout;
}
inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >>
(9 - EXT4_SB(sb)->s_cluster_bits);
@@ -873,7 +874,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh);
if (unlikely(err)) {
ext4_std_error(sb, err);
- goto exit_inode;
+ goto errout;
}
brelse(dind);
@@ -889,15 +890,11 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
err = ext4_handle_dirty_super(handle, sb);
if (err)
ext4_std_error(sb, err);
-
return err;
-
-exit_inode:
+errout:
kvfree(n_group_desc);
brelse(iloc.bh);
-exit_dind:
brelse(dind);
-exit_bh:
brelse(gdb_bh);
ext4_debug("leaving with error %d\n", err);
@@ -917,13 +914,14 @@ static int add_new_gdb_meta_bg(struct super_block *sb,
gdblock = ext4_meta_bg_first_block_no(sb, group) +
ext4_bg_has_super(sb, group);
- gdb_bh = sb_bread(sb, gdblock);
- if (!gdb_bh)
- return -EIO;
+ gdb_bh = ext4_sb_bread(sb, gdblock, 0);
+ if (IS_ERR(gdb_bh))
+ return PTR_ERR(gdb_bh);
n_group_desc = ext4_kvmalloc((gdb_num + 1) *
sizeof(struct buffer_head *),
GFP_NOFS);
if (!n_group_desc) {
+ brelse(gdb_bh);
err = -ENOMEM;
ext4_warning(sb, "not enough memory for %lu groups",
gdb_num + 1);
@@ -939,8 +937,6 @@ static int add_new_gdb_meta_bg(struct super_block *sb,
kvfree(o_group_desc);
BUFFER_TRACE(gdb_bh, "get_write_access");
err = ext4_journal_get_write_access(handle, gdb_bh);
- if (unlikely(err))
- brelse(gdb_bh);
return err;
}
@@ -977,9 +973,10 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
return -ENOMEM;
data = EXT4_I(inode)->i_data + EXT4_DIND_BLOCK;
- dind = sb_bread(sb, le32_to_cpu(*data));
- if (!dind) {
- err = -EIO;
+ dind = ext4_sb_bread(sb, le32_to_cpu(*data), 0);
+ if (IS_ERR(dind)) {
+ err = PTR_ERR(dind);
+ dind = NULL;
goto exit_free;
}
@@ -998,9 +995,10 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
err = -EINVAL;
goto exit_bh;
}
- primary[res] = sb_bread(sb, blk);
- if (!primary[res]) {
- err = -EIO;
+ primary[res] = ext4_sb_bread(sb, blk, 0);
+ if (IS_ERR(primary[res])) {
+ err = PTR_ERR(primary[res]);
+ primary[res] = NULL;
goto exit_bh;
}
gdbackups = verify_reserved_gdb(sb, group, primary[res]);
@@ -1124,8 +1122,10 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data,
backup_block, backup_block -
ext4_group_first_block_no(sb, group));
BUFFER_TRACE(bh, "get_write_access");
- if ((err = ext4_journal_get_write_access(handle, bh)))
+ if ((err = ext4_journal_get_write_access(handle, bh))) {
+ brelse(bh);
break;
+ }
lock_buffer(bh);
memcpy(bh->b_data, data, size);
if (rest)
@@ -1631,13 +1631,13 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
}
if (reserved_gdb || gdb_off == 0) {
- if (ext4_has_feature_resize_inode(sb) ||
+ if (!ext4_has_feature_resize_inode(sb) ||
!le16_to_cpu(es->s_reserved_gdt_blocks)) {
ext4_warning(sb,
"No reserved GDT blocks, can't resize");
return -EPERM;
}
- inode = ext4_iget(sb, EXT4_RESIZE_INO);
+ inode = ext4_iget(sb, EXT4_RESIZE_INO, EXT4_IGET_SPECIAL);
if (IS_ERR(inode)) {
ext4_warning(sb, "Error opening resize inode");
return PTR_ERR(inode);
@@ -1965,7 +1965,8 @@ retry:
}
if (!resize_inode)
- resize_inode = ext4_iget(sb, EXT4_RESIZE_INO);
+ resize_inode = ext4_iget(sb, EXT4_RESIZE_INO,
+ EXT4_IGET_SPECIAL);
if (IS_ERR(resize_inode)) {
ext4_warning(sb, "Error opening resize inode");
return PTR_ERR(resize_inode);
@@ -2023,7 +2024,7 @@ retry:
err = ext4_alloc_flex_bg_array(sb, n_group + 1);
if (err)
- return err;
+ goto out;
err = ext4_mb_alloc_groupinfo(sb, n_group + 1);
if (err)
@@ -2059,6 +2060,10 @@ retry:
n_blocks_count_retry = 0;
free_flex_gd(flex_gd);
flex_gd = NULL;
+ if (resize_inode) {
+ iput(resize_inode);
+ resize_inode = NULL;
+ }
goto retry;
}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index a221f1cdf704..fb12d3c17c1b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -140,6 +140,29 @@ MODULE_ALIAS_FS("ext3");
MODULE_ALIAS("ext3");
#define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type)
+/*
+ * This works like sb_bread() except it uses ERR_PTR for error
+ * returns. Currently with sb_bread it's impossible to distinguish
+ * between ENOMEM and EIO situations (since both result in a NULL
+ * return.
+ */
+struct buffer_head *
+ext4_sb_bread(struct super_block *sb, sector_t block, int op_flags)
+{
+ struct buffer_head *bh = sb_getblk(sb, block);
+
+ if (bh == NULL)
+ return ERR_PTR(-ENOMEM);
+ if (buffer_uptodate(bh))
+ return bh;
+ ll_rw_block(REQ_OP_READ, REQ_META | op_flags, 1, &bh);
+ wait_on_buffer(bh);
+ if (buffer_uptodate(bh))
+ return bh;
+ put_bh(bh);
+ return ERR_PTR(-EIO);
+}
+
static int ext4_verify_csum_type(struct super_block *sb,
struct ext4_super_block *es)
{
@@ -1000,14 +1023,13 @@ static void ext4_put_super(struct super_block *sb)
invalidate_bdev(sbi->journal_bdev);
ext4_blkdev_remove(sbi);
}
- if (sbi->s_ea_inode_cache) {
- ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
- sbi->s_ea_inode_cache = NULL;
- }
- if (sbi->s_ea_block_cache) {
- ext4_xattr_destroy_cache(sbi->s_ea_block_cache);
- sbi->s_ea_block_cache = NULL;
- }
+
+ ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
+ sbi->s_ea_inode_cache = NULL;
+
+ ext4_xattr_destroy_cache(sbi->s_ea_block_cache);
+ sbi->s_ea_block_cache = NULL;
+
if (sbi->s_mmp_tsk)
kthread_stop(sbi->s_mmp_tsk);
brelse(sbi->s_sbh);
@@ -1151,20 +1173,11 @@ static struct inode *ext4_nfs_get_inode(struct super_block *sb,
{
struct inode *inode;
- if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)
- return ERR_PTR(-ESTALE);
- if (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))
- return ERR_PTR(-ESTALE);
-
- /* iget isn't really right if the inode is currently unallocated!!
- *
- * ext4_read_inode will return a bad_inode if the inode had been
- * deleted, so we should be safe.
- *
+ /*
* Currently we don't know the generation for parent directory, so
* a generation of 0 means "accept any"
*/
- inode = ext4_iget_normal(sb, ino);
+ inode = ext4_iget(sb, ino, EXT4_IGET_HANDLE);
if (IS_ERR(inode))
return ERR_CAST(inode);
if (generation && inode->i_generation != generation) {
@@ -1189,6 +1202,16 @@ static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
ext4_nfs_get_inode);
}
+static int ext4_nfs_commit_metadata(struct inode *inode)
+{
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_ALL
+ };
+
+ trace_ext4_nfs_commit_metadata(inode);
+ return ext4_write_inode(inode, &wbc);
+}
+
/*
* Try to release metadata pages (indirect blocks, directories) which are
* mapped via the block device. Since these pages could have journal heads
@@ -1393,6 +1416,7 @@ static const struct export_operations ext4_export_ops = {
.fh_to_dentry = ext4_fh_to_dentry,
.fh_to_parent = ext4_fh_to_parent,
.get_parent = ext4_get_parent,
+ .commit_metadata = ext4_nfs_commit_metadata,
};
enum {
@@ -1939,7 +1963,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
#ifdef CONFIG_FS_DAX
ext4_msg(sb, KERN_WARNING,
"DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
- sbi->s_mount_opt |= m->mount_opt;
+ sbi->s_mount_opt |= m->mount_opt;
#else
ext4_msg(sb, KERN_INFO, "dax option not supported");
return -1;
@@ -3842,12 +3866,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
if (ext4_has_feature_inline_data(sb)) {
ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem"
" that may contain inline data");
- sbi->s_mount_opt &= ~EXT4_MOUNT_DAX;
+ goto failed_mount;
}
if (!bdev_dax_supported(sb->s_bdev, blocksize)) {
ext4_msg(sb, KERN_ERR,
- "DAX unsupported by block device. Turning off DAX.");
- sbi->s_mount_opt &= ~EXT4_MOUNT_DAX;
+ "DAX unsupported by block device.");
+ goto failed_mount;
}
}
@@ -4075,6 +4099,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_groups_count = blocks_count;
sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
(EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
+ if (((u64)sbi->s_groups_count * sbi->s_inodes_per_group) !=
+ le32_to_cpu(es->s_inodes_count)) {
+ ext4_msg(sb, KERN_ERR, "inodes count not valid: %u vs %llu",
+ le32_to_cpu(es->s_inodes_count),
+ ((u64)sbi->s_groups_count * sbi->s_inodes_per_group));
+ ret = -EINVAL;
+ goto failed_mount;
+ }
db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
EXT4_DESC_PER_BLOCK(sb);
if (ext4_has_feature_meta_bg(sb)) {
@@ -4094,14 +4126,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
ret = -ENOMEM;
goto failed_mount;
}
- if (((u64)sbi->s_groups_count * sbi->s_inodes_per_group) !=
- le32_to_cpu(es->s_inodes_count)) {
- ext4_msg(sb, KERN_ERR, "inodes count not valid: %u vs %llu",
- le32_to_cpu(es->s_inodes_count),
- ((u64)sbi->s_groups_count * sbi->s_inodes_per_group));
- ret = -EINVAL;
- goto failed_mount;
- }
bgl_lock_init(sbi->s_blockgroup_lock);
@@ -4328,7 +4352,7 @@ no_journal:
* so we can safely mount the rest of the filesystem now.
*/
- root = ext4_iget(sb, EXT4_ROOT_INO);
+ root = ext4_iget(sb, EXT4_ROOT_INO, EXT4_IGET_SPECIAL);
if (IS_ERR(root)) {
ext4_msg(sb, KERN_ERR, "get root inode failed");
ret = PTR_ERR(root);
@@ -4510,6 +4534,7 @@ failed_mount6:
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
+ percpu_free_rwsem(&sbi->s_journal_flag_rwsem);
failed_mount5:
ext4_ext_release(sb);
ext4_release_system_zone(sb);
@@ -4521,14 +4546,12 @@ failed_mount4:
if (EXT4_SB(sb)->rsv_conversion_wq)
destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
failed_mount_wq:
- if (sbi->s_ea_inode_cache) {
- ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
- sbi->s_ea_inode_cache = NULL;
- }
- if (sbi->s_ea_block_cache) {
- ext4_xattr_destroy_cache(sbi->s_ea_block_cache);
- sbi->s_ea_block_cache = NULL;
- }
+ ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
+ sbi->s_ea_inode_cache = NULL;
+
+ ext4_xattr_destroy_cache(sbi->s_ea_block_cache);
+ sbi->s_ea_block_cache = NULL;
+
if (sbi->s_journal) {
jbd2_journal_destroy(sbi->s_journal);
sbi->s_journal = NULL;
@@ -4597,7 +4620,7 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb,
* happen if we iget() an unused inode, as the subsequent iput()
* will try to delete it.
*/
- journal_inode = ext4_iget(sb, journal_inum);
+ journal_inode = ext4_iget(sb, journal_inum, EXT4_IGET_SPECIAL);
if (IS_ERR(journal_inode)) {
ext4_msg(sb, KERN_ERR, "no journal found");
return NULL;
@@ -4879,7 +4902,7 @@ static int ext4_commit_super(struct super_block *sb, int sync)
ext4_superblock_csum_set(sb);
if (sync)
lock_buffer(sbh);
- if (buffer_write_io_error(sbh)) {
+ if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) {
/*
* Oh, dear. A previous attempt to write the
* superblock failed. This could happen because the
@@ -5679,7 +5702,7 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
if (!qf_inums[type])
return -EPERM;
- qf_inode = ext4_iget(sb, qf_inums[type]);
+ qf_inode = ext4_iget(sb, qf_inums[type], EXT4_IGET_SPECIAL);
if (IS_ERR(qf_inode)) {
ext4_error(sb, "Bad quota inode # %lu", qf_inums[type]);
return PTR_ERR(qf_inode);
@@ -5689,9 +5712,9 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
qf_inode->i_flags |= S_NOQUOTA;
lockdep_set_quota_inode(qf_inode, I_DATA_SEM_QUOTA);
err = dquot_enable(qf_inode, type, format_id, flags);
- iput(qf_inode);
if (err)
lockdep_set_quota_inode(qf_inode, I_DATA_SEM_NORMAL);
+ iput(qf_inode);
return err;
}
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index f36fc5d5b257..86ed9c686249 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -384,7 +384,7 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
struct inode *inode;
int err;
- inode = ext4_iget(parent->i_sb, ea_ino);
+ inode = ext4_iget(parent->i_sb, ea_ino, EXT4_IGET_NORMAL);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
ext4_error(parent->i_sb,
@@ -522,14 +522,13 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
name_index, name, buffer, (long)buffer_size);
- error = -ENODATA;
if (!EXT4_I(inode)->i_file_acl)
- goto cleanup;
+ return -ENODATA;
ea_idebug(inode, "reading block %llu",
(unsigned long long)EXT4_I(inode)->i_file_acl);
- bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
- if (!bh)
- goto cleanup;
+ bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
ea_bdebug(bh, "b_count=%d, refcount=%d",
atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
error = ext4_xattr_check_block(inode, bh);
@@ -696,26 +695,23 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
ea_idebug(inode, "buffer=%p, buffer_size=%ld",
buffer, (long)buffer_size);
- error = 0;
if (!EXT4_I(inode)->i_file_acl)
- goto cleanup;
+ return 0;
ea_idebug(inode, "reading block %llu",
(unsigned long long)EXT4_I(inode)->i_file_acl);
- bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
- error = -EIO;
- if (!bh)
- goto cleanup;
+ bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
ea_bdebug(bh, "b_count=%d, refcount=%d",
atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
error = ext4_xattr_check_block(inode, bh);
if (error)
goto cleanup;
ext4_xattr_block_cache_insert(EA_BLOCK_CACHE(inode), bh);
- error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size);
-
+ error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer,
+ buffer_size);
cleanup:
brelse(bh);
-
return error;
}
@@ -830,9 +826,9 @@ int ext4_get_inode_usage(struct inode *inode, qsize_t *usage)
}
if (EXT4_I(inode)->i_file_acl) {
- bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
- if (!bh) {
- ret = -EIO;
+ bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
+ if (IS_ERR(bh)) {
+ ret = PTR_ERR(bh);
goto out;
}
@@ -1031,10 +1027,8 @@ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
inode_lock(ea_inode);
ret = ext4_reserve_inode_write(handle, ea_inode, &iloc);
- if (ret) {
- iloc.bh = NULL;
+ if (ret)
goto out;
- }
ref_count = ext4_xattr_inode_get_ref(ea_inode);
ref_count += ref_change;
@@ -1080,12 +1074,10 @@ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
}
ret = ext4_mark_iloc_dirty(handle, ea_inode, &iloc);
- iloc.bh = NULL;
if (ret)
ext4_warning_inode(ea_inode,
"ext4_mark_iloc_dirty() failed ret=%d", ret);
out:
- brelse(iloc.bh);
inode_unlock(ea_inode);
return ret;
}
@@ -1388,6 +1380,12 @@ retry:
bh = ext4_getblk(handle, ea_inode, block, 0);
if (IS_ERR(bh))
return PTR_ERR(bh);
+ if (!bh) {
+ WARN_ON_ONCE(1);
+ EXT4_ERROR_INODE(ea_inode,
+ "ext4_getblk() return bh = NULL");
+ return -EFSCORRUPTED;
+ }
ret = ext4_journal_get_write_access(handle, bh);
if (ret)
goto out;
@@ -1484,7 +1482,8 @@ ext4_xattr_inode_cache_find(struct inode *inode, const void *value,
}
while (ce) {
- ea_inode = ext4_iget(inode->i_sb, ce->e_value);
+ ea_inode = ext4_iget(inode->i_sb, ce->e_value,
+ EXT4_IGET_NORMAL);
if (!IS_ERR(ea_inode) &&
!is_bad_inode(ea_inode) &&
(EXT4_I(ea_inode)->i_flags & EXT4_EA_INODE_FL) &&
@@ -1819,16 +1818,15 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
if (EXT4_I(inode)->i_file_acl) {
/* The inode already has an extended attribute block. */
- bs->bh = sb_bread(sb, EXT4_I(inode)->i_file_acl);
- error = -EIO;
- if (!bs->bh)
- goto cleanup;
+ bs->bh = ext4_sb_bread(sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
+ if (IS_ERR(bs->bh))
+ return PTR_ERR(bs->bh);
ea_bdebug(bs->bh, "b_count=%d, refcount=%d",
atomic_read(&(bs->bh->b_count)),
le32_to_cpu(BHDR(bs->bh)->h_refcount));
error = ext4_xattr_check_block(inode, bs->bh);
if (error)
- goto cleanup;
+ return error;
/* Find the named attribute. */
bs->s.base = BHDR(bs->bh);
bs->s.first = BFIRST(bs->bh);
@@ -1837,13 +1835,10 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
error = xattr_find_entry(inode, &bs->s.here, bs->s.end,
i->name_index, i->name, 1);
if (error && error != -ENODATA)
- goto cleanup;
+ return error;
bs->s.not_found = error;
}
- error = 0;
-
-cleanup:
- return error;
+ return 0;
}
static int
@@ -2272,12 +2267,14 @@ static struct buffer_head *ext4_xattr_get_block(struct inode *inode)
if (!EXT4_I(inode)->i_file_acl)
return NULL;
- bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
- if (!bh)
- return ERR_PTR(-EIO);
+ bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
+ if (IS_ERR(bh))
+ return bh;
error = ext4_xattr_check_block(inode, bh);
- if (error)
+ if (error) {
+ brelse(bh);
return ERR_PTR(error);
+ }
return bh;
}
@@ -2397,6 +2394,8 @@ retry_inode:
error = ext4_xattr_block_set(handle, inode, &i, &bs);
} else if (error == -ENOSPC) {
if (EXT4_I(inode)->i_file_acl && !bs.s.base) {
+ brelse(bs.bh);
+ bs.bh = NULL;
error = ext4_xattr_block_find(inode, &i, &bs);
if (error)
goto cleanup;
@@ -2617,6 +2616,8 @@ out:
kfree(buffer);
if (is)
brelse(is->iloc.bh);
+ if (bs)
+ brelse(bs->bh);
kfree(is);
kfree(bs);
@@ -2696,7 +2697,6 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
struct ext4_inode *raw_inode, handle_t *handle)
{
struct ext4_xattr_ibody_header *header;
- struct buffer_head *bh;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
static unsigned int mnt_count;
size_t min_offs;
@@ -2722,7 +2722,7 @@ retry:
base = IFIRST(header);
end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
min_offs = end - base;
- total_ino = sizeof(struct ext4_xattr_ibody_header);
+ total_ino = sizeof(struct ext4_xattr_ibody_header) + sizeof(u32);
error = xattr_check_inode(inode, header, end);
if (error)
@@ -2737,13 +2737,18 @@ retry:
* EA block can hold new_extra_isize bytes.
*/
if (EXT4_I(inode)->i_file_acl) {
- bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
- error = -EIO;
- if (!bh)
+ struct buffer_head *bh;
+
+ bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
+ if (IS_ERR(bh)) {
+ error = PTR_ERR(bh);
goto cleanup;
+ }
error = ext4_xattr_check_block(inode, bh);
- if (error)
+ if (error) {
+ brelse(bh);
goto cleanup;
+ }
base = BHDR(bh);
end = bh->b_data + bh->b_size;
min_offs = end - base;
@@ -2892,11 +2897,12 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
}
if (EXT4_I(inode)->i_file_acl) {
- bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
- if (!bh) {
- EXT4_ERROR_INODE(inode, "block %llu read error",
- EXT4_I(inode)->i_file_acl);
- error = -EIO;
+ bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
+ if (IS_ERR(bh)) {
+ error = PTR_ERR(bh);
+ if (error == -EIO)
+ EXT4_ERROR_INODE(inode, "block %llu read error",
+ EXT4_I(inode)->i_file_acl);
goto cleanup;
}
error = ext4_xattr_check_block(inode, bh);
@@ -3049,8 +3055,10 @@ ext4_xattr_block_cache_find(struct inode *inode,
while (ce) {
struct buffer_head *bh;
- bh = sb_bread(inode->i_sb, ce->e_value);
- if (!bh) {
+ bh = ext4_sb_bread(inode->i_sb, ce->e_value, REQ_PRIO);
+ if (IS_ERR(bh)) {
+ if (PTR_ERR(bh) == -ENOMEM)
+ return NULL;
EXT4_ERROR_INODE(inode, "block %lu read error",
(unsigned long)ce->e_value);
} else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) {
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index fa707cdd4120..63e599524085 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -160,7 +160,7 @@ static void *f2fs_acl_to_disk(struct f2fs_sb_info *sbi,
return (void *)f2fs_acl;
fail:
- kfree(f2fs_acl);
+ kvfree(f2fs_acl);
return ERR_PTR(-EINVAL);
}
@@ -190,7 +190,7 @@ static struct posix_acl *__f2fs_get_acl(struct inode *inode, int type,
acl = NULL;
else
acl = ERR_PTR(retval);
- kfree(value);
+ kvfree(value);
return acl;
}
@@ -240,7 +240,7 @@ static int __f2fs_set_acl(struct inode *inode, int type,
error = f2fs_setxattr(inode, name_index, "", value, size, ipage, 0);
- kfree(value);
+ kvfree(value);
if (!error)
set_cached_acl(inode, type, acl);
@@ -352,12 +352,14 @@ static int f2fs_acl_create(struct inode *dir, umode_t *mode,
return PTR_ERR(p);
clone = f2fs_acl_clone(p, GFP_NOFS);
- if (!clone)
- goto no_mem;
+ if (!clone) {
+ ret = -ENOMEM;
+ goto release_acl;
+ }
ret = f2fs_acl_create_masq(clone, mode);
if (ret < 0)
- goto no_mem_clone;
+ goto release_clone;
if (ret == 0)
posix_acl_release(clone);
@@ -371,11 +373,11 @@ static int f2fs_acl_create(struct inode *dir, umode_t *mode,
return 0;
-no_mem_clone:
+release_clone:
posix_acl_release(clone);
-no_mem:
+release_acl:
posix_acl_release(p);
- return -ENOMEM;
+ return ret;
}
int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage,
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 9c28ea439e0b..f955cd3e0677 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -44,7 +44,7 @@ repeat:
cond_resched();
goto repeat;
}
- f2fs_wait_on_page_writeback(page, META, true);
+ f2fs_wait_on_page_writeback(page, META, true, true);
if (!PageUptodate(page))
SetPageUptodate(page);
return page;
@@ -370,9 +370,8 @@ continue_unlock:
goto continue_unlock;
}
- f2fs_wait_on_page_writeback(page, META, true);
+ f2fs_wait_on_page_writeback(page, META, true, true);
- BUG_ON(PageWriteback(page));
if (!clear_page_dirty_for_io(page))
goto continue_unlock;
@@ -911,7 +910,7 @@ free_fail_no_cp:
f2fs_put_page(cp1, 1);
f2fs_put_page(cp2, 1);
fail_no_cp:
- kfree(sbi->ckpt);
+ kvfree(sbi->ckpt);
return -EINVAL;
}
@@ -1290,11 +1289,11 @@ static void commit_checkpoint(struct f2fs_sb_info *sbi,
struct page *page = f2fs_grab_meta_page(sbi, blk_addr);
int err;
+ f2fs_wait_on_page_writeback(page, META, true, true);
+
memcpy(page_address(page), src, PAGE_SIZE);
- set_page_dirty(page);
- f2fs_wait_on_page_writeback(page, META, true);
- f2fs_bug_on(sbi, PageWriteback(page));
+ set_page_dirty(page);
if (unlikely(!clear_page_dirty_for_io(page)))
f2fs_bug_on(sbi, 1);
@@ -1328,11 +1327,9 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
int err;
/* Flush all the NAT/SIT pages */
- while (get_pages(sbi, F2FS_DIRTY_META)) {
- f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO);
- if (unlikely(f2fs_cp_error(sbi)))
- break;
- }
+ f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO);
+ f2fs_bug_on(sbi, get_pages(sbi, F2FS_DIRTY_META) &&
+ !f2fs_cp_error(sbi));
/*
* modify checkpoint
@@ -1405,14 +1402,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
for (i = 0; i < nm_i->nat_bits_blocks; i++)
f2fs_update_meta_page(sbi, nm_i->nat_bits +
(i << F2FS_BLKSIZE_BITS), blk + i);
-
- /* Flush all the NAT BITS pages */
- while (get_pages(sbi, F2FS_DIRTY_META)) {
- f2fs_sync_meta_pages(sbi, META, LONG_MAX,
- FS_CP_META_IO);
- if (unlikely(f2fs_cp_error(sbi)))
- break;
- }
}
/* write out checkpoint buffer at block 0 */
@@ -1448,6 +1437,8 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
/* Here, we have one bio having CP pack except cp pack 2 page */
f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO);
+ f2fs_bug_on(sbi, get_pages(sbi, F2FS_DIRTY_META) &&
+ !f2fs_cp_error(sbi));
/* wait for previous submitted meta pages writeback */
f2fs_wait_on_all_pages_writeback(sbi);
@@ -1465,7 +1456,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
* invalidate intermediate page cache borrowed from meta inode
* which are used for migration of encrypted inode's blocks.
*/
- if (f2fs_sb_has_encrypt(sbi->sb))
+ if (f2fs_sb_has_encrypt(sbi))
invalidate_mapping_pages(META_MAPPING(sbi),
MAIN_BLKADDR(sbi), MAX_BLKADDR(sbi) - 1);
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 106f116466bf..da060b77f64d 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -87,8 +87,9 @@ static void __read_end_io(struct bio *bio)
struct page *page;
struct bio_vec *bv;
int i;
+ struct bvec_iter_all iter_all;
- bio_for_each_segment_all(bv, bio, i) {
+ bio_for_each_segment_all(bv, bio, i, iter_all) {
page = bv->bv_page;
/* PG_error was set if any post_read step failed */
@@ -164,13 +165,14 @@ static void f2fs_write_end_io(struct bio *bio)
struct f2fs_sb_info *sbi = bio->bi_private;
struct bio_vec *bvec;
int i;
+ struct bvec_iter_all iter_all;
if (time_to_inject(sbi, FAULT_WRITE_IO)) {
f2fs_show_injection_info(FAULT_WRITE_IO);
bio->bi_status = BLK_STS_IOERR;
}
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, i, iter_all) {
struct page *page = bvec->bv_page;
enum count_type type = WB_DATA_TYPE(page);
@@ -347,6 +349,7 @@ static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode,
struct bio_vec *bvec;
struct page *target;
int i;
+ struct bvec_iter_all iter_all;
if (!io->bio)
return false;
@@ -354,7 +357,7 @@ static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode,
if (!inode && !page && !ino)
return true;
- bio_for_each_segment_all(bvec, io->bio, i) {
+ bio_for_each_segment_all(bvec, io->bio, i, iter_all) {
if (bvec->bv_page->mapping)
target = bvec->bv_page;
@@ -372,29 +375,6 @@ static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode,
return false;
}
-static bool has_merged_page(struct f2fs_sb_info *sbi, struct inode *inode,
- struct page *page, nid_t ino,
- enum page_type type)
-{
- enum page_type btype = PAGE_TYPE_OF_BIO(type);
- enum temp_type temp;
- struct f2fs_bio_info *io;
- bool ret = false;
-
- for (temp = HOT; temp < NR_TEMP_TYPE; temp++) {
- io = sbi->write_io[btype] + temp;
-
- down_read(&io->io_rwsem);
- ret = __has_merged_page(io, inode, page, ino);
- up_read(&io->io_rwsem);
-
- /* TODO: use HOT temp only for meta pages now. */
- if (ret || btype == META)
- break;
- }
- return ret;
-}
-
static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi,
enum page_type type, enum temp_type temp)
{
@@ -420,13 +400,19 @@ static void __submit_merged_write_cond(struct f2fs_sb_info *sbi,
nid_t ino, enum page_type type, bool force)
{
enum temp_type temp;
-
- if (!force && !has_merged_page(sbi, inode, page, ino, type))
- return;
+ bool ret = true;
for (temp = HOT; temp < NR_TEMP_TYPE; temp++) {
+ if (!force) {
+ enum page_type btype = PAGE_TYPE_OF_BIO(type);
+ struct f2fs_bio_info *io = sbi->write_io[btype] + temp;
- __f2fs_submit_merged_write(sbi, type, temp);
+ down_read(&io->io_rwsem);
+ ret = __has_merged_page(io, inode, page, ino);
+ up_read(&io->io_rwsem);
+ }
+ if (ret)
+ __f2fs_submit_merged_write(sbi, type, temp);
/* TODO: use HOT temp only for meta pages now. */
if (type >= META)
@@ -643,7 +629,7 @@ static void __set_data_blkaddr(struct dnode_of_data *dn)
*/
void f2fs_set_data_blkaddr(struct dnode_of_data *dn)
{
- f2fs_wait_on_page_writeback(dn->node_page, NODE, true);
+ f2fs_wait_on_page_writeback(dn->node_page, NODE, true, true);
__set_data_blkaddr(dn);
if (set_page_dirty(dn->node_page))
dn->node_changed = true;
@@ -673,7 +659,7 @@ int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count)
trace_f2fs_reserve_new_blocks(dn->inode, dn->nid,
dn->ofs_in_node, count);
- f2fs_wait_on_page_writeback(dn->node_page, NODE, true);
+ f2fs_wait_on_page_writeback(dn->node_page, NODE, true, true);
for (; count > 0; dn->ofs_in_node++) {
block_t blkaddr = datablock_addr(dn->inode,
@@ -957,6 +943,9 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
return err;
}
+ if (direct_io && allow_outplace_dio(inode, iocb, from))
+ return 0;
+
if (is_inode_flag_set(inode, FI_NO_PREALLOC))
return 0;
@@ -970,6 +959,7 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
map.m_next_pgofs = NULL;
map.m_next_extent = NULL;
map.m_seg_type = NO_CHECK_TYPE;
+ map.m_may_create = true;
if (direct_io) {
map.m_seg_type = f2fs_rw_hint_to_seg_type(iocb->ki_hint);
@@ -1028,7 +1018,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
unsigned int maxblocks = map->m_len;
struct dnode_of_data dn;
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- int mode = create ? ALLOC_NODE : LOOKUP_NODE;
+ int mode = map->m_may_create ? ALLOC_NODE : LOOKUP_NODE;
pgoff_t pgofs, end_offset, end;
int err = 0, ofs = 1;
unsigned int ofs_in_node, last_ofs_in_node;
@@ -1048,6 +1038,10 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
end = pgofs + maxblocks;
if (!create && f2fs_lookup_extent_cache(inode, pgofs, &ei)) {
+ if (test_opt(sbi, LFS) && flag == F2FS_GET_BLOCK_DIO &&
+ map->m_may_create)
+ goto next_dnode;
+
map->m_pblk = ei.blk + pgofs - ei.fofs;
map->m_len = min((pgoff_t)maxblocks, ei.fofs + ei.len - pgofs);
map->m_flags = F2FS_MAP_MAPPED;
@@ -1062,7 +1056,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
}
next_dnode:
- if (create)
+ if (map->m_may_create)
__do_map_lock(sbi, flag, true);
/* When reading holes, we need its node page */
@@ -1099,11 +1093,13 @@ next_block:
if (is_valid_data_blkaddr(sbi, blkaddr)) {
/* use out-place-update for driect IO under LFS mode */
- if (test_opt(sbi, LFS) && create &&
- flag == F2FS_GET_BLOCK_DIO) {
+ if (test_opt(sbi, LFS) && flag == F2FS_GET_BLOCK_DIO &&
+ map->m_may_create) {
err = __allocate_data_block(&dn, map->m_seg_type);
- if (!err)
+ if (!err) {
+ blkaddr = dn.data_blkaddr;
set_inode_flag(inode, FI_APPEND_WRITE);
+ }
}
} else {
if (create) {
@@ -1209,7 +1205,7 @@ skip:
f2fs_put_dnode(&dn);
- if (create) {
+ if (map->m_may_create) {
__do_map_lock(sbi, flag, false);
f2fs_balance_fs(sbi, dn.node_changed);
}
@@ -1235,7 +1231,7 @@ sync_out:
}
f2fs_put_dnode(&dn);
unlock_out:
- if (create) {
+ if (map->m_may_create) {
__do_map_lock(sbi, flag, false);
f2fs_balance_fs(sbi, dn.node_changed);
}
@@ -1257,6 +1253,7 @@ bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len)
map.m_next_pgofs = NULL;
map.m_next_extent = NULL;
map.m_seg_type = NO_CHECK_TYPE;
+ map.m_may_create = false;
last_lblk = F2FS_BLK_ALIGN(pos + len);
while (map.m_lblk < last_lblk) {
@@ -1271,7 +1268,7 @@ bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len)
static int __get_data_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh, int create, int flag,
- pgoff_t *next_pgofs, int seg_type)
+ pgoff_t *next_pgofs, int seg_type, bool may_write)
{
struct f2fs_map_blocks map;
int err;
@@ -1281,6 +1278,7 @@ static int __get_data_block(struct inode *inode, sector_t iblock,
map.m_next_pgofs = next_pgofs;
map.m_next_extent = NULL;
map.m_seg_type = seg_type;
+ map.m_may_create = may_write;
err = f2fs_map_blocks(inode, &map, create, flag);
if (!err) {
@@ -1297,16 +1295,25 @@ static int get_data_block(struct inode *inode, sector_t iblock,
{
return __get_data_block(inode, iblock, bh_result, create,
flag, next_pgofs,
- NO_CHECK_TYPE);
+ NO_CHECK_TYPE, create);
+}
+
+static int get_data_block_dio_write(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ return __get_data_block(inode, iblock, bh_result, create,
+ F2FS_GET_BLOCK_DIO, NULL,
+ f2fs_rw_hint_to_seg_type(inode->i_write_hint),
+ true);
}
static int get_data_block_dio(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
return __get_data_block(inode, iblock, bh_result, create,
- F2FS_GET_BLOCK_DIO, NULL,
- f2fs_rw_hint_to_seg_type(
- inode->i_write_hint));
+ F2FS_GET_BLOCK_DIO, NULL,
+ f2fs_rw_hint_to_seg_type(inode->i_write_hint),
+ false);
}
static int get_data_block_bmap(struct inode *inode, sector_t iblock,
@@ -1318,7 +1325,7 @@ static int get_data_block_bmap(struct inode *inode, sector_t iblock,
return __get_data_block(inode, iblock, bh_result, create,
F2FS_GET_BLOCK_BMAP, NULL,
- NO_CHECK_TYPE);
+ NO_CHECK_TYPE, create);
}
static inline sector_t logical_to_blk(struct inode *inode, loff_t offset)
@@ -1525,6 +1532,7 @@ static int f2fs_mpage_readpages(struct address_space *mapping,
map.m_next_pgofs = NULL;
map.m_next_extent = NULL;
map.m_seg_type = NO_CHECK_TYPE;
+ map.m_may_create = false;
for (; nr_pages; nr_pages--) {
if (pages) {
@@ -1855,6 +1863,8 @@ got_it:
if (fio->need_lock == LOCK_REQ)
f2fs_unlock_op(fio->sbi);
err = f2fs_inplace_write_data(fio);
+ if (err && PageWriteback(page))
+ end_page_writeback(page);
trace_f2fs_do_write_data_page(fio->page, IPU);
set_inode_flag(inode, FI_UPDATE_WRITE);
return err;
@@ -2071,7 +2081,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
pgoff_t done_index;
int cycled;
int range_whole = 0;
- int tag;
+ xa_mark_t tag;
int nwritten = 0;
pagevec_init(&pvec);
@@ -2142,12 +2152,11 @@ continue_unlock:
if (PageWriteback(page)) {
if (wbc->sync_mode != WB_SYNC_NONE)
f2fs_wait_on_page_writeback(page,
- DATA, true);
+ DATA, true, true);
else
goto continue_unlock;
}
- BUG_ON(PageWriteback(page));
if (!clear_page_dirty_for_io(page))
goto continue_unlock;
@@ -2324,6 +2333,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi,
bool locked = false;
struct extent_info ei = {0,0,0};
int err = 0;
+ int flag;
/*
* we already allocated all the blocks, so we don't need to get
@@ -2333,9 +2343,15 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi,
!is_inode_flag_set(inode, FI_NO_PREALLOC))
return 0;
+ /* f2fs_lock_op avoids race between write CP and convert_inline_page */
+ if (f2fs_has_inline_data(inode) && pos + len > MAX_INLINE_DATA(inode))
+ flag = F2FS_GET_BLOCK_DEFAULT;
+ else
+ flag = F2FS_GET_BLOCK_PRE_AIO;
+
if (f2fs_has_inline_data(inode) ||
(pos & PAGE_MASK) >= i_size_read(inode)) {
- __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true);
+ __do_map_lock(sbi, flag, true);
locked = true;
}
restart:
@@ -2373,6 +2389,7 @@ restart:
f2fs_put_dnode(&dn);
__do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO,
true);
+ WARN_ON(flag != F2FS_GET_BLOCK_PRE_AIO);
locked = true;
goto restart;
}
@@ -2386,7 +2403,7 @@ out:
f2fs_put_dnode(&dn);
unlock_out:
if (locked)
- __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false);
+ __do_map_lock(sbi, flag, false);
return err;
}
@@ -2457,7 +2474,7 @@ repeat:
}
}
- f2fs_wait_on_page_writeback(page, DATA, false);
+ f2fs_wait_on_page_writeback(page, DATA, false, true);
if (len == PAGE_SIZE || PageUptodate(page))
return 0;
@@ -2548,6 +2565,53 @@ static int check_direct_IO(struct inode *inode, struct iov_iter *iter,
return 0;
}
+static void f2fs_dio_end_io(struct bio *bio)
+{
+ struct f2fs_private_dio *dio = bio->bi_private;
+
+ dec_page_count(F2FS_I_SB(dio->inode),
+ dio->write ? F2FS_DIO_WRITE : F2FS_DIO_READ);
+
+ bio->bi_private = dio->orig_private;
+ bio->bi_end_io = dio->orig_end_io;
+
+ kvfree(dio);
+
+ bio_endio(bio);
+}
+
+static void f2fs_dio_submit_bio(struct bio *bio, struct inode *inode,
+ loff_t file_offset)
+{
+ struct f2fs_private_dio *dio;
+ bool write = (bio_op(bio) == REQ_OP_WRITE);
+ int err;
+
+ dio = f2fs_kzalloc(F2FS_I_SB(inode),
+ sizeof(struct f2fs_private_dio), GFP_NOFS);
+ if (!dio) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ dio->inode = inode;
+ dio->orig_end_io = bio->bi_end_io;
+ dio->orig_private = bio->bi_private;
+ dio->write = write;
+
+ bio->bi_end_io = f2fs_dio_end_io;
+ bio->bi_private = dio;
+
+ inc_page_count(F2FS_I_SB(inode),
+ write ? F2FS_DIO_WRITE : F2FS_DIO_READ);
+
+ submit_bio(bio);
+ return;
+out:
+ bio->bi_status = BLK_STS_IOERR;
+ bio_endio(bio);
+}
+
static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
struct address_space *mapping = iocb->ki_filp->f_mapping;
@@ -2594,7 +2658,10 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
down_read(&fi->i_gc_rwsem[READ]);
}
- err = blockdev_direct_IO(iocb, inode, iter, get_data_block_dio);
+ err = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
+ iter, rw == WRITE ? get_data_block_dio_write :
+ get_data_block_dio, NULL, f2fs_dio_submit_bio,
+ DIO_LOCKING | DIO_SKIP_HOLES);
if (do_opu)
up_read(&fi->i_gc_rwsem[READ]);
@@ -2738,7 +2805,7 @@ int f2fs_migrate_page(struct address_space *mapping,
*/
extra_count = (atomic_written ? 1 : 0) - page_has_private(page);
rc = migrate_page_move_mapping(mapping, newpage,
- page, NULL, mode, extra_count);
+ page, mode, extra_count);
if (rc != MIGRATEPAGE_SUCCESS) {
if (atomic_written)
mutex_unlock(&fi->inmem_lock);
@@ -2787,13 +2854,13 @@ const struct address_space_operations f2fs_dblock_aops = {
#endif
};
-void f2fs_clear_radix_tree_dirty_tag(struct page *page)
+void f2fs_clear_page_cache_dirty_tag(struct page *page)
{
struct address_space *mapping = page_mapping(page);
unsigned long flags;
xa_lock_irqsave(&mapping->i_pages, flags);
- radix_tree_tag_clear(&mapping->i_pages, page_index(page),
+ __xa_clear_mark(&mapping->i_pages, page_index(page),
PAGECACHE_TAG_DIRTY);
xa_unlock_irqrestore(&mapping->i_pages, flags);
}
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 139b4d5c83d5..fd7f170e2f2d 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -53,6 +53,8 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->vw_cnt = atomic_read(&sbi->vw_cnt);
si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt);
si->max_vw_cnt = atomic_read(&sbi->max_vw_cnt);
+ si->nr_dio_read = get_pages(sbi, F2FS_DIO_READ);
+ si->nr_dio_write = get_pages(sbi, F2FS_DIO_WRITE);
si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA);
si->nr_wb_data = get_pages(sbi, F2FS_WB_DATA);
si->nr_rd_data = get_pages(sbi, F2FS_RD_DATA);
@@ -62,7 +64,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->nr_flushed =
atomic_read(&SM_I(sbi)->fcc_info->issued_flush);
si->nr_flushing =
- atomic_read(&SM_I(sbi)->fcc_info->issing_flush);
+ atomic_read(&SM_I(sbi)->fcc_info->queued_flush);
si->flush_list_empty =
llist_empty(&SM_I(sbi)->fcc_info->issue_list);
}
@@ -70,7 +72,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->nr_discarded =
atomic_read(&SM_I(sbi)->dcc_info->issued_discard);
si->nr_discarding =
- atomic_read(&SM_I(sbi)->dcc_info->issing_discard);
+ atomic_read(&SM_I(sbi)->dcc_info->queued_discard);
si->nr_discard_cmd =
atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt);
si->undiscard_blks = SM_I(sbi)->dcc_info->undiscard_blks;
@@ -197,7 +199,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi);
si->base_mem += SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi);
si->base_mem += SIT_VBLOCK_MAP_SIZE;
- if (sbi->segs_per_sec > 1)
+ if (__is_large_section(sbi))
si->base_mem += MAIN_SECS(sbi) * sizeof(struct sec_entry);
si->base_mem += __bitmap_size(sbi, SIT_BITMAP);
@@ -374,6 +376,8 @@ static int stat_show(struct seq_file *s, void *v)
seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n",
si->ext_tree, si->zombie_tree, si->ext_node);
seq_puts(s, "\nBalancing F2FS Async:\n");
+ seq_printf(s, " - DIO (R: %4d, W: %4d)\n",
+ si->nr_dio_read, si->nr_dio_write);
seq_printf(s, " - IO_R (Data: %4d, Node: %4d, Meta: %4d\n",
si->nr_rd_data, si->nr_rd_node, si->nr_rd_meta);
seq_printf(s, " - IO_W (CP: %4d, Data: %4d, Flush: (%4d %4d %4d), "
@@ -444,18 +448,7 @@ static int stat_show(struct seq_file *s, void *v)
return 0;
}
-static int stat_open(struct inode *inode, struct file *file)
-{
- return single_open(file, stat_show, inode->i_private);
-}
-
-static const struct file_operations stat_fops = {
- .owner = THIS_MODULE,
- .open = stat_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(stat);
int f2fs_build_stats(struct f2fs_sb_info *sbi)
{
@@ -510,33 +503,19 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi)
list_del(&si->stat_list);
mutex_unlock(&f2fs_stat_mutex);
- kfree(si);
+ kvfree(si);
}
-int __init f2fs_create_root_stats(void)
+void __init f2fs_create_root_stats(void)
{
- struct dentry *file;
-
f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL);
- if (!f2fs_debugfs_root)
- return -ENOMEM;
- file = debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root,
- NULL, &stat_fops);
- if (!file) {
- debugfs_remove(f2fs_debugfs_root);
- f2fs_debugfs_root = NULL;
- return -ENOMEM;
- }
-
- return 0;
+ debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root, NULL,
+ &stat_fops);
}
void f2fs_destroy_root_stats(void)
{
- if (!f2fs_debugfs_root)
- return;
-
debugfs_remove_recursive(f2fs_debugfs_root);
f2fs_debugfs_root = NULL;
}
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 2ef84b4590ea..50d0d36280fa 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -293,7 +293,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
{
enum page_type type = f2fs_has_inline_dentry(dir) ? NODE : DATA;
lock_page(page);
- f2fs_wait_on_page_writeback(page, type, true);
+ f2fs_wait_on_page_writeback(page, type, true, true);
de->ino = cpu_to_le32(inode->i_ino);
set_de_type(de, inode->i_mode);
set_page_dirty(page);
@@ -307,7 +307,7 @@ static void init_dent_inode(const struct qstr *name, struct page *ipage)
{
struct f2fs_inode *ri;
- f2fs_wait_on_page_writeback(ipage, NODE, true);
+ f2fs_wait_on_page_writeback(ipage, NODE, true, true);
/* copy name info. to this inode page */
ri = F2FS_INODE(ipage);
@@ -550,7 +550,7 @@ start:
++level;
goto start;
add_dentry:
- f2fs_wait_on_page_writeback(dentry_page, DATA, true);
+ f2fs_wait_on_page_writeback(dentry_page, DATA, true, true);
if (inode) {
down_write(&F2FS_I(inode)->i_sem);
@@ -705,7 +705,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
return f2fs_delete_inline_entry(dentry, page, dir, inode);
lock_page(page);
- f2fs_wait_on_page_writeback(page, DATA, true);
+ f2fs_wait_on_page_writeback(page, DATA, true, true);
dentry_blk = page_address(page);
bit_pos = dentry - dentry_blk->dentry;
@@ -726,7 +726,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
if (bit_pos == NR_DENTRY_IN_BLOCK &&
!f2fs_truncate_hole(dir, page->index, page->index + 1)) {
- f2fs_clear_radix_tree_dirty_tag(page);
+ f2fs_clear_page_cache_dirty_tag(page);
clear_page_dirty_for_io(page);
ClearPagePrivate(page);
ClearPageUptodate(page);
@@ -808,6 +808,17 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
de_name.name = d->filename[bit_pos];
de_name.len = le16_to_cpu(de->name_len);
+ /* check memory boundary before moving forward */
+ bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
+ if (unlikely(bit_pos > d->max)) {
+ f2fs_msg(sbi->sb, KERN_WARNING,
+ "%s: corrupted namelen=%d, run fsck to fix.",
+ __func__, le16_to_cpu(de->name_len));
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ err = -EINVAL;
+ goto out;
+ }
+
if (f2fs_encrypted_inode(d->inode)) {
int save_len = fstr->len;
@@ -830,7 +841,6 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
if (readdir_ra)
f2fs_ra_node_page(sbi, le32_to_cpu(de->ino));
- bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
ctx->pos = start_pos + bit_pos;
}
out:
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 56204a8f8a12..8f23ee6e8eb9 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -67,7 +67,7 @@ struct f2fs_fault_info {
unsigned int inject_type;
};
-extern char *f2fs_fault_name[FAULT_MAX];
+extern const char *f2fs_fault_name[FAULT_MAX];
#define IS_FAULT_SET(fi, type) ((fi)->inject_type & (1 << (type)))
#endif
@@ -152,12 +152,13 @@ struct f2fs_mount_info {
#define F2FS_FEATURE_VERITY 0x0400 /* reserved */
#define F2FS_FEATURE_SB_CHKSUM 0x0800
-#define F2FS_HAS_FEATURE(sb, mask) \
- ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0)
-#define F2FS_SET_FEATURE(sb, mask) \
- (F2FS_SB(sb)->raw_super->feature |= cpu_to_le32(mask))
-#define F2FS_CLEAR_FEATURE(sb, mask) \
- (F2FS_SB(sb)->raw_super->feature &= ~cpu_to_le32(mask))
+#define __F2FS_HAS_FEATURE(raw_super, mask) \
+ ((raw_super->feature & cpu_to_le32(mask)) != 0)
+#define F2FS_HAS_FEATURE(sbi, mask) __F2FS_HAS_FEATURE(sbi->raw_super, mask)
+#define F2FS_SET_FEATURE(sbi, mask) \
+ (sbi->raw_super->feature |= cpu_to_le32(mask))
+#define F2FS_CLEAR_FEATURE(sbi, mask) \
+ (sbi->raw_super->feature &= ~cpu_to_le32(mask))
/*
* Default values for user and/or group using reserved blocks
@@ -284,7 +285,7 @@ struct discard_cmd {
struct block_device *bdev; /* bdev */
unsigned short ref; /* reference count */
unsigned char state; /* state */
- unsigned char issuing; /* issuing discard */
+ unsigned char queued; /* queued discard */
int error; /* bio error */
spinlock_t lock; /* for state/bio_ref updating */
unsigned short bio_ref; /* bio reference count */
@@ -326,7 +327,7 @@ struct discard_cmd_control {
unsigned int undiscard_blks; /* # of undiscard blocks */
unsigned int next_pos; /* next discard position */
atomic_t issued_discard; /* # of issued discard */
- atomic_t issing_discard; /* # of issing discard */
+ atomic_t queued_discard; /* # of queued discard */
atomic_t discard_cmd_cnt; /* # of cached cmd count */
struct rb_root_cached root; /* root of discard rb-tree */
bool rbtree_check; /* config for consistence check */
@@ -416,6 +417,7 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal,
#define F2FS_GOING_DOWN_METASYNC 0x1 /* going down with metadata */
#define F2FS_GOING_DOWN_NOSYNC 0x2 /* going down */
#define F2FS_GOING_DOWN_METAFLUSH 0x3 /* going down with meta flush */
+#define F2FS_GOING_DOWN_NEED_FSCK 0x4 /* going down to trigger fsck */
#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
/*
@@ -557,16 +559,8 @@ struct extent_info {
};
struct extent_node {
- struct rb_node rb_node;
- union {
- struct {
- unsigned int fofs;
- unsigned int len;
- u32 blk;
- };
- struct extent_info ei; /* extent info */
-
- };
+ struct rb_node rb_node; /* rb node located in rb-tree */
+ struct extent_info ei; /* extent info */
struct list_head list; /* node in global extent list of sbi */
struct extent_tree *et; /* extent tree pointer */
};
@@ -601,6 +595,7 @@ struct f2fs_map_blocks {
pgoff_t *m_next_pgofs; /* point next possible non-hole pgofs */
pgoff_t *m_next_extent; /* point to next possible extent */
int m_seg_type;
+ bool m_may_create; /* indicate it is from write path */
};
/* for flag in get_data_block */
@@ -889,7 +884,7 @@ struct flush_cmd_control {
struct task_struct *f2fs_issue_flush; /* flush thread */
wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */
atomic_t issued_flush; /* # of issued flushes */
- atomic_t issing_flush; /* # of issing flushes */
+ atomic_t queued_flush; /* # of queued flushes */
struct llist_head issue_list; /* list for command issue */
struct llist_node *dispatch_list; /* list for command dispatch */
};
@@ -956,6 +951,8 @@ enum count_type {
F2FS_RD_DATA,
F2FS_RD_NODE,
F2FS_RD_META,
+ F2FS_DIO_WRITE,
+ F2FS_DIO_READ,
NR_COUNT_TYPE,
};
@@ -1170,8 +1167,6 @@ struct f2fs_sb_info {
/* for bio operations */
struct f2fs_bio_info *write_io[NR_PAGE_TYPE]; /* for write bios */
- struct mutex wio_mutex[NR_PAGE_TYPE - 1][NR_TEMP_TYPE];
- /* bio ordering for NODE/DATA */
/* keep migration IO order for LFS mode */
struct rw_semaphore io_order_lock;
mempool_t *write_io_dummy; /* Dummy pages */
@@ -1263,6 +1258,7 @@ struct f2fs_sb_info {
struct f2fs_gc_kthread *gc_thread; /* GC thread */
unsigned int cur_victim_sec; /* current victim section num */
unsigned int gc_mode; /* current GC state */
+ unsigned int next_victim_seg[2]; /* next segment in victim section */
/* for skip statistic */
unsigned long long skipped_atomic_files[2]; /* FG_GC and BG_GC */
unsigned long long skipped_gc_rwsem; /* FG_GC only */
@@ -1272,6 +1268,8 @@ struct f2fs_sb_info {
/* maximum # of trials to find a victim segment for SSR and GC */
unsigned int max_victim_search;
+ /* migration granularity of garbage collection, unit: segment */
+ unsigned int migration_granularity;
/*
* for stat information.
@@ -1330,6 +1328,13 @@ struct f2fs_sb_info {
__u32 s_chksum_seed;
};
+struct f2fs_private_dio {
+ struct inode *inode;
+ void *orig_private;
+ bio_end_io_t *orig_end_io;
+ bool write;
+};
+
#ifdef CONFIG_F2FS_FAULT_INJECTION
#define f2fs_show_injection_info(type) \
printk_ratelimited("%sF2FS-fs : inject %s in %s of %pF\n", \
@@ -1608,12 +1613,16 @@ static inline void disable_nat_bits(struct f2fs_sb_info *sbi, bool lock)
{
unsigned long flags;
- set_sbi_flag(sbi, SBI_NEED_FSCK);
+ /*
+ * In order to re-enable nat_bits we need to call fsck.f2fs by
+ * set_sbi_flag(sbi, SBI_NEED_FSCK). But it may give huge cost,
+ * so let's rely on regular fsck or unclean shutdown.
+ */
if (lock)
spin_lock_irqsave(&sbi->cp_lock, flags);
__clear_ckpt_flags(F2FS_CKPT(sbi), CP_NAT_BITS_FLAG);
- kfree(NM_I(sbi)->nat_bits);
+ kvfree(NM_I(sbi)->nat_bits);
NM_I(sbi)->nat_bits = NULL;
if (lock)
spin_unlock_irqrestore(&sbi->cp_lock, flags);
@@ -2146,7 +2155,11 @@ static inline bool is_idle(struct f2fs_sb_info *sbi, int type)
{
if (get_pages(sbi, F2FS_RD_DATA) || get_pages(sbi, F2FS_RD_NODE) ||
get_pages(sbi, F2FS_RD_META) || get_pages(sbi, F2FS_WB_DATA) ||
- get_pages(sbi, F2FS_WB_CP_DATA))
+ get_pages(sbi, F2FS_WB_CP_DATA) ||
+ get_pages(sbi, F2FS_DIO_READ) ||
+ get_pages(sbi, F2FS_DIO_WRITE) ||
+ atomic_read(&SM_I(sbi)->dcc_info->queued_discard) ||
+ atomic_read(&SM_I(sbi)->fcc_info->queued_flush))
return false;
return f2fs_time_over(sbi, type);
}
@@ -2370,6 +2383,7 @@ static inline void __mark_inode_dirty_flag(struct inode *inode,
case FI_NEW_INODE:
if (set)
return;
+ /* fall through */
case FI_DATA_EXIST:
case FI_INLINE_DOTS:
case FI_PIN_FILE:
@@ -2672,22 +2686,37 @@ static inline bool is_dot_dotdot(const struct qstr *str)
static inline bool f2fs_may_extent_tree(struct inode *inode)
{
- if (!test_opt(F2FS_I_SB(inode), EXTENT_CACHE) ||
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+ if (!test_opt(sbi, EXTENT_CACHE) ||
is_inode_flag_set(inode, FI_NO_EXTENT))
return false;
+ /*
+ * for recovered files during mount do not create extents
+ * if shrinker is not registered.
+ */
+ if (list_empty(&sbi->s_list))
+ return false;
+
return S_ISREG(inode->i_mode);
}
static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi,
size_t size, gfp_t flags)
{
+ void *ret;
+
if (time_to_inject(sbi, FAULT_KMALLOC)) {
f2fs_show_injection_info(FAULT_KMALLOC);
return NULL;
}
- return kmalloc(size, flags);
+ ret = kmalloc(size, flags);
+ if (ret)
+ return ret;
+
+ return kvmalloc(size, flags);
}
static inline void *f2fs_kzalloc(struct f2fs_sb_info *sbi,
@@ -2762,6 +2791,8 @@ static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi,
spin_unlock(&sbi->iostat_lock);
}
+#define __is_large_section(sbi) ((sbi)->segs_per_sec > 1)
+
#define __is_meta_io(fio) (PAGE_TYPE_OF_BIO(fio->type) == META && \
(!is_read_io(fio->op) || fio->is_meta))
@@ -3007,7 +3038,7 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
struct f2fs_summary *sum, int type,
struct f2fs_io_info *fio, bool add_list);
void f2fs_wait_on_page_writeback(struct page *page,
- enum page_type type, bool ordered);
+ enum page_type type, bool ordered, bool locked);
void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr);
void f2fs_wait_on_block_writeback_range(struct inode *inode, block_t blkaddr,
block_t len);
@@ -3108,7 +3139,7 @@ int f2fs_migrate_page(struct address_space *mapping, struct page *newpage,
struct page *page, enum migrate_mode mode);
#endif
bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len);
-void f2fs_clear_radix_tree_dirty_tag(struct page *page);
+void f2fs_clear_page_cache_dirty_tag(struct page *page);
/*
* gc.c
@@ -3147,6 +3178,7 @@ struct f2fs_stat_info {
int total_count, utilization;
int bg_gc, nr_wb_cp_data, nr_wb_data;
int nr_rd_data, nr_rd_node, nr_rd_meta;
+ int nr_dio_read, nr_dio_write;
unsigned int io_skip_bggc, other_skip_bggc;
int nr_flushing, nr_flushed, flush_list_empty;
int nr_discarding, nr_discarded;
@@ -3296,7 +3328,7 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
int f2fs_build_stats(struct f2fs_sb_info *sbi);
void f2fs_destroy_stats(struct f2fs_sb_info *sbi);
-int __init f2fs_create_root_stats(void);
+void __init f2fs_create_root_stats(void);
void f2fs_destroy_root_stats(void);
#else
#define stat_inc_cp_count(si) do { } while (0)
@@ -3334,7 +3366,7 @@ void f2fs_destroy_root_stats(void);
static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; }
static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { }
-static inline int __init f2fs_create_root_stats(void) { return 0; }
+static inline void __init f2fs_create_root_stats(void) { }
static inline void f2fs_destroy_root_stats(void) { }
#endif
@@ -3459,9 +3491,9 @@ static inline bool f2fs_post_read_required(struct inode *inode)
}
#define F2FS_FEATURE_FUNCS(name, flagname) \
-static inline int f2fs_sb_has_##name(struct super_block *sb) \
+static inline int f2fs_sb_has_##name(struct f2fs_sb_info *sbi) \
{ \
- return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_##flagname); \
+ return F2FS_HAS_FEATURE(sbi, F2FS_FEATURE_##flagname); \
}
F2FS_FEATURE_FUNCS(encrypt, ENCRYPT);
@@ -3491,7 +3523,7 @@ static inline int get_blkz_type(struct f2fs_sb_info *sbi,
static inline bool f2fs_hw_should_discard(struct f2fs_sb_info *sbi)
{
- return f2fs_sb_has_blkzoned(sbi->sb);
+ return f2fs_sb_has_blkzoned(sbi);
}
static inline bool f2fs_hw_support_discard(struct f2fs_sb_info *sbi)
@@ -3566,7 +3598,7 @@ static inline bool f2fs_force_buffered_io(struct inode *inode,
* for blkzoned device, fallback direct IO to buffered IO, so
* all IOs can be serialized by log-structured write.
*/
- if (f2fs_sb_has_blkzoned(sbi->sb))
+ if (f2fs_sb_has_blkzoned(sbi))
return true;
if (test_opt(sbi, LFS) && (rw == WRITE) &&
block_unaligned_IO(inode, iocb, iter))
@@ -3589,7 +3621,7 @@ extern void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate,
static inline bool is_journalled_quota(struct f2fs_sb_info *sbi)
{
#ifdef CONFIG_QUOTA
- if (f2fs_sb_has_quota_ino(sbi->sb))
+ if (f2fs_sb_has_quota_ino(sbi))
return true;
if (F2FS_OPTION(sbi).s_qf_names[USRQUOTA] ||
F2FS_OPTION(sbi).s_qf_names[GRPQUOTA] ||
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 88b124677189..bba56b39dcc5 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -82,7 +82,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
}
/* fill the page */
- f2fs_wait_on_page_writeback(page, DATA, false);
+ f2fs_wait_on_page_writeback(page, DATA, false, true);
/* wait for GCed page writeback via META_MAPPING */
f2fs_wait_on_block_writeback(inode, dn.data_blkaddr);
@@ -216,6 +216,9 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
trace_f2fs_sync_file_enter(inode);
+ if (S_ISDIR(inode->i_mode))
+ goto go_write;
+
/* if fdatasync is triggered, let's do in-place-update */
if (datasync || get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks)
set_inode_flag(inode, FI_NEED_IPU);
@@ -575,7 +578,7 @@ static int truncate_partial_data_page(struct inode *inode, u64 from,
if (IS_ERR(page))
return PTR_ERR(page) == -ENOENT ? 0 : PTR_ERR(page);
truncate_out:
- f2fs_wait_on_page_writeback(page, DATA, true);
+ f2fs_wait_on_page_writeback(page, DATA, true, true);
zero_user(page, offset, PAGE_SIZE - offset);
/* An encrypted inode should have a key and truncate the last page. */
@@ -696,7 +699,7 @@ int f2fs_getattr(const struct path *path, struct kstat *stat,
unsigned int flags;
if (f2fs_has_extra_attr(inode) &&
- f2fs_sb_has_inode_crtime(inode->i_sb) &&
+ f2fs_sb_has_inode_crtime(F2FS_I_SB(inode)) &&
F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_crtime)) {
stat->result_mask |= STATX_BTIME;
stat->btime.tv_sec = fi->i_crtime.tv_sec;
@@ -892,7 +895,7 @@ static int fill_zero(struct inode *inode, pgoff_t index,
if (IS_ERR(page))
return PTR_ERR(page);
- f2fs_wait_on_page_writeback(page, DATA, true);
+ f2fs_wait_on_page_writeback(page, DATA, true, true);
zero_user(page, start, len);
set_page_dirty(page);
f2fs_put_page(page, 1);
@@ -1496,7 +1499,8 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_map_blocks map = { .m_next_pgofs = NULL,
- .m_next_extent = NULL, .m_seg_type = NO_CHECK_TYPE };
+ .m_next_extent = NULL, .m_seg_type = NO_CHECK_TYPE,
+ .m_may_create = true };
pgoff_t pg_end;
loff_t new_size = i_size_read(inode);
loff_t off_end;
@@ -1681,7 +1685,7 @@ static int __f2fs_ioc_setflags(struct inode *inode, unsigned int flags)
inode->i_ctime = current_time(inode);
f2fs_set_inode_flags(inode);
- f2fs_mark_inode_dirty_sync(inode, false);
+ f2fs_mark_inode_dirty_sync(inode, true);
return 0;
}
@@ -1962,6 +1966,13 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
f2fs_stop_checkpoint(sbi, false);
set_sbi_flag(sbi, SBI_IS_SHUTDOWN);
break;
+ case F2FS_GOING_DOWN_NEED_FSCK:
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ /* do checkpoint only */
+ ret = f2fs_sync_fs(sb, 1);
+ if (ret)
+ goto out;
+ break;
default:
ret = -EINVAL;
goto out;
@@ -2030,7 +2041,7 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg)
{
struct inode *inode = file_inode(filp);
- if (!f2fs_sb_has_encrypt(inode->i_sb))
+ if (!f2fs_sb_has_encrypt(F2FS_I_SB(inode)))
return -EOPNOTSUPP;
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
@@ -2040,7 +2051,7 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg)
static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg)
{
- if (!f2fs_sb_has_encrypt(file_inode(filp)->i_sb))
+ if (!f2fs_sb_has_encrypt(F2FS_I_SB(file_inode(filp))))
return -EOPNOTSUPP;
return fscrypt_ioctl_get_policy(filp, (void __user *)arg);
}
@@ -2051,7 +2062,7 @@ static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg)
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
int err;
- if (!f2fs_sb_has_encrypt(inode->i_sb))
+ if (!f2fs_sb_has_encrypt(sbi))
return -EOPNOTSUPP;
err = mnt_want_write_file(filp);
@@ -2155,7 +2166,7 @@ do_more:
}
ret = f2fs_gc(sbi, range.sync, true, GET_SEGNO(sbi, range.start));
- range.start += sbi->blocks_per_seg;
+ range.start += BLKS_PER_SEC(sbi);
if (range.start <= end)
goto do_more;
out:
@@ -2197,7 +2208,8 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
{
struct inode *inode = file_inode(filp);
struct f2fs_map_blocks map = { .m_next_extent = NULL,
- .m_seg_type = NO_CHECK_TYPE };
+ .m_seg_type = NO_CHECK_TYPE ,
+ .m_may_create = false };
struct extent_info ei = {0, 0, 0};
pgoff_t pg_start, pg_end, next_pgofs;
unsigned int blk_per_seg = sbi->blocks_per_seg;
@@ -2560,7 +2572,7 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg)
return -EFAULT;
if (sbi->s_ndevs <= 1 || sbi->s_ndevs - 1 <= range.dev_num ||
- sbi->segs_per_sec != 1) {
+ __is_large_section(sbi)) {
f2fs_msg(sbi->sb, KERN_WARNING,
"Can't flush %u in %d for segs_per_sec %u != 1\n",
range.dev_num, sbi->s_ndevs,
@@ -2635,12 +2647,11 @@ static int f2fs_ioc_setproject(struct file *filp, __u32 projid)
struct inode *inode = file_inode(filp);
struct f2fs_inode_info *fi = F2FS_I(inode);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct super_block *sb = sbi->sb;
struct page *ipage;
kprojid_t kprojid;
int err;
- if (!f2fs_sb_has_project_quota(sb)) {
+ if (!f2fs_sb_has_project_quota(sbi)) {
if (projid != F2FS_DEF_PROJID)
return -EOPNOTSUPP;
else
@@ -2757,7 +2768,7 @@ static int f2fs_ioc_fsgetxattr(struct file *filp, unsigned long arg)
fa.fsx_xflags = f2fs_iflags_to_xflags(fi->i_flags &
F2FS_FL_USER_VISIBLE);
- if (f2fs_sb_has_project_quota(inode->i_sb))
+ if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)))
fa.fsx_projid = (__u32)from_kprojid(&init_user_ns,
fi->i_projid);
@@ -2932,6 +2943,7 @@ int f2fs_precache_extents(struct inode *inode)
map.m_next_pgofs = NULL;
map.m_next_extent = &m_next_extent;
map.m_seg_type = NO_CHECK_TYPE;
+ map.m_may_create = false;
end = F2FS_I_SB(inode)->max_file_blocks;
while (map.m_lblk < end) {
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index a07241fb8537..195cf0f9d9ef 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -142,7 +142,7 @@ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi)
"f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev));
if (IS_ERR(gc_th->f2fs_gc_task)) {
err = PTR_ERR(gc_th->f2fs_gc_task);
- kfree(gc_th);
+ kvfree(gc_th);
sbi->gc_thread = NULL;
}
out:
@@ -155,7 +155,7 @@ void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi)
if (!gc_th)
return;
kthread_stop(gc_th->f2fs_gc_task);
- kfree(gc_th);
+ kvfree(gc_th);
sbi->gc_thread = NULL;
}
@@ -323,8 +323,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
p.min_cost = get_max_cost(sbi, &p);
if (*result != NULL_SEGNO) {
- if (IS_DATASEG(get_seg_entry(sbi, *result)->type) &&
- get_valid_blocks(sbi, *result, false) &&
+ if (get_valid_blocks(sbi, *result, false) &&
!sec_usage_check(sbi, GET_SEC_FROM_SEG(sbi, *result)))
p.min_segno = *result;
goto out;
@@ -333,6 +332,22 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
if (p.max_search == 0)
goto out;
+ if (__is_large_section(sbi) && p.alloc_mode == LFS) {
+ if (sbi->next_victim_seg[BG_GC] != NULL_SEGNO) {
+ p.min_segno = sbi->next_victim_seg[BG_GC];
+ *result = p.min_segno;
+ sbi->next_victim_seg[BG_GC] = NULL_SEGNO;
+ goto got_result;
+ }
+ if (gc_type == FG_GC &&
+ sbi->next_victim_seg[FG_GC] != NULL_SEGNO) {
+ p.min_segno = sbi->next_victim_seg[FG_GC];
+ *result = p.min_segno;
+ sbi->next_victim_seg[FG_GC] = NULL_SEGNO;
+ goto got_result;
+ }
+ }
+
last_victim = sm->last_victim[p.gc_mode];
if (p.alloc_mode == LFS && gc_type == FG_GC) {
p.min_segno = check_bg_victims(sbi);
@@ -395,6 +410,8 @@ next:
}
if (p.min_segno != NULL_SEGNO) {
got_it:
+ *result = (p.min_segno / p.ofs_unit) * p.ofs_unit;
+got_result:
if (p.alloc_mode == LFS) {
secno = GET_SEC_FROM_SEG(sbi, p.min_segno);
if (gc_type == FG_GC)
@@ -402,13 +419,13 @@ got_it:
else
set_bit(secno, dirty_i->victim_secmap);
}
- *result = (p.min_segno / p.ofs_unit) * p.ofs_unit;
+ }
+out:
+ if (p.min_segno != NULL_SEGNO)
trace_f2fs_get_victim(sbi->sb, type, gc_type, &p,
sbi->cur_victim_sec,
prefree_segments(sbi), free_segments(sbi));
- }
-out:
mutex_unlock(&dirty_i->seglist_lock);
return (p.min_segno == NULL_SEGNO) ? 0 : 1;
@@ -658,6 +675,14 @@ got_it:
fio.page = page;
fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr;
+ /*
+ * don't cache encrypted data into meta inode until previous dirty
+ * data were writebacked to avoid racing between GC and flush.
+ */
+ f2fs_wait_on_page_writeback(page, DATA, true, true);
+
+ f2fs_wait_on_block_writeback(inode, dn.data_blkaddr);
+
fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(sbi),
dn.data_blkaddr,
FGP_LOCK | FGP_CREAT, GFP_NOFS);
@@ -743,7 +768,9 @@ static int move_data_block(struct inode *inode, block_t bidx,
* don't cache encrypted data into meta inode until previous dirty
* data were writebacked to avoid racing between GC and flush.
*/
- f2fs_wait_on_page_writeback(page, DATA, true);
+ f2fs_wait_on_page_writeback(page, DATA, true, true);
+
+ f2fs_wait_on_block_writeback(inode, dn.data_blkaddr);
err = f2fs_get_node_info(fio.sbi, dn.nid, &ni);
if (err)
@@ -802,8 +829,8 @@ static int move_data_block(struct inode *inode, block_t bidx,
}
write_page:
+ f2fs_wait_on_page_writeback(fio.encrypted_page, DATA, true, true);
set_page_dirty(fio.encrypted_page);
- f2fs_wait_on_page_writeback(fio.encrypted_page, DATA, true);
if (clear_page_dirty_for_io(fio.encrypted_page))
dec_page_count(fio.sbi, F2FS_DIRTY_META);
@@ -811,7 +838,7 @@ write_page:
ClearPageError(page);
/* allocate block address */
- f2fs_wait_on_page_writeback(dn.node_page, NODE, true);
+ f2fs_wait_on_page_writeback(dn.node_page, NODE, true, true);
fio.op = REQ_OP_WRITE;
fio.op_flags = REQ_SYNC;
@@ -897,8 +924,9 @@ static int move_data_page(struct inode *inode, block_t bidx, int gc_type,
bool is_dirty = PageDirty(page);
retry:
+ f2fs_wait_on_page_writeback(page, DATA, true, true);
+
set_page_dirty(page);
- f2fs_wait_on_page_writeback(page, DATA, true);
if (clear_page_dirty_for_io(page)) {
inode_dec_dirty_pages(inode);
f2fs_remove_dirty_inode(inode);
@@ -1093,15 +1121,18 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
struct blk_plug plug;
unsigned int segno = start_segno;
unsigned int end_segno = start_segno + sbi->segs_per_sec;
- int seg_freed = 0;
+ int seg_freed = 0, migrated = 0;
unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ?
SUM_TYPE_DATA : SUM_TYPE_NODE;
int submitted = 0;
+ if (__is_large_section(sbi))
+ end_segno = rounddown(end_segno, sbi->segs_per_sec);
+
/* readahead multi ssa blocks those have contiguous address */
- if (sbi->segs_per_sec > 1)
+ if (__is_large_section(sbi))
f2fs_ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno),
- sbi->segs_per_sec, META_SSA, true);
+ end_segno - segno, META_SSA, true);
/* reference all summary page */
while (segno < end_segno) {
@@ -1130,10 +1161,13 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
GET_SUM_BLOCK(sbi, segno));
f2fs_put_page(sum_page, 0);
- if (get_valid_blocks(sbi, segno, false) == 0 ||
- !PageUptodate(sum_page) ||
- unlikely(f2fs_cp_error(sbi)))
- goto next;
+ if (get_valid_blocks(sbi, segno, false) == 0)
+ goto freed;
+ if (__is_large_section(sbi) &&
+ migrated >= sbi->migration_granularity)
+ goto skip;
+ if (!PageUptodate(sum_page) || unlikely(f2fs_cp_error(sbi)))
+ goto skip;
sum = page_address(sum_page);
if (type != GET_SUM_TYPE((&sum->footer))) {
@@ -1141,7 +1175,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
"type [%d, %d] in SSA and SIT",
segno, type, GET_SUM_TYPE((&sum->footer)));
set_sbi_flag(sbi, SBI_NEED_FSCK);
- goto next;
+ goto skip;
}
/*
@@ -1160,10 +1194,15 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
stat_inc_seg_count(sbi, type, gc_type);
+freed:
if (gc_type == FG_GC &&
get_valid_blocks(sbi, segno, false) == 0)
seg_freed++;
-next:
+ migrated++;
+
+ if (__is_large_section(sbi) && segno + 1 < end_segno)
+ sbi->next_victim_seg[gc_type] = segno + 1;
+skip:
f2fs_put_page(sum_page, 0);
}
@@ -1307,7 +1346,7 @@ void f2fs_build_gc_manager(struct f2fs_sb_info *sbi)
sbi->gc_pin_file_threshold = DEF_GC_FAILED_PINNED_FILES;
/* give warm/cold data area from slower device */
- if (sbi->s_ndevs && sbi->segs_per_sec == 1)
+ if (sbi->s_ndevs && !__is_large_section(sbi))
SIT_I(sbi)->last_victim[ALLOC_NEXT] =
GET_SEGNO(sbi, FDEV(0).end_blk) + 1;
}
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index cb31a719b048..d636cbcf68f2 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -72,7 +72,7 @@ void f2fs_truncate_inline_inode(struct inode *inode,
addr = inline_data_addr(inode, ipage);
- f2fs_wait_on_page_writeback(ipage, NODE, true);
+ f2fs_wait_on_page_writeback(ipage, NODE, true, true);
memset(addr + from, 0, MAX_INLINE_DATA(inode) - from);
set_page_dirty(ipage);
@@ -161,7 +161,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
fio.old_blkaddr = dn->data_blkaddr;
set_inode_flag(dn->inode, FI_HOT_DATA);
f2fs_outplace_write_data(dn, &fio);
- f2fs_wait_on_page_writeback(page, DATA, true);
+ f2fs_wait_on_page_writeback(page, DATA, true, true);
if (dirty) {
inode_dec_dirty_pages(dn->inode);
f2fs_remove_dirty_inode(dn->inode);
@@ -236,14 +236,14 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page)
f2fs_bug_on(F2FS_I_SB(inode), page->index);
- f2fs_wait_on_page_writeback(dn.inode_page, NODE, true);
+ f2fs_wait_on_page_writeback(dn.inode_page, NODE, true, true);
src_addr = kmap_atomic(page);
dst_addr = inline_data_addr(inode, dn.inode_page);
memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode));
kunmap_atomic(src_addr);
set_page_dirty(dn.inode_page);
- f2fs_clear_radix_tree_dirty_tag(page);
+ f2fs_clear_page_cache_dirty_tag(page);
set_inode_flag(inode, FI_APPEND_WRITE);
set_inode_flag(inode, FI_DATA_EXIST);
@@ -277,7 +277,7 @@ process_inline:
ipage = f2fs_get_node_page(sbi, inode->i_ino);
f2fs_bug_on(sbi, IS_ERR(ipage));
- f2fs_wait_on_page_writeback(ipage, NODE, true);
+ f2fs_wait_on_page_writeback(ipage, NODE, true, true);
src_addr = inline_data_addr(inode, npage);
dst_addr = inline_data_addr(inode, ipage);
@@ -391,7 +391,7 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage,
goto out;
}
- f2fs_wait_on_page_writeback(page, DATA, true);
+ f2fs_wait_on_page_writeback(page, DATA, true, true);
dentry_blk = page_address(page);
@@ -501,18 +501,18 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage,
stat_dec_inline_dir(dir);
clear_inode_flag(dir, FI_INLINE_DENTRY);
- kfree(backup_dentry);
+ kvfree(backup_dentry);
return 0;
recover:
lock_page(ipage);
- f2fs_wait_on_page_writeback(ipage, NODE, true);
+ f2fs_wait_on_page_writeback(ipage, NODE, true, true);
memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA(dir));
f2fs_i_depth_write(dir, 0);
f2fs_i_size_write(dir, MAX_INLINE_DATA(dir));
set_page_dirty(ipage);
f2fs_put_page(ipage, 1);
- kfree(backup_dentry);
+ kvfree(backup_dentry);
return err;
}
@@ -565,7 +565,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name,
}
}
- f2fs_wait_on_page_writeback(ipage, NODE, true);
+ f2fs_wait_on_page_writeback(ipage, NODE, true, true);
name_hash = f2fs_dentry_hash(new_name, NULL);
f2fs_update_dentry(ino, mode, &d, new_name, name_hash, bit_pos);
@@ -597,7 +597,7 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page,
int i;
lock_page(page);
- f2fs_wait_on_page_writeback(page, NODE, true);
+ f2fs_wait_on_page_writeback(page, NODE, true, true);
inline_dentry = inline_data_addr(dir, page);
make_dentry_ptr_inline(dir, &d, inline_dentry);
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 91ceee0ed4c4..bec52961630b 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -103,7 +103,7 @@ static void __recover_inline_status(struct inode *inode, struct page *ipage)
while (start < end) {
if (*start++) {
- f2fs_wait_on_page_writeback(ipage, NODE, true);
+ f2fs_wait_on_page_writeback(ipage, NODE, true, true);
set_inode_flag(inode, FI_DATA_EXIST);
set_raw_inline(inode, F2FS_INODE(ipage));
@@ -118,7 +118,7 @@ static bool f2fs_enable_inode_chksum(struct f2fs_sb_info *sbi, struct page *page
{
struct f2fs_inode *ri = &F2FS_NODE(page)->i;
- if (!f2fs_sb_has_inode_chksum(sbi->sb))
+ if (!f2fs_sb_has_inode_chksum(sbi))
return false;
if (!IS_INODE(page) || !(ri->i_inline & F2FS_EXTRA_ATTR))
@@ -218,7 +218,7 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
return false;
}
- if (f2fs_sb_has_flexible_inline_xattr(sbi->sb)
+ if (f2fs_sb_has_flexible_inline_xattr(sbi)
&& !f2fs_has_extra_attr(inode)) {
set_sbi_flag(sbi, SBI_NEED_FSCK);
f2fs_msg(sbi->sb, KERN_WARNING,
@@ -228,7 +228,7 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
}
if (f2fs_has_extra_attr(inode) &&
- !f2fs_sb_has_extra_attr(sbi->sb)) {
+ !f2fs_sb_has_extra_attr(sbi)) {
set_sbi_flag(sbi, SBI_NEED_FSCK);
f2fs_msg(sbi->sb, KERN_WARNING,
"%s: inode (ino=%lx) is with extra_attr, "
@@ -340,7 +340,7 @@ static int do_read_inode(struct inode *inode)
fi->i_extra_isize = f2fs_has_extra_attr(inode) ?
le16_to_cpu(ri->i_extra_isize) : 0;
- if (f2fs_sb_has_flexible_inline_xattr(sbi->sb)) {
+ if (f2fs_sb_has_flexible_inline_xattr(sbi)) {
fi->i_inline_xattr_size = le16_to_cpu(ri->i_inline_xattr_size);
} else if (f2fs_has_inline_xattr(inode) ||
f2fs_has_inline_dentry(inode)) {
@@ -390,14 +390,14 @@ static int do_read_inode(struct inode *inode)
if (fi->i_flags & F2FS_PROJINHERIT_FL)
set_inode_flag(inode, FI_PROJ_INHERIT);
- if (f2fs_has_extra_attr(inode) && f2fs_sb_has_project_quota(sbi->sb) &&
+ if (f2fs_has_extra_attr(inode) && f2fs_sb_has_project_quota(sbi) &&
F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_projid))
i_projid = (projid_t)le32_to_cpu(ri->i_projid);
else
i_projid = F2FS_DEF_PROJID;
fi->i_projid = make_kprojid(&init_user_ns, i_projid);
- if (f2fs_has_extra_attr(inode) && f2fs_sb_has_inode_crtime(sbi->sb) &&
+ if (f2fs_has_extra_attr(inode) && f2fs_sb_has_inode_crtime(sbi) &&
F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_crtime)) {
fi->i_crtime.tv_sec = le64_to_cpu(ri->i_crtime);
fi->i_crtime.tv_nsec = le32_to_cpu(ri->i_crtime_nsec);
@@ -497,7 +497,7 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page)
struct f2fs_inode *ri;
struct extent_tree *et = F2FS_I(inode)->extent_tree;
- f2fs_wait_on_page_writeback(node_page, NODE, true);
+ f2fs_wait_on_page_writeback(node_page, NODE, true, true);
set_page_dirty(node_page);
f2fs_inode_synced(inode);
@@ -542,11 +542,11 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page)
if (f2fs_has_extra_attr(inode)) {
ri->i_extra_isize = cpu_to_le16(F2FS_I(inode)->i_extra_isize);
- if (f2fs_sb_has_flexible_inline_xattr(F2FS_I_SB(inode)->sb))
+ if (f2fs_sb_has_flexible_inline_xattr(F2FS_I_SB(inode)))
ri->i_inline_xattr_size =
cpu_to_le16(F2FS_I(inode)->i_inline_xattr_size);
- if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)->sb) &&
+ if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)) &&
F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize,
i_projid)) {
projid_t i_projid;
@@ -556,7 +556,7 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page)
ri->i_projid = cpu_to_le32(i_projid);
}
- if (f2fs_sb_has_inode_crtime(F2FS_I_SB(inode)->sb) &&
+ if (f2fs_sb_has_inode_crtime(F2FS_I_SB(inode)) &&
F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize,
i_crtime)) {
ri->i_crtime =
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 99299ede7429..62d9829f3a6a 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -61,7 +61,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
goto fail;
}
- if (f2fs_sb_has_project_quota(sbi->sb) &&
+ if (f2fs_sb_has_project_quota(sbi) &&
(F2FS_I(dir)->i_flags & F2FS_PROJINHERIT_FL))
F2FS_I(inode)->i_projid = F2FS_I(dir)->i_projid;
else
@@ -79,7 +79,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
f2fs_may_encrypt(inode))
f2fs_set_encrypted_inode(inode);
- if (f2fs_sb_has_extra_attr(sbi->sb)) {
+ if (f2fs_sb_has_extra_attr(sbi)) {
set_inode_flag(inode, FI_EXTRA_ATTR);
F2FS_I(inode)->i_extra_isize = F2FS_TOTAL_EXTRA_ATTR_SIZE;
}
@@ -92,7 +92,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
if (f2fs_may_inline_dentry(inode))
set_inode_flag(inode, FI_INLINE_DENTRY);
- if (f2fs_sb_has_flexible_inline_xattr(sbi->sb)) {
+ if (f2fs_sb_has_flexible_inline_xattr(sbi)) {
f2fs_bug_on(sbi, !f2fs_has_extra_attr(inode));
if (f2fs_has_inline_xattr(inode))
xattr_size = F2FS_OPTION(sbi).inline_xattr_size;
@@ -635,7 +635,7 @@ out_f2fs_handle_failed_inode:
f2fs_handle_failed_inode(inode);
out_free_encrypted_link:
if (disk_link.name != (unsigned char *)symname)
- kfree(disk_link.name);
+ kvfree(disk_link.name);
return err;
}
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 2b34206486d8..4f450e573312 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -101,7 +101,7 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type)
static void clear_node_page_dirty(struct page *page)
{
if (PageDirty(page)) {
- f2fs_clear_radix_tree_dirty_tag(page);
+ f2fs_clear_page_cache_dirty_tag(page);
clear_page_dirty_for_io(page);
dec_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES);
}
@@ -826,6 +826,7 @@ static int truncate_node(struct dnode_of_data *dn)
struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
struct node_info ni;
int err;
+ pgoff_t index;
err = f2fs_get_node_info(sbi, dn->nid, &ni);
if (err)
@@ -845,10 +846,11 @@ static int truncate_node(struct dnode_of_data *dn)
clear_node_page_dirty(dn->node_page);
set_sbi_flag(sbi, SBI_IS_DIRTY);
+ index = dn->node_page->index;
f2fs_put_page(dn->node_page, 1);
invalidate_mapping_pages(NODE_MAPPING(sbi),
- dn->node_page->index, dn->node_page->index);
+ index, index);
dn->node_page = NULL;
trace_f2fs_truncate_node(dn->inode, dn->nid, ni.blk_addr);
@@ -1104,7 +1106,7 @@ skip_partial:
ri->i_nid[offset[0] - NODE_DIR1_BLOCK]) {
lock_page(page);
BUG_ON(page->mapping != NODE_MAPPING(sbi));
- f2fs_wait_on_page_writeback(page, NODE, true);
+ f2fs_wait_on_page_writeback(page, NODE, true, true);
ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0;
set_page_dirty(page);
unlock_page(page);
@@ -1232,7 +1234,7 @@ struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs)
new_ni.version = 0;
set_node_addr(sbi, &new_ni, NEW_ADDR, false);
- f2fs_wait_on_page_writeback(page, NODE, true);
+ f2fs_wait_on_page_writeback(page, NODE, true, true);
fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true);
set_cold_node(page, S_ISDIR(dn->inode->i_mode));
if (!PageUptodate(page))
@@ -1306,9 +1308,7 @@ void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
if (f2fs_check_nid_range(sbi, nid))
return;
- rcu_read_lock();
- apage = radix_tree_lookup(&NODE_MAPPING(sbi)->i_pages, nid);
- rcu_read_unlock();
+ apage = xa_load(&NODE_MAPPING(sbi)->i_pages, nid);
if (apage)
return;
@@ -1598,10 +1598,10 @@ int f2fs_move_node_page(struct page *node_page, int gc_type)
.for_reclaim = 0,
};
+ f2fs_wait_on_page_writeback(node_page, NODE, true, true);
+
set_page_dirty(node_page);
- f2fs_wait_on_page_writeback(node_page, NODE, true);
- f2fs_bug_on(F2FS_P_SB(node_page), PageWriteback(node_page));
if (!clear_page_dirty_for_io(node_page)) {
err = -EAGAIN;
goto out_page;
@@ -1689,8 +1689,7 @@ continue_unlock:
goto continue_unlock;
}
- f2fs_wait_on_page_writeback(page, NODE, true);
- BUG_ON(PageWriteback(page));
+ f2fs_wait_on_page_writeback(page, NODE, true, true);
set_fsync_mark(page, 0);
set_dentry_mark(page, 0);
@@ -1741,7 +1740,7 @@ continue_unlock:
"Retry to write fsync mark: ino=%u, idx=%lx",
ino, last_page->index);
lock_page(last_page);
- f2fs_wait_on_page_writeback(last_page, NODE, true);
+ f2fs_wait_on_page_writeback(last_page, NODE, true, true);
set_page_dirty(last_page);
unlock_page(last_page);
goto retry;
@@ -1822,9 +1821,8 @@ continue_unlock:
goto lock_node;
}
- f2fs_wait_on_page_writeback(page, NODE, true);
+ f2fs_wait_on_page_writeback(page, NODE, true, true);
- BUG_ON(PageWriteback(page));
if (!clear_page_dirty_for_io(page))
goto continue_unlock;
@@ -1891,7 +1889,7 @@ int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi,
get_page(page);
spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
- f2fs_wait_on_page_writeback(page, NODE, true);
+ f2fs_wait_on_page_writeback(page, NODE, true, false);
if (TestClearPageError(page))
ret = -EIO;
@@ -2469,7 +2467,7 @@ void f2fs_recover_inline_xattr(struct inode *inode, struct page *page)
src_addr = inline_xattr_addr(inode, page);
inline_size = inline_xattr_size(inode);
- f2fs_wait_on_page_writeback(ipage, NODE, true);
+ f2fs_wait_on_page_writeback(ipage, NODE, true, true);
memcpy(dst_addr, src_addr, inline_size);
update_inode:
f2fs_update_inode(inode, ipage);
@@ -2563,17 +2561,17 @@ retry:
if (dst->i_inline & F2FS_EXTRA_ATTR) {
dst->i_extra_isize = src->i_extra_isize;
- if (f2fs_sb_has_flexible_inline_xattr(sbi->sb) &&
+ if (f2fs_sb_has_flexible_inline_xattr(sbi) &&
F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize),
i_inline_xattr_size))
dst->i_inline_xattr_size = src->i_inline_xattr_size;
- if (f2fs_sb_has_project_quota(sbi->sb) &&
+ if (f2fs_sb_has_project_quota(sbi) &&
F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize),
i_projid))
dst->i_projid = src->i_projid;
- if (f2fs_sb_has_inode_crtime(sbi->sb) &&
+ if (f2fs_sb_has_inode_crtime(sbi) &&
F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize),
i_crtime_nsec)) {
dst->i_crtime = src->i_crtime;
@@ -3115,17 +3113,17 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi)
for (i = 0; i < nm_i->nat_blocks; i++)
kvfree(nm_i->free_nid_bitmap[i]);
- kfree(nm_i->free_nid_bitmap);
+ kvfree(nm_i->free_nid_bitmap);
}
kvfree(nm_i->free_nid_count);
- kfree(nm_i->nat_bitmap);
- kfree(nm_i->nat_bits);
+ kvfree(nm_i->nat_bitmap);
+ kvfree(nm_i->nat_bits);
#ifdef CONFIG_F2FS_CHECK_FS
- kfree(nm_i->nat_bitmap_mir);
+ kvfree(nm_i->nat_bitmap_mir);
#endif
sbi->nm_info = NULL;
- kfree(nm_i);
+ kvfree(nm_i);
}
int __init f2fs_create_node_manager_caches(void)
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 1c73d879a9bc..e05af5df5648 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -361,7 +361,7 @@ static inline int set_nid(struct page *p, int off, nid_t nid, bool i)
{
struct f2fs_node *rn = F2FS_NODE(p);
- f2fs_wait_on_page_writeback(p, NODE, true);
+ f2fs_wait_on_page_writeback(p, NODE, true, true);
if (i)
rn->i.i_nid[off - NODE_DIR1_BLOCK] = cpu_to_le32(nid);
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 1dfb17f9f9ff..e3883db868d8 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -250,7 +250,7 @@ static int recover_inode(struct inode *inode, struct page *page)
i_gid_write(inode, le32_to_cpu(raw->i_gid));
if (raw->i_inline & F2FS_EXTRA_ATTR) {
- if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)->sb) &&
+ if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)) &&
F2FS_FITS_IN_INODE(raw, le16_to_cpu(raw->i_extra_isize),
i_projid)) {
projid_t i_projid;
@@ -539,7 +539,7 @@ retry_dn:
goto out;
}
- f2fs_wait_on_page_writeback(dn.node_page, NODE, true);
+ f2fs_wait_on_page_writeback(dn.node_page, NODE, true, true);
err = f2fs_get_node_info(sbi, dn.nid, &ni);
if (err)
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 6edcf8391dd3..9b79056d705d 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -229,7 +229,7 @@ static int __revoke_inmem_pages(struct inode *inode,
lock_page(page);
- f2fs_wait_on_page_writeback(page, DATA, true);
+ f2fs_wait_on_page_writeback(page, DATA, true, true);
if (recover) {
struct dnode_of_data dn;
@@ -387,8 +387,9 @@ static int __f2fs_commit_inmem_pages(struct inode *inode)
if (page->mapping == inode->i_mapping) {
trace_f2fs_commit_inmem_page(page, INMEM);
+ f2fs_wait_on_page_writeback(page, DATA, true, true);
+
set_page_dirty(page);
- f2fs_wait_on_page_writeback(page, DATA, true);
if (clear_page_dirty_for_io(page)) {
inode_dec_dirty_pages(inode);
f2fs_remove_dirty_inode(inode);
@@ -620,14 +621,16 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino)
return 0;
if (!test_opt(sbi, FLUSH_MERGE)) {
+ atomic_inc(&fcc->queued_flush);
ret = submit_flush_wait(sbi, ino);
+ atomic_dec(&fcc->queued_flush);
atomic_inc(&fcc->issued_flush);
return ret;
}
- if (atomic_inc_return(&fcc->issing_flush) == 1 || sbi->s_ndevs > 1) {
+ if (atomic_inc_return(&fcc->queued_flush) == 1 || sbi->s_ndevs > 1) {
ret = submit_flush_wait(sbi, ino);
- atomic_dec(&fcc->issing_flush);
+ atomic_dec(&fcc->queued_flush);
atomic_inc(&fcc->issued_flush);
return ret;
@@ -646,14 +649,14 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino)
if (fcc->f2fs_issue_flush) {
wait_for_completion(&cmd.wait);
- atomic_dec(&fcc->issing_flush);
+ atomic_dec(&fcc->queued_flush);
} else {
struct llist_node *list;
list = llist_del_all(&fcc->issue_list);
if (!list) {
wait_for_completion(&cmd.wait);
- atomic_dec(&fcc->issing_flush);
+ atomic_dec(&fcc->queued_flush);
} else {
struct flush_cmd *tmp, *next;
@@ -662,7 +665,7 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino)
llist_for_each_entry_safe(tmp, next, list, llnode) {
if (tmp == &cmd) {
cmd.ret = ret;
- atomic_dec(&fcc->issing_flush);
+ atomic_dec(&fcc->queued_flush);
continue;
}
tmp->ret = ret;
@@ -691,7 +694,7 @@ int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi)
if (!fcc)
return -ENOMEM;
atomic_set(&fcc->issued_flush, 0);
- atomic_set(&fcc->issing_flush, 0);
+ atomic_set(&fcc->queued_flush, 0);
init_waitqueue_head(&fcc->flush_wait_queue);
init_llist_head(&fcc->issue_list);
SM_I(sbi)->fcc_info = fcc;
@@ -703,7 +706,7 @@ init_thread:
"f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev));
if (IS_ERR(fcc->f2fs_issue_flush)) {
err = PTR_ERR(fcc->f2fs_issue_flush);
- kfree(fcc);
+ kvfree(fcc);
SM_I(sbi)->fcc_info = NULL;
return err;
}
@@ -722,7 +725,7 @@ void f2fs_destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free)
kthread_stop(flush_thread);
}
if (free) {
- kfree(fcc);
+ kvfree(fcc);
SM_I(sbi)->fcc_info = NULL;
}
}
@@ -907,7 +910,7 @@ static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi,
dc->len = len;
dc->ref = 0;
dc->state = D_PREP;
- dc->issuing = 0;
+ dc->queued = 0;
dc->error = 0;
init_completion(&dc->wait);
list_add_tail(&dc->list, pend_list);
@@ -940,7 +943,7 @@ static void __detach_discard_cmd(struct discard_cmd_control *dcc,
struct discard_cmd *dc)
{
if (dc->state == D_DONE)
- atomic_sub(dc->issuing, &dcc->issing_discard);
+ atomic_sub(dc->queued, &dcc->queued_discard);
list_del(&dc->list);
rb_erase_cached(&dc->rb_node, &dcc->root);
@@ -1143,12 +1146,12 @@ submit:
dc->bio_ref++;
spin_unlock_irqrestore(&dc->lock, flags);
- atomic_inc(&dcc->issing_discard);
- dc->issuing++;
+ atomic_inc(&dcc->queued_discard);
+ dc->queued++;
list_move_tail(&dc->list, wait_list);
/* sanity check on discard range */
- __check_sit_bitmap(sbi, start, start + len);
+ __check_sit_bitmap(sbi, lstart, lstart + len);
bio->bi_private = dc;
bio->bi_end_io = f2fs_submit_discard_endio;
@@ -1649,6 +1652,10 @@ static int issue_discard_thread(void *data)
if (dcc->discard_wake)
dcc->discard_wake = 0;
+ /* clean up pending candidates before going to sleep */
+ if (atomic_read(&dcc->queued_discard))
+ __wait_all_discard_cmd(sbi, NULL);
+
if (try_to_freeze())
continue;
if (f2fs_readonly(sbi->sb))
@@ -1734,7 +1741,7 @@ static int __issue_discard_async(struct f2fs_sb_info *sbi,
struct block_device *bdev, block_t blkstart, block_t blklen)
{
#ifdef CONFIG_BLK_DEV_ZONED
- if (f2fs_sb_has_blkzoned(sbi->sb) &&
+ if (f2fs_sb_has_blkzoned(sbi) &&
bdev_zoned_model(bdev) != BLK_ZONED_NONE)
return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen);
#endif
@@ -1882,7 +1889,7 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi,
unsigned int start = 0, end = -1;
unsigned int secno, start_segno;
bool force = (cpc->reason & CP_DISCARD);
- bool need_align = test_opt(sbi, LFS) && sbi->segs_per_sec > 1;
+ bool need_align = test_opt(sbi, LFS) && __is_large_section(sbi);
mutex_lock(&dirty_i->seglist_lock);
@@ -1914,7 +1921,7 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi,
(end - 1) <= cpc->trim_end)
continue;
- if (!test_opt(sbi, LFS) || sbi->segs_per_sec == 1) {
+ if (!test_opt(sbi, LFS) || !__is_large_section(sbi)) {
f2fs_issue_discard(sbi, START_BLOCK(sbi, start),
(end - start) << sbi->log_blocks_per_seg);
continue;
@@ -1946,7 +1953,7 @@ find_next:
sbi->blocks_per_seg, cur_pos);
len = next_pos - cur_pos;
- if (f2fs_sb_has_blkzoned(sbi->sb) ||
+ if (f2fs_sb_has_blkzoned(sbi) ||
(force && len < cpc->trim_minlen))
goto skip;
@@ -1994,7 +2001,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
INIT_LIST_HEAD(&dcc->fstrim_list);
mutex_init(&dcc->cmd_lock);
atomic_set(&dcc->issued_discard, 0);
- atomic_set(&dcc->issing_discard, 0);
+ atomic_set(&dcc->queued_discard, 0);
atomic_set(&dcc->discard_cmd_cnt, 0);
dcc->nr_discards = 0;
dcc->max_discards = MAIN_SEGS(sbi) << sbi->log_blocks_per_seg;
@@ -2010,7 +2017,7 @@ init_thread:
"f2fs_discard-%u:%u", MAJOR(dev), MINOR(dev));
if (IS_ERR(dcc->f2fs_issue_discard)) {
err = PTR_ERR(dcc->f2fs_issue_discard);
- kfree(dcc);
+ kvfree(dcc);
SM_I(sbi)->dcc_info = NULL;
return err;
}
@@ -2027,7 +2034,7 @@ static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi)
f2fs_stop_discard_thread(sbi);
- kfree(dcc);
+ kvfree(dcc);
SM_I(sbi)->dcc_info = NULL;
}
@@ -2146,7 +2153,7 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
/* update total number of valid blocks to be written in ckpt area */
SIT_I(sbi)->written_valid_blocks += del;
- if (sbi->segs_per_sec > 1)
+ if (__is_large_section(sbi))
get_sec_entry(sbi, segno)->valid_blocks += del;
}
@@ -2412,7 +2419,7 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified)
static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type)
{
/* if segs_per_sec is large than 1, we need to keep original policy. */
- if (sbi->segs_per_sec != 1)
+ if (__is_large_section(sbi))
return CURSEG_I(sbi, type)->segno;
if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
@@ -2722,7 +2729,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
struct discard_policy dpolicy;
unsigned long long trimmed = 0;
int err = 0;
- bool need_align = test_opt(sbi, LFS) && sbi->segs_per_sec > 1;
+ bool need_align = test_opt(sbi, LFS) && __is_large_section(sbi);
if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize)
return -EINVAL;
@@ -3272,16 +3279,18 @@ void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn,
}
void f2fs_wait_on_page_writeback(struct page *page,
- enum page_type type, bool ordered)
+ enum page_type type, bool ordered, bool locked)
{
if (PageWriteback(page)) {
struct f2fs_sb_info *sbi = F2FS_P_SB(page);
f2fs_submit_merged_write_cond(sbi, NULL, page, 0, type);
- if (ordered)
+ if (ordered) {
wait_on_page_writeback(page);
- else
+ f2fs_bug_on(sbi, locked && PageWriteback(page));
+ } else {
wait_for_stable_page(page);
+ }
}
}
@@ -3298,7 +3307,7 @@ void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr)
cpage = find_lock_page(META_MAPPING(sbi), blkaddr);
if (cpage) {
- f2fs_wait_on_page_writeback(cpage, DATA, true);
+ f2fs_wait_on_page_writeback(cpage, DATA, true, true);
f2fs_put_page(cpage, 1);
}
}
@@ -3880,7 +3889,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
if (!sit_i->tmp_map)
return -ENOMEM;
- if (sbi->segs_per_sec > 1) {
+ if (__is_large_section(sbi)) {
sit_i->sec_entries =
f2fs_kvzalloc(sbi, array_size(sizeof(struct sec_entry),
MAIN_SECS(sbi)),
@@ -4035,7 +4044,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi)
se->valid_blocks;
}
- if (sbi->segs_per_sec > 1)
+ if (__is_large_section(sbi))
get_sec_entry(sbi, start)->valid_blocks +=
se->valid_blocks;
}
@@ -4079,7 +4088,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi)
sbi->discard_blks -= se->valid_blocks;
}
- if (sbi->segs_per_sec > 1) {
+ if (__is_large_section(sbi)) {
get_sec_entry(sbi, start)->valid_blocks +=
se->valid_blocks;
get_sec_entry(sbi, start)->valid_blocks -=
@@ -4314,7 +4323,7 @@ static void destroy_dirty_segmap(struct f2fs_sb_info *sbi)
destroy_victim_secmap(sbi);
SM_I(sbi)->dirty_info = NULL;
- kfree(dirty_i);
+ kvfree(dirty_i);
}
static void destroy_curseg(struct f2fs_sb_info *sbi)
@@ -4326,10 +4335,10 @@ static void destroy_curseg(struct f2fs_sb_info *sbi)
return;
SM_I(sbi)->curseg_array = NULL;
for (i = 0; i < NR_CURSEG_TYPE; i++) {
- kfree(array[i].sum_blk);
- kfree(array[i].journal);
+ kvfree(array[i].sum_blk);
+ kvfree(array[i].journal);
}
- kfree(array);
+ kvfree(array);
}
static void destroy_free_segmap(struct f2fs_sb_info *sbi)
@@ -4340,7 +4349,7 @@ static void destroy_free_segmap(struct f2fs_sb_info *sbi)
SM_I(sbi)->free_info = NULL;
kvfree(free_i->free_segmap);
kvfree(free_i->free_secmap);
- kfree(free_i);
+ kvfree(free_i);
}
static void destroy_sit_info(struct f2fs_sb_info *sbi)
@@ -4353,26 +4362,26 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi)
if (sit_i->sentries) {
for (start = 0; start < MAIN_SEGS(sbi); start++) {
- kfree(sit_i->sentries[start].cur_valid_map);
+ kvfree(sit_i->sentries[start].cur_valid_map);
#ifdef CONFIG_F2FS_CHECK_FS
- kfree(sit_i->sentries[start].cur_valid_map_mir);
+ kvfree(sit_i->sentries[start].cur_valid_map_mir);
#endif
- kfree(sit_i->sentries[start].ckpt_valid_map);
- kfree(sit_i->sentries[start].discard_map);
+ kvfree(sit_i->sentries[start].ckpt_valid_map);
+ kvfree(sit_i->sentries[start].discard_map);
}
}
- kfree(sit_i->tmp_map);
+ kvfree(sit_i->tmp_map);
kvfree(sit_i->sentries);
kvfree(sit_i->sec_entries);
kvfree(sit_i->dirty_sentries_bitmap);
SM_I(sbi)->sit_info = NULL;
- kfree(sit_i->sit_bitmap);
+ kvfree(sit_i->sit_bitmap);
#ifdef CONFIG_F2FS_CHECK_FS
- kfree(sit_i->sit_bitmap_mir);
+ kvfree(sit_i->sit_bitmap_mir);
#endif
- kfree(sit_i);
+ kvfree(sit_i);
}
void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi)
@@ -4388,7 +4397,7 @@ void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi)
destroy_free_segmap(sbi);
destroy_sit_info(sbi);
sbi->sm_info = NULL;
- kfree(sm_info);
+ kvfree(sm_info);
}
int __init f2fs_create_segment_manager_caches(void)
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index ab3465faddf1..a77f76f528b6 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -333,7 +333,7 @@ static inline unsigned int get_valid_blocks(struct f2fs_sb_info *sbi,
* In order to get # of valid blocks in a section instantly from many
* segments, f2fs manages two counting structures separately.
*/
- if (use_section && sbi->segs_per_sec > 1)
+ if (use_section && __is_large_section(sbi))
return get_sec_entry(sbi, segno)->valid_blocks;
else
return get_seg_entry(sbi, segno)->valid_blocks;
diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c
index 9e13db994fdf..a467aca29cfe 100644
--- a/fs/f2fs/shrinker.c
+++ b/fs/f2fs/shrinker.c
@@ -135,6 +135,6 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *sbi)
f2fs_shrink_extent_tree(sbi, __count_extent_cache(sbi));
spin_lock(&f2fs_list_lock);
- list_del(&sbi->s_list);
+ list_del_init(&sbi->s_list);
spin_unlock(&f2fs_list_lock);
}
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index af58b2cc21b8..3d3ce9eb6d13 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -38,7 +38,7 @@ static struct kmem_cache *f2fs_inode_cachep;
#ifdef CONFIG_F2FS_FAULT_INJECTION
-char *f2fs_fault_name[FAULT_MAX] = {
+const char *f2fs_fault_name[FAULT_MAX] = {
[FAULT_KMALLOC] = "kmalloc",
[FAULT_KVMALLOC] = "kvmalloc",
[FAULT_PAGE_ALLOC] = "page alloc",
@@ -259,7 +259,7 @@ static int f2fs_set_qf_name(struct super_block *sb, int qtype,
"quota options when quota turned on");
return -EINVAL;
}
- if (f2fs_sb_has_quota_ino(sb)) {
+ if (f2fs_sb_has_quota_ino(sbi)) {
f2fs_msg(sb, KERN_INFO,
"QUOTA feature is enabled, so ignore qf_name");
return 0;
@@ -289,7 +289,7 @@ static int f2fs_set_qf_name(struct super_block *sb, int qtype,
set_opt(sbi, QUOTA);
return 0;
errout:
- kfree(qname);
+ kvfree(qname);
return ret;
}
@@ -302,7 +302,7 @@ static int f2fs_clear_qf_name(struct super_block *sb, int qtype)
" when quota turned on");
return -EINVAL;
}
- kfree(F2FS_OPTION(sbi).s_qf_names[qtype]);
+ kvfree(F2FS_OPTION(sbi).s_qf_names[qtype]);
F2FS_OPTION(sbi).s_qf_names[qtype] = NULL;
return 0;
}
@@ -314,7 +314,7 @@ static int f2fs_check_quota_options(struct f2fs_sb_info *sbi)
* 'grpquota' mount options are allowed even without quota feature
* to support legacy quotas in quota files.
*/
- if (test_opt(sbi, PRJQUOTA) && !f2fs_sb_has_project_quota(sbi->sb)) {
+ if (test_opt(sbi, PRJQUOTA) && !f2fs_sb_has_project_quota(sbi)) {
f2fs_msg(sbi->sb, KERN_ERR, "Project quota feature not enabled. "
"Cannot enable project quota enforcement.");
return -1;
@@ -348,7 +348,7 @@ static int f2fs_check_quota_options(struct f2fs_sb_info *sbi)
}
}
- if (f2fs_sb_has_quota_ino(sbi->sb) && F2FS_OPTION(sbi).s_jquota_fmt) {
+ if (f2fs_sb_has_quota_ino(sbi) && F2FS_OPTION(sbi).s_jquota_fmt) {
f2fs_msg(sbi->sb, KERN_INFO,
"QUOTA feature is enabled, so ignore jquota_fmt");
F2FS_OPTION(sbi).s_jquota_fmt = 0;
@@ -399,10 +399,10 @@ static int parse_options(struct super_block *sb, char *options)
set_opt(sbi, BG_GC);
set_opt(sbi, FORCE_FG_GC);
} else {
- kfree(name);
+ kvfree(name);
return -EINVAL;
}
- kfree(name);
+ kvfree(name);
break;
case Opt_disable_roll_forward:
set_opt(sbi, DISABLE_ROLL_FORWARD);
@@ -417,7 +417,7 @@ static int parse_options(struct super_block *sb, char *options)
set_opt(sbi, DISCARD);
break;
case Opt_nodiscard:
- if (f2fs_sb_has_blkzoned(sb)) {
+ if (f2fs_sb_has_blkzoned(sbi)) {
f2fs_msg(sb, KERN_WARNING,
"discard is required for zoned block devices");
return -EINVAL;
@@ -566,11 +566,11 @@ static int parse_options(struct super_block *sb, char *options)
return -ENOMEM;
if (strlen(name) == 8 &&
!strncmp(name, "adaptive", 8)) {
- if (f2fs_sb_has_blkzoned(sb)) {
+ if (f2fs_sb_has_blkzoned(sbi)) {
f2fs_msg(sb, KERN_WARNING,
"adaptive mode is not allowed with "
"zoned block device feature");
- kfree(name);
+ kvfree(name);
return -EINVAL;
}
set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE);
@@ -578,10 +578,10 @@ static int parse_options(struct super_block *sb, char *options)
!strncmp(name, "lfs", 3)) {
set_opt_mode(sbi, F2FS_MOUNT_LFS);
} else {
- kfree(name);
+ kvfree(name);
return -EINVAL;
}
- kfree(name);
+ kvfree(name);
break;
case Opt_io_size_bits:
if (args->from && match_int(args, &arg))
@@ -714,10 +714,10 @@ static int parse_options(struct super_block *sb, char *options)
!strncmp(name, "fs-based", 8)) {
F2FS_OPTION(sbi).whint_mode = WHINT_MODE_FS;
} else {
- kfree(name);
+ kvfree(name);
return -EINVAL;
}
- kfree(name);
+ kvfree(name);
break;
case Opt_alloc:
name = match_strdup(&args[0]);
@@ -731,10 +731,10 @@ static int parse_options(struct super_block *sb, char *options)
!strncmp(name, "reuse", 5)) {
F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE;
} else {
- kfree(name);
+ kvfree(name);
return -EINVAL;
}
- kfree(name);
+ kvfree(name);
break;
case Opt_fsync:
name = match_strdup(&args[0]);
@@ -751,14 +751,14 @@ static int parse_options(struct super_block *sb, char *options)
F2FS_OPTION(sbi).fsync_mode =
FSYNC_MODE_NOBARRIER;
} else {
- kfree(name);
+ kvfree(name);
return -EINVAL;
}
- kfree(name);
+ kvfree(name);
break;
case Opt_test_dummy_encryption:
#ifdef CONFIG_F2FS_FS_ENCRYPTION
- if (!f2fs_sb_has_encrypt(sb)) {
+ if (!f2fs_sb_has_encrypt(sbi)) {
f2fs_msg(sb, KERN_ERR, "Encrypt feature is off");
return -EINVAL;
}
@@ -783,10 +783,10 @@ static int parse_options(struct super_block *sb, char *options)
!strncmp(name, "disable", 7)) {
set_opt(sbi, DISABLE_CHECKPOINT);
} else {
- kfree(name);
+ kvfree(name);
return -EINVAL;
}
- kfree(name);
+ kvfree(name);
break;
default:
f2fs_msg(sb, KERN_ERR,
@@ -799,13 +799,13 @@ static int parse_options(struct super_block *sb, char *options)
if (f2fs_check_quota_options(sbi))
return -EINVAL;
#else
- if (f2fs_sb_has_quota_ino(sbi->sb) && !f2fs_readonly(sbi->sb)) {
+ if (f2fs_sb_has_quota_ino(sbi) && !f2fs_readonly(sbi->sb)) {
f2fs_msg(sbi->sb, KERN_INFO,
"Filesystem with quota feature cannot be mounted RDWR "
"without CONFIG_QUOTA");
return -EINVAL;
}
- if (f2fs_sb_has_project_quota(sbi->sb) && !f2fs_readonly(sbi->sb)) {
+ if (f2fs_sb_has_project_quota(sbi) && !f2fs_readonly(sbi->sb)) {
f2fs_msg(sb, KERN_ERR,
"Filesystem with project quota feature cannot be "
"mounted RDWR without CONFIG_QUOTA");
@@ -821,8 +821,8 @@ static int parse_options(struct super_block *sb, char *options)
}
if (test_opt(sbi, INLINE_XATTR_SIZE)) {
- if (!f2fs_sb_has_extra_attr(sb) ||
- !f2fs_sb_has_flexible_inline_xattr(sb)) {
+ if (!f2fs_sb_has_extra_attr(sbi) ||
+ !f2fs_sb_has_flexible_inline_xattr(sbi)) {
f2fs_msg(sb, KERN_ERR,
"extra_attr or flexible_inline_xattr "
"feature is off");
@@ -1017,10 +1017,10 @@ static void destroy_device_list(struct f2fs_sb_info *sbi)
for (i = 0; i < sbi->s_ndevs; i++) {
blkdev_put(FDEV(i).bdev, FMODE_EXCL);
#ifdef CONFIG_BLK_DEV_ZONED
- kfree(FDEV(i).blkz_type);
+ kvfree(FDEV(i).blkz_type);
#endif
}
- kfree(sbi->devs);
+ kvfree(sbi->devs);
}
static void f2fs_put_super(struct super_block *sb)
@@ -1058,9 +1058,6 @@ static void f2fs_put_super(struct super_block *sb)
f2fs_write_checkpoint(sbi, &cpc);
}
- /* f2fs_write_checkpoint can update stat informaion */
- f2fs_destroy_stats(sbi);
-
/*
* normally superblock is clean, so we need to release this.
* In addition, EIO will skip do checkpoint, we need this as well.
@@ -1080,29 +1077,35 @@ static void f2fs_put_super(struct super_block *sb)
iput(sbi->node_inode);
iput(sbi->meta_inode);
+ /*
+ * iput() can update stat information, if f2fs_write_checkpoint()
+ * above failed with error.
+ */
+ f2fs_destroy_stats(sbi);
+
/* destroy f2fs internal modules */
f2fs_destroy_node_manager(sbi);
f2fs_destroy_segment_manager(sbi);
- kfree(sbi->ckpt);
+ kvfree(sbi->ckpt);
f2fs_unregister_sysfs(sbi);
sb->s_fs_info = NULL;
if (sbi->s_chksum_driver)
crypto_free_shash(sbi->s_chksum_driver);
- kfree(sbi->raw_super);
+ kvfree(sbi->raw_super);
destroy_device_list(sbi);
mempool_destroy(sbi->write_io_dummy);
#ifdef CONFIG_QUOTA
for (i = 0; i < MAXQUOTAS; i++)
- kfree(F2FS_OPTION(sbi).s_qf_names[i]);
+ kvfree(F2FS_OPTION(sbi).s_qf_names[i]);
#endif
destroy_percpu_info(sbi);
for (i = 0; i < NR_PAGE_TYPE; i++)
- kfree(sbi->write_io[i]);
- kfree(sbi);
+ kvfree(sbi->write_io[i]);
+ kvfree(sbi);
}
int f2fs_sync_fs(struct super_block *sb, int sync)
@@ -1431,7 +1434,7 @@ static void default_options(struct f2fs_sb_info *sbi)
sbi->sb->s_flags |= SB_LAZYTIME;
set_opt(sbi, FLUSH_MERGE);
set_opt(sbi, DISCARD);
- if (f2fs_sb_has_blkzoned(sbi->sb))
+ if (f2fs_sb_has_blkzoned(sbi))
set_opt_mode(sbi, F2FS_MOUNT_LFS);
else
set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE);
@@ -1457,19 +1460,16 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
sbi->sb->s_flags |= SB_ACTIVE;
- mutex_lock(&sbi->gc_mutex);
f2fs_update_time(sbi, DISABLE_TIME);
while (!f2fs_time_over(sbi, DISABLE_TIME)) {
+ mutex_lock(&sbi->gc_mutex);
err = f2fs_gc(sbi, true, false, NULL_SEGNO);
if (err == -ENODATA)
break;
- if (err && err != -EAGAIN) {
- mutex_unlock(&sbi->gc_mutex);
+ if (err && err != -EAGAIN)
return err;
- }
}
- mutex_unlock(&sbi->gc_mutex);
err = sync_filesystem(sbi->sb);
if (err)
@@ -1531,7 +1531,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
GFP_KERNEL);
if (!org_mount_opt.s_qf_names[i]) {
for (j = 0; j < i; j++)
- kfree(org_mount_opt.s_qf_names[j]);
+ kvfree(org_mount_opt.s_qf_names[j]);
return -ENOMEM;
}
} else {
@@ -1575,7 +1575,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
sb->s_flags &= ~SB_RDONLY;
if (sb_any_quota_suspended(sb)) {
dquot_resume(sb, -1);
- } else if (f2fs_sb_has_quota_ino(sb)) {
+ } else if (f2fs_sb_has_quota_ino(sbi)) {
err = f2fs_enable_quotas(sb);
if (err)
goto restore_opts;
@@ -1651,7 +1651,7 @@ skip:
#ifdef CONFIG_QUOTA
/* Release old quota file names */
for (i = 0; i < MAXQUOTAS; i++)
- kfree(org_mount_opt.s_qf_names[i]);
+ kvfree(org_mount_opt.s_qf_names[i]);
#endif
/* Update the POSIXACL Flag */
sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
@@ -1672,7 +1672,7 @@ restore_opts:
#ifdef CONFIG_QUOTA
F2FS_OPTION(sbi).s_jquota_fmt = org_mount_opt.s_jquota_fmt;
for (i = 0; i < MAXQUOTAS; i++) {
- kfree(F2FS_OPTION(sbi).s_qf_names[i]);
+ kvfree(F2FS_OPTION(sbi).s_qf_names[i]);
F2FS_OPTION(sbi).s_qf_names[i] = org_mount_opt.s_qf_names[i];
}
#endif
@@ -1817,7 +1817,7 @@ int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly)
int enabled = 0;
int i, err;
- if (f2fs_sb_has_quota_ino(sbi->sb) && rdonly) {
+ if (f2fs_sb_has_quota_ino(sbi) && rdonly) {
err = f2fs_enable_quotas(sbi->sb);
if (err) {
f2fs_msg(sbi->sb, KERN_ERR,
@@ -1848,7 +1848,7 @@ static int f2fs_quota_enable(struct super_block *sb, int type, int format_id,
unsigned long qf_inum;
int err;
- BUG_ON(!f2fs_sb_has_quota_ino(sb));
+ BUG_ON(!f2fs_sb_has_quota_ino(F2FS_SB(sb)));
qf_inum = f2fs_qf_ino(sb, type);
if (!qf_inum)
@@ -1993,7 +1993,7 @@ static int f2fs_quota_off(struct super_block *sb, int type)
goto out_put;
err = dquot_quota_off(sb, type);
- if (err || f2fs_sb_has_quota_ino(sb))
+ if (err || f2fs_sb_has_quota_ino(F2FS_SB(sb)))
goto out_put;
inode_lock(inode);
@@ -2173,7 +2173,7 @@ static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len,
* if LOST_FOUND feature is enabled.
*
*/
- if (f2fs_sb_has_lost_found(sbi->sb) &&
+ if (f2fs_sb_has_lost_found(sbi) &&
inode->i_ino == F2FS_ROOT_INO(sbi))
return -EPERM;
@@ -2396,7 +2396,7 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
__u32 crc = 0;
/* Check checksum_offset and crc in superblock */
- if (le32_to_cpu(raw_super->feature) & F2FS_FEATURE_SB_CHKSUM) {
+ if (__F2FS_HAS_FEATURE(raw_super, F2FS_FEATURE_SB_CHKSUM)) {
crc_offset = le32_to_cpu(raw_super->checksum_offset);
if (crc_offset !=
offsetof(struct f2fs_super_block, crc)) {
@@ -2496,10 +2496,10 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
return 1;
}
- if (segment_count > (le32_to_cpu(raw_super->block_count) >> 9)) {
+ if (segment_count > (le64_to_cpu(raw_super->block_count) >> 9)) {
f2fs_msg(sb, KERN_INFO,
- "Wrong segment_count / block_count (%u > %u)",
- segment_count, le32_to_cpu(raw_super->block_count));
+ "Wrong segment_count / block_count (%u > %llu)",
+ segment_count, le64_to_cpu(raw_super->block_count));
return 1;
}
@@ -2674,7 +2674,7 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi)
static void init_sb_info(struct f2fs_sb_info *sbi)
{
struct f2fs_super_block *raw_super = sbi->raw_super;
- int i, j;
+ int i;
sbi->log_sectors_per_block =
le32_to_cpu(raw_super->log_sectors_per_block);
@@ -2692,7 +2692,10 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
sbi->node_ino_num = le32_to_cpu(raw_super->node_ino);
sbi->meta_ino_num = le32_to_cpu(raw_super->meta_ino);
sbi->cur_victim_sec = NULL_SECNO;
+ sbi->next_victim_seg[BG_GC] = NULL_SEGNO;
+ sbi->next_victim_seg[FG_GC] = NULL_SEGNO;
sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH;
+ sbi->migration_granularity = sbi->segs_per_sec;
sbi->dir_level = DEF_DIR_LEVEL;
sbi->interval_time[CP_TIME] = DEF_CP_INTERVAL;
@@ -2710,9 +2713,6 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
INIT_LIST_HEAD(&sbi->s_list);
mutex_init(&sbi->umount_mutex);
- for (i = 0; i < NR_PAGE_TYPE - 1; i++)
- for (j = HOT; j < NR_TEMP_TYPE; j++)
- mutex_init(&sbi->wio_mutex[i][j]);
init_rwsem(&sbi->io_order_lock);
spin_lock_init(&sbi->cp_lock);
@@ -2749,7 +2749,7 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi)
unsigned int n = 0;
int err = -EIO;
- if (!f2fs_sb_has_blkzoned(sbi->sb))
+ if (!f2fs_sb_has_blkzoned(sbi))
return 0;
if (sbi->blocks_per_blkz && sbi->blocks_per_blkz !=
@@ -2800,7 +2800,7 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi)
}
}
- kfree(zones);
+ kvfree(zones);
return err;
}
@@ -2860,7 +2860,7 @@ static int read_raw_super_block(struct f2fs_sb_info *sbi,
/* No valid superblock */
if (!*raw_super)
- kfree(super);
+ kvfree(super);
else
err = 0;
@@ -2880,7 +2880,7 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover)
}
/* we should update superblock crc here */
- if (!recover && f2fs_sb_has_sb_chksum(sbi->sb)) {
+ if (!recover && f2fs_sb_has_sb_chksum(sbi)) {
crc = f2fs_crc32(sbi, F2FS_RAW_SUPER(sbi),
offsetof(struct f2fs_super_block, crc));
F2FS_RAW_SUPER(sbi)->crc = cpu_to_le32(crc);
@@ -2968,7 +2968,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
#ifdef CONFIG_BLK_DEV_ZONED
if (bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HM &&
- !f2fs_sb_has_blkzoned(sbi->sb)) {
+ !f2fs_sb_has_blkzoned(sbi)) {
f2fs_msg(sbi->sb, KERN_ERR,
"Zoned block device feature not enabled\n");
return -EINVAL;
@@ -3064,7 +3064,7 @@ try_onemore:
sbi->raw_super = raw_super;
/* precompute checksum seed for metadata */
- if (f2fs_sb_has_inode_chksum(sb))
+ if (f2fs_sb_has_inode_chksum(sbi))
sbi->s_chksum_seed = f2fs_chksum(sbi, ~0, raw_super->uuid,
sizeof(raw_super->uuid));
@@ -3074,7 +3074,7 @@ try_onemore:
* devices, but mandatory for host-managed zoned block devices.
*/
#ifndef CONFIG_BLK_DEV_ZONED
- if (f2fs_sb_has_blkzoned(sb)) {
+ if (f2fs_sb_has_blkzoned(sbi)) {
f2fs_msg(sb, KERN_ERR,
"Zoned block device support is not enabled\n");
err = -EOPNOTSUPP;
@@ -3101,13 +3101,13 @@ try_onemore:
#ifdef CONFIG_QUOTA
sb->dq_op = &f2fs_quota_operations;
- if (f2fs_sb_has_quota_ino(sb))
+ if (f2fs_sb_has_quota_ino(sbi))
sb->s_qcop = &dquot_quotactl_sysfile_ops;
else
sb->s_qcop = &f2fs_quotactl_ops;
sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ;
- if (f2fs_sb_has_quota_ino(sbi->sb)) {
+ if (f2fs_sb_has_quota_ino(sbi)) {
for (i = 0; i < MAXQUOTAS; i++) {
if (f2fs_qf_ino(sbi->sb, i))
sbi->nquota_files++;
@@ -3259,30 +3259,30 @@ try_onemore:
f2fs_build_gc_manager(sbi);
+ err = f2fs_build_stats(sbi);
+ if (err)
+ goto free_nm;
+
/* get an inode for node space */
sbi->node_inode = f2fs_iget(sb, F2FS_NODE_INO(sbi));
if (IS_ERR(sbi->node_inode)) {
f2fs_msg(sb, KERN_ERR, "Failed to read node inode");
err = PTR_ERR(sbi->node_inode);
- goto free_nm;
+ goto free_stats;
}
- err = f2fs_build_stats(sbi);
- if (err)
- goto free_node_inode;
-
/* read root inode and dentry */
root = f2fs_iget(sb, F2FS_ROOT_INO(sbi));
if (IS_ERR(root)) {
f2fs_msg(sb, KERN_ERR, "Failed to read root inode");
err = PTR_ERR(root);
- goto free_stats;
+ goto free_node_inode;
}
if (!S_ISDIR(root->i_mode) || !root->i_blocks ||
!root->i_size || !root->i_nlink) {
iput(root);
err = -EINVAL;
- goto free_stats;
+ goto free_node_inode;
}
sb->s_root = d_make_root(root); /* allocate root dentry */
@@ -3297,7 +3297,7 @@ try_onemore:
#ifdef CONFIG_QUOTA
/* Enable quota usage during mount */
- if (f2fs_sb_has_quota_ino(sb) && !f2fs_readonly(sb)) {
+ if (f2fs_sb_has_quota_ino(sbi) && !f2fs_readonly(sb)) {
err = f2fs_enable_quotas(sb);
if (err)
f2fs_msg(sb, KERN_ERR,
@@ -3369,7 +3369,7 @@ skip_recovery:
if (err)
goto free_meta;
}
- kfree(options);
+ kvfree(options);
/* recover broken superblock */
if (recovery) {
@@ -3392,7 +3392,7 @@ skip_recovery:
free_meta:
#ifdef CONFIG_QUOTA
f2fs_truncate_quota_inode_pages(sb);
- if (f2fs_sb_has_quota_ino(sb) && !f2fs_readonly(sb))
+ if (f2fs_sb_has_quota_ino(sbi) && !f2fs_readonly(sb))
f2fs_quota_off_umount(sbi->sb);
#endif
/*
@@ -3406,19 +3406,19 @@ free_meta:
free_root_inode:
dput(sb->s_root);
sb->s_root = NULL;
-free_stats:
- f2fs_destroy_stats(sbi);
free_node_inode:
f2fs_release_ino_entry(sbi, true);
truncate_inode_pages_final(NODE_MAPPING(sbi));
iput(sbi->node_inode);
+free_stats:
+ f2fs_destroy_stats(sbi);
free_nm:
f2fs_destroy_node_manager(sbi);
free_sm:
f2fs_destroy_segment_manager(sbi);
free_devices:
destroy_device_list(sbi);
- kfree(sbi->ckpt);
+ kvfree(sbi->ckpt);
free_meta_inode:
make_bad_inode(sbi->meta_inode);
iput(sbi->meta_inode);
@@ -3428,19 +3428,19 @@ free_percpu:
destroy_percpu_info(sbi);
free_bio_info:
for (i = 0; i < NR_PAGE_TYPE; i++)
- kfree(sbi->write_io[i]);
+ kvfree(sbi->write_io[i]);
free_options:
#ifdef CONFIG_QUOTA
for (i = 0; i < MAXQUOTAS; i++)
- kfree(F2FS_OPTION(sbi).s_qf_names[i]);
+ kvfree(F2FS_OPTION(sbi).s_qf_names[i]);
#endif
- kfree(options);
+ kvfree(options);
free_sb_buf:
- kfree(raw_super);
+ kvfree(raw_super);
free_sbi:
if (sbi->s_chksum_driver)
crypto_free_shash(sbi->s_chksum_driver);
- kfree(sbi);
+ kvfree(sbi);
/* give only one another chance */
if (retry) {
@@ -3545,9 +3545,7 @@ static int __init init_f2fs_fs(void)
err = register_filesystem(&f2fs_fs_type);
if (err)
goto free_shrinker;
- err = f2fs_create_root_stats();
- if (err)
- goto free_filesystem;
+ f2fs_create_root_stats();
err = f2fs_init_post_read_processing();
if (err)
goto free_root_stats;
@@ -3555,7 +3553,6 @@ static int __init init_f2fs_fs(void)
free_root_stats:
f2fs_destroy_root_stats();
-free_filesystem:
unregister_filesystem(&f2fs_fs_type);
free_shrinker:
unregister_shrinker(&f2fs_shrinker_info);
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index b777cbdd796b..0575edbe3ed6 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -90,34 +90,34 @@ static ssize_t features_show(struct f2fs_attr *a,
if (!sb->s_bdev->bd_part)
return snprintf(buf, PAGE_SIZE, "0\n");
- if (f2fs_sb_has_encrypt(sb))
+ if (f2fs_sb_has_encrypt(sbi))
len += snprintf(buf, PAGE_SIZE - len, "%s",
"encryption");
- if (f2fs_sb_has_blkzoned(sb))
+ if (f2fs_sb_has_blkzoned(sbi))
len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "blkzoned");
- if (f2fs_sb_has_extra_attr(sb))
+ if (f2fs_sb_has_extra_attr(sbi))
len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "extra_attr");
- if (f2fs_sb_has_project_quota(sb))
+ if (f2fs_sb_has_project_quota(sbi))
len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "projquota");
- if (f2fs_sb_has_inode_chksum(sb))
+ if (f2fs_sb_has_inode_chksum(sbi))
len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "inode_checksum");
- if (f2fs_sb_has_flexible_inline_xattr(sb))
+ if (f2fs_sb_has_flexible_inline_xattr(sbi))
len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "flexible_inline_xattr");
- if (f2fs_sb_has_quota_ino(sb))
+ if (f2fs_sb_has_quota_ino(sbi))
len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "quota_ino");
- if (f2fs_sb_has_inode_crtime(sb))
+ if (f2fs_sb_has_inode_crtime(sbi))
len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "inode_crtime");
- if (f2fs_sb_has_lost_found(sb))
+ if (f2fs_sb_has_lost_found(sbi))
len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "lost_found");
- if (f2fs_sb_has_sb_chksum(sb))
+ if (f2fs_sb_has_sb_chksum(sbi))
len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "sb_checksum");
len += snprintf(buf + len, PAGE_SIZE - len, "\n");
@@ -246,6 +246,11 @@ out:
return count;
}
+ if (!strcmp(a->attr.name, "migration_granularity")) {
+ if (t == 0 || t > sbi->segs_per_sec)
+ return -EINVAL;
+ }
+
if (!strcmp(a->attr.name, "trim_sections"))
return -EINVAL;
@@ -406,6 +411,7 @@ F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh);
F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages);
F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, migration_granularity, migration_granularity);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]);
@@ -460,6 +466,7 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(min_hot_blocks),
ATTR_LIST(min_ssr_sections),
ATTR_LIST(max_victim_search),
+ ATTR_LIST(migration_granularity),
ATTR_LIST(dir_level),
ATTR_LIST(ram_thresh),
ATTR_LIST(ra_nid_pages),
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 7261245c208d..18d5ffbc5e8c 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -288,7 +288,7 @@ static int read_xattr_block(struct inode *inode, void *txattr_addr)
static int lookup_all_xattrs(struct inode *inode, struct page *ipage,
unsigned int index, unsigned int len,
const char *name, struct f2fs_xattr_entry **xe,
- void **base_addr)
+ void **base_addr, int *base_size)
{
void *cur_addr, *txattr_addr, *last_addr = NULL;
nid_t xnid = F2FS_I(inode)->i_xattr_nid;
@@ -299,8 +299,8 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage,
if (!size && !inline_size)
return -ENODATA;
- txattr_addr = f2fs_kzalloc(F2FS_I_SB(inode),
- inline_size + size + XATTR_PADDING_SIZE, GFP_NOFS);
+ *base_size = inline_size + size + XATTR_PADDING_SIZE;
+ txattr_addr = f2fs_kzalloc(F2FS_I_SB(inode), *base_size, GFP_NOFS);
if (!txattr_addr)
return -ENOMEM;
@@ -312,8 +312,10 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage,
*xe = __find_inline_xattr(inode, txattr_addr, &last_addr,
index, len, name);
- if (*xe)
+ if (*xe) {
+ *base_size = inline_size;
goto check;
+ }
}
/* read from xattr node block */
@@ -415,7 +417,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
}
f2fs_wait_on_page_writeback(ipage ? ipage : in_page,
- NODE, true);
+ NODE, true, true);
/* no need to use xattr node block */
if (hsize <= inline_size) {
err = f2fs_truncate_xattr_node(inode);
@@ -439,7 +441,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
goto in_page_out;
}
f2fs_bug_on(sbi, new_nid);
- f2fs_wait_on_page_writeback(xpage, NODE, true);
+ f2fs_wait_on_page_writeback(xpage, NODE, true, true);
} else {
struct dnode_of_data dn;
set_new_dnode(&dn, inode, NULL, NULL, new_nid);
@@ -474,6 +476,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name,
int error = 0;
unsigned int size, len;
void *base_addr = NULL;
+ int base_size;
if (name == NULL)
return -EINVAL;
@@ -484,7 +487,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name,
down_read(&F2FS_I(inode)->i_xattr_sem);
error = lookup_all_xattrs(inode, ipage, index, len, name,
- &entry, &base_addr);
+ &entry, &base_addr, &base_size);
up_read(&F2FS_I(inode)->i_xattr_sem);
if (error)
return error;
@@ -498,6 +501,11 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name,
if (buffer) {
char *pval = entry->e_name + entry->e_name_len;
+
+ if (base_size - (pval - (char *)base_addr) < size) {
+ error = -ERANGE;
+ goto out;
+ }
memcpy(buffer, pval, size);
}
error = size;
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 78d501c1fb65..738e427e2d21 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -363,7 +363,7 @@ int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
*phys = 0;
*mapped_blocks = 0;
- if ((sbi->fat_bits != 32) && (inode->i_ino == MSDOS_ROOT_INO)) {
+ if (!is_fat32(sbi) && (inode->i_ino == MSDOS_ROOT_INO)) {
if (sector < (sbi->dir_entries >> sbi->dir_per_block_bits)) {
*phys = sector + sbi->dir_start;
*mapped_blocks = 1;
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 7f5f3699fc6c..9d01db37183f 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -57,7 +57,7 @@ static inline void fat_dir_readahead(struct inode *dir, sector_t iblock,
if ((iblock & (sbi->sec_per_clus - 1)) || sbi->sec_per_clus == 1)
return;
/* root dir of FAT12/FAT16 */
- if ((sbi->fat_bits != 32) && (dir->i_ino == MSDOS_ROOT_INO))
+ if (!is_fat32(sbi) && (dir->i_ino == MSDOS_ROOT_INO))
return;
bh = sb_find_get_block(sb, phys);
@@ -369,7 +369,9 @@ static int fat_parse_short(struct super_block *sb,
}
memcpy(work, de->name, sizeof(work));
- /* see namei.c, msdos_format_name */
+ /* For an explanation of the special treatment of 0x05 in
+ * filenames, see msdos_format_name in namei_msdos.c
+ */
if (work[0] == 0x05)
work[0] = 0xE5;
@@ -803,7 +805,7 @@ static long fat_dir_ioctl(struct file *filp, unsigned int cmd,
return fat_generic_ioctl(filp, cmd, arg);
}
- if (!access_ok(VERIFY_WRITE, d1, sizeof(struct __fat_dirent[2])))
+ if (!access_ok(d1, sizeof(struct __fat_dirent[2])))
return -EFAULT;
/*
* Yes, we don't need this put_user() absolutely. However old
@@ -843,7 +845,7 @@ static long fat_compat_dir_ioctl(struct file *filp, unsigned cmd,
return fat_generic_ioctl(filp, cmd, (unsigned long)arg);
}
- if (!access_ok(VERIFY_WRITE, d1, sizeof(struct compat_dirent[2])))
+ if (!access_ok(d1, sizeof(struct compat_dirent[2])))
return -EFAULT;
/*
* Yes, we don't need this put_user() absolutely. However old
@@ -1071,7 +1073,7 @@ int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo)
}
}
- dir->i_mtime = dir->i_atime = current_time(dir);
+ fat_truncate_time(dir, NULL, S_ATIME|S_MTIME);
if (IS_DIRSYNC(dir))
(void)fat_sync_inode(dir);
else
@@ -1311,7 +1313,7 @@ int fat_add_entries(struct inode *dir, void *slots, int nr_slots,
}
}
if (dir->i_ino == MSDOS_ROOT_INO) {
- if (sbi->fat_bits != 32)
+ if (!is_fat32(sbi))
goto error;
} else if (MSDOS_I(dir)->i_start == 0) {
fat_msg(sb, KERN_ERR, "Corrupted directory (i_pos %lld)",
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 9d7d2d5da28b..922a0c6ba46c 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -142,6 +142,34 @@ static inline struct msdos_sb_info *MSDOS_SB(struct super_block *sb)
return sb->s_fs_info;
}
+/*
+ * Functions that determine the variant of the FAT file system (i.e.,
+ * whether this is FAT12, FAT16 or FAT32.
+ */
+static inline bool is_fat12(const struct msdos_sb_info *sbi)
+{
+ return sbi->fat_bits == 12;
+}
+
+static inline bool is_fat16(const struct msdos_sb_info *sbi)
+{
+ return sbi->fat_bits == 16;
+}
+
+static inline bool is_fat32(const struct msdos_sb_info *sbi)
+{
+ return sbi->fat_bits == 32;
+}
+
+/* Maximum number of clusters */
+static inline u32 max_fat(struct super_block *sb)
+{
+ struct msdos_sb_info *sbi = MSDOS_SB(sb);
+
+ return is_fat32(sbi) ? MAX_FAT32 :
+ is_fat16(sbi) ? MAX_FAT16 : MAX_FAT12;
+}
+
static inline struct msdos_inode_info *MSDOS_I(struct inode *inode)
{
return container_of(inode, struct msdos_inode_info, vfs_inode);
@@ -257,7 +285,7 @@ static inline int fat_get_start(const struct msdos_sb_info *sbi,
const struct msdos_dir_entry *de)
{
int cluster = le16_to_cpu(de->start);
- if (sbi->fat_bits == 32)
+ if (is_fat32(sbi))
cluster |= (le16_to_cpu(de->starthi) << 16);
return cluster;
}
@@ -416,6 +444,10 @@ extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec64 *ts,
__le16 __time, __le16 __date, u8 time_cs);
extern void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec64 *ts,
__le16 *time, __le16 *date, u8 *time_cs);
+extern int fat_truncate_time(struct inode *inode, struct timespec64 *now,
+ int flags);
+extern int fat_update_time(struct inode *inode, struct timespec64 *now,
+ int flags);
extern int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs);
int fat_cache_init(void);
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index f58c0cacc531..495edeafd60a 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -290,19 +290,17 @@ void fat_ent_access_init(struct super_block *sb)
mutex_init(&sbi->fat_lock);
- switch (sbi->fat_bits) {
- case 32:
+ if (is_fat32(sbi)) {
sbi->fatent_shift = 2;
sbi->fatent_ops = &fat32_ops;
- break;
- case 16:
+ } else if (is_fat16(sbi)) {
sbi->fatent_shift = 1;
sbi->fatent_ops = &fat16_ops;
- break;
- case 12:
+ } else if (is_fat12(sbi)) {
sbi->fatent_shift = -1;
sbi->fatent_ops = &fat12_ops;
- break;
+ } else {
+ fat_fs_error(sb, "invalid FAT variant, %u bits", sbi->fat_bits);
}
}
@@ -310,7 +308,7 @@ static void mark_fsinfo_dirty(struct super_block *sb)
{
struct msdos_sb_info *sbi = MSDOS_SB(sb);
- if (sb_rdonly(sb) || sbi->fat_bits != 32)
+ if (sb_rdonly(sb) || !is_fat32(sbi))
return;
__mark_inode_dirty(sbi->fsinfo_inode, I_DIRTY_SYNC);
@@ -327,7 +325,7 @@ static inline int fat_ent_update_ptr(struct super_block *sb,
/* Is this fatent's blocks including this entry? */
if (!fatent->nr_bhs || bhs[0]->b_blocknr != blocknr)
return 0;
- if (sbi->fat_bits == 12) {
+ if (is_fat12(sbi)) {
if ((offset + 1) < sb->s_blocksize) {
/* This entry is on bhs[0]. */
if (fatent->nr_bhs == 2) {
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 4f3d72fb1e60..b3bed32946b1 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -214,6 +214,7 @@ const struct file_operations fat_file_operations = {
#endif
.fsync = fat_file_fsync,
.splice_read = generic_file_splice_read,
+ .splice_write = iter_file_splice_write,
.fallocate = fat_fallocate,
};
@@ -227,7 +228,7 @@ static int fat_cont_expand(struct inode *inode, loff_t size)
if (err)
goto out;
- inode->i_ctime = inode->i_mtime = current_time(inode);
+ fat_truncate_time(inode, NULL, S_CTIME|S_MTIME);
mark_inode_dirty(inode);
if (IS_SYNC(inode)) {
int err2;
@@ -330,7 +331,7 @@ static int fat_free(struct inode *inode, int skip)
MSDOS_I(inode)->i_logstart = 0;
}
MSDOS_I(inode)->i_attrs |= ATTR_ARCH;
- inode->i_ctime = inode->i_mtime = current_time(inode);
+ fat_truncate_time(inode, NULL, S_CTIME|S_MTIME);
if (wait) {
err = fat_sync_inode(inode);
if (err) {
@@ -542,6 +543,18 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
up_write(&MSDOS_I(inode)->truncate_lock);
}
+ /*
+ * setattr_copy can't truncate these appropriately, so we'll
+ * copy them ourselves
+ */
+ if (attr->ia_valid & ATTR_ATIME)
+ fat_truncate_time(inode, &attr->ia_atime, S_ATIME);
+ if (attr->ia_valid & ATTR_CTIME)
+ fat_truncate_time(inode, &attr->ia_ctime, S_CTIME);
+ if (attr->ia_valid & ATTR_MTIME)
+ fat_truncate_time(inode, &attr->ia_mtime, S_MTIME);
+ attr->ia_valid &= ~(ATTR_ATIME|ATTR_CTIME|ATTR_MTIME);
+
setattr_copy(inode, attr);
mark_inode_dirty(inode);
out:
@@ -552,4 +565,5 @@ EXPORT_SYMBOL_GPL(fat_setattr);
const struct inode_operations fat_file_inode_operations = {
.setattr = fat_setattr,
.getattr = fat_getattr,
+ .update_time = fat_update_time,
};
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index d6b81e31f9f5..79bb0e73a65f 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -244,7 +244,7 @@ static int fat_write_end(struct file *file, struct address_space *mapping,
if (err < len)
fat_write_failed(mapping, pos + len);
if (!(err < 0) && !(MSDOS_I(inode)->i_attrs & ATTR_ARCH)) {
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ fat_truncate_time(inode, NULL, S_CTIME|S_MTIME);
MSDOS_I(inode)->i_attrs |= ATTR_ARCH;
mark_inode_dirty(inode);
}
@@ -564,7 +564,7 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
de->cdate, de->ctime_cs);
fat_time_fat2unix(sbi, &inode->i_atime, 0, de->adate, 0);
} else
- inode->i_ctime = inode->i_atime = inode->i_mtime;
+ fat_truncate_time(inode, &inode->i_mtime, S_ATIME|S_CTIME);
return 0;
}
@@ -686,7 +686,7 @@ static void fat_set_state(struct super_block *sb,
b = (struct fat_boot_sector *) bh->b_data;
- if (sbi->fat_bits == 32) {
+ if (is_fat32(sbi)) {
if (set)
b->fat32.state |= FAT_STATE_DIRTY;
else
@@ -1396,7 +1396,7 @@ static int fat_read_root(struct inode *inode)
inode->i_mode = fat_make_mode(sbi, ATTR_DIR, S_IRWXUGO);
inode->i_op = sbi->dir_ops;
inode->i_fop = &fat_dir_operations;
- if (sbi->fat_bits == 32) {
+ if (is_fat32(sbi)) {
MSDOS_I(inode)->i_start = sbi->root_cluster;
error = fat_calc_dir_size(inode);
if (error < 0)
@@ -1423,7 +1423,7 @@ static unsigned long calc_fat_clusters(struct super_block *sb)
struct msdos_sb_info *sbi = MSDOS_SB(sb);
/* Divide first to avoid overflow */
- if (sbi->fat_bits != 12) {
+ if (!is_fat12(sbi)) {
unsigned long ent_per_sec = sb->s_blocksize * 8 / sbi->fat_bits;
return ent_per_sec * sbi->fat_length;
}
@@ -1626,6 +1626,11 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
sb->s_magic = MSDOS_SUPER_MAGIC;
sb->s_op = &fat_sops;
sb->s_export_op = &fat_export_ops;
+ /*
+ * fat timestamps are complex and truncated by fat itself, so
+ * we set 1 here to be fast
+ */
+ sb->s_time_gran = 1;
mutex_init(&sbi->nfs_build_inode_lock);
ratelimit_state_init(&sbi->ratelimit, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
@@ -1738,7 +1743,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
}
/* interpret volume ID as a little endian 32 bit integer */
- if (sbi->fat_bits == 32)
+ if (is_fat32(sbi))
sbi->vol_id = bpb.fat32_vol_id;
else /* fat 16 or 12 */
sbi->vol_id = bpb.fat16_vol_id;
@@ -1764,11 +1769,11 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
total_clusters = (total_sectors - sbi->data_start) / sbi->sec_per_clus;
- if (sbi->fat_bits != 32)
+ if (!is_fat32(sbi))
sbi->fat_bits = (total_clusters > MAX_FAT12) ? 16 : 12;
/* some OSes set FAT_STATE_DIRTY and clean it on unmount. */
- if (sbi->fat_bits == 32)
+ if (is_fat32(sbi))
sbi->dirty = bpb.fat32_state & FAT_STATE_DIRTY;
else /* fat 16 or 12 */
sbi->dirty = bpb.fat16_state & FAT_STATE_DIRTY;
@@ -1776,7 +1781,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
/* check that FAT table does not overflow */
fat_clusters = calc_fat_clusters(sb);
total_clusters = min(total_clusters, fat_clusters - FAT_START_ENT);
- if (total_clusters > MAX_FAT(sb)) {
+ if (total_clusters > max_fat(sb)) {
if (!silent)
fat_msg(sb, KERN_ERR, "count of clusters too big (%u)",
total_clusters);
@@ -1798,11 +1803,15 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
fat_ent_access_init(sb);
/*
- * The low byte of FAT's first entry must have same value with
- * media-field. But in real world, too many devices is
- * writing wrong value. So, removed that validity check.
+ * The low byte of the first FAT entry must have the same value as
+ * the media field of the boot sector. But in real world, too many
+ * devices are writing wrong values. So, removed that validity check.
*
- * if (FAT_FIRST_ENT(sb, media) != first)
+ * The removed check compared the first FAT entry to a value dependent
+ * on the media field like this:
+ * == (0x0F00 | media), for FAT12
+ * == (0XFF00 | media), for FAT16
+ * == (0x0FFFFF | media), for FAT32
*/
error = -EINVAL;
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 573836dcaefc..4fc950bb6433 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -7,6 +7,7 @@
*/
#include "fat.h"
+#include <linux/iversion.h>
/*
* fat_fs_error reports a file system problem that might indicate fa data
@@ -63,7 +64,7 @@ int fat_clusters_flush(struct super_block *sb)
struct buffer_head *bh;
struct fat_boot_fsinfo *fsinfo;
- if (sbi->fat_bits != 32)
+ if (!is_fat32(sbi))
return 0;
bh = sb_bread(sb, sbi->fsinfo_sector);
@@ -185,6 +186,13 @@ static long days_in_year[] = {
0, 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0,
};
+static inline int fat_tz_offset(struct msdos_sb_info *sbi)
+{
+ return (sbi->options.tz_set ?
+ -sbi->options.time_offset :
+ sys_tz.tz_minuteswest) * SECS_PER_MIN;
+}
+
/* Convert a FAT time/date pair to a UNIX date (seconds since 1 1 70). */
void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec64 *ts,
__le16 __time, __le16 __date, u8 time_cs)
@@ -210,10 +218,7 @@ void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec64 *ts,
+ days_in_year[month] + day
+ DAYS_DELTA) * SECS_PER_DAY;
- if (!sbi->options.tz_set)
- second += sys_tz.tz_minuteswest * SECS_PER_MIN;
- else
- second -= sbi->options.time_offset * SECS_PER_MIN;
+ second += fat_tz_offset(sbi);
if (time_cs) {
ts->tv_sec = second + (time_cs / 100);
@@ -229,9 +234,7 @@ void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec64 *ts,
__le16 *time, __le16 *date, u8 *time_cs)
{
struct tm tm;
- time64_to_tm(ts->tv_sec,
- (sbi->options.tz_set ? sbi->options.time_offset :
- -sys_tz.tz_minuteswest) * SECS_PER_MIN, &tm);
+ time64_to_tm(ts->tv_sec, -fat_tz_offset(sbi), &tm);
/* FAT can only support year between 1980 to 2107 */
if (tm.tm_year < 1980 - 1900) {
@@ -263,6 +266,80 @@ void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec64 *ts,
}
EXPORT_SYMBOL_GPL(fat_time_unix2fat);
+static inline struct timespec64 fat_timespec64_trunc_2secs(struct timespec64 ts)
+{
+ return (struct timespec64){ ts.tv_sec & ~1ULL, 0 };
+}
+/*
+ * truncate the various times with appropriate granularity:
+ * root inode:
+ * all times always 0
+ * all other inodes:
+ * mtime - 2 seconds
+ * ctime
+ * msdos - 2 seconds
+ * vfat - 10 milliseconds
+ * atime - 24 hours (00:00:00 in local timezone)
+ */
+int fat_truncate_time(struct inode *inode, struct timespec64 *now, int flags)
+{
+ struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+ struct timespec64 ts;
+
+ if (inode->i_ino == MSDOS_ROOT_INO)
+ return 0;
+
+ if (now == NULL) {
+ now = &ts;
+ ts = current_time(inode);
+ }
+
+ if (flags & S_ATIME) {
+ /* to localtime */
+ time64_t seconds = now->tv_sec - fat_tz_offset(sbi);
+ s32 remainder;
+
+ div_s64_rem(seconds, SECS_PER_DAY, &remainder);
+ /* to day boundary, and back to unix time */
+ seconds = seconds + fat_tz_offset(sbi) - remainder;
+
+ inode->i_atime = (struct timespec64){ seconds, 0 };
+ }
+ if (flags & S_CTIME) {
+ if (sbi->options.isvfat)
+ inode->i_ctime = timespec64_trunc(*now, 10000000);
+ else
+ inode->i_ctime = fat_timespec64_trunc_2secs(*now);
+ }
+ if (flags & S_MTIME)
+ inode->i_mtime = fat_timespec64_trunc_2secs(*now);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(fat_truncate_time);
+
+int fat_update_time(struct inode *inode, struct timespec64 *now, int flags)
+{
+ int iflags = I_DIRTY_TIME;
+ bool dirty = false;
+
+ if (inode->i_ino == MSDOS_ROOT_INO)
+ return 0;
+
+ fat_truncate_time(inode, now, flags);
+ if (flags & S_VERSION)
+ dirty = inode_maybe_inc_iversion(inode, false);
+ if ((flags & (S_ATIME | S_CTIME | S_MTIME)) &&
+ !(inode->i_sb->s_flags & SB_LAZYTIME))
+ dirty = true;
+
+ if (dirty)
+ iflags |= I_DIRTY_SYNC;
+ __mark_inode_dirty(inode, iflags);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(fat_update_time);
+
int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs)
{
int i, err = 0;
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index efb8c40c9d27..f2cd365a4e86 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -250,7 +250,7 @@ static int msdos_add_entry(struct inode *dir, const unsigned char *name,
if (err)
return err;
- dir->i_ctime = dir->i_mtime = *ts;
+ fat_truncate_time(dir, ts, S_CTIME|S_MTIME);
if (IS_DIRSYNC(dir))
(void)fat_sync_inode(dir);
else
@@ -294,7 +294,7 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, umode_t mode,
err = PTR_ERR(inode);
goto out;
}
- inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
+ fat_truncate_time(inode, &ts, S_ATIME|S_CTIME|S_MTIME);
/* timestamp is already written, so mark_inode_dirty() is unneeded. */
d_instantiate(dentry, inode);
@@ -327,7 +327,7 @@ static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
drop_nlink(dir);
clear_nlink(inode);
- inode->i_ctime = current_time(inode);
+ fat_truncate_time(inode, NULL, S_CTIME);
fat_detach(inode);
out:
mutex_unlock(&MSDOS_SB(sb)->s_lock);
@@ -380,7 +380,7 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
goto out;
}
set_nlink(inode, 2);
- inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
+ fat_truncate_time(inode, &ts, S_ATIME|S_CTIME|S_MTIME);
/* timestamp is already written, so mark_inode_dirty() is unneeded. */
d_instantiate(dentry, inode);
@@ -413,7 +413,7 @@ static int msdos_unlink(struct inode *dir, struct dentry *dentry)
if (err)
goto out;
clear_nlink(inode);
- inode->i_ctime = current_time(inode);
+ fat_truncate_time(inode, NULL, S_CTIME);
fat_detach(inode);
out:
mutex_unlock(&MSDOS_SB(sb)->s_lock);
@@ -478,7 +478,7 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
mark_inode_dirty(old_inode);
inode_inc_iversion(old_dir);
- old_dir->i_ctime = old_dir->i_mtime = current_time(old_dir);
+ fat_truncate_time(old_dir, NULL, S_CTIME|S_MTIME);
if (IS_DIRSYNC(old_dir))
(void)fat_sync_inode(old_dir);
else
@@ -538,7 +538,7 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
if (err)
goto error_dotdot;
inode_inc_iversion(old_dir);
- old_dir->i_ctime = old_dir->i_mtime = ts;
+ fat_truncate_time(old_dir, &ts, S_CTIME|S_MTIME);
if (IS_DIRSYNC(old_dir))
(void)fat_sync_inode(old_dir);
else
@@ -548,7 +548,7 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
drop_nlink(new_inode);
if (is_dir)
drop_nlink(new_inode);
- new_inode->i_ctime = ts;
+ fat_truncate_time(new_inode, &ts, S_CTIME);
}
out:
brelse(sinfo.bh);
@@ -637,6 +637,7 @@ static const struct inode_operations msdos_dir_inode_operations = {
.rename = msdos_rename,
.setattr = fat_setattr,
.getattr = fat_getattr,
+ .update_time = fat_update_time,
};
static void setup(struct super_block *sb)
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 82cd1e69cbdf..996c8c25e9c6 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -678,7 +678,7 @@ static int vfat_add_entry(struct inode *dir, const struct qstr *qname,
goto cleanup;
/* update timestamp */
- dir->i_ctime = dir->i_mtime = dir->i_atime = *ts;
+ fat_truncate_time(dir, ts, S_CTIME|S_MTIME);
if (IS_DIRSYNC(dir))
(void)fat_sync_inode(dir);
else
@@ -779,7 +779,7 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, umode_t mode,
goto out;
}
inode_inc_iversion(inode);
- inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
+ fat_truncate_time(inode, &ts, S_ATIME|S_CTIME|S_MTIME);
/* timestamp is already written, so mark_inode_dirty() is unneeded. */
d_instantiate(dentry, inode);
@@ -810,7 +810,7 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
drop_nlink(dir);
clear_nlink(inode);
- inode->i_mtime = inode->i_atime = current_time(inode);
+ fat_truncate_time(inode, NULL, S_ATIME|S_MTIME);
fat_detach(inode);
vfat_d_version_set(dentry, inode_query_iversion(dir));
out:
@@ -836,7 +836,7 @@ static int vfat_unlink(struct inode *dir, struct dentry *dentry)
if (err)
goto out;
clear_nlink(inode);
- inode->i_mtime = inode->i_atime = current_time(inode);
+ fat_truncate_time(inode, NULL, S_ATIME|S_MTIME);
fat_detach(inode);
vfat_d_version_set(dentry, inode_query_iversion(dir));
out:
@@ -876,7 +876,7 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
}
inode_inc_iversion(inode);
set_nlink(inode, 2);
- inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
+ fat_truncate_time(inode, &ts, S_ATIME|S_CTIME|S_MTIME);
/* timestamp is already written, so mark_inode_dirty() is unneeded. */
d_instantiate(dentry, inode);
@@ -969,7 +969,7 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
if (err)
goto error_dotdot;
inode_inc_iversion(old_dir);
- old_dir->i_ctime = old_dir->i_mtime = ts;
+ fat_truncate_time(old_dir, &ts, S_CTIME|S_MTIME);
if (IS_DIRSYNC(old_dir))
(void)fat_sync_inode(old_dir);
else
@@ -979,7 +979,7 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
drop_nlink(new_inode);
if (is_dir)
drop_nlink(new_inode);
- new_inode->i_ctime = ts;
+ fat_truncate_time(new_inode, &ts, S_CTIME);
}
out:
brelse(sinfo.bh);
@@ -1032,6 +1032,7 @@ static const struct inode_operations vfat_dir_inode_operations = {
.rename = vfat_rename,
.setattr = fat_setattr,
.getattr = fat_getattr,
+ .update_time = fat_update_time,
};
static void setup(struct super_block *sb)
diff --git a/fs/file.c b/fs/file.c
index 7ffd6e9d103d..3da91a112bab 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -158,7 +158,7 @@ static int expand_fdtable(struct files_struct *files, unsigned int nr)
* or have finished their rcu_read_lock_sched() section.
*/
if (atomic_read(&files->count) > 1)
- synchronize_sched();
+ synchronize_rcu();
spin_lock(&files->file_lock);
if (!new_fdt)
@@ -457,6 +457,7 @@ struct files_struct init_files = {
.full_fds_bits = init_files.full_fds_bits_init,
},
.file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock),
+ .resize_wait = __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait),
};
static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start)
@@ -640,6 +641,35 @@ out_unlock:
}
EXPORT_SYMBOL(__close_fd); /* for ksys_close() */
+/*
+ * variant of __close_fd that gets a ref on the file for later fput
+ */
+int __close_fd_get_file(unsigned int fd, struct file **res)
+{
+ struct files_struct *files = current->files;
+ struct file *file;
+ struct fdtable *fdt;
+
+ spin_lock(&files->file_lock);
+ fdt = files_fdtable(files);
+ if (fd >= fdt->max_fds)
+ goto out_unlock;
+ file = fdt->fd[fd];
+ if (!file)
+ goto out_unlock;
+ rcu_assign_pointer(fdt->fd[fd], NULL);
+ __put_unused_fd(files, fd);
+ spin_unlock(&files->file_lock);
+ get_file(file);
+ *res = file;
+ return filp_close(file, files);
+
+out_unlock:
+ spin_unlock(&files->file_lock);
+ *res = NULL;
+ return -ENOENT;
+}
+
void do_close_on_exec(struct files_struct *files)
{
unsigned i;
@@ -676,7 +706,7 @@ void do_close_on_exec(struct files_struct *files)
spin_unlock(&files->file_lock);
}
-static struct file *__fget(unsigned int fd, fmode_t mask)
+static struct file *__fget(unsigned int fd, fmode_t mask, unsigned int refs)
{
struct files_struct *files = current->files;
struct file *file;
@@ -691,7 +721,7 @@ loop:
*/
if (file->f_mode & mask)
file = NULL;
- else if (!get_file_rcu(file))
+ else if (!get_file_rcu_many(file, refs))
goto loop;
}
rcu_read_unlock();
@@ -699,15 +729,20 @@ loop:
return file;
}
+struct file *fget_many(unsigned int fd, unsigned int refs)
+{
+ return __fget(fd, FMODE_PATH, refs);
+}
+
struct file *fget(unsigned int fd)
{
- return __fget(fd, FMODE_PATH);
+ return __fget(fd, FMODE_PATH, 1);
}
EXPORT_SYMBOL(fget);
struct file *fget_raw(unsigned int fd)
{
- return __fget(fd, 0);
+ return __fget(fd, 0, 1);
}
EXPORT_SYMBOL(fget_raw);
@@ -738,7 +773,7 @@ static unsigned long __fget_light(unsigned int fd, fmode_t mask)
return 0;
return (unsigned long)file;
} else {
- file = __fget(fd, mask);
+ file = __fget(fd, mask, 1);
if (!file)
return 0;
return FDPUT_FPUT | (unsigned long)file;
diff --git a/fs/file_table.c b/fs/file_table.c
index e49af4caf15d..155d7514a094 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -326,9 +326,9 @@ void flush_delayed_fput(void)
static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput);
-void fput(struct file *file)
+void fput_many(struct file *file, unsigned int refs)
{
- if (atomic_long_dec_and_test(&file->f_count)) {
+ if (atomic_long_sub_and_test(refs, &file->f_count)) {
struct task_struct *task = current;
if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
@@ -347,6 +347,11 @@ void fput(struct file *file)
}
}
+void fput(struct file *file)
+{
+ fput_many(file, 1);
+}
+
/*
* synchronous analog of fput(); for kernel threads that might be needed
* in some umount() (and thus can't use flush_delayed_fput() without
@@ -380,10 +385,11 @@ void __init files_init(void)
void __init files_maxfiles_init(void)
{
unsigned long n;
- unsigned long memreserve = (totalram_pages - nr_free_pages()) * 3/2;
+ unsigned long nr_pages = totalram_pages();
+ unsigned long memreserve = (nr_pages - nr_free_pages()) * 3/2;
- memreserve = min(memreserve, totalram_pages - 1);
- n = ((totalram_pages - memreserve) * (PAGE_SIZE / 1024)) / 10;
+ memreserve = min(memreserve, nr_pages - 1);
+ n = ((nr_pages - memreserve) * (PAGE_SIZE / 1024)) / 10;
files_stat.max_files = max_t(unsigned long, n, NR_FILE);
}
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 471d863958bc..36855c1f8daf 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -331,17 +331,34 @@ struct inode_switch_wbs_context {
struct work_struct work;
};
+static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi)
+{
+ down_write(&bdi->wb_switch_rwsem);
+}
+
+static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi)
+{
+ up_write(&bdi->wb_switch_rwsem);
+}
+
static void inode_switch_wbs_work_fn(struct work_struct *work)
{
struct inode_switch_wbs_context *isw =
container_of(work, struct inode_switch_wbs_context, work);
struct inode *inode = isw->inode;
+ struct backing_dev_info *bdi = inode_to_bdi(inode);
struct address_space *mapping = inode->i_mapping;
struct bdi_writeback *old_wb = inode->i_wb;
struct bdi_writeback *new_wb = isw->new_wb;
- struct radix_tree_iter iter;
+ XA_STATE(xas, &mapping->i_pages, 0);
+ struct page *page;
bool switched = false;
- void **slot;
+
+ /*
+ * If @inode switches cgwb membership while sync_inodes_sb() is
+ * being issued, sync_inodes_sb() might miss it. Synchronize.
+ */
+ down_read(&bdi->wb_switch_rwsem);
/*
* By the time control reaches here, RCU grace period has passed
@@ -375,25 +392,18 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
* to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to
* pages actually under writeback.
*/
- radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, 0,
- PAGECACHE_TAG_DIRTY) {
- struct page *page = radix_tree_deref_slot_protected(slot,
- &mapping->i_pages.xa_lock);
- if (likely(page) && PageDirty(page)) {
+ xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_DIRTY) {
+ if (PageDirty(page)) {
dec_wb_stat(old_wb, WB_RECLAIMABLE);
inc_wb_stat(new_wb, WB_RECLAIMABLE);
}
}
- radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, 0,
- PAGECACHE_TAG_WRITEBACK) {
- struct page *page = radix_tree_deref_slot_protected(slot,
- &mapping->i_pages.xa_lock);
- if (likely(page)) {
- WARN_ON_ONCE(!PageWriteback(page));
- dec_wb_stat(old_wb, WB_WRITEBACK);
- inc_wb_stat(new_wb, WB_WRITEBACK);
- }
+ xas_set(&xas, 0);
+ xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_WRITEBACK) {
+ WARN_ON_ONCE(!PageWriteback(page));
+ dec_wb_stat(old_wb, WB_WRITEBACK);
+ inc_wb_stat(new_wb, WB_WRITEBACK);
}
wb_get(new_wb);
@@ -435,6 +445,8 @@ skip_switch:
spin_unlock(&new_wb->list_lock);
spin_unlock(&old_wb->list_lock);
+ up_read(&bdi->wb_switch_rwsem);
+
if (switched) {
wb_wakeup(new_wb);
wb_put(old_wb);
@@ -475,9 +487,18 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
if (inode->i_state & I_WB_SWITCH)
return;
+ /*
+ * Avoid starting new switches while sync_inodes_sb() is in
+ * progress. Otherwise, if the down_write protected issue path
+ * blocks heavily, we might end up starting a large number of
+ * switches which will block on the rwsem.
+ */
+ if (!down_read_trylock(&bdi->wb_switch_rwsem))
+ return;
+
isw = kzalloc(sizeof(*isw), GFP_ATOMIC);
if (!isw)
- return;
+ goto out_unlock;
/* find and pin the new wb */
rcu_read_lock();
@@ -511,12 +532,14 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
* Let's continue after I_WB_SWITCH is guaranteed to be visible.
*/
call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
- return;
+ goto out_unlock;
out_free:
if (isw->new_wb)
wb_put(isw->new_wb);
kfree(isw);
+out_unlock:
+ up_read(&bdi->wb_switch_rwsem);
}
/**
@@ -894,6 +917,9 @@ fs_initcall(cgroup_writeback_init);
#else /* CONFIG_CGROUP_WRITEBACK */
+static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
+static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
+
static struct bdi_writeback *
locked_inode_to_wb_and_lock_list(struct inode *inode)
__releases(&inode->i_lock)
@@ -2420,8 +2446,11 @@ void sync_inodes_sb(struct super_block *sb)
return;
WARN_ON(!rwsem_is_locked(&sb->s_umount));
+ /* protect against inode wb switch, see inode_switch_wbs_work_fn() */
+ bdi_down_write_wb_switch_rwsem(bdi);
bdi_split_work_to_wbs(bdi, &work, false);
wb_wait_for_completion(bdi, &done);
+ bdi_up_write_wb_switch_rwsem(bdi);
wait_sb_inodes(sb);
}
diff --git a/fs/fs_types.c b/fs/fs_types.c
new file mode 100644
index 000000000000..78365e5dc08c
--- /dev/null
+++ b/fs/fs_types.c
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/fs.h>
+#include <linux/export.h>
+
+/*
+ * fs on-disk file type to dirent file type conversion
+ */
+static const unsigned char fs_dtype_by_ftype[FT_MAX] = {
+ [FT_UNKNOWN] = DT_UNKNOWN,
+ [FT_REG_FILE] = DT_REG,
+ [FT_DIR] = DT_DIR,
+ [FT_CHRDEV] = DT_CHR,
+ [FT_BLKDEV] = DT_BLK,
+ [FT_FIFO] = DT_FIFO,
+ [FT_SOCK] = DT_SOCK,
+ [FT_SYMLINK] = DT_LNK
+};
+
+/**
+ * fs_ftype_to_dtype() - fs on-disk file type to dirent type.
+ * @filetype: The on-disk file type to convert.
+ *
+ * This function converts the on-disk file type value (FT_*) to the directory
+ * entry type (DT_*).
+ *
+ * Context: Any context.
+ * Return:
+ * * DT_UNKNOWN - Unknown type
+ * * DT_FIFO - FIFO
+ * * DT_CHR - Character device
+ * * DT_DIR - Directory
+ * * DT_BLK - Block device
+ * * DT_REG - Regular file
+ * * DT_LNK - Symbolic link
+ * * DT_SOCK - Local-domain socket
+ */
+unsigned char fs_ftype_to_dtype(unsigned int filetype)
+{
+ if (filetype >= FT_MAX)
+ return DT_UNKNOWN;
+
+ return fs_dtype_by_ftype[filetype];
+}
+EXPORT_SYMBOL_GPL(fs_ftype_to_dtype);
+
+/*
+ * dirent file type to fs on-disk file type conversion
+ * Values not initialized explicitly are FT_UNKNOWN (0).
+ */
+static const unsigned char fs_ftype_by_dtype[DT_MAX] = {
+ [DT_REG] = FT_REG_FILE,
+ [DT_DIR] = FT_DIR,
+ [DT_LNK] = FT_SYMLINK,
+ [DT_CHR] = FT_CHRDEV,
+ [DT_BLK] = FT_BLKDEV,
+ [DT_FIFO] = FT_FIFO,
+ [DT_SOCK] = FT_SOCK,
+};
+
+/**
+ * fs_umode_to_ftype() - file mode to on-disk file type.
+ * @mode: The file mode to convert.
+ *
+ * This function converts the file mode value to the on-disk file type (FT_*).
+ *
+ * Context: Any context.
+ * Return:
+ * * FT_UNKNOWN - Unknown type
+ * * FT_REG_FILE - Regular file
+ * * FT_DIR - Directory
+ * * FT_CHRDEV - Character device
+ * * FT_BLKDEV - Block device
+ * * FT_FIFO - FIFO
+ * * FT_SOCK - Local-domain socket
+ * * FT_SYMLINK - Symbolic link
+ */
+unsigned char fs_umode_to_ftype(umode_t mode)
+{
+ return fs_ftype_by_dtype[S_DT(mode)];
+}
+EXPORT_SYMBOL_GPL(fs_umode_to_ftype);
+
+/**
+ * fs_umode_to_dtype() - file mode to dirent file type.
+ * @mode: The file mode to convert.
+ *
+ * This function converts the file mode value to the directory
+ * entry type (DT_*).
+ *
+ * Context: Any context.
+ * Return:
+ * * DT_UNKNOWN - Unknown type
+ * * DT_FIFO - FIFO
+ * * DT_CHR - Character device
+ * * DT_DIR - Directory
+ * * DT_BLK - Block device
+ * * DT_REG - Regular file
+ * * DT_LNK - Symbolic link
+ * * DT_SOCK - Local-domain socket
+ */
+unsigned char fs_umode_to_dtype(umode_t mode)
+{
+ return fs_ftype_to_dtype(fs_umode_to_ftype(mode));
+}
+EXPORT_SYMBOL_GPL(fs_umode_to_dtype);
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index 9edc920f651f..6d9cb1719de5 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -730,6 +730,9 @@ static const struct fscache_state *fscache_drop_object(struct fscache_object *ob
if (awaken)
wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING);
+ if (test_and_clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags))
+ wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
+
/* Prevent a race with our last child, which has to signal EV_CLEARED
* before dropping our spinlock.
diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
index 60da84a86dab..f7b807bc1027 100644
--- a/fs/fuse/Makefile
+++ b/fs/fuse/Makefile
@@ -5,4 +5,4 @@
obj-$(CONFIG_FUSE_FS) += fuse.o
obj-$(CONFIG_CUSE) += cuse.o
-fuse-objs := dev.o dir.o file.o inode.o control.o xattr.o acl.o
+fuse-objs := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 0b694655d988..989df5accaee 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -107,7 +107,7 @@ static ssize_t fuse_conn_max_background_read(struct file *file,
if (!fc)
return 0;
- val = fc->max_background;
+ val = READ_ONCE(fc->max_background);
fuse_conn_put(fc);
return fuse_conn_limit_read(file, buf, len, ppos, val);
@@ -125,7 +125,12 @@ static ssize_t fuse_conn_max_background_write(struct file *file,
if (ret > 0) {
struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
if (fc) {
+ spin_lock(&fc->bg_lock);
fc->max_background = val;
+ fc->blocked = fc->num_background >= fc->max_background;
+ if (!fc->blocked)
+ wake_up(&fc->blocked_waitq);
+ spin_unlock(&fc->bg_lock);
fuse_conn_put(fc);
}
}
@@ -144,7 +149,7 @@ static ssize_t fuse_conn_congestion_threshold_read(struct file *file,
if (!fc)
return 0;
- val = fc->congestion_threshold;
+ val = READ_ONCE(fc->congestion_threshold);
fuse_conn_put(fc);
return fuse_conn_limit_read(file, buf, len, ppos, val);
@@ -155,18 +160,31 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
size_t count, loff_t *ppos)
{
unsigned uninitialized_var(val);
+ struct fuse_conn *fc;
ssize_t ret;
ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
max_user_congthresh);
- if (ret > 0) {
- struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
- if (fc) {
- fc->congestion_threshold = val;
- fuse_conn_put(fc);
+ if (ret <= 0)
+ goto out;
+ fc = fuse_ctl_file_conn_get(file);
+ if (!fc)
+ goto out;
+
+ spin_lock(&fc->bg_lock);
+ fc->congestion_threshold = val;
+ if (fc->sb) {
+ if (fc->num_background < fc->congestion_threshold) {
+ clear_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC);
+ clear_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC);
+ } else {
+ set_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC);
+ set_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC);
}
}
-
+ spin_unlock(&fc->bg_lock);
+ fuse_conn_put(fc);
+out:
return ret;
}
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 11ea2c4a38ab..809c0f2f9942 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -25,6 +25,10 @@
MODULE_ALIAS_MISCDEV(FUSE_MINOR);
MODULE_ALIAS("devname:fuse");
+/* Ordinary requests have even IDs, while interrupts IDs are odd */
+#define FUSE_INT_REQ_BIT (1ULL << 0)
+#define FUSE_REQ_ID_STEP (1ULL << 1)
+
static struct kmem_cache *fuse_req_cachep;
static struct fuse_dev *fuse_get_dev(struct file *file)
@@ -40,9 +44,6 @@ static void fuse_request_init(struct fuse_req *req, struct page **pages,
struct fuse_page_desc *page_descs,
unsigned npages)
{
- memset(req, 0, sizeof(*req));
- memset(pages, 0, sizeof(*pages) * npages);
- memset(page_descs, 0, sizeof(*page_descs) * npages);
INIT_LIST_HEAD(&req->list);
INIT_LIST_HEAD(&req->intr_entry);
init_waitqueue_head(&req->waitq);
@@ -53,30 +54,36 @@ static void fuse_request_init(struct fuse_req *req, struct page **pages,
__set_bit(FR_PENDING, &req->flags);
}
+static struct page **fuse_req_pages_alloc(unsigned int npages, gfp_t flags,
+ struct fuse_page_desc **desc)
+{
+ struct page **pages;
+
+ pages = kzalloc(npages * (sizeof(struct page *) +
+ sizeof(struct fuse_page_desc)), flags);
+ *desc = (void *) pages + npages * sizeof(struct page *);
+
+ return pages;
+}
+
static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags)
{
- struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, flags);
+ struct fuse_req *req = kmem_cache_zalloc(fuse_req_cachep, flags);
if (req) {
- struct page **pages;
- struct fuse_page_desc *page_descs;
-
- if (npages <= FUSE_REQ_INLINE_PAGES) {
+ struct page **pages = NULL;
+ struct fuse_page_desc *page_descs = NULL;
+
+ WARN_ON(npages > FUSE_MAX_MAX_PAGES);
+ if (npages > FUSE_REQ_INLINE_PAGES) {
+ pages = fuse_req_pages_alloc(npages, flags,
+ &page_descs);
+ if (!pages) {
+ kmem_cache_free(fuse_req_cachep, req);
+ return NULL;
+ }
+ } else if (npages) {
pages = req->inline_pages;
page_descs = req->inline_page_descs;
- } else {
- pages = kmalloc_array(npages, sizeof(struct page *),
- flags);
- page_descs =
- kmalloc_array(npages,
- sizeof(struct fuse_page_desc),
- flags);
- }
-
- if (!pages || !page_descs) {
- kfree(pages);
- kfree(page_descs);
- kmem_cache_free(fuse_req_cachep, req);
- return NULL;
}
fuse_request_init(req, pages, page_descs, npages);
@@ -95,12 +102,41 @@ struct fuse_req *fuse_request_alloc_nofs(unsigned npages)
return __fuse_request_alloc(npages, GFP_NOFS);
}
-void fuse_request_free(struct fuse_req *req)
+static void fuse_req_pages_free(struct fuse_req *req)
{
- if (req->pages != req->inline_pages) {
+ if (req->pages != req->inline_pages)
kfree(req->pages);
- kfree(req->page_descs);
- }
+}
+
+bool fuse_req_realloc_pages(struct fuse_conn *fc, struct fuse_req *req,
+ gfp_t flags)
+{
+ struct page **pages;
+ struct fuse_page_desc *page_descs;
+ unsigned int npages = min_t(unsigned int,
+ max_t(unsigned int, req->max_pages * 2,
+ FUSE_DEFAULT_MAX_PAGES_PER_REQ),
+ fc->max_pages);
+ WARN_ON(npages <= req->max_pages);
+
+ pages = fuse_req_pages_alloc(npages, flags, &page_descs);
+ if (!pages)
+ return false;
+
+ memcpy(pages, req->pages, sizeof(struct page *) * req->max_pages);
+ memcpy(page_descs, req->page_descs,
+ sizeof(struct fuse_page_desc) * req->max_pages);
+ fuse_req_pages_free(req);
+ req->pages = pages;
+ req->page_descs = page_descs;
+ req->max_pages = npages;
+
+ return true;
+}
+
+void fuse_request_free(struct fuse_req *req)
+{
+ fuse_req_pages_free(req);
kmem_cache_free(fuse_req_cachep, req);
}
@@ -129,9 +165,13 @@ static bool fuse_block_alloc(struct fuse_conn *fc, bool for_background)
static void fuse_drop_waiting(struct fuse_conn *fc)
{
- if (fc->connected) {
- atomic_dec(&fc->num_waiting);
- } else if (atomic_dec_and_test(&fc->num_waiting)) {
+ /*
+ * lockess check of fc->connected is okay, because atomic_dec_and_test()
+ * provides a memory barrier mached with the one in fuse_wait_aborted()
+ * to ensure no wake-up is missed.
+ */
+ if (atomic_dec_and_test(&fc->num_waiting) &&
+ !READ_ONCE(fc->connected)) {
/* wake up aborters */
wake_up_all(&fc->blocked_waitq);
}
@@ -235,8 +275,10 @@ static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req)
struct file *file = req->stolen_file;
struct fuse_file *ff = file->private_data;
+ WARN_ON(req->max_pages);
spin_lock(&fc->lock);
- fuse_request_init(req, req->pages, req->page_descs, req->max_pages);
+ memset(req, 0, sizeof(*req));
+ fuse_request_init(req, NULL, NULL, 0);
BUG_ON(ff->reserved_req);
ff->reserved_req = req;
wake_up_all(&fc->reserved_req_waitq);
@@ -287,10 +329,10 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
* We get here in the unlikely case that a background
* request was allocated but not sent
*/
- spin_lock(&fc->lock);
+ spin_lock(&fc->bg_lock);
if (!fc->blocked)
wake_up(&fc->blocked_waitq);
- spin_unlock(&fc->lock);
+ spin_unlock(&fc->bg_lock);
}
if (test_bit(FR_WAITING, &req->flags)) {
@@ -319,7 +361,13 @@ static unsigned len_args(unsigned numargs, struct fuse_arg *args)
static u64 fuse_get_unique(struct fuse_iqueue *fiq)
{
- return ++fiq->reqctr;
+ fiq->reqctr += FUSE_REQ_ID_STEP;
+ return fiq->reqctr;
+}
+
+static unsigned int fuse_req_hash(u64 unique)
+{
+ return hash_long(unique & ~FUSE_INT_REQ_BIT, FUSE_PQ_HASH_BITS);
}
static void queue_request(struct fuse_iqueue *fiq, struct fuse_req *req)
@@ -353,12 +401,13 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
static void flush_bg_queue(struct fuse_conn *fc)
{
+ struct fuse_iqueue *fiq = &fc->iq;
+
while (fc->active_background < fc->max_background &&
!list_empty(&fc->bg_queue)) {
struct fuse_req *req;
- struct fuse_iqueue *fiq = &fc->iq;
- req = list_entry(fc->bg_queue.next, struct fuse_req, list);
+ req = list_first_entry(&fc->bg_queue, struct fuse_req, list);
list_del(&req->list);
fc->active_background++;
spin_lock(&fiq->waitq.lock);
@@ -389,14 +438,21 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
WARN_ON(test_bit(FR_PENDING, &req->flags));
WARN_ON(test_bit(FR_SENT, &req->flags));
if (test_bit(FR_BACKGROUND, &req->flags)) {
- spin_lock(&fc->lock);
+ spin_lock(&fc->bg_lock);
clear_bit(FR_BACKGROUND, &req->flags);
- if (fc->num_background == fc->max_background)
+ if (fc->num_background == fc->max_background) {
fc->blocked = 0;
-
- /* Wake up next waiter, if any */
- if (!fc->blocked && waitqueue_active(&fc->blocked_waitq))
wake_up(&fc->blocked_waitq);
+ } else if (!fc->blocked) {
+ /*
+ * Wake up next waiter, if any. It's okay to use
+ * waitqueue_active(), as we've already synced up
+ * fc->blocked with waiters with the wake_up() call
+ * above.
+ */
+ if (waitqueue_active(&fc->blocked_waitq))
+ wake_up(&fc->blocked_waitq);
+ }
if (fc->num_background == fc->congestion_threshold && fc->sb) {
clear_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC);
@@ -405,7 +461,7 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
fc->num_background--;
fc->active_background--;
flush_bg_queue(fc);
- spin_unlock(&fc->lock);
+ spin_unlock(&fc->bg_lock);
}
wake_up(&req->waitq);
if (req->end)
@@ -573,40 +629,38 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args)
return ret;
}
-/*
- * Called under fc->lock
- *
- * fc->connected must have been checked previously
- */
-void fuse_request_send_background_locked(struct fuse_conn *fc,
- struct fuse_req *req)
+bool fuse_request_queue_background(struct fuse_conn *fc, struct fuse_req *req)
{
- BUG_ON(!test_bit(FR_BACKGROUND, &req->flags));
+ bool queued = false;
+
+ WARN_ON(!test_bit(FR_BACKGROUND, &req->flags));
if (!test_bit(FR_WAITING, &req->flags)) {
__set_bit(FR_WAITING, &req->flags);
atomic_inc(&fc->num_waiting);
}
__set_bit(FR_ISREPLY, &req->flags);
- fc->num_background++;
- if (fc->num_background == fc->max_background)
- fc->blocked = 1;
- if (fc->num_background == fc->congestion_threshold && fc->sb) {
- set_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC);
- set_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC);
+ spin_lock(&fc->bg_lock);
+ if (likely(fc->connected)) {
+ fc->num_background++;
+ if (fc->num_background == fc->max_background)
+ fc->blocked = 1;
+ if (fc->num_background == fc->congestion_threshold && fc->sb) {
+ set_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC);
+ set_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC);
+ }
+ list_add_tail(&req->list, &fc->bg_queue);
+ flush_bg_queue(fc);
+ queued = true;
}
- list_add_tail(&req->list, &fc->bg_queue);
- flush_bg_queue(fc);
+ spin_unlock(&fc->bg_lock);
+
+ return queued;
}
void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req)
{
- BUG_ON(!req->end);
- spin_lock(&fc->lock);
- if (fc->connected) {
- fuse_request_send_background_locked(fc, req);
- spin_unlock(&fc->lock);
- } else {
- spin_unlock(&fc->lock);
+ WARN_ON(!req->end);
+ if (!fuse_request_queue_background(fc, req)) {
req->out.h.error = -ENOTCONN;
req->end(fc, req);
fuse_put_request(fc, req);
@@ -1084,12 +1138,11 @@ __releases(fiq->waitq.lock)
int err;
list_del_init(&req->intr_entry);
- req->intr_unique = fuse_get_unique(fiq);
memset(&ih, 0, sizeof(ih));
memset(&arg, 0, sizeof(arg));
ih.len = reqsize;
ih.opcode = FUSE_INTERRUPT;
- ih.unique = req->intr_unique;
+ ih.unique = (req->in.h.unique | FUSE_INT_REQ_BIT);
arg.unique = req->in.h.unique;
spin_unlock(&fiq->waitq.lock);
@@ -1238,6 +1291,7 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
struct fuse_req *req;
struct fuse_in *in;
unsigned reqsize;
+ unsigned int hash;
restart:
spin_lock(&fiq->waitq.lock);
@@ -1310,13 +1364,16 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
err = reqsize;
goto out_end;
}
- list_move_tail(&req->list, &fpq->processing);
- spin_unlock(&fpq->lock);
+ hash = fuse_req_hash(req->in.h.unique);
+ list_move_tail(&req->list, &fpq->processing[hash]);
+ __fuse_get_request(req);
set_bit(FR_SENT, &req->flags);
+ spin_unlock(&fpq->lock);
/* matches barrier in request_wait_answer() */
smp_mb__after_atomic();
if (test_bit(FR_INTERRUPTED, &req->flags))
queue_interrupt(fiq, req);
+ fuse_put_request(fc, req);
return reqsize;
@@ -1663,7 +1720,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
unsigned int num;
unsigned int offset;
size_t total_len = 0;
- int num_pages;
+ unsigned int num_pages;
offset = outarg->offset & ~PAGE_MASK;
file_size = i_size_read(inode);
@@ -1675,7 +1732,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
num = file_size - outarg->offset;
num_pages = (num + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
- num_pages = min(num_pages, FUSE_MAX_PAGES_PER_REQ);
+ num_pages = min(num_pages, fc->max_pages);
req = fuse_get_req(fc, num_pages);
if (IS_ERR(req))
@@ -1685,7 +1742,6 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
req->in.h.nodeid = outarg->nodeid;
req->in.numargs = 2;
req->in.argpages = 1;
- req->page_descs[0].offset = offset;
req->end = fuse_retrieve_end;
index = outarg->offset >> PAGE_SHIFT;
@@ -1700,6 +1756,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
this_num = min_t(unsigned, num, PAGE_SIZE - offset);
req->pages[req->num_pages] = page;
+ req->page_descs[req->num_pages].offset = offset;
req->page_descs[req->num_pages].length = this_num;
req->num_pages++;
@@ -1715,8 +1772,10 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
req->in.args[1].size = total_len;
err = fuse_request_send_notify_reply(fc, req, outarg->notify_unique);
- if (err)
+ if (err) {
fuse_retrieve_end(fc, req);
+ fuse_put_request(fc, req);
+ }
return err;
}
@@ -1792,10 +1851,11 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
/* Look up request on processing list by unique ID */
static struct fuse_req *request_find(struct fuse_pqueue *fpq, u64 unique)
{
+ unsigned int hash = fuse_req_hash(unique);
struct fuse_req *req;
- list_for_each_entry(req, &fpq->processing, list) {
- if (req->in.h.unique == unique || req->intr_unique == unique)
+ list_for_each_entry(req, &fpq->processing[hash], list) {
+ if (req->in.h.unique == unique)
return req;
}
return NULL;
@@ -1869,22 +1929,26 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
if (!fpq->connected)
goto err_unlock_pq;
- req = request_find(fpq, oh.unique);
+ req = request_find(fpq, oh.unique & ~FUSE_INT_REQ_BIT);
if (!req)
goto err_unlock_pq;
- /* Is it an interrupt reply? */
- if (req->intr_unique == oh.unique) {
+ /* Is it an interrupt reply ID? */
+ if (oh.unique & FUSE_INT_REQ_BIT) {
+ __fuse_get_request(req);
spin_unlock(&fpq->lock);
err = -EINVAL;
- if (nbytes != sizeof(struct fuse_out_header))
+ if (nbytes != sizeof(struct fuse_out_header)) {
+ fuse_put_request(fc, req);
goto err_finish;
+ }
if (oh.error == -ENOSYS)
fc->no_interrupt = 1;
else if (oh.error == -EAGAIN)
queue_interrupt(&fc->iq, req);
+ fuse_put_request(fc, req);
fuse_copy_finish(cs);
return nbytes;
@@ -2013,8 +2077,10 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
ret = fuse_dev_do_write(fud, &cs, len);
+ pipe_lock(pipe);
for (idx = 0; idx < nbuf; idx++)
pipe_buf_release(pipe, &bufs[idx]);
+ pipe_unlock(pipe);
out:
kvfree(bufs);
@@ -2102,9 +2168,13 @@ void fuse_abort_conn(struct fuse_conn *fc, bool is_abort)
struct fuse_dev *fud;
struct fuse_req *req, *next;
LIST_HEAD(to_end);
+ unsigned int i;
+ /* Background queuing checks fc->connected under bg_lock */
+ spin_lock(&fc->bg_lock);
fc->connected = 0;
- fc->blocked = 0;
+ spin_unlock(&fc->bg_lock);
+
fc->aborted = is_abort;
fuse_set_initialized(fc);
list_for_each_entry(fud, &fc->devices, entry) {
@@ -2123,11 +2193,16 @@ void fuse_abort_conn(struct fuse_conn *fc, bool is_abort)
}
spin_unlock(&req->waitq.lock);
}
- list_splice_tail_init(&fpq->processing, &to_end);
+ for (i = 0; i < FUSE_PQ_HASH_SIZE; i++)
+ list_splice_tail_init(&fpq->processing[i],
+ &to_end);
spin_unlock(&fpq->lock);
}
+ spin_lock(&fc->bg_lock);
+ fc->blocked = 0;
fc->max_background = UINT_MAX;
flush_bg_queue(fc);
+ spin_unlock(&fc->bg_lock);
spin_lock(&fiq->waitq.lock);
fiq->connected = 0;
@@ -2152,6 +2227,8 @@ EXPORT_SYMBOL_GPL(fuse_abort_conn);
void fuse_wait_aborted(struct fuse_conn *fc)
{
+ /* matches implicit memory barrier in fuse_drop_waiting() */
+ smp_mb();
wait_event(fc->blocked_waitq, atomic_read(&fc->num_waiting) == 0);
}
@@ -2163,10 +2240,12 @@ int fuse_dev_release(struct inode *inode, struct file *file)
struct fuse_conn *fc = fud->fc;
struct fuse_pqueue *fpq = &fud->pq;
LIST_HEAD(to_end);
+ unsigned int i;
spin_lock(&fpq->lock);
WARN_ON(!list_empty(&fpq->io));
- list_splice_init(&fpq->processing, &to_end);
+ for (i = 0; i < FUSE_PQ_HASH_SIZE; i++)
+ list_splice_init(&fpq->processing[i], &to_end);
spin_unlock(&fpq->lock);
end_requests(fc, &to_end);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 0979609d6eba..e909678afa2d 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -14,24 +14,9 @@
#include <linux/namei.h>
#include <linux/slab.h>
#include <linux/xattr.h>
+#include <linux/iversion.h>
#include <linux/posix_acl.h>
-static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx)
-{
- struct fuse_conn *fc = get_fuse_conn(dir);
- struct fuse_inode *fi = get_fuse_inode(dir);
-
- if (!fc->do_readdirplus)
- return false;
- if (!fc->readdirplus_auto)
- return true;
- if (test_and_clear_bit(FUSE_I_ADVISE_RDPLUS, &fi->state))
- return true;
- if (ctx->pos == 0)
- return true;
- return false;
-}
-
static void fuse_advise_use_readdirplus(struct inode *dir)
{
struct fuse_inode *fi = get_fuse_inode(dir);
@@ -80,8 +65,7 @@ static u64 time_to_jiffies(u64 sec, u32 nsec)
* Set dentry and possibly attribute timeouts from the lookup/mk*
* replies
*/
-static void fuse_change_entry_timeout(struct dentry *entry,
- struct fuse_entry_out *o)
+void fuse_change_entry_timeout(struct dentry *entry, struct fuse_entry_out *o)
{
fuse_dentry_settime(entry,
time_to_jiffies(o->entry_valid, o->entry_valid_nsec));
@@ -92,18 +76,29 @@ static u64 attr_timeout(struct fuse_attr_out *o)
return time_to_jiffies(o->attr_valid, o->attr_valid_nsec);
}
-static u64 entry_attr_timeout(struct fuse_entry_out *o)
+u64 entry_attr_timeout(struct fuse_entry_out *o)
{
return time_to_jiffies(o->attr_valid, o->attr_valid_nsec);
}
+static void fuse_invalidate_attr_mask(struct inode *inode, u32 mask)
+{
+ set_mask_bits(&get_fuse_inode(inode)->inval_mask, 0, mask);
+}
+
/*
* Mark the attributes as stale, so that at the next call to
* ->getattr() they will be fetched from userspace
*/
void fuse_invalidate_attr(struct inode *inode)
{
- get_fuse_inode(inode)->i_time = 0;
+ fuse_invalidate_attr_mask(inode, STATX_BASIC_STATS);
+}
+
+static void fuse_dir_changed(struct inode *dir)
+{
+ fuse_invalidate_attr(dir);
+ inode_maybe_inc_iversion(dir, false);
}
/**
@@ -113,7 +108,7 @@ void fuse_invalidate_attr(struct inode *inode)
void fuse_invalidate_atime(struct inode *inode)
{
if (!IS_RDONLY(inode))
- fuse_invalidate_attr(inode);
+ fuse_invalidate_attr_mask(inode, STATX_ATIME);
}
/*
@@ -262,11 +257,6 @@ invalid:
goto out;
}
-static int invalid_nodeid(u64 nodeid)
-{
- return !nodeid || nodeid == FUSE_ROOT_ID;
-}
-
static int fuse_dentry_init(struct dentry *dentry)
{
dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry), GFP_KERNEL);
@@ -469,7 +459,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
kfree(forget);
d_instantiate(entry, inode);
fuse_change_entry_timeout(entry, &outentry);
- fuse_invalidate_attr(dir);
+ fuse_dir_changed(dir);
err = finish_open(file, entry, generic_file_open);
if (err) {
fuse_sync_release(ff, flags);
@@ -583,7 +573,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args,
} else {
fuse_change_entry_timeout(entry, &outarg);
}
- fuse_invalidate_attr(dir);
+ fuse_dir_changed(dir);
return 0;
out_put_forget_req:
@@ -693,7 +683,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
drop_nlink(inode);
spin_unlock(&fc->lock);
fuse_invalidate_attr(inode);
- fuse_invalidate_attr(dir);
+ fuse_dir_changed(dir);
fuse_invalidate_entry_cache(entry);
fuse_update_ctime(inode);
} else if (err == -EINTR)
@@ -715,7 +705,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
err = fuse_simple_request(fc, &args);
if (!err) {
clear_nlink(d_inode(entry));
- fuse_invalidate_attr(dir);
+ fuse_dir_changed(dir);
fuse_invalidate_entry_cache(entry);
} else if (err == -EINTR)
fuse_invalidate_entry(entry);
@@ -754,9 +744,9 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent,
fuse_update_ctime(d_inode(newent));
}
- fuse_invalidate_attr(olddir);
+ fuse_dir_changed(olddir);
if (olddir != newdir)
- fuse_invalidate_attr(newdir);
+ fuse_dir_changed(newdir);
/* newent will end up negative */
if (!(flags & RENAME_EXCHANGE) && d_really_is_positive(newent)) {
@@ -932,7 +922,8 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
}
static int fuse_update_get_attr(struct inode *inode, struct file *file,
- struct kstat *stat, unsigned int flags)
+ struct kstat *stat, u32 request_mask,
+ unsigned int flags)
{
struct fuse_inode *fi = get_fuse_inode(inode);
int err = 0;
@@ -942,6 +933,8 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file,
sync = true;
else if (flags & AT_STATX_DONT_SYNC)
sync = false;
+ else if (request_mask & READ_ONCE(fi->inval_mask))
+ sync = true;
else
sync = time_before64(fi->i_time, get_jiffies_64());
@@ -959,7 +952,9 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file,
int fuse_update_attributes(struct inode *inode, struct file *file)
{
- return fuse_update_get_attr(inode, file, NULL, 0);
+ /* Do *not* need to get atime for internal purposes */
+ return fuse_update_get_attr(inode, file, NULL,
+ STATX_BASIC_STATS & ~STATX_ATIME, 0);
}
int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
@@ -989,7 +984,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
if (!entry)
goto unlock;
- fuse_invalidate_attr(parent);
+ fuse_dir_changed(parent);
fuse_invalidate_entry(entry);
if (child_nodeid != 0 && d_really_is_positive(entry)) {
@@ -1124,8 +1119,10 @@ static int fuse_permission(struct inode *inode, int mask)
if (fc->default_permissions ||
((mask & MAY_EXEC) && S_ISREG(inode->i_mode))) {
struct fuse_inode *fi = get_fuse_inode(inode);
+ u32 perm_mask = STATX_MODE | STATX_UID | STATX_GID;
- if (time_before64(fi->i_time, get_jiffies_64())) {
+ if (perm_mask & READ_ONCE(fi->inval_mask) ||
+ time_before64(fi->i_time, get_jiffies_64())) {
refreshed = true;
err = fuse_perm_getattr(inode, mask);
@@ -1165,271 +1162,78 @@ static int fuse_permission(struct inode *inode, int mask)
return err;
}
-static int parse_dirfile(char *buf, size_t nbytes, struct file *file,
- struct dir_context *ctx)
-{
- while (nbytes >= FUSE_NAME_OFFSET) {
- struct fuse_dirent *dirent = (struct fuse_dirent *) buf;
- size_t reclen = FUSE_DIRENT_SIZE(dirent);
- if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX)
- return -EIO;
- if (reclen > nbytes)
- break;
- if (memchr(dirent->name, '/', dirent->namelen) != NULL)
- return -EIO;
-
- if (!dir_emit(ctx, dirent->name, dirent->namelen,
- dirent->ino, dirent->type))
- break;
-
- buf += reclen;
- nbytes -= reclen;
- ctx->pos = dirent->off;
- }
-
- return 0;
-}
-
-static int fuse_direntplus_link(struct file *file,
- struct fuse_direntplus *direntplus,
- u64 attr_version)
-{
- struct fuse_entry_out *o = &direntplus->entry_out;
- struct fuse_dirent *dirent = &direntplus->dirent;
- struct dentry *parent = file->f_path.dentry;
- struct qstr name = QSTR_INIT(dirent->name, dirent->namelen);
- struct dentry *dentry;
- struct dentry *alias;
- struct inode *dir = d_inode(parent);
- struct fuse_conn *fc;
- struct inode *inode;
- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
-
- if (!o->nodeid) {
- /*
- * Unlike in the case of fuse_lookup, zero nodeid does not mean
- * ENOENT. Instead, it only means the userspace filesystem did
- * not want to return attributes/handle for this entry.
- *
- * So do nothing.
- */
- return 0;
- }
-
- if (name.name[0] == '.') {
- /*
- * We could potentially refresh the attributes of the directory
- * and its parent?
- */
- if (name.len == 1)
- return 0;
- if (name.name[1] == '.' && name.len == 2)
- return 0;
- }
-
- if (invalid_nodeid(o->nodeid))
- return -EIO;
- if (!fuse_valid_type(o->attr.mode))
- return -EIO;
-
- fc = get_fuse_conn(dir);
-
- name.hash = full_name_hash(parent, name.name, name.len);
- dentry = d_lookup(parent, &name);
- if (!dentry) {
-retry:
- dentry = d_alloc_parallel(parent, &name, &wq);
- if (IS_ERR(dentry))
- return PTR_ERR(dentry);
- }
- if (!d_in_lookup(dentry)) {
- struct fuse_inode *fi;
- inode = d_inode(dentry);
- if (!inode ||
- get_node_id(inode) != o->nodeid ||
- ((o->attr.mode ^ inode->i_mode) & S_IFMT)) {
- d_invalidate(dentry);
- dput(dentry);
- goto retry;
- }
- if (is_bad_inode(inode)) {
- dput(dentry);
- return -EIO;
- }
-
- fi = get_fuse_inode(inode);
- spin_lock(&fc->lock);
- fi->nlookup++;
- spin_unlock(&fc->lock);
-
- forget_all_cached_acls(inode);
- fuse_change_attributes(inode, &o->attr,
- entry_attr_timeout(o),
- attr_version);
- /*
- * The other branch comes via fuse_iget()
- * which bumps nlookup inside
- */
- } else {
- inode = fuse_iget(dir->i_sb, o->nodeid, o->generation,
- &o->attr, entry_attr_timeout(o),
- attr_version);
- if (!inode)
- inode = ERR_PTR(-ENOMEM);
-
- alias = d_splice_alias(inode, dentry);
- d_lookup_done(dentry);
- if (alias) {
- dput(dentry);
- dentry = alias;
- }
- if (IS_ERR(dentry))
- return PTR_ERR(dentry);
- }
- if (fc->readdirplus_auto)
- set_bit(FUSE_I_INIT_RDPLUS, &get_fuse_inode(inode)->state);
- fuse_change_entry_timeout(dentry, o);
-
- dput(dentry);
- return 0;
-}
-
-static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file,
- struct dir_context *ctx, u64 attr_version)
-{
- struct fuse_direntplus *direntplus;
- struct fuse_dirent *dirent;
- size_t reclen;
- int over = 0;
- int ret;
-
- while (nbytes >= FUSE_NAME_OFFSET_DIRENTPLUS) {
- direntplus = (struct fuse_direntplus *) buf;
- dirent = &direntplus->dirent;
- reclen = FUSE_DIRENTPLUS_SIZE(direntplus);
-
- if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX)
- return -EIO;
- if (reclen > nbytes)
- break;
- if (memchr(dirent->name, '/', dirent->namelen) != NULL)
- return -EIO;
-
- if (!over) {
- /* We fill entries into dstbuf only as much as
- it can hold. But we still continue iterating
- over remaining entries to link them. If not,
- we need to send a FORGET for each of those
- which we did not link.
- */
- over = !dir_emit(ctx, dirent->name, dirent->namelen,
- dirent->ino, dirent->type);
- if (!over)
- ctx->pos = dirent->off;
- }
-
- buf += reclen;
- nbytes -= reclen;
-
- ret = fuse_direntplus_link(file, direntplus, attr_version);
- if (ret)
- fuse_force_forget(file, direntplus->entry_out.nodeid);
- }
-
- return 0;
-}
-
-static int fuse_readdir(struct file *file, struct dir_context *ctx)
+static int fuse_readlink_page(struct inode *inode, struct page *page)
{
- int plus, err;
- size_t nbytes;
- struct page *page;
- struct inode *inode = file_inode(file);
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_req *req;
- u64 attr_version = 0;
- bool locked;
-
- if (is_bad_inode(inode))
- return -EIO;
+ int err;
req = fuse_get_req(fc, 1);
if (IS_ERR(req))
return PTR_ERR(req);
- page = alloc_page(GFP_KERNEL);
- if (!page) {
- fuse_put_request(fc, req);
- return -ENOMEM;
- }
-
- plus = fuse_use_readdirplus(inode, ctx);
+ req->out.page_zeroing = 1;
req->out.argpages = 1;
req->num_pages = 1;
req->pages[0] = page;
- req->page_descs[0].length = PAGE_SIZE;
- if (plus) {
- attr_version = fuse_get_attr_version(fc);
- fuse_read_fill(req, file, ctx->pos, PAGE_SIZE,
- FUSE_READDIRPLUS);
- } else {
- fuse_read_fill(req, file, ctx->pos, PAGE_SIZE,
- FUSE_READDIR);
- }
- locked = fuse_lock_inode(inode);
+ req->page_descs[0].length = PAGE_SIZE - 1;
+ req->in.h.opcode = FUSE_READLINK;
+ req->in.h.nodeid = get_node_id(inode);
+ req->out.argvar = 1;
+ req->out.numargs = 1;
+ req->out.args[0].size = PAGE_SIZE - 1;
fuse_request_send(fc, req);
- fuse_unlock_inode(inode, locked);
- nbytes = req->out.args[0].size;
err = req->out.h.error;
- fuse_put_request(fc, req);
+
if (!err) {
- if (plus) {
- err = parse_dirplusfile(page_address(page), nbytes,
- file, ctx,
- attr_version);
- } else {
- err = parse_dirfile(page_address(page), nbytes, file,
- ctx);
- }
+ char *link = page_address(page);
+ size_t len = req->out.args[0].size;
+
+ BUG_ON(len >= PAGE_SIZE);
+ link[len] = '\0';
}
- __free_page(page);
+ fuse_put_request(fc, req);
fuse_invalidate_atime(inode);
+
return err;
}
-static const char *fuse_get_link(struct dentry *dentry,
- struct inode *inode,
- struct delayed_call *done)
+static const char *fuse_get_link(struct dentry *dentry, struct inode *inode,
+ struct delayed_call *callback)
{
struct fuse_conn *fc = get_fuse_conn(inode);
- FUSE_ARGS(args);
- char *link;
- ssize_t ret;
+ struct page *page;
+ int err;
+
+ err = -EIO;
+ if (is_bad_inode(inode))
+ goto out_err;
+
+ if (fc->cache_symlinks)
+ return page_get_link(dentry, inode, callback);
+ err = -ECHILD;
if (!dentry)
- return ERR_PTR(-ECHILD);
+ goto out_err;
- link = kmalloc(PAGE_SIZE, GFP_KERNEL);
- if (!link)
- return ERR_PTR(-ENOMEM);
+ page = alloc_page(GFP_KERNEL);
+ err = -ENOMEM;
+ if (!page)
+ goto out_err;
- args.in.h.opcode = FUSE_READLINK;
- args.in.h.nodeid = get_node_id(inode);
- args.out.argvar = 1;
- args.out.numargs = 1;
- args.out.args[0].size = PAGE_SIZE - 1;
- args.out.args[0].value = link;
- ret = fuse_simple_request(fc, &args);
- if (ret < 0) {
- kfree(link);
- link = ERR_PTR(ret);
- } else {
- link[ret] = '\0';
- set_delayed_call(done, kfree_link, link);
+ err = fuse_readlink_page(inode, page);
+ if (err) {
+ __free_page(page);
+ goto out_err;
}
- fuse_invalidate_atime(inode);
- return link;
+
+ set_delayed_call(callback, page_put_link, page);
+
+ return page_address(page);
+
+out_err:
+ return ERR_PTR(err);
}
static int fuse_dir_open(struct inode *inode, struct file *file)
@@ -1439,7 +1243,7 @@ static int fuse_dir_open(struct inode *inode, struct file *file)
static int fuse_dir_release(struct inode *inode, struct file *file)
{
- fuse_release_common(file, FUSE_RELEASEDIR);
+ fuse_release_common(file, true);
return 0;
}
@@ -1447,7 +1251,25 @@ static int fuse_dir_release(struct inode *inode, struct file *file)
static int fuse_dir_fsync(struct file *file, loff_t start, loff_t end,
int datasync)
{
- return fuse_fsync_common(file, start, end, datasync, 1);
+ struct inode *inode = file->f_mapping->host;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ int err;
+
+ if (is_bad_inode(inode))
+ return -EIO;
+
+ if (fc->no_fsyncdir)
+ return 0;
+
+ inode_lock(inode);
+ err = fuse_fsync_common(file, start, end, datasync, FUSE_FSYNCDIR);
+ if (err == -ENOSYS) {
+ fc->no_fsyncdir = 1;
+ err = 0;
+ }
+ inode_unlock(inode);
+
+ return err;
}
static long fuse_dir_ioctl(struct file *file, unsigned int cmd,
@@ -1662,8 +1484,11 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
file = NULL;
}
- if (attr->ia_valid & ATTR_SIZE)
+ if (attr->ia_valid & ATTR_SIZE) {
+ if (WARN_ON(!S_ISREG(inode->i_mode)))
+ return -EIO;
is_truncate = true;
+ }
if (is_truncate) {
fuse_set_nowrite(inode);
@@ -1811,7 +1636,7 @@ static int fuse_getattr(const struct path *path, struct kstat *stat,
if (!fuse_allow_current_process(fc))
return -EACCES;
- return fuse_update_get_attr(inode, NULL, stat, flags);
+ return fuse_update_get_attr(inode, NULL, stat, request_mask, flags);
}
static const struct inode_operations fuse_dir_inode_operations = {
@@ -1867,11 +1692,37 @@ void fuse_init_common(struct inode *inode)
void fuse_init_dir(struct inode *inode)
{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
inode->i_op = &fuse_dir_inode_operations;
inode->i_fop = &fuse_dir_operations;
+
+ spin_lock_init(&fi->rdc.lock);
+ fi->rdc.cached = false;
+ fi->rdc.size = 0;
+ fi->rdc.pos = 0;
+ fi->rdc.version = 0;
+}
+
+static int fuse_symlink_readpage(struct file *null, struct page *page)
+{
+ int err = fuse_readlink_page(page->mapping->host, page);
+
+ if (!err)
+ SetPageUptodate(page);
+
+ unlock_page(page);
+
+ return err;
}
+static const struct address_space_operations fuse_symlink_aops = {
+ .readpage = fuse_symlink_readpage,
+};
+
void fuse_init_symlink(struct inode *inode)
{
inode->i_op = &fuse_symlink_inode_operations;
+ inode->i_data.a_ops = &fuse_symlink_aops;
+ inode_nohighmem(inode);
}
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 32d0b883e74f..a59c16bd90ac 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -59,6 +59,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
}
INIT_LIST_HEAD(&ff->write_entry);
+ mutex_init(&ff->readdir.lock);
refcount_set(&ff->count, 1);
RB_CLEAR_NODE(&ff->polled_node);
init_waitqueue_head(&ff->poll_wait);
@@ -73,6 +74,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
void fuse_file_free(struct fuse_file *ff)
{
fuse_request_free(ff->reserved_req);
+ mutex_destroy(&ff->readdir.lock);
kfree(ff);
}
@@ -87,12 +89,12 @@ static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req)
iput(req->misc.release.inode);
}
-static void fuse_file_put(struct fuse_file *ff, bool sync)
+static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir)
{
if (refcount_dec_and_test(&ff->count)) {
struct fuse_req *req = ff->reserved_req;
- if (ff->fc->no_open) {
+ if (ff->fc->no_open && !isdir) {
/*
* Drop the release request when client does not
* implement 'open'
@@ -245,10 +247,11 @@ static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode)
req->in.args[0].value = inarg;
}
-void fuse_release_common(struct file *file, int opcode)
+void fuse_release_common(struct file *file, bool isdir)
{
struct fuse_file *ff = file->private_data;
struct fuse_req *req = ff->reserved_req;
+ int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE;
fuse_prepare_release(ff, file->f_flags, opcode);
@@ -270,7 +273,7 @@ void fuse_release_common(struct file *file, int opcode)
* synchronous RELEASE is allowed (and desirable) in this case
* because the server can be trusted not to screw up.
*/
- fuse_file_put(ff, ff->fc->destroy_req != NULL);
+ fuse_file_put(ff, ff->fc->destroy_req != NULL, isdir);
}
static int fuse_open(struct inode *inode, struct file *file)
@@ -286,7 +289,7 @@ static int fuse_release(struct inode *inode, struct file *file)
if (fc->writeback_cache)
write_inode_now(inode, 1);
- fuse_release_common(file, FUSE_RELEASE);
+ fuse_release_common(file, false);
/* return value is ignored by VFS */
return 0;
@@ -300,7 +303,7 @@ void fuse_sync_release(struct fuse_file *ff, int flags)
* iput(NULL) is a no-op and since the refcount is 1 and everything's
* synchronous, we are fine with not doing igrab() here"
*/
- fuse_file_put(ff, true);
+ fuse_file_put(ff, true, false);
}
EXPORT_SYMBOL_GPL(fuse_sync_release);
@@ -439,13 +442,30 @@ static int fuse_flush(struct file *file, fl_owner_t id)
}
int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
- int datasync, int isdir)
+ int datasync, int opcode)
{
struct inode *inode = file->f_mapping->host;
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_file *ff = file->private_data;
FUSE_ARGS(args);
struct fuse_fsync_in inarg;
+
+ memset(&inarg, 0, sizeof(inarg));
+ inarg.fh = ff->fh;
+ inarg.fsync_flags = datasync ? 1 : 0;
+ args.in.h.opcode = opcode;
+ args.in.h.nodeid = get_node_id(inode);
+ args.in.numargs = 1;
+ args.in.args[0].size = sizeof(inarg);
+ args.in.args[0].value = &inarg;
+ return fuse_simple_request(fc, &args);
+}
+
+static int fuse_fsync(struct file *file, loff_t start, loff_t end,
+ int datasync)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct fuse_conn *fc = get_fuse_conn(inode);
int err;
if (is_bad_inode(inode))
@@ -477,34 +497,18 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
if (err)
goto out;
- if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir))
+ if (fc->no_fsync)
goto out;
- memset(&inarg, 0, sizeof(inarg));
- inarg.fh = ff->fh;
- inarg.fsync_flags = datasync ? 1 : 0;
- args.in.h.opcode = isdir ? FUSE_FSYNCDIR : FUSE_FSYNC;
- args.in.h.nodeid = get_node_id(inode);
- args.in.numargs = 1;
- args.in.args[0].size = sizeof(inarg);
- args.in.args[0].value = &inarg;
- err = fuse_simple_request(fc, &args);
+ err = fuse_fsync_common(file, start, end, datasync, FUSE_FSYNC);
if (err == -ENOSYS) {
- if (isdir)
- fc->no_fsyncdir = 1;
- else
- fc->no_fsync = 1;
+ fc->no_fsync = 1;
err = 0;
}
out:
inode_unlock(inode);
- return err;
-}
-static int fuse_fsync(struct file *file, loff_t start, loff_t end,
- int datasync)
-{
- return fuse_fsync_common(file, start, end, datasync, 0);
+ return err;
}
void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos,
@@ -805,7 +809,7 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
put_page(page);
}
if (req->ff)
- fuse_file_put(req->ff, false);
+ fuse_file_put(req->ff, false, false);
}
static void fuse_send_readpages(struct fuse_req *req, struct file *file)
@@ -848,11 +852,11 @@ static int fuse_readpages_fill(void *_data, struct page *page)
fuse_wait_on_page_writeback(inode, page->index);
if (req->num_pages &&
- (req->num_pages == FUSE_MAX_PAGES_PER_REQ ||
+ (req->num_pages == fc->max_pages ||
(req->num_pages + 1) * PAGE_SIZE > fc->max_read ||
req->pages[req->num_pages - 1]->index + 1 != page->index)) {
- int nr_alloc = min_t(unsigned, data->nr_pages,
- FUSE_MAX_PAGES_PER_REQ);
+ unsigned int nr_alloc = min_t(unsigned int, data->nr_pages,
+ fc->max_pages);
fuse_send_readpages(req, data->file);
if (fc->async_read)
req = fuse_get_req_for_background(fc, nr_alloc);
@@ -887,7 +891,7 @@ static int fuse_readpages(struct file *file, struct address_space *mapping,
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_fill_data data;
int err;
- int nr_alloc = min_t(unsigned, nr_pages, FUSE_MAX_PAGES_PER_REQ);
+ unsigned int nr_alloc = min_t(unsigned int, nr_pages, fc->max_pages);
err = -EIO;
if (is_bad_inode(inode))
@@ -1102,12 +1106,13 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
return count > 0 ? count : err;
}
-static inline unsigned fuse_wr_pages(loff_t pos, size_t len)
+static inline unsigned int fuse_wr_pages(loff_t pos, size_t len,
+ unsigned int max_pages)
{
- return min_t(unsigned,
+ return min_t(unsigned int,
((pos + len - 1) >> PAGE_SHIFT) -
(pos >> PAGE_SHIFT) + 1,
- FUSE_MAX_PAGES_PER_REQ);
+ max_pages);
}
static ssize_t fuse_perform_write(struct kiocb *iocb,
@@ -1129,7 +1134,8 @@ static ssize_t fuse_perform_write(struct kiocb *iocb,
do {
struct fuse_req *req;
ssize_t count;
- unsigned nr_pages = fuse_wr_pages(pos, iov_iter_count(ii));
+ unsigned int nr_pages = fuse_wr_pages(pos, iov_iter_count(ii),
+ fc->max_pages);
req = fuse_get_req(fc, nr_pages);
if (IS_ERR(req)) {
@@ -1271,7 +1277,7 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
ssize_t ret = 0;
/* Special case for kernel I/O: can copy directly into the buffer */
- if (ii->type & ITER_KVEC) {
+ if (iov_iter_is_kvec(ii)) {
unsigned long user_addr = fuse_get_user_addr(ii);
size_t frag_size = fuse_get_frag_size(ii, *nbytesp);
@@ -1319,11 +1325,6 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
return ret < 0 ? ret : 0;
}
-static inline int fuse_iter_npages(const struct iov_iter *ii_p)
-{
- return iov_iter_npages(ii_p, FUSE_MAX_PAGES_PER_REQ);
-}
-
ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
loff_t *ppos, int flags)
{
@@ -1343,9 +1344,10 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
int err = 0;
if (io->async)
- req = fuse_get_req_for_background(fc, fuse_iter_npages(iter));
+ req = fuse_get_req_for_background(fc, iov_iter_npages(iter,
+ fc->max_pages));
else
- req = fuse_get_req(fc, fuse_iter_npages(iter));
+ req = fuse_get_req(fc, iov_iter_npages(iter, fc->max_pages));
if (IS_ERR(req))
return PTR_ERR(req);
@@ -1390,9 +1392,10 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
fuse_put_request(fc, req);
if (io->async)
req = fuse_get_req_for_background(fc,
- fuse_iter_npages(iter));
+ iov_iter_npages(iter, fc->max_pages));
else
- req = fuse_get_req(fc, fuse_iter_npages(iter));
+ req = fuse_get_req(fc, iov_iter_npages(iter,
+ fc->max_pages));
if (IS_ERR(req))
break;
}
@@ -1418,7 +1421,7 @@ static ssize_t __fuse_direct_read(struct fuse_io_priv *io,
res = fuse_direct_io(io, iter, ppos, 0);
- fuse_invalidate_attr(inode);
+ fuse_invalidate_atime(inode);
return res;
}
@@ -1459,7 +1462,7 @@ static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req)
__free_page(req->pages[i]);
if (req->ff)
- fuse_file_put(req->ff, false);
+ fuse_file_put(req->ff, false, false);
}
static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
@@ -1487,6 +1490,7 @@ __acquires(fc->lock)
struct fuse_inode *fi = get_fuse_inode(req->inode);
struct fuse_write_in *inarg = &req->misc.write.in;
__u64 data_size = req->num_pages * PAGE_SIZE;
+ bool queued;
if (!fc->connected)
goto out_free;
@@ -1502,7 +1506,8 @@ __acquires(fc->lock)
req->in.args[1].size = inarg->size;
fi->writectr++;
- fuse_request_send_background_locked(fc, req);
+ queued = fuse_request_queue_background(fc, req);
+ WARN_ON(!queued);
return;
out_free:
@@ -1616,7 +1621,7 @@ int fuse_write_inode(struct inode *inode, struct writeback_control *wbc)
ff = __fuse_write_file_get(fc, fi);
err = fuse_flush_times(inode, ff);
if (ff)
- fuse_file_put(ff, 0);
+ fuse_file_put(ff, false, false);
return err;
}
@@ -1777,7 +1782,7 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req,
spin_unlock(&fc->lock);
dec_wb_stat(&bdi->wb, WB_WRITEBACK);
- dec_node_page_state(page, NR_WRITEBACK_TEMP);
+ dec_node_page_state(new_req->pages[0], NR_WRITEBACK_TEMP);
wb_writeout_inc(&bdi->wb);
fuse_writepage_free(fc, new_req);
fuse_request_free(new_req);
@@ -1819,12 +1824,18 @@ static int fuse_writepages_fill(struct page *page,
is_writeback = fuse_page_is_writeback(inode, page->index);
if (req && req->num_pages &&
- (is_writeback || req->num_pages == FUSE_MAX_PAGES_PER_REQ ||
+ (is_writeback || req->num_pages == fc->max_pages ||
(req->num_pages + 1) * PAGE_SIZE > fc->max_write ||
data->orig_pages[req->num_pages - 1]->index + 1 != page->index)) {
fuse_writepages_send(data);
data->req = NULL;
+ } else if (req && req->num_pages == req->max_pages) {
+ if (!fuse_req_realloc_pages(fc, req, GFP_NOFS)) {
+ fuse_writepages_send(data);
+ req = data->req = NULL;
+ }
}
+
err = -ENOMEM;
tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
if (!tmp_page)
@@ -1847,7 +1858,7 @@ static int fuse_writepages_fill(struct page *page,
struct fuse_inode *fi = get_fuse_inode(inode);
err = -ENOMEM;
- req = fuse_request_alloc_nofs(FUSE_MAX_PAGES_PER_REQ);
+ req = fuse_request_alloc_nofs(FUSE_REQ_INLINE_PAGES);
if (!req) {
__free_page(tmp_page);
goto out_unlock;
@@ -1904,6 +1915,7 @@ static int fuse_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct inode *inode = mapping->host;
+ struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_fill_wb_data data;
int err;
@@ -1916,7 +1928,7 @@ static int fuse_writepages(struct address_space *mapping,
data.ff = NULL;
err = -ENOMEM;
- data.orig_pages = kcalloc(FUSE_MAX_PAGES_PER_REQ,
+ data.orig_pages = kcalloc(fc->max_pages,
sizeof(struct page *),
GFP_NOFS);
if (!data.orig_pages)
@@ -1930,7 +1942,7 @@ static int fuse_writepages(struct address_space *mapping,
err = 0;
}
if (data.ff)
- fuse_file_put(data.ff, false);
+ fuse_file_put(data.ff, false, false);
kfree(data.orig_pages);
out:
@@ -2387,10 +2399,11 @@ static int fuse_copy_ioctl_iovec_old(struct iovec *dst, void *src,
}
/* Make sure iov_length() won't overflow */
-static int fuse_verify_ioctl_iov(struct iovec *iov, size_t count)
+static int fuse_verify_ioctl_iov(struct fuse_conn *fc, struct iovec *iov,
+ size_t count)
{
size_t n;
- u32 max = FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT;
+ u32 max = fc->max_pages << PAGE_SHIFT;
for (n = 0; n < count; n++, iov++) {
if (iov->iov_len > (size_t) max)
@@ -2514,7 +2527,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
err = -ENOMEM;
- pages = kcalloc(FUSE_MAX_PAGES_PER_REQ, sizeof(pages[0]), GFP_KERNEL);
+ pages = kcalloc(fc->max_pages, sizeof(pages[0]), GFP_KERNEL);
iov_page = (struct iovec *) __get_free_page(GFP_KERNEL);
if (!pages || !iov_page)
goto out;
@@ -2553,7 +2566,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
/* make sure there are enough buffer pages and init request with them */
err = -ENOMEM;
- if (max_pages > FUSE_MAX_PAGES_PER_REQ)
+ if (max_pages > fc->max_pages)
goto out;
while (num_pages < max_pages) {
pages[num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
@@ -2640,11 +2653,11 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
in_iov = iov_page;
out_iov = in_iov + in_iovs;
- err = fuse_verify_ioctl_iov(in_iov, in_iovs);
+ err = fuse_verify_ioctl_iov(fc, in_iov, in_iovs);
if (err)
goto out;
- err = fuse_verify_ioctl_iov(out_iov, out_iovs);
+ err = fuse_verify_ioctl_iov(fc, out_iov, out_iovs);
if (err)
goto out;
@@ -2835,9 +2848,9 @@ static void fuse_do_truncate(struct file *file)
fuse_do_setattr(file_dentry(file), &attr, file);
}
-static inline loff_t fuse_round_up(loff_t off)
+static inline loff_t fuse_round_up(struct fuse_conn *fc, loff_t off)
{
- return round_up(off, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
+ return round_up(off, fc->max_pages << PAGE_SHIFT);
}
static ssize_t
@@ -2866,7 +2879,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
if (async_dio && iov_iter_rw(iter) != WRITE && offset + count > i_size) {
if (offset >= i_size)
return 0;
- iov_iter_truncate(iter, fuse_round_up(i_size - offset));
+ iov_iter_truncate(iter, fuse_round_up(ff->fc, i_size - offset));
count = iov_iter_count(iter);
}
@@ -2913,10 +2926,12 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
}
if (io->async) {
+ bool blocking = io->blocking;
+
fuse_aio_complete(io, ret < 0 ? ret : 0, -1);
/* we have a non-extending, async request, so return */
- if (!io->blocking)
+ if (!blocking)
return -EIOCBQUEUED;
wait_for_completion(&wait);
@@ -3011,6 +3026,82 @@ out:
return err;
}
+static ssize_t fuse_copy_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ size_t len, unsigned int flags)
+{
+ struct fuse_file *ff_in = file_in->private_data;
+ struct fuse_file *ff_out = file_out->private_data;
+ struct inode *inode_out = file_inode(file_out);
+ struct fuse_inode *fi_out = get_fuse_inode(inode_out);
+ struct fuse_conn *fc = ff_in->fc;
+ FUSE_ARGS(args);
+ struct fuse_copy_file_range_in inarg = {
+ .fh_in = ff_in->fh,
+ .off_in = pos_in,
+ .nodeid_out = ff_out->nodeid,
+ .fh_out = ff_out->fh,
+ .off_out = pos_out,
+ .len = len,
+ .flags = flags
+ };
+ struct fuse_write_out outarg;
+ ssize_t err;
+ /* mark unstable when write-back is not used, and file_out gets
+ * extended */
+ bool is_unstable = (!fc->writeback_cache) &&
+ ((pos_out + len) > inode_out->i_size);
+
+ if (fc->no_copy_file_range)
+ return -EOPNOTSUPP;
+
+ inode_lock(inode_out);
+
+ if (fc->writeback_cache) {
+ err = filemap_write_and_wait_range(inode_out->i_mapping,
+ pos_out, pos_out + len);
+ if (err)
+ goto out;
+
+ fuse_sync_writes(inode_out);
+ }
+
+ if (is_unstable)
+ set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);
+
+ args.in.h.opcode = FUSE_COPY_FILE_RANGE;
+ args.in.h.nodeid = ff_in->nodeid;
+ args.in.numargs = 1;
+ args.in.args[0].size = sizeof(inarg);
+ args.in.args[0].value = &inarg;
+ args.out.numargs = 1;
+ args.out.args[0].size = sizeof(outarg);
+ args.out.args[0].value = &outarg;
+ err = fuse_simple_request(fc, &args);
+ if (err == -ENOSYS) {
+ fc->no_copy_file_range = 1;
+ err = -EOPNOTSUPP;
+ }
+ if (err)
+ goto out;
+
+ if (fc->writeback_cache) {
+ fuse_write_update_size(inode_out, pos_out + outarg.size);
+ file_update_time(file_out);
+ }
+
+ fuse_invalidate_attr(inode_out);
+
+ err = outarg.size;
+out:
+ if (is_unstable)
+ clear_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);
+
+ inode_unlock(inode_out);
+
+ return err;
+}
+
static const struct file_operations fuse_file_operations = {
.llseek = fuse_file_llseek,
.read_iter = fuse_file_read_iter,
@@ -3027,6 +3118,7 @@ static const struct file_operations fuse_file_operations = {
.compat_ioctl = fuse_file_compat_ioctl,
.poll = fuse_file_poll,
.fallocate = fuse_file_fallocate,
+ .copy_file_range = fuse_copy_file_range,
};
static const struct file_operations fuse_direct_io_file_operations = {
@@ -3062,6 +3154,14 @@ static const struct address_space_operations fuse_file_aops = {
void fuse_init_file_inode(struct inode *inode)
{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
inode->i_fop = &fuse_file_operations;
inode->i_data.a_ops = &fuse_file_aops;
+
+ INIT_LIST_HEAD(&fi->write_files);
+ INIT_LIST_HEAD(&fi->queued_writes);
+ fi->writectr = 0;
+ init_waitqueue_head(&fi->page_waitq);
+ INIT_LIST_HEAD(&fi->writepages);
}
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index f78e9614bb5f..2f2c92e6f8cb 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -28,8 +28,11 @@
#include <linux/refcount.h>
#include <linux/user_namespace.h>
-/** Max number of pages that can be used in a single read request */
-#define FUSE_MAX_PAGES_PER_REQ 32
+/** Default max number of pages that can be used in a single read request */
+#define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32
+
+/** Maximum of max_pages received in init_out */
+#define FUSE_MAX_MAX_PAGES 256
/** Bias for fi->writectr, meaning new writepages must not be sent */
#define FUSE_NOWRITE INT_MIN
@@ -77,6 +80,9 @@ struct fuse_inode {
/** Time in jiffies until the file attributes are valid */
u64 i_time;
+ /* Which attributes are invalid */
+ u32 inval_mask;
+
/** The sticky bit in inode->i_mode may have been removed, so
preserve the original mode */
umode_t orig_i_mode;
@@ -87,21 +93,51 @@ struct fuse_inode {
/** Version of last attribute change */
u64 attr_version;
- /** Files usable in writepage. Protected by fc->lock */
- struct list_head write_files;
+ union {
+ /* Write related fields (regular file only) */
+ struct {
+ /* Files usable in writepage. Protected by fc->lock */
+ struct list_head write_files;
+
+ /* Writepages pending on truncate or fsync */
+ struct list_head queued_writes;
- /** Writepages pending on truncate or fsync */
- struct list_head queued_writes;
+ /* Number of sent writes, a negative bias
+ * (FUSE_NOWRITE) means more writes are blocked */
+ int writectr;
+
+ /* Waitq for writepage completion */
+ wait_queue_head_t page_waitq;
+
+ /* List of writepage requestst (pending or sent) */
+ struct list_head writepages;
+ };
+
+ /* readdir cache (directory only) */
+ struct {
+ /* true if fully cached */
+ bool cached;
- /** Number of sent writes, a negative bias (FUSE_NOWRITE)
- * means more writes are blocked */
- int writectr;
+ /* size of cache */
+ loff_t size;
- /** Waitq for writepage completion */
- wait_queue_head_t page_waitq;
+ /* position at end of cache (position of next entry) */
+ loff_t pos;
- /** List of writepage requestst (pending or sent) */
- struct list_head writepages;
+ /* version of the cache */
+ u64 version;
+
+ /* modification time of directory when cache was
+ * started */
+ struct timespec64 mtime;
+
+ /* iversion of directory when cache was started */
+ u64 iversion;
+
+ /* protects above fields */
+ spinlock_t lock;
+ } rdc;
+ };
/** Miscellaneous bits describing inode state */
unsigned long state;
@@ -148,6 +184,25 @@ struct fuse_file {
/** Entry on inode's write_files list */
struct list_head write_entry;
+ /* Readdir related */
+ struct {
+ /*
+ * Protects below fields against (crazy) parallel readdir on
+ * same open file. Uncontended in the normal case.
+ */
+ struct mutex lock;
+
+ /* Dir stream position */
+ loff_t pos;
+
+ /* Offset in cache */
+ loff_t cache_off;
+
+ /* Version of cache we are reading */
+ u64 version;
+
+ } readdir;
+
/** RB node to be linked on fuse_conn->polled_files */
struct rb_node polled_node;
@@ -311,9 +366,6 @@ struct fuse_req {
/** refcount */
refcount_t count;
- /** Unique ID for the interrupt request */
- u64 intr_unique;
-
/* Request flags, updated with test/set/clear_bit() */
unsigned long flags;
@@ -411,6 +463,9 @@ struct fuse_iqueue {
struct fasync_struct *fasync;
};
+#define FUSE_PQ_HASH_BITS 8
+#define FUSE_PQ_HASH_SIZE (1 << FUSE_PQ_HASH_BITS)
+
struct fuse_pqueue {
/** Connection established */
unsigned connected;
@@ -418,8 +473,8 @@ struct fuse_pqueue {
/** Lock protecting accessess to members of this structure */
spinlock_t lock;
- /** The list of requests being processed */
- struct list_head processing;
+ /** Hash table of requests being processed */
+ struct list_head *processing;
/** The list of requests under I/O */
struct list_head io;
@@ -476,6 +531,9 @@ struct fuse_conn {
/** Maximum write size */
unsigned max_write;
+ /** Maxmum number of pages that can be used in a single request */
+ unsigned int max_pages;
+
/** Input queue */
struct fuse_iqueue iq;
@@ -500,6 +558,10 @@ struct fuse_conn {
/** The list of background requests set aside for later queuing */
struct list_head bg_queue;
+ /** Protects: max_background, congestion_threshold, num_background,
+ * active_background, bg_queue, blocked */
+ spinlock_t bg_lock;
+
/** Flag indicating that INIT reply has been received. Allocating
* any fuse request will be suspended until the flag is set */
int initialized;
@@ -551,6 +613,9 @@ struct fuse_conn {
/** handle fs handles killing suid/sgid/cap on write/chown/trunc */
unsigned handle_killpriv:1;
+ /** cache READLINK responses in page cache */
+ unsigned cache_symlinks:1;
+
/*
* The following bitfields are only for optimization purposes
* and hence races in setting them will not cause malfunction
@@ -637,6 +702,9 @@ struct fuse_conn {
/** Allow other than the mounter user to access the filesystem ? */
unsigned allow_other:1;
+ /** Does the filesystem support copy_file_range? */
+ unsigned no_copy_file_range:1;
+
/** The number of requests waiting for completion */
atomic_t num_waiting;
@@ -697,6 +765,11 @@ static inline u64 get_node_id(struct inode *inode)
return get_fuse_inode(inode)->nodeid;
}
+static inline int invalid_nodeid(u64 nodeid)
+{
+ return !nodeid || nodeid == FUSE_ROOT_ID;
+}
+
/** Device operations */
extern const struct file_operations fuse_dev_operations;
@@ -749,13 +822,13 @@ void fuse_sync_release(struct fuse_file *ff, int flags);
/**
* Send RELEASE or RELEASEDIR request
*/
-void fuse_release_common(struct file *file, int opcode);
+void fuse_release_common(struct file *file, bool isdir);
/**
* Send FSYNC or FSYNCDIR request
*/
int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
- int datasync, int isdir);
+ int datasync, int opcode);
/**
* Notify poll wakeup
@@ -812,6 +885,10 @@ struct fuse_req *fuse_request_alloc(unsigned npages);
struct fuse_req *fuse_request_alloc_nofs(unsigned npages);
+bool fuse_req_realloc_pages(struct fuse_conn *fc, struct fuse_req *req,
+ gfp_t flags);
+
+
/**
* Free a request
*/
@@ -856,9 +933,7 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args);
* Send a request in the background
*/
void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req);
-
-void fuse_request_send_background_locked(struct fuse_conn *fc,
- struct fuse_req *req);
+bool fuse_request_queue_background(struct fuse_conn *fc, struct fuse_req *req);
/* Abort all requests */
void fuse_abort_conn(struct fuse_conn *fc, bool is_abort);
@@ -873,6 +948,9 @@ void fuse_invalidate_entry_cache(struct dentry *entry);
void fuse_invalidate_atime(struct inode *inode);
+u64 entry_attr_timeout(struct fuse_entry_out *o);
+void fuse_change_entry_timeout(struct dentry *entry, struct fuse_entry_out *o);
+
/**
* Acquire reference to fuse_conn
*/
@@ -992,4 +1070,8 @@ struct posix_acl;
struct posix_acl *fuse_get_acl(struct inode *inode, int type);
int fuse_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+
+/* readdir.c */
+int fuse_readdir(struct file *file, struct dir_context *ctx);
+
#endif /* _FS_FUSE_I_H */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index db9e60b7eb69..c2d4099429be 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -90,16 +90,12 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
fi = get_fuse_inode(inode);
fi->i_time = 0;
+ fi->inval_mask = 0;
fi->nodeid = 0;
fi->nlookup = 0;
fi->attr_version = 0;
- fi->writectr = 0;
fi->orig_ino = 0;
fi->state = 0;
- INIT_LIST_HEAD(&fi->write_files);
- INIT_LIST_HEAD(&fi->queued_writes);
- INIT_LIST_HEAD(&fi->writepages);
- init_waitqueue_head(&fi->page_waitq);
mutex_init(&fi->mutex);
fi->forget = fuse_alloc_forget();
if (!fi->forget) {
@@ -119,8 +115,10 @@ static void fuse_i_callback(struct rcu_head *head)
static void fuse_destroy_inode(struct inode *inode)
{
struct fuse_inode *fi = get_fuse_inode(inode);
- BUG_ON(!list_empty(&fi->write_files));
- BUG_ON(!list_empty(&fi->queued_writes));
+ if (S_ISREG(inode->i_mode) && !is_bad_inode(inode)) {
+ WARN_ON(!list_empty(&fi->write_files));
+ WARN_ON(!list_empty(&fi->queued_writes));
+ }
mutex_destroy(&fi->mutex);
kfree(fi->forget);
call_rcu(&inode->i_rcu, fuse_i_callback);
@@ -167,6 +165,7 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
fi->attr_version = ++fc->attr_version;
fi->i_time = attr_valid;
+ WRITE_ONCE(fi->inval_mask, 0);
inode->i_ino = fuse_squash_ino(attr->ino);
inode->i_mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
@@ -594,9 +593,11 @@ static void fuse_iqueue_init(struct fuse_iqueue *fiq)
static void fuse_pqueue_init(struct fuse_pqueue *fpq)
{
- memset(fpq, 0, sizeof(struct fuse_pqueue));
+ unsigned int i;
+
spin_lock_init(&fpq->lock);
- INIT_LIST_HEAD(&fpq->processing);
+ for (i = 0; i < FUSE_PQ_HASH_SIZE; i++)
+ INIT_LIST_HEAD(&fpq->processing[i]);
INIT_LIST_HEAD(&fpq->io);
fpq->connected = 1;
}
@@ -605,6 +606,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns)
{
memset(fc, 0, sizeof(*fc));
spin_lock_init(&fc->lock);
+ spin_lock_init(&fc->bg_lock);
init_rwsem(&fc->killsb);
refcount_set(&fc->count, 1);
atomic_set(&fc->dev_count, 1);
@@ -626,6 +628,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns)
get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
fc->pid_ns = get_pid_ns(task_active_pid_ns(current));
fc->user_ns = get_user_ns(user_ns);
+ fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ;
}
EXPORT_SYMBOL_GPL(fuse_conn_init);
@@ -822,7 +825,7 @@ static const struct super_operations fuse_super_operations = {
static void sanitize_global_limit(unsigned *limit)
{
if (*limit == 0)
- *limit = ((totalram_pages << PAGE_SHIFT) >> 13) /
+ *limit = ((totalram_pages() << PAGE_SHIFT) >> 13) /
sizeof(struct fuse_req);
if (*limit >= 1 << 16)
@@ -852,6 +855,7 @@ static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg)
sanitize_global_limit(&max_user_bgreq);
sanitize_global_limit(&max_user_congthresh);
+ spin_lock(&fc->bg_lock);
if (arg->max_background) {
fc->max_background = arg->max_background;
@@ -865,6 +869,7 @@ static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg)
fc->congestion_threshold > max_user_congthresh)
fc->congestion_threshold = max_user_congthresh;
}
+ spin_unlock(&fc->bg_lock);
}
static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
@@ -924,8 +929,15 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
fc->posix_acl = 1;
fc->sb->s_xattr = fuse_acl_xattr_handlers;
}
+ if (arg->flags & FUSE_CACHE_SYMLINKS)
+ fc->cache_symlinks = 1;
if (arg->flags & FUSE_ABORT_ERROR)
fc->abort_err = 1;
+ if (arg->flags & FUSE_MAX_PAGES) {
+ fc->max_pages =
+ min_t(unsigned int, FUSE_MAX_MAX_PAGES,
+ max_t(unsigned int, arg->max_pages, 1));
+ }
} else {
ra_pages = fc->max_read / PAGE_SIZE;
fc->no_lock = 1;
@@ -957,7 +969,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO |
FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT |
FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL |
- FUSE_ABORT_ERROR;
+ FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS;
req->in.h.opcode = FUSE_INIT;
req->in.numargs = 1;
req->in.args[0].size = sizeof(*arg);
@@ -1022,17 +1034,26 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc)
{
struct fuse_dev *fud;
+ struct list_head *pq;
fud = kzalloc(sizeof(struct fuse_dev), GFP_KERNEL);
- if (fud) {
- fud->fc = fuse_conn_get(fc);
- fuse_pqueue_init(&fud->pq);
+ if (!fud)
+ return NULL;
- spin_lock(&fc->lock);
- list_add_tail(&fud->entry, &fc->devices);
- spin_unlock(&fc->lock);
+ pq = kcalloc(FUSE_PQ_HASH_SIZE, sizeof(struct list_head), GFP_KERNEL);
+ if (!pq) {
+ kfree(fud);
+ return NULL;
}
+ fud->pq.processing = pq;
+ fud->fc = fuse_conn_get(fc);
+ fuse_pqueue_init(&fud->pq);
+
+ spin_lock(&fc->lock);
+ list_add_tail(&fud->entry, &fc->devices);
+ spin_unlock(&fc->lock);
+
return fud;
}
EXPORT_SYMBOL_GPL(fuse_dev_alloc);
@@ -1048,6 +1069,7 @@ void fuse_dev_free(struct fuse_dev *fud)
fuse_conn_put(fc);
}
+ kfree(fud->pq.processing);
kfree(fud);
}
EXPORT_SYMBOL_GPL(fuse_dev_free);
diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
new file mode 100644
index 000000000000..ab18b78f4755
--- /dev/null
+++ b/fs/fuse/readdir.c
@@ -0,0 +1,569 @@
+/*
+ FUSE: Filesystem in Userspace
+ Copyright (C) 2001-2018 Miklos Szeredi <miklos@szeredi.hu>
+
+ This program can be distributed under the terms of the GNU GPL.
+ See the file COPYING.
+*/
+
+
+#include "fuse_i.h"
+#include <linux/iversion.h>
+#include <linux/posix_acl.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+
+static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx)
+{
+ struct fuse_conn *fc = get_fuse_conn(dir);
+ struct fuse_inode *fi = get_fuse_inode(dir);
+
+ if (!fc->do_readdirplus)
+ return false;
+ if (!fc->readdirplus_auto)
+ return true;
+ if (test_and_clear_bit(FUSE_I_ADVISE_RDPLUS, &fi->state))
+ return true;
+ if (ctx->pos == 0)
+ return true;
+ return false;
+}
+
+static void fuse_add_dirent_to_cache(struct file *file,
+ struct fuse_dirent *dirent, loff_t pos)
+{
+ struct fuse_inode *fi = get_fuse_inode(file_inode(file));
+ size_t reclen = FUSE_DIRENT_SIZE(dirent);
+ pgoff_t index;
+ struct page *page;
+ loff_t size;
+ u64 version;
+ unsigned int offset;
+ void *addr;
+
+ spin_lock(&fi->rdc.lock);
+ /*
+ * Is cache already completed? Or this entry does not go at the end of
+ * cache?
+ */
+ if (fi->rdc.cached || pos != fi->rdc.pos) {
+ spin_unlock(&fi->rdc.lock);
+ return;
+ }
+ version = fi->rdc.version;
+ size = fi->rdc.size;
+ offset = size & ~PAGE_MASK;
+ index = size >> PAGE_SHIFT;
+ /* Dirent doesn't fit in current page? Jump to next page. */
+ if (offset + reclen > PAGE_SIZE) {
+ index++;
+ offset = 0;
+ }
+ spin_unlock(&fi->rdc.lock);
+
+ if (offset) {
+ page = find_lock_page(file->f_mapping, index);
+ } else {
+ page = find_or_create_page(file->f_mapping, index,
+ mapping_gfp_mask(file->f_mapping));
+ }
+ if (!page)
+ return;
+
+ spin_lock(&fi->rdc.lock);
+ /* Raced with another readdir */
+ if (fi->rdc.version != version || fi->rdc.size != size ||
+ WARN_ON(fi->rdc.pos != pos))
+ goto unlock;
+
+ addr = kmap_atomic(page);
+ if (!offset)
+ clear_page(addr);
+ memcpy(addr + offset, dirent, reclen);
+ kunmap_atomic(addr);
+ fi->rdc.size = (index << PAGE_SHIFT) + offset + reclen;
+ fi->rdc.pos = dirent->off;
+unlock:
+ spin_unlock(&fi->rdc.lock);
+ unlock_page(page);
+ put_page(page);
+}
+
+static void fuse_readdir_cache_end(struct file *file, loff_t pos)
+{
+ struct fuse_inode *fi = get_fuse_inode(file_inode(file));
+ loff_t end;
+
+ spin_lock(&fi->rdc.lock);
+ /* does cache end position match current position? */
+ if (fi->rdc.pos != pos) {
+ spin_unlock(&fi->rdc.lock);
+ return;
+ }
+
+ fi->rdc.cached = true;
+ end = ALIGN(fi->rdc.size, PAGE_SIZE);
+ spin_unlock(&fi->rdc.lock);
+
+ /* truncate unused tail of cache */
+ truncate_inode_pages(file->f_mapping, end);
+}
+
+static bool fuse_emit(struct file *file, struct dir_context *ctx,
+ struct fuse_dirent *dirent)
+{
+ struct fuse_file *ff = file->private_data;
+
+ if (ff->open_flags & FOPEN_CACHE_DIR)
+ fuse_add_dirent_to_cache(file, dirent, ctx->pos);
+
+ return dir_emit(ctx, dirent->name, dirent->namelen, dirent->ino,
+ dirent->type);
+}
+
+static int parse_dirfile(char *buf, size_t nbytes, struct file *file,
+ struct dir_context *ctx)
+{
+ while (nbytes >= FUSE_NAME_OFFSET) {
+ struct fuse_dirent *dirent = (struct fuse_dirent *) buf;
+ size_t reclen = FUSE_DIRENT_SIZE(dirent);
+ if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX)
+ return -EIO;
+ if (reclen > nbytes)
+ break;
+ if (memchr(dirent->name, '/', dirent->namelen) != NULL)
+ return -EIO;
+
+ if (!fuse_emit(file, ctx, dirent))
+ break;
+
+ buf += reclen;
+ nbytes -= reclen;
+ ctx->pos = dirent->off;
+ }
+
+ return 0;
+}
+
+static int fuse_direntplus_link(struct file *file,
+ struct fuse_direntplus *direntplus,
+ u64 attr_version)
+{
+ struct fuse_entry_out *o = &direntplus->entry_out;
+ struct fuse_dirent *dirent = &direntplus->dirent;
+ struct dentry *parent = file->f_path.dentry;
+ struct qstr name = QSTR_INIT(dirent->name, dirent->namelen);
+ struct dentry *dentry;
+ struct dentry *alias;
+ struct inode *dir = d_inode(parent);
+ struct fuse_conn *fc;
+ struct inode *inode;
+ DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
+
+ if (!o->nodeid) {
+ /*
+ * Unlike in the case of fuse_lookup, zero nodeid does not mean
+ * ENOENT. Instead, it only means the userspace filesystem did
+ * not want to return attributes/handle for this entry.
+ *
+ * So do nothing.
+ */
+ return 0;
+ }
+
+ if (name.name[0] == '.') {
+ /*
+ * We could potentially refresh the attributes of the directory
+ * and its parent?
+ */
+ if (name.len == 1)
+ return 0;
+ if (name.name[1] == '.' && name.len == 2)
+ return 0;
+ }
+
+ if (invalid_nodeid(o->nodeid))
+ return -EIO;
+ if (!fuse_valid_type(o->attr.mode))
+ return -EIO;
+
+ fc = get_fuse_conn(dir);
+
+ name.hash = full_name_hash(parent, name.name, name.len);
+ dentry = d_lookup(parent, &name);
+ if (!dentry) {
+retry:
+ dentry = d_alloc_parallel(parent, &name, &wq);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+ }
+ if (!d_in_lookup(dentry)) {
+ struct fuse_inode *fi;
+ inode = d_inode(dentry);
+ if (!inode ||
+ get_node_id(inode) != o->nodeid ||
+ ((o->attr.mode ^ inode->i_mode) & S_IFMT)) {
+ d_invalidate(dentry);
+ dput(dentry);
+ goto retry;
+ }
+ if (is_bad_inode(inode)) {
+ dput(dentry);
+ return -EIO;
+ }
+
+ fi = get_fuse_inode(inode);
+ spin_lock(&fc->lock);
+ fi->nlookup++;
+ spin_unlock(&fc->lock);
+
+ forget_all_cached_acls(inode);
+ fuse_change_attributes(inode, &o->attr,
+ entry_attr_timeout(o),
+ attr_version);
+ /*
+ * The other branch comes via fuse_iget()
+ * which bumps nlookup inside
+ */
+ } else {
+ inode = fuse_iget(dir->i_sb, o->nodeid, o->generation,
+ &o->attr, entry_attr_timeout(o),
+ attr_version);
+ if (!inode)
+ inode = ERR_PTR(-ENOMEM);
+
+ alias = d_splice_alias(inode, dentry);
+ d_lookup_done(dentry);
+ if (alias) {
+ dput(dentry);
+ dentry = alias;
+ }
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+ }
+ if (fc->readdirplus_auto)
+ set_bit(FUSE_I_INIT_RDPLUS, &get_fuse_inode(inode)->state);
+ fuse_change_entry_timeout(dentry, o);
+
+ dput(dentry);
+ return 0;
+}
+
+static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file,
+ struct dir_context *ctx, u64 attr_version)
+{
+ struct fuse_direntplus *direntplus;
+ struct fuse_dirent *dirent;
+ size_t reclen;
+ int over = 0;
+ int ret;
+
+ while (nbytes >= FUSE_NAME_OFFSET_DIRENTPLUS) {
+ direntplus = (struct fuse_direntplus *) buf;
+ dirent = &direntplus->dirent;
+ reclen = FUSE_DIRENTPLUS_SIZE(direntplus);
+
+ if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX)
+ return -EIO;
+ if (reclen > nbytes)
+ break;
+ if (memchr(dirent->name, '/', dirent->namelen) != NULL)
+ return -EIO;
+
+ if (!over) {
+ /* We fill entries into dstbuf only as much as
+ it can hold. But we still continue iterating
+ over remaining entries to link them. If not,
+ we need to send a FORGET for each of those
+ which we did not link.
+ */
+ over = !fuse_emit(file, ctx, dirent);
+ if (!over)
+ ctx->pos = dirent->off;
+ }
+
+ buf += reclen;
+ nbytes -= reclen;
+
+ ret = fuse_direntplus_link(file, direntplus, attr_version);
+ if (ret)
+ fuse_force_forget(file, direntplus->entry_out.nodeid);
+ }
+
+ return 0;
+}
+
+static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
+{
+ int plus, err;
+ size_t nbytes;
+ struct page *page;
+ struct inode *inode = file_inode(file);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_req *req;
+ u64 attr_version = 0;
+ bool locked;
+
+ req = fuse_get_req(fc, 1);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ page = alloc_page(GFP_KERNEL);
+ if (!page) {
+ fuse_put_request(fc, req);
+ return -ENOMEM;
+ }
+
+ plus = fuse_use_readdirplus(inode, ctx);
+ req->out.argpages = 1;
+ req->num_pages = 1;
+ req->pages[0] = page;
+ req->page_descs[0].length = PAGE_SIZE;
+ if (plus) {
+ attr_version = fuse_get_attr_version(fc);
+ fuse_read_fill(req, file, ctx->pos, PAGE_SIZE,
+ FUSE_READDIRPLUS);
+ } else {
+ fuse_read_fill(req, file, ctx->pos, PAGE_SIZE,
+ FUSE_READDIR);
+ }
+ locked = fuse_lock_inode(inode);
+ fuse_request_send(fc, req);
+ fuse_unlock_inode(inode, locked);
+ nbytes = req->out.args[0].size;
+ err = req->out.h.error;
+ fuse_put_request(fc, req);
+ if (!err) {
+ if (!nbytes) {
+ struct fuse_file *ff = file->private_data;
+
+ if (ff->open_flags & FOPEN_CACHE_DIR)
+ fuse_readdir_cache_end(file, ctx->pos);
+ } else if (plus) {
+ err = parse_dirplusfile(page_address(page), nbytes,
+ file, ctx, attr_version);
+ } else {
+ err = parse_dirfile(page_address(page), nbytes, file,
+ ctx);
+ }
+ }
+
+ __free_page(page);
+ fuse_invalidate_atime(inode);
+ return err;
+}
+
+enum fuse_parse_result {
+ FOUND_ERR = -1,
+ FOUND_NONE = 0,
+ FOUND_SOME,
+ FOUND_ALL,
+};
+
+static enum fuse_parse_result fuse_parse_cache(struct fuse_file *ff,
+ void *addr, unsigned int size,
+ struct dir_context *ctx)
+{
+ unsigned int offset = ff->readdir.cache_off & ~PAGE_MASK;
+ enum fuse_parse_result res = FOUND_NONE;
+
+ WARN_ON(offset >= size);
+
+ for (;;) {
+ struct fuse_dirent *dirent = addr + offset;
+ unsigned int nbytes = size - offset;
+ size_t reclen = FUSE_DIRENT_SIZE(dirent);
+
+ if (nbytes < FUSE_NAME_OFFSET || !dirent->namelen)
+ break;
+
+ if (WARN_ON(dirent->namelen > FUSE_NAME_MAX))
+ return FOUND_ERR;
+ if (WARN_ON(reclen > nbytes))
+ return FOUND_ERR;
+ if (WARN_ON(memchr(dirent->name, '/', dirent->namelen) != NULL))
+ return FOUND_ERR;
+
+ if (ff->readdir.pos == ctx->pos) {
+ res = FOUND_SOME;
+ if (!dir_emit(ctx, dirent->name, dirent->namelen,
+ dirent->ino, dirent->type))
+ return FOUND_ALL;
+ ctx->pos = dirent->off;
+ }
+ ff->readdir.pos = dirent->off;
+ ff->readdir.cache_off += reclen;
+
+ offset += reclen;
+ }
+
+ return res;
+}
+
+static void fuse_rdc_reset(struct inode *inode)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ fi->rdc.cached = false;
+ fi->rdc.version++;
+ fi->rdc.size = 0;
+ fi->rdc.pos = 0;
+}
+
+#define UNCACHED 1
+
+static int fuse_readdir_cached(struct file *file, struct dir_context *ctx)
+{
+ struct fuse_file *ff = file->private_data;
+ struct inode *inode = file_inode(file);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ enum fuse_parse_result res;
+ pgoff_t index;
+ unsigned int size;
+ struct page *page;
+ void *addr;
+
+ /* Seeked? If so, reset the cache stream */
+ if (ff->readdir.pos != ctx->pos) {
+ ff->readdir.pos = 0;
+ ff->readdir.cache_off = 0;
+ }
+
+ /*
+ * We're just about to start reading into the cache or reading the
+ * cache; both cases require an up-to-date mtime value.
+ */
+ if (!ctx->pos && fc->auto_inval_data) {
+ int err = fuse_update_attributes(inode, file);
+
+ if (err)
+ return err;
+ }
+
+retry:
+ spin_lock(&fi->rdc.lock);
+retry_locked:
+ if (!fi->rdc.cached) {
+ /* Starting cache? Set cache mtime. */
+ if (!ctx->pos && !fi->rdc.size) {
+ fi->rdc.mtime = inode->i_mtime;
+ fi->rdc.iversion = inode_query_iversion(inode);
+ }
+ spin_unlock(&fi->rdc.lock);
+ return UNCACHED;
+ }
+ /*
+ * When at the beginning of the directory (i.e. just after opendir(3) or
+ * rewinddir(3)), then need to check whether directory contents have
+ * changed, and reset the cache if so.
+ */
+ if (!ctx->pos) {
+ if (inode_peek_iversion(inode) != fi->rdc.iversion ||
+ !timespec64_equal(&fi->rdc.mtime, &inode->i_mtime)) {
+ fuse_rdc_reset(inode);
+ goto retry_locked;
+ }
+ }
+
+ /*
+ * If cache version changed since the last getdents() call, then reset
+ * the cache stream.
+ */
+ if (ff->readdir.version != fi->rdc.version) {
+ ff->readdir.pos = 0;
+ ff->readdir.cache_off = 0;
+ }
+ /*
+ * If at the beginning of the cache, than reset version to
+ * current.
+ */
+ if (ff->readdir.pos == 0)
+ ff->readdir.version = fi->rdc.version;
+
+ WARN_ON(fi->rdc.size < ff->readdir.cache_off);
+
+ index = ff->readdir.cache_off >> PAGE_SHIFT;
+
+ if (index == (fi->rdc.size >> PAGE_SHIFT))
+ size = fi->rdc.size & ~PAGE_MASK;
+ else
+ size = PAGE_SIZE;
+ spin_unlock(&fi->rdc.lock);
+
+ /* EOF? */
+ if ((ff->readdir.cache_off & ~PAGE_MASK) == size)
+ return 0;
+
+ page = find_get_page_flags(file->f_mapping, index,
+ FGP_ACCESSED | FGP_LOCK);
+ spin_lock(&fi->rdc.lock);
+ if (!page) {
+ /*
+ * Uh-oh: page gone missing, cache is useless
+ */
+ if (fi->rdc.version == ff->readdir.version)
+ fuse_rdc_reset(inode);
+ goto retry_locked;
+ }
+
+ /* Make sure it's still the same version after getting the page. */
+ if (ff->readdir.version != fi->rdc.version) {
+ spin_unlock(&fi->rdc.lock);
+ unlock_page(page);
+ put_page(page);
+ goto retry;
+ }
+ spin_unlock(&fi->rdc.lock);
+
+ /*
+ * Contents of the page are now protected against changing by holding
+ * the page lock.
+ */
+ addr = kmap(page);
+ res = fuse_parse_cache(ff, addr, size, ctx);
+ kunmap(page);
+ unlock_page(page);
+ put_page(page);
+
+ if (res == FOUND_ERR)
+ return -EIO;
+
+ if (res == FOUND_ALL)
+ return 0;
+
+ if (size == PAGE_SIZE) {
+ /* We hit end of page: skip to next page. */
+ ff->readdir.cache_off = ALIGN(ff->readdir.cache_off, PAGE_SIZE);
+ goto retry;
+ }
+
+ /*
+ * End of cache reached. If found position, then we are done, otherwise
+ * need to fall back to uncached, since the position we were looking for
+ * wasn't in the cache.
+ */
+ return res == FOUND_SOME ? 0 : UNCACHED;
+}
+
+int fuse_readdir(struct file *file, struct dir_context *ctx)
+{
+ struct fuse_file *ff = file->private_data;
+ struct inode *inode = file_inode(file);
+ int err;
+
+ if (is_bad_inode(inode))
+ return -EIO;
+
+ mutex_lock(&ff->readdir.lock);
+
+ err = UNCACHED;
+ if (ff->open_flags & FOPEN_CACHE_DIR)
+ err = fuse_readdir_cached(file, ctx);
+ if (err == UNCACHED)
+ err = fuse_readdir_uncached(file, ctx);
+
+ mutex_unlock(&ff->readdir.lock);
+
+ return err;
+}
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 31e8270d0b26..05dd78f4b2b3 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -366,7 +366,7 @@ static int gfs2_write_cache_jdata(struct address_space *mapping,
pgoff_t done_index;
int cycled;
int range_whole = 0;
- int tag;
+ xa_mark_t tag;
pagevec_init(&pvec);
if (wbc->range_cyclic) {
@@ -820,10 +820,10 @@ out:
* @page: the page that's being released
* @gfp_mask: passed from Linux VFS, ignored by us
*
- * Call try_to_free_buffers() if the buffers in this page can be
- * released.
+ * Calls try_to_free_buffers() to free the buffers and put the page if the
+ * buffers can be released.
*
- * Returns: 0
+ * Returns: 1 if the page was put or else 0
*/
int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
@@ -930,14 +930,14 @@ static const struct address_space_operations gfs2_jdata_aops = {
void gfs2_set_aops(struct inode *inode)
{
struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
- if (gfs2_is_writeback(ip))
+ if (gfs2_is_jdata(ip))
+ inode->i_mapping->a_ops = &gfs2_jdata_aops;
+ else if (gfs2_is_writeback(sdp))
inode->i_mapping->a_ops = &gfs2_writeback_aops;
- else if (gfs2_is_ordered(ip))
+ else if (gfs2_is_ordered(sdp))
inode->i_mapping->a_ops = &gfs2_ordered_aops;
- else if (gfs2_is_jdata(ip))
- inode->i_mapping->a_ops = &gfs2_jdata_aops;
else
BUG();
}
-
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index a683d9b27d76..02b2646d84b3 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -14,6 +14,7 @@
#include <linux/gfs2_ondisk.h>
#include <linux/crc32.h>
#include <linux/iomap.h>
+#include <linux/ktime.h>
#include "gfs2.h"
#include "incore.h"
@@ -826,7 +827,7 @@ static int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length,
ret = gfs2_meta_inode_buffer(ip, &dibh);
if (ret)
goto unlock;
- iomap->private = dibh;
+ mp->mp_bh[0] = dibh;
if (gfs2_is_stuffed(ip)) {
if (flags & IOMAP_WRITE) {
@@ -863,9 +864,6 @@ unstuff:
len = lblock_stop - lblock + 1;
iomap->length = len << inode->i_blkbits;
- get_bh(dibh);
- mp->mp_bh[0] = dibh;
-
height = ip->i_height;
while ((lblock + 1) * sdp->sd_sb.sb_bsize > sdp->sd_heightsize[height])
height++;
@@ -898,8 +896,6 @@ out:
iomap->bdev = inode->i_sb->s_bdev;
unlock:
up_read(&ip->i_rw_mutex);
- if (ret && dibh)
- brelse(dibh);
return ret;
do_alloc:
@@ -980,9 +976,9 @@ static void gfs2_iomap_journaled_page_done(struct inode *inode, loff_t pos,
static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos,
loff_t length, unsigned flags,
- struct iomap *iomap)
+ struct iomap *iomap,
+ struct metapath *mp)
{
- struct metapath mp = { .mp_aheight = 1, };
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
@@ -996,9 +992,9 @@ static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos,
unstuff = gfs2_is_stuffed(ip) &&
pos + length > gfs2_max_stuffed_size(ip);
- ret = gfs2_iomap_get(inode, pos, length, flags, iomap, &mp);
+ ret = gfs2_iomap_get(inode, pos, length, flags, iomap, mp);
if (ret)
- goto out_release;
+ goto out_unlock;
alloc_required = unstuff || iomap->type == IOMAP_HOLE;
@@ -1013,7 +1009,7 @@ static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos,
ret = gfs2_quota_lock_check(ip, &ap);
if (ret)
- goto out_release;
+ goto out_unlock;
ret = gfs2_inplace_reserve(ip, &ap);
if (ret)
@@ -1038,17 +1034,15 @@ static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos,
ret = gfs2_unstuff_dinode(ip, NULL);
if (ret)
goto out_trans_end;
- release_metapath(&mp);
- brelse(iomap->private);
- iomap->private = NULL;
+ release_metapath(mp);
ret = gfs2_iomap_get(inode, iomap->offset, iomap->length,
- flags, iomap, &mp);
+ flags, iomap, mp);
if (ret)
goto out_trans_end;
}
if (iomap->type == IOMAP_HOLE) {
- ret = gfs2_iomap_alloc(inode, iomap, flags, &mp);
+ ret = gfs2_iomap_alloc(inode, iomap, flags, mp);
if (ret) {
gfs2_trans_end(sdp);
gfs2_inplace_release(ip);
@@ -1056,7 +1050,6 @@ static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos,
goto out_qunlock;
}
}
- release_metapath(&mp);
if (!gfs2_is_stuffed(ip) && gfs2_is_jdata(ip))
iomap->page_done = gfs2_iomap_journaled_page_done;
return 0;
@@ -1069,10 +1062,7 @@ out_trans_fail:
out_qunlock:
if (alloc_required)
gfs2_quota_unlock(ip);
-out_release:
- if (iomap->private)
- brelse(iomap->private);
- release_metapath(&mp);
+out_unlock:
gfs2_write_unlock(inode);
return ret;
}
@@ -1088,10 +1078,10 @@ static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
trace_gfs2_iomap_start(ip, pos, length, flags);
if ((flags & IOMAP_WRITE) && !(flags & IOMAP_DIRECT)) {
- ret = gfs2_iomap_begin_write(inode, pos, length, flags, iomap);
+ ret = gfs2_iomap_begin_write(inode, pos, length, flags, iomap, &mp);
} else {
ret = gfs2_iomap_get(inode, pos, length, flags, iomap, &mp);
- release_metapath(&mp);
+
/*
* Silently fall back to buffered I/O for stuffed files or if
* we've hot a hole (see gfs2_file_direct_write).
@@ -1100,6 +1090,11 @@ static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
iomap->type != IOMAP_MAPPED)
ret = -ENOTBLK;
}
+ if (!ret) {
+ get_bh(mp.mp_bh[0]);
+ iomap->private = mp.mp_bh[0];
+ }
+ release_metapath(&mp);
trace_gfs2_iomap_end(ip, iomap, ret);
return ret;
}
@@ -1908,10 +1903,16 @@ static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length)
if (ret < 0)
goto out;
- /* issue read-ahead on metadata */
- if (mp.mp_aheight > 1) {
- for (; ret > 1; ret--) {
- metapointer_range(&mp, mp.mp_aheight - ret,
+ /* On the first pass, issue read-ahead on metadata. */
+ if (mp.mp_aheight > 1 && strip_h == ip->i_height - 1) {
+ unsigned int height = mp.mp_aheight - 1;
+
+ /* No read-ahead for data blocks. */
+ if (mp.mp_aheight - 1 == strip_h)
+ height--;
+
+ for (; height >= mp.mp_aheight - ret; height--) {
+ metapointer_range(&mp, height,
start_list, start_aligned,
end_list, end_aligned,
&start, &end);
@@ -2083,6 +2084,8 @@ static int do_grow(struct inode *inode, u64 size)
}
error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT +
+ (unstuff &&
+ gfs2_is_jdata(ip) ? RES_JDATA : 0) +
(sdp->sd_args.ar_quota == GFS2_QUOTA_OFF ?
0 : RES_QUOTA), 0);
if (error)
@@ -2248,7 +2251,9 @@ int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd)
unsigned int shift = sdp->sd_sb.sb_bsize_shift;
u64 size;
int rc;
+ ktime_t start, end;
+ start = ktime_get();
lblock_stop = i_size_read(jd->jd_inode) >> shift;
size = (lblock_stop - lblock) << shift;
jd->nr_extents = 0;
@@ -2268,8 +2273,9 @@ int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd)
lblock += (bh.b_size >> ip->i_inode.i_blkbits);
} while(size > 0);
- fs_info(sdp, "journal %d mapped with %u extents\n", jd->jd_jid,
- jd->nr_extents);
+ end = ktime_get();
+ fs_info(sdp, "journal %d mapped with %u extents in %lldms\n", jd->jd_jid,
+ jd->nr_extents, ktime_ms_delta(end, start));
return 0;
fail:
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 45a17b770d97..58a768e59712 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -1199,13 +1199,13 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
mutex_lock(&fp->f_fl_mutex);
if (gfs2_holder_initialized(fl_gh)) {
+ struct file_lock request;
if (fl_gh->gh_state == state)
goto out;
- locks_lock_file_wait(file,
- &(struct file_lock) {
- .fl_type = F_UNLCK,
- .fl_flags = FL_FLOCK
- });
+ locks_init_lock(&request);
+ request.fl_type = F_UNLCK;
+ request.fl_flags = FL_FLOCK;
+ locks_lock_file_wait(file, &request);
gfs2_glock_dq(fl_gh);
gfs2_holder_reinit(state, flags, fl_gh);
} else {
@@ -1280,6 +1280,7 @@ const struct file_operations gfs2_file_fops = {
.llseek = gfs2_llseek,
.read_iter = gfs2_file_read_iter,
.write_iter = gfs2_file_write_iter,
+ .iopoll = iomap_dio_iopoll,
.unlocked_ioctl = gfs2_ioctl,
.mmap = gfs2_mmap,
.open = gfs2_open,
@@ -1310,6 +1311,7 @@ const struct file_operations gfs2_file_fops_nolock = {
.llseek = gfs2_llseek,
.read_iter = gfs2_file_read_iter,
.write_iter = gfs2_file_write_iter,
+ .iopoll = iomap_dio_iopoll,
.unlocked_ioctl = gfs2_ioctl,
.mmap = gfs2_mmap,
.open = gfs2_open,
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 05431324b262..b92740edc416 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1777,7 +1777,7 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl)
*
*/
-void gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
+void gfs2_dump_glock(struct seq_file *seq, struct gfs2_glock *gl)
{
const struct gfs2_glock_operations *glops = gl->gl_ops;
unsigned long long dtime;
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 5e12220cc0c2..8949bf28b249 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -202,7 +202,7 @@ extern int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number,
struct gfs2_holder *gh);
extern int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
extern void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
-extern void gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
+extern void gfs2_dump_glock(struct seq_file *seq, struct gfs2_glock *gl);
#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { gfs2_dump_glock(NULL, gl); BUG(); } } while(0)
extern __printf(2, 3)
void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index c63bee9adb6a..78510ab91835 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -466,17 +466,25 @@ static int inode_go_lock(struct gfs2_holder *gh)
*
*/
-static void inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
+static void inode_go_dump(struct seq_file *seq, struct gfs2_glock *gl)
{
- const struct gfs2_inode *ip = gl->gl_object;
+ struct gfs2_inode *ip = gl->gl_object;
+ struct inode *inode = &ip->i_inode;
+ unsigned long nrpages;
+
if (ip == NULL)
return;
- gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu\n",
+
+ xa_lock_irq(&inode->i_data.i_pages);
+ nrpages = inode->i_data.nrpages;
+ xa_unlock_irq(&inode->i_data.i_pages);
+
+ gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu p:%lu\n",
(unsigned long long)ip->i_no_formal_ino,
(unsigned long long)ip->i_no_addr,
IF2DT(ip->i_inode.i_mode), ip->i_flags,
(unsigned int)ip->i_diskflags,
- (unsigned long long)i_size_read(&ip->i_inode));
+ (unsigned long long)i_size_read(inode), nrpages);
}
/**
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 888b62cfd6d1..e10e0b0a7cd5 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -165,7 +165,6 @@ struct gfs2_bufdata {
u64 bd_blkno;
struct list_head bd_list;
- const struct gfs2_log_operations *bd_ops;
struct gfs2_trans *bd_tr;
struct list_head bd_ail_st_list;
@@ -244,7 +243,7 @@ struct gfs2_glock_operations {
int (*go_demote_ok) (const struct gfs2_glock *gl);
int (*go_lock) (struct gfs2_holder *gh);
void (*go_unlock) (struct gfs2_holder *gh);
- void (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl);
+ void (*go_dump)(struct seq_file *seq, struct gfs2_glock *gl);
void (*go_callback)(struct gfs2_glock *gl, bool remote);
const int go_type;
const unsigned long go_flags;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 648f0ca1ad57..998051c4aea7 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -744,17 +744,19 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
the gfs2 structures. */
if (default_acl) {
error = __gfs2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
+ if (error)
+ goto fail_gunlock3;
posix_acl_release(default_acl);
+ default_acl = NULL;
}
if (acl) {
- if (!error)
- error = __gfs2_set_acl(inode, acl, ACL_TYPE_ACCESS);
+ error = __gfs2_set_acl(inode, acl, ACL_TYPE_ACCESS);
+ if (error)
+ goto fail_gunlock3;
posix_acl_release(acl);
+ acl = NULL;
}
- if (error)
- goto fail_gunlock3;
-
error = security_inode_init_security(&ip->i_inode, &dip->i_inode, name,
&gfs2_initxattrs, NULL);
if (error)
@@ -789,10 +791,8 @@ fail_free_inode:
}
gfs2_rsqa_delete(ip, NULL);
fail_free_acls:
- if (default_acl)
- posix_acl_release(default_acl);
- if (acl)
- posix_acl_release(acl);
+ posix_acl_release(default_acl);
+ posix_acl_release(acl);
fail_gunlock:
gfs2_dir_no_add(&da);
gfs2_glock_dq_uninit(ghs);
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index b5b6341a4f5c..793808263c6d 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -30,16 +30,14 @@ static inline int gfs2_is_jdata(const struct gfs2_inode *ip)
return ip->i_diskflags & GFS2_DIF_JDATA;
}
-static inline int gfs2_is_writeback(const struct gfs2_inode *ip)
+static inline bool gfs2_is_ordered(const struct gfs2_sbd *sdp)
{
- const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- return (sdp->sd_args.ar_data == GFS2_DATA_WRITEBACK) && !gfs2_is_jdata(ip);
+ return sdp->sd_args.ar_data == GFS2_DATA_ORDERED;
}
-static inline int gfs2_is_ordered(const struct gfs2_inode *ip)
+static inline bool gfs2_is_writeback(const struct gfs2_sbd *sdp)
{
- const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- return (sdp->sd_args.ar_data == GFS2_DATA_ORDERED) && !gfs2_is_jdata(ip);
+ return sdp->sd_args.ar_data == GFS2_DATA_WRITEBACK;
}
static inline int gfs2_is_dir(const struct gfs2_inode *ip)
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 99dd58694ba1..b8830fda51e8 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -605,7 +605,6 @@ void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
bd->bd_blkno = bh->b_blocknr;
gfs2_remove_from_ail(bd); /* drops ref on bh */
bd->bd_bh = NULL;
- bd->bd_ops = &gfs2_revoke_lops;
sdp->sd_log_num_revoke++;
atomic_inc(&gl->gl_revokes);
set_bit(GLF_LFLUSH, &gl->gl_flags);
@@ -734,7 +733,7 @@ void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd,
lh->lh_crc = cpu_to_be32(crc);
gfs2_log_write(sdp, page, sb->s_blocksize, 0, addr);
- gfs2_log_flush_bio(sdp, REQ_OP_WRITE, op_flags);
+ gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE, op_flags);
log_flush_wait(sdp);
}
@@ -811,7 +810,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags)
gfs2_ordered_write(sdp);
lops_before_commit(sdp, tr);
- gfs2_log_flush_bio(sdp, REQ_OP_WRITE, 0);
+ gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE, 0);
if (sdp->sd_log_head != sdp->sd_log_flush_head) {
log_flush_wait(sdp);
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 20241436126d..1bc9bd444b28 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -51,12 +51,11 @@ static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp,
static inline void gfs2_ordered_add_inode(struct gfs2_inode *ip)
{
- struct gfs2_sbd *sdp;
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- if (!gfs2_is_ordered(ip))
+ if (gfs2_is_jdata(ip) || !gfs2_is_ordered(sdp))
return;
- sdp = GFS2_SB(&ip->i_inode);
if (!test_bit(GIF_ORDERED, &ip->i_flags)) {
spin_lock(&sdp->sd_ordered_lock);
if (!test_and_set_bit(GIF_ORDERED, &ip->i_flags))
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 4c7069b8f3c1..8722c60b11fe 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -168,7 +168,8 @@ u64 gfs2_log_bmap(struct gfs2_sbd *sdp)
* that is pinned in the pagecache.
*/
-static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct bio_vec *bvec,
+static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp,
+ struct bio_vec *bvec,
blk_status_t error)
{
struct buffer_head *bh, *next;
@@ -207,6 +208,7 @@ static void gfs2_end_log_write(struct bio *bio)
struct bio_vec *bvec;
struct page *page;
int i;
+ struct bvec_iter_all iter_all;
if (bio->bi_status) {
fs_err(sdp, "Error %d writing to journal, jid=%u\n",
@@ -214,7 +216,7 @@ static void gfs2_end_log_write(struct bio *bio)
wake_up(&sdp->sd_logd_waitq);
}
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, i, iter_all) {
page = bvec->bv_page;
if (page_has_buffers(page))
gfs2_end_log_write_bh(sdp, bvec, bio->bi_status);
@@ -228,8 +230,8 @@ static void gfs2_end_log_write(struct bio *bio)
}
/**
- * gfs2_log_flush_bio - Submit any pending log bio
- * @sdp: The superblock
+ * gfs2_log_submit_bio - Submit any pending log bio
+ * @biop: Address of the bio pointer
* @op: REQ_OP
* @op_flags: req_flag_bits
*
@@ -237,74 +239,78 @@ static void gfs2_end_log_write(struct bio *bio)
* there is no pending bio, then this is a no-op.
*/
-void gfs2_log_flush_bio(struct gfs2_sbd *sdp, int op, int op_flags)
+void gfs2_log_submit_bio(struct bio **biop, int op, int op_flags)
{
- if (sdp->sd_log_bio) {
+ struct bio *bio = *biop;
+ if (bio) {
+ struct gfs2_sbd *sdp = bio->bi_private;
atomic_inc(&sdp->sd_log_in_flight);
- bio_set_op_attrs(sdp->sd_log_bio, op, op_flags);
- submit_bio(sdp->sd_log_bio);
- sdp->sd_log_bio = NULL;
+ bio_set_op_attrs(bio, op, op_flags);
+ submit_bio(bio);
+ *biop = NULL;
}
}
/**
- * gfs2_log_alloc_bio - Allocate a new bio for log writing
- * @sdp: The superblock
- * @blkno: The next device block number we want to write to
+ * gfs2_log_alloc_bio - Allocate a bio
+ * @sdp: The super block
+ * @blkno: The device block number we want to write to
+ * @end_io: The bi_end_io callback
*
- * This should never be called when there is a cached bio in the
- * super block. When it returns, there will be a cached bio in the
- * super block which will have as many bio_vecs as the device is
- * happy to handle.
+ * Allocate a new bio, initialize it with the given parameters and return it.
*
- * Returns: Newly allocated bio
+ * Returns: The newly allocated bio
*/
-static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno)
+static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno,
+ bio_end_io_t *end_io)
{
struct super_block *sb = sdp->sd_vfs;
- struct bio *bio;
+ struct bio *bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
- BUG_ON(sdp->sd_log_bio);
-
- bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
bio->bi_iter.bi_sector = blkno * (sb->s_blocksize >> 9);
bio_set_dev(bio, sb->s_bdev);
- bio->bi_end_io = gfs2_end_log_write;
+ bio->bi_end_io = end_io;
bio->bi_private = sdp;
- sdp->sd_log_bio = bio;
-
return bio;
}
/**
* gfs2_log_get_bio - Get cached log bio, or allocate a new one
- * @sdp: The superblock
+ * @sdp: The super block
* @blkno: The device block number we want to write to
+ * @bio: The bio to get or allocate
+ * @op: REQ_OP
+ * @end_io: The bi_end_io callback
+ * @flush: Always flush the current bio and allocate a new one?
*
* If there is a cached bio, then if the next block number is sequential
* with the previous one, return it, otherwise flush the bio to the
- * device. If there is not a cached bio, or we just flushed it, then
+ * device. If there is no cached bio, or we just flushed it, then
* allocate a new one.
*
* Returns: The bio to use for log writes
*/
-static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno)
+static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno,
+ struct bio **biop, int op,
+ bio_end_io_t *end_io, bool flush)
{
- struct bio *bio = sdp->sd_log_bio;
- u64 nblk;
+ struct bio *bio = *biop;
if (bio) {
+ u64 nblk;
+
nblk = bio_end_sector(bio);
nblk >>= sdp->sd_fsb2bb_shift;
- if (blkno == nblk)
+ if (blkno == nblk && !flush)
return bio;
- gfs2_log_flush_bio(sdp, REQ_OP_WRITE, 0);
+ gfs2_log_submit_bio(biop, op, 0);
}
- return gfs2_log_alloc_bio(sdp, blkno);
+ *biop = gfs2_log_alloc_bio(sdp, blkno, end_io);
+ return *biop;
}
/**
@@ -326,11 +332,12 @@ void gfs2_log_write(struct gfs2_sbd *sdp, struct page *page,
struct bio *bio;
int ret;
- bio = gfs2_log_get_bio(sdp, blkno);
+ bio = gfs2_log_get_bio(sdp, blkno, &sdp->sd_log_bio, REQ_OP_WRITE,
+ gfs2_end_log_write, false);
ret = bio_add_page(bio, page, size, offset);
if (ret == 0) {
- gfs2_log_flush_bio(sdp, REQ_OP_WRITE, 0);
- bio = gfs2_log_alloc_bio(sdp, blkno);
+ bio = gfs2_log_get_bio(sdp, blkno, &sdp->sd_log_bio,
+ REQ_OP_WRITE, gfs2_end_log_write, true);
ret = bio_add_page(bio, page, size, offset);
WARN_ON(ret == 0);
}
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
index e4949394f054..711c4d89c063 100644
--- a/fs/gfs2/lops.h
+++ b/fs/gfs2/lops.h
@@ -30,7 +30,7 @@ extern u64 gfs2_log_bmap(struct gfs2_sbd *sdp);
extern void gfs2_log_write(struct gfs2_sbd *sdp, struct page *page,
unsigned size, unsigned offset, u64 blkno);
extern void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page);
-extern void gfs2_log_flush_bio(struct gfs2_sbd *sdp, int op, int op_flags);
+extern void gfs2_log_submit_bio(struct bio **biop, int op, int op_flags);
extern void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh);
static inline unsigned int buf_limit(struct gfs2_sbd *sdp)
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index be9c0bf697fe..3201342404a7 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -190,8 +190,9 @@ static void gfs2_meta_read_endio(struct bio *bio)
{
struct bio_vec *bvec;
int i;
+ struct bvec_iter_all iter_all;
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, i, iter_all) {
struct page *page = bvec->bv_page;
struct buffer_head *bh = page_buffers(page);
unsigned int len = bvec->bv_len;
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 0f501f938d1c..2dac43065382 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -120,6 +120,35 @@ void gfs2_revoke_clean(struct gfs2_jdesc *jd)
}
}
+int __get_log_header(struct gfs2_sbd *sdp, const struct gfs2_log_header *lh,
+ unsigned int blkno, struct gfs2_log_header_host *head)
+{
+ u32 hash, crc;
+
+ if (lh->lh_header.mh_magic != cpu_to_be32(GFS2_MAGIC) ||
+ lh->lh_header.mh_type != cpu_to_be32(GFS2_METATYPE_LH) ||
+ (blkno && be32_to_cpu(lh->lh_blkno) != blkno))
+ return 1;
+
+ hash = crc32(~0, lh, LH_V1_SIZE - 4);
+ hash = ~crc32_le_shift(hash, 4); /* assume lh_hash is zero */
+
+ if (be32_to_cpu(lh->lh_hash) != hash)
+ return 1;
+
+ crc = crc32c(~0, (void *)lh + LH_V1_SIZE + 4,
+ sdp->sd_sb.sb_bsize - LH_V1_SIZE - 4);
+
+ if ((lh->lh_crc != 0 && be32_to_cpu(lh->lh_crc) != crc))
+ return 1;
+
+ head->lh_sequence = be64_to_cpu(lh->lh_sequence);
+ head->lh_flags = be32_to_cpu(lh->lh_flags);
+ head->lh_tail = be32_to_cpu(lh->lh_tail);
+ head->lh_blkno = be32_to_cpu(lh->lh_blkno);
+
+ return 0;
+}
/**
* get_log_header - read the log header for a given segment
* @jd: the journal
@@ -137,36 +166,18 @@ void gfs2_revoke_clean(struct gfs2_jdesc *jd)
static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk,
struct gfs2_log_header_host *head)
{
- struct gfs2_log_header *lh;
+ struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
struct buffer_head *bh;
- u32 hash, crc;
int error;
error = gfs2_replay_read_block(jd, blk, &bh);
if (error)
return error;
- lh = (void *)bh->b_data;
-
- hash = crc32(~0, lh, LH_V1_SIZE - 4);
- hash = ~crc32_le_shift(hash, 4); /* assume lh_hash is zero */
-
- crc = crc32c(~0, (void *)lh + LH_V1_SIZE + 4,
- bh->b_size - LH_V1_SIZE - 4);
-
- error = lh->lh_header.mh_magic != cpu_to_be32(GFS2_MAGIC) ||
- lh->lh_header.mh_type != cpu_to_be32(GFS2_METATYPE_LH) ||
- be32_to_cpu(lh->lh_blkno) != blk ||
- be32_to_cpu(lh->lh_hash) != hash ||
- (lh->lh_crc != 0 && be32_to_cpu(lh->lh_crc) != crc);
+ error = __get_log_header(sdp, (const struct gfs2_log_header *)bh->b_data,
+ blk, head);
brelse(bh);
- if (!error) {
- head->lh_sequence = be64_to_cpu(lh->lh_sequence);
- head->lh_flags = be32_to_cpu(lh->lh_flags);
- head->lh_tail = be32_to_cpu(lh->lh_tail);
- head->lh_blkno = be32_to_cpu(lh->lh_blkno);
- }
return error;
}
@@ -460,6 +471,8 @@ void gfs2_recover_func(struct work_struct *work)
if (error)
goto fail_gunlock_ji;
t_jhd = ktime_get();
+ fs_info(sdp, "jid=%u: Journal head lookup took %lldms\n", jd->jd_jid,
+ ktime_ms_delta(t_jhd, t_jlck));
if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
fs_info(sdp, "jid=%u: Acquiring the transaction lock...\n",
diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h
index 11fdfab4bf99..11d81248be85 100644
--- a/fs/gfs2/recovery.h
+++ b/fs/gfs2/recovery.h
@@ -31,6 +31,9 @@ extern int gfs2_find_jhead(struct gfs2_jdesc *jd,
struct gfs2_log_header_host *head);
extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd, bool wait);
extern void gfs2_recover_func(struct work_struct *work);
+extern int __get_log_header(struct gfs2_sbd *sdp,
+ const struct gfs2_log_header *lh, unsigned int blkno,
+ struct gfs2_log_header_host *head);
#endif /* __RECOVERY_DOT_H__ */
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index ffe3032b1043..17a8d3b43990 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -733,6 +733,7 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
if (gl) {
glock_clear_object(gl, rgd);
+ gfs2_rgrp_brelse(rgd);
gfs2_glock_put(gl);
}
@@ -1174,7 +1175,7 @@ static u32 count_unlinked(struct gfs2_rgrpd *rgd)
* @rgd: the struct gfs2_rgrpd describing the RG to read in
*
* Read in all of a Resource Group's header and bitmap blocks.
- * Caller must eventually call gfs2_rgrp_relse() to free the bitmaps.
+ * Caller must eventually call gfs2_rgrp_brelse() to free the bitmaps.
*
* Returns: errno
*/
@@ -2255,7 +2256,7 @@ static void rgblk_free(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd,
*
*/
-void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
+void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_glock *gl)
{
struct gfs2_rgrpd *rgd = gl->gl_object;
struct gfs2_blkreserv *trs;
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index b596c3d17988..499079a9dbbe 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -72,7 +72,7 @@ extern void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist);
extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
extern u64 gfs2_ri_total(struct gfs2_sbd *sdp);
-extern void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl);
+extern void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_glock *gl);
extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
struct buffer_head *bh,
const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed);
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 423bc2d03dd8..cd9a94a6b5bb 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -124,15 +124,13 @@ void gfs2_trans_end(struct gfs2_sbd *sdp)
}
static struct gfs2_bufdata *gfs2_alloc_bufdata(struct gfs2_glock *gl,
- struct buffer_head *bh,
- const struct gfs2_log_operations *lops)
+ struct buffer_head *bh)
{
struct gfs2_bufdata *bd;
bd = kmem_cache_zalloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL);
bd->bd_bh = bh;
bd->bd_gl = gl;
- bd->bd_ops = lops;
INIT_LIST_HEAD(&bd->bd_list);
bh->b_private = bd;
return bd;
@@ -169,7 +167,7 @@ void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh)
gfs2_log_unlock(sdp);
unlock_buffer(bh);
if (bh->b_private == NULL)
- bd = gfs2_alloc_bufdata(gl, bh, &gfs2_databuf_lops);
+ bd = gfs2_alloc_bufdata(gl, bh);
else
bd = bh->b_private;
lock_buffer(bh);
@@ -210,7 +208,7 @@ void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh)
unlock_buffer(bh);
lock_page(bh->b_page);
if (bh->b_private == NULL)
- bd = gfs2_alloc_bufdata(gl, bh, &gfs2_buf_lops);
+ bd = gfs2_alloc_bufdata(gl, bh);
else
bd = bh->b_private;
unlock_page(bh->b_page);
diff --git a/fs/hfs/brec.c b/fs/hfs/brec.c
index 9a8772465a90..896396554bcc 100644
--- a/fs/hfs/brec.c
+++ b/fs/hfs/brec.c
@@ -425,6 +425,10 @@ skip:
if (new_node) {
__be32 cnid;
+ if (!new_node->parent) {
+ hfs_btree_inc_height(tree);
+ new_node->parent = tree->root;
+ }
fd->bnode = hfs_bnode_find(tree, new_node->parent);
/* create index key and entry */
hfs_bnode_read_key(new_node, fd->search_key, 14);
@@ -441,6 +445,7 @@ skip:
/* restore search_key */
hfs_bnode_read_key(node, fd->search_key, 14);
}
+ new_node = NULL;
}
if (!rec && node->parent)
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 374b5688e29e..19017d296173 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -220,25 +220,17 @@ static struct hfs_bnode *hfs_bmap_new_bmap(struct hfs_bnode *prev, u32 idx)
return node;
}
-struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
+/* Make sure @tree has enough space for the @rsvd_nodes */
+int hfs_bmap_reserve(struct hfs_btree *tree, int rsvd_nodes)
{
- struct hfs_bnode *node, *next_node;
- struct page **pagep;
- u32 nidx, idx;
- unsigned off;
- u16 off16;
- u16 len;
- u8 *data, byte, m;
- int i;
-
- while (!tree->free_nodes) {
- struct inode *inode = tree->inode;
- u32 count;
- int res;
+ struct inode *inode = tree->inode;
+ u32 count;
+ int res;
+ while (tree->free_nodes < rsvd_nodes) {
res = hfs_extend_file(inode);
if (res)
- return ERR_PTR(res);
+ return res;
HFS_I(inode)->phys_size = inode->i_size =
(loff_t)HFS_I(inode)->alloc_blocks *
HFS_SB(tree->sb)->alloc_blksz;
@@ -246,9 +238,26 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
tree->sb->s_blocksize_bits;
inode_set_bytes(inode, inode->i_size);
count = inode->i_size >> tree->node_size_shift;
- tree->free_nodes = count - tree->node_count;
+ tree->free_nodes += count - tree->node_count;
tree->node_count = count;
}
+ return 0;
+}
+
+struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
+{
+ struct hfs_bnode *node, *next_node;
+ struct page **pagep;
+ u32 nidx, idx;
+ unsigned off;
+ u16 off16;
+ u16 len;
+ u8 *data, byte, m;
+ int i, res;
+
+ res = hfs_bmap_reserve(tree, 1);
+ if (res)
+ return ERR_PTR(res);
nidx = 0;
node = hfs_bnode_find(tree, nidx);
@@ -329,13 +338,14 @@ void hfs_bmap_free(struct hfs_bnode *node)
nidx -= len * 8;
i = node->next;
- hfs_bnode_put(node);
if (!i) {
/* panic */;
pr_crit("unable to free bnode %u. bmap not found!\n",
node->this);
+ hfs_bnode_put(node);
return;
}
+ hfs_bnode_put(node);
node = hfs_bnode_find(tree, i);
if (IS_ERR(node))
return;
diff --git a/fs/hfs/btree.h b/fs/hfs/btree.h
index c8b252dbb26c..dcc2aab1b2c4 100644
--- a/fs/hfs/btree.h
+++ b/fs/hfs/btree.h
@@ -82,6 +82,7 @@ struct hfs_find_data {
extern struct hfs_btree *hfs_btree_open(struct super_block *, u32, btree_keycmp);
extern void hfs_btree_close(struct hfs_btree *);
extern void hfs_btree_write(struct hfs_btree *);
+extern int hfs_bmap_reserve(struct hfs_btree *, int);
extern struct hfs_bnode * hfs_bmap_alloc(struct hfs_btree *);
extern void hfs_bmap_free(struct hfs_bnode *node);
diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c
index 8a66405b0f8b..d365bf0b8c77 100644
--- a/fs/hfs/catalog.c
+++ b/fs/hfs/catalog.c
@@ -97,6 +97,14 @@ int hfs_cat_create(u32 cnid, struct inode *dir, const struct qstr *str, struct i
if (err)
return err;
+ /*
+ * Fail early and avoid ENOSPC during the btree operations. We may
+ * have to split the root node at most once.
+ */
+ err = hfs_bmap_reserve(fd.tree, 2 * fd.tree->depth);
+ if (err)
+ goto err2;
+
hfs_cat_build_key(sb, fd.search_key, cnid, NULL);
entry_size = hfs_cat_build_thread(sb, &entry, S_ISDIR(inode->i_mode) ?
HFS_CDR_THD : HFS_CDR_FTH,
@@ -295,6 +303,14 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, const struct qstr *src_name,
return err;
dst_fd = src_fd;
+ /*
+ * Fail early and avoid ENOSPC during the btree operations. We may
+ * have to split the root node at most once.
+ */
+ err = hfs_bmap_reserve(src_fd.tree, 2 * src_fd.tree->depth);
+ if (err)
+ goto out;
+
/* find the old dir entry and read the data */
hfs_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name);
err = hfs_brec_find(&src_fd);
diff --git a/fs/hfs/extent.c b/fs/hfs/extent.c
index 5d0182654580..263d5028d9d1 100644
--- a/fs/hfs/extent.c
+++ b/fs/hfs/extent.c
@@ -117,6 +117,10 @@ static int __hfs_ext_write_extent(struct inode *inode, struct hfs_find_data *fd)
if (HFS_I(inode)->flags & HFS_FLG_EXT_NEW) {
if (res != -ENOENT)
return res;
+ /* Fail early and avoid ENOSPC during the btree operation */
+ res = hfs_bmap_reserve(fd->tree, fd->tree->depth + 1);
+ if (res)
+ return res;
hfs_brec_insert(fd, HFS_I(inode)->cached_extents, sizeof(hfs_extent_rec));
HFS_I(inode)->flags &= ~(HFS_FLG_EXT_DIRTY|HFS_FLG_EXT_NEW);
} else {
@@ -300,7 +304,7 @@ int hfs_free_fork(struct super_block *sb, struct hfs_cat_file *file, int type)
return 0;
blocks = 0;
- for (i = 0; i < 3; extent++, i++)
+ for (i = 0; i < 3; i++)
blocks += be16_to_cpu(extent[i].count);
res = hfs_free_extents(sb, extent, blocks, blocks);
@@ -341,7 +345,9 @@ int hfs_get_block(struct inode *inode, sector_t block,
ablock = (u32)block / HFS_SB(sb)->fs_div;
if (block >= HFS_I(inode)->fs_blocks) {
- if (block > HFS_I(inode)->fs_blocks || !create)
+ if (!create)
+ return 0;
+ if (block > HFS_I(inode)->fs_blocks)
return -EIO;
if (ablock >= HFS_I(inode)->alloc_blocks) {
res = hfs_extend_file(inode);
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index a2dfa1b2a89c..da243c84e93b 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -642,6 +642,8 @@ int hfs_inode_setattr(struct dentry *dentry, struct iattr * attr)
truncate_setsize(inode, attr->ia_size);
hfs_file_truncate(inode);
+ inode->i_atime = inode->i_mtime = inode->i_ctime =
+ current_time(inode);
}
setattr_copy(inode, attr);
diff --git a/fs/hfsplus/attributes.c b/fs/hfsplus/attributes.c
index 2bab6b3cdba4..e6d554476db4 100644
--- a/fs/hfsplus/attributes.c
+++ b/fs/hfsplus/attributes.c
@@ -217,6 +217,11 @@ int hfsplus_create_attr(struct inode *inode,
if (err)
goto failed_init_create_attr;
+ /* Fail early and avoid ENOSPC during the btree operation */
+ err = hfs_bmap_reserve(fd.tree, fd.tree->depth + 1);
+ if (err)
+ goto failed_create_attr;
+
if (name) {
err = hfsplus_attr_build_key(sb, fd.search_key,
inode->i_ino, name);
@@ -313,6 +318,11 @@ int hfsplus_delete_attr(struct inode *inode, const char *name)
if (err)
return err;
+ /* Fail early and avoid ENOSPC during the btree operation */
+ err = hfs_bmap_reserve(fd.tree, fd.tree->depth);
+ if (err)
+ goto out;
+
if (name) {
err = hfsplus_attr_build_key(sb, fd.search_key,
inode->i_ino, name);
diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c
index ed8eacb34452..1918544a7871 100644
--- a/fs/hfsplus/brec.c
+++ b/fs/hfsplus/brec.c
@@ -429,6 +429,10 @@ skip:
if (new_node) {
__be32 cnid;
+ if (!new_node->parent) {
+ hfs_btree_inc_height(tree);
+ new_node->parent = tree->root;
+ }
fd->bnode = hfs_bnode_find(tree, new_node->parent);
/* create index key and entry */
hfs_bnode_read_key(new_node, fd->search_key, 14);
@@ -445,6 +449,7 @@ skip:
/* restore search_key */
hfs_bnode_read_key(node, fd->search_key, 14);
}
+ new_node = NULL;
}
if (!rec && node->parent)
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index de14b2b6881b..66774f4cb4fd 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -342,26 +342,21 @@ static struct hfs_bnode *hfs_bmap_new_bmap(struct hfs_bnode *prev, u32 idx)
return node;
}
-struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
+/* Make sure @tree has enough space for the @rsvd_nodes */
+int hfs_bmap_reserve(struct hfs_btree *tree, int rsvd_nodes)
{
- struct hfs_bnode *node, *next_node;
- struct page **pagep;
- u32 nidx, idx;
- unsigned off;
- u16 off16;
- u16 len;
- u8 *data, byte, m;
- int i;
+ struct inode *inode = tree->inode;
+ struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
+ u32 count;
+ int res;
- while (!tree->free_nodes) {
- struct inode *inode = tree->inode;
- struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
- u32 count;
- int res;
+ if (rsvd_nodes <= 0)
+ return 0;
+ while (tree->free_nodes < rsvd_nodes) {
res = hfsplus_file_extend(inode, hfs_bnode_need_zeroout(tree));
if (res)
- return ERR_PTR(res);
+ return res;
hip->phys_size = inode->i_size =
(loff_t)hip->alloc_blocks <<
HFSPLUS_SB(tree->sb)->alloc_blksz_shift;
@@ -369,9 +364,26 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
hip->alloc_blocks << HFSPLUS_SB(tree->sb)->fs_shift;
inode_set_bytes(inode, inode->i_size);
count = inode->i_size >> tree->node_size_shift;
- tree->free_nodes = count - tree->node_count;
+ tree->free_nodes += count - tree->node_count;
tree->node_count = count;
}
+ return 0;
+}
+
+struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
+{
+ struct hfs_bnode *node, *next_node;
+ struct page **pagep;
+ u32 nidx, idx;
+ unsigned off;
+ u16 off16;
+ u16 len;
+ u8 *data, byte, m;
+ int i, res;
+
+ res = hfs_bmap_reserve(tree, 1);
+ if (res)
+ return ERR_PTR(res);
nidx = 0;
node = hfs_bnode_find(tree, nidx);
@@ -454,14 +466,15 @@ void hfs_bmap_free(struct hfs_bnode *node)
nidx -= len * 8;
i = node->next;
- hfs_bnode_put(node);
if (!i) {
/* panic */;
pr_crit("unable to free bnode %u. "
"bmap not found!\n",
node->this);
+ hfs_bnode_put(node);
return;
}
+ hfs_bnode_put(node);
node = hfs_bnode_find(tree, i);
if (IS_ERR(node))
return;
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index a196369ba779..35472cba750e 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -265,6 +265,14 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir,
if (err)
return err;
+ /*
+ * Fail early and avoid ENOSPC during the btree operations. We may
+ * have to split the root node at most once.
+ */
+ err = hfs_bmap_reserve(fd.tree, 2 * fd.tree->depth);
+ if (err)
+ goto err2;
+
hfsplus_cat_build_key_with_cnid(sb, fd.search_key, cnid);
entry_size = hfsplus_fill_cat_thread(sb, &entry,
S_ISDIR(inode->i_mode) ?
@@ -333,6 +341,14 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, const struct qstr *str)
if (err)
return err;
+ /*
+ * Fail early and avoid ENOSPC during the btree operations. We may
+ * have to split the root node at most once.
+ */
+ err = hfs_bmap_reserve(fd.tree, 2 * (int)fd.tree->depth - 2);
+ if (err)
+ goto out;
+
if (!str) {
int len;
@@ -433,6 +449,14 @@ int hfsplus_rename_cat(u32 cnid,
return err;
dst_fd = src_fd;
+ /*
+ * Fail early and avoid ENOSPC during the btree operations. We may
+ * have to split the root node at most twice.
+ */
+ err = hfs_bmap_reserve(src_fd.tree, 4 * (int)src_fd.tree->depth - 1);
+ if (err)
+ goto out;
+
/* find the old dir entry and read the data */
err = hfsplus_cat_build_key(sb, src_fd.search_key,
src_dir->i_ino, src_name);
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index f37662675c3a..29a9dcfbe81f 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -565,6 +565,7 @@ const struct inode_operations hfsplus_dir_inode_operations = {
.symlink = hfsplus_symlink,
.mknod = hfsplus_mknod,
.rename = hfsplus_rename,
+ .getattr = hfsplus_getattr,
.listxattr = hfsplus_listxattr,
};
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index 8e0f59767694..a930ddd15681 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -100,6 +100,10 @@ static int __hfsplus_ext_write_extent(struct inode *inode,
if (hip->extent_state & HFSPLUS_EXT_NEW) {
if (res != -ENOENT)
return res;
+ /* Fail early and avoid ENOSPC during the btree operation */
+ res = hfs_bmap_reserve(fd->tree, fd->tree->depth + 1);
+ if (res)
+ return res;
hfs_brec_insert(fd, hip->cached_extents,
sizeof(hfsplus_extent_rec));
hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW);
@@ -233,7 +237,9 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock,
ablock = iblock >> sbi->fs_shift;
if (iblock >= hip->fs_blocks) {
- if (iblock > hip->fs_blocks || !create)
+ if (!create)
+ return 0;
+ if (iblock > hip->fs_blocks)
return -EIO;
if (ablock >= hip->alloc_blocks) {
res = hfsplus_file_extend(inode, false);
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 8e039435958a..b8471bf05def 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -311,6 +311,7 @@ static inline unsigned short hfsplus_min_io_size(struct super_block *sb)
#define hfs_btree_open hfsplus_btree_open
#define hfs_btree_close hfsplus_btree_close
#define hfs_btree_write hfsplus_btree_write
+#define hfs_bmap_reserve hfsplus_bmap_reserve
#define hfs_bmap_alloc hfsplus_bmap_alloc
#define hfs_bmap_free hfsplus_bmap_free
#define hfs_bnode_read hfsplus_bnode_read
@@ -395,6 +396,7 @@ u32 hfsplus_calc_btree_clump_size(u32 block_size, u32 node_size, u64 sectors,
struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id);
void hfs_btree_close(struct hfs_btree *tree);
int hfs_btree_write(struct hfs_btree *tree);
+int hfs_bmap_reserve(struct hfs_btree *tree, int rsvd_nodes);
struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree);
void hfs_bmap_free(struct hfs_bnode *node);
@@ -486,6 +488,8 @@ void hfsplus_inode_write_fork(struct inode *inode,
struct hfsplus_fork_raw *fork);
int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd);
int hfsplus_cat_write_inode(struct inode *inode);
+int hfsplus_getattr(const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int query_flags);
int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
int datasync);
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 8e9427a42b81..d131c8ea7eb6 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -261,6 +261,7 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)
}
truncate_setsize(inode, attr->ia_size);
hfsplus_file_truncate(inode);
+ inode->i_mtime = inode->i_ctime = current_time(inode);
}
setattr_copy(inode, attr);
@@ -269,6 +270,26 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)
return 0;
}
+int hfsplus_getattr(const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int query_flags)
+{
+ struct inode *inode = d_inode(path->dentry);
+ struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
+
+ if (inode->i_flags & S_APPEND)
+ stat->attributes |= STATX_ATTR_APPEND;
+ if (inode->i_flags & S_IMMUTABLE)
+ stat->attributes |= STATX_ATTR_IMMUTABLE;
+ if (hip->userflags & HFSPLUS_FLG_NODUMP)
+ stat->attributes |= STATX_ATTR_NODUMP;
+
+ stat->attributes_mask |= STATX_ATTR_APPEND | STATX_ATTR_IMMUTABLE |
+ STATX_ATTR_NODUMP;
+
+ generic_fillattr(inode, stat);
+ return 0;
+}
+
int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
int datasync)
{
@@ -328,6 +349,7 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
static const struct inode_operations hfsplus_file_inode_operations = {
.setattr = hfsplus_setattr,
+ .getattr = hfsplus_getattr,
.listxattr = hfsplus_listxattr,
};
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 32920a10100e..b0eef008de67 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -530,7 +530,7 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
inode_lock(inode);
/* protected by i_mutex */
- if (info->seals & F_SEAL_WRITE) {
+ if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
inode_unlock(inode);
return -EPERM;
}
@@ -859,6 +859,18 @@ static int hugetlbfs_migrate_page(struct address_space *mapping,
rc = migrate_huge_page_move_mapping(mapping, newpage, page);
if (rc != MIGRATEPAGE_SUCCESS)
return rc;
+
+ /*
+ * page_private is subpool pointer in hugetlb pages. Transfer to
+ * new page. PagePrivate is not associated with page_private for
+ * hugetlb pages and can not be set here as only page_huge_active
+ * pages can be migrated.
+ */
+ if (page_private(page)) {
+ set_page_private(newpage, page_private(page));
+ set_page_private(page, 0);
+ }
+
if (mode != MIGRATE_SYNC_NO_COPY)
migrate_page_copy(newpage, page);
else
diff --git a/fs/inode.c b/fs/inode.c
index 42f6d25f32a5..e9d97add2b36 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -10,7 +10,7 @@
#include <linux/swap.h>
#include <linux/security.h>
#include <linux/cdev.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/fsnotify.h>
#include <linux/mount.h>
#include <linux/posix_acl.h>
@@ -349,7 +349,7 @@ EXPORT_SYMBOL(inc_nlink);
static void __address_space_init_once(struct address_space *mapping)
{
- INIT_RADIX_TREE(&mapping->i_pages, GFP_ATOMIC | __GFP_ACCOUNT);
+ xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ);
init_rwsem(&mapping->i_mmap_rwsem);
INIT_LIST_HEAD(&mapping->private_list);
spin_lock_init(&mapping->private_lock);
@@ -2093,14 +2093,8 @@ EXPORT_SYMBOL(inode_dio_wait);
void inode_set_flags(struct inode *inode, unsigned int flags,
unsigned int mask)
{
- unsigned int old_flags, new_flags;
-
WARN_ON_ONCE(flags & ~mask);
- do {
- old_flags = READ_ONCE(inode->i_flags);
- new_flags = (old_flags & ~mask) | flags;
- } while (unlikely(cmpxchg(&inode->i_flags, old_flags,
- new_flags) != old_flags));
+ set_mask_bits(&inode->i_flags, mask, flags);
}
EXPORT_SYMBOL(inode_set_flags);
@@ -2146,7 +2140,9 @@ EXPORT_SYMBOL(timespec64_trunc);
*/
struct timespec64 current_time(struct inode *inode)
{
- struct timespec64 now = current_kernel_time64();
+ struct timespec64 now;
+
+ ktime_get_coarse_real_ts64(&now);
if (unlikely(!inode->i_sb)) {
WARN(1, "current_time() called with uninitialized super_block in the inode");
diff --git a/fs/io_uring.c b/fs/io_uring.c
new file mode 100644
index 000000000000..5d99376d2369
--- /dev/null
+++ b/fs/io_uring.c
@@ -0,0 +1,2971 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Shared application/kernel submission and completion ring pairs, for
+ * supporting fast/efficient IO.
+ *
+ * A note on the read/write ordering memory barriers that are matched between
+ * the application and kernel side. When the application reads the CQ ring
+ * tail, it must use an appropriate smp_rmb() to order with the smp_wmb()
+ * the kernel uses after writing the tail. Failure to do so could cause a
+ * delay in when the application notices that completion events available.
+ * This isn't a fatal condition. Likewise, the application must use an
+ * appropriate smp_wmb() both before writing the SQ tail, and after writing
+ * the SQ tail. The first one orders the sqe writes with the tail write, and
+ * the latter is paired with the smp_rmb() the kernel will issue before
+ * reading the SQ tail on submission.
+ *
+ * Also see the examples in the liburing library:
+ *
+ * git://git.kernel.dk/liburing
+ *
+ * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
+ * from data shared between the kernel and application. This is done both
+ * for ordering purposes, but also to ensure that once a value is loaded from
+ * data that the application could potentially modify, it remains stable.
+ *
+ * Copyright (C) 2018-2019 Jens Axboe
+ * Copyright (c) 2018-2019 Christoph Hellwig
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/syscalls.h>
+#include <linux/compat.h>
+#include <linux/refcount.h>
+#include <linux/uio.h>
+
+#include <linux/sched/signal.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/mmu_context.h>
+#include <linux/percpu.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
+#include <linux/blkdev.h>
+#include <linux/bvec.h>
+#include <linux/net.h>
+#include <net/sock.h>
+#include <net/af_unix.h>
+#include <net/scm.h>
+#include <linux/anon_inodes.h>
+#include <linux/sched/mm.h>
+#include <linux/uaccess.h>
+#include <linux/nospec.h>
+#include <linux/sizes.h>
+#include <linux/hugetlb.h>
+
+#include <uapi/linux/io_uring.h>
+
+#include "internal.h"
+
+#define IORING_MAX_ENTRIES 4096
+#define IORING_MAX_FIXED_FILES 1024
+
+struct io_uring {
+ u32 head ____cacheline_aligned_in_smp;
+ u32 tail ____cacheline_aligned_in_smp;
+};
+
+struct io_sq_ring {
+ struct io_uring r;
+ u32 ring_mask;
+ u32 ring_entries;
+ u32 dropped;
+ u32 flags;
+ u32 array[];
+};
+
+struct io_cq_ring {
+ struct io_uring r;
+ u32 ring_mask;
+ u32 ring_entries;
+ u32 overflow;
+ struct io_uring_cqe cqes[];
+};
+
+struct io_mapped_ubuf {
+ u64 ubuf;
+ size_t len;
+ struct bio_vec *bvec;
+ unsigned int nr_bvecs;
+};
+
+struct async_list {
+ spinlock_t lock;
+ atomic_t cnt;
+ struct list_head list;
+
+ struct file *file;
+ off_t io_end;
+ size_t io_pages;
+};
+
+struct io_ring_ctx {
+ struct {
+ struct percpu_ref refs;
+ } ____cacheline_aligned_in_smp;
+
+ struct {
+ unsigned int flags;
+ bool compat;
+ bool account_mem;
+
+ /* SQ ring */
+ struct io_sq_ring *sq_ring;
+ unsigned cached_sq_head;
+ unsigned sq_entries;
+ unsigned sq_mask;
+ unsigned sq_thread_idle;
+ struct io_uring_sqe *sq_sqes;
+ } ____cacheline_aligned_in_smp;
+
+ /* IO offload */
+ struct workqueue_struct *sqo_wq;
+ struct task_struct *sqo_thread; /* if using sq thread polling */
+ struct mm_struct *sqo_mm;
+ wait_queue_head_t sqo_wait;
+ unsigned sqo_stop;
+
+ struct {
+ /* CQ ring */
+ struct io_cq_ring *cq_ring;
+ unsigned cached_cq_tail;
+ unsigned cq_entries;
+ unsigned cq_mask;
+ struct wait_queue_head cq_wait;
+ struct fasync_struct *cq_fasync;
+ } ____cacheline_aligned_in_smp;
+
+ /*
+ * If used, fixed file set. Writers must ensure that ->refs is dead,
+ * readers must ensure that ->refs is alive as long as the file* is
+ * used. Only updated through io_uring_register(2).
+ */
+ struct file **user_files;
+ unsigned nr_user_files;
+
+ /* if used, fixed mapped user buffers */
+ unsigned nr_user_bufs;
+ struct io_mapped_ubuf *user_bufs;
+
+ struct user_struct *user;
+
+ struct completion ctx_done;
+
+ struct {
+ struct mutex uring_lock;
+ wait_queue_head_t wait;
+ } ____cacheline_aligned_in_smp;
+
+ struct {
+ spinlock_t completion_lock;
+ bool poll_multi_file;
+ /*
+ * ->poll_list is protected by the ctx->uring_lock for
+ * io_uring instances that don't use IORING_SETUP_SQPOLL.
+ * For SQPOLL, only the single threaded io_sq_thread() will
+ * manipulate the list, hence no extra locking is needed there.
+ */
+ struct list_head poll_list;
+ struct list_head cancel_list;
+ } ____cacheline_aligned_in_smp;
+
+ struct async_list pending_async[2];
+
+#if defined(CONFIG_UNIX)
+ struct socket *ring_sock;
+#endif
+};
+
+struct sqe_submit {
+ const struct io_uring_sqe *sqe;
+ unsigned short index;
+ bool has_user;
+ bool needs_lock;
+ bool needs_fixed_file;
+};
+
+struct io_poll_iocb {
+ struct file *file;
+ struct wait_queue_head *head;
+ __poll_t events;
+ bool woken;
+ bool canceled;
+ struct wait_queue_entry wait;
+};
+
+struct io_kiocb {
+ union {
+ struct kiocb rw;
+ struct io_poll_iocb poll;
+ };
+
+ struct sqe_submit submit;
+
+ struct io_ring_ctx *ctx;
+ struct list_head list;
+ unsigned int flags;
+ refcount_t refs;
+#define REQ_F_FORCE_NONBLOCK 1 /* inline submission attempt */
+#define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */
+#define REQ_F_FIXED_FILE 4 /* ctx owns file */
+#define REQ_F_SEQ_PREV 8 /* sequential with previous */
+ u64 user_data;
+ u64 error;
+
+ struct work_struct work;
+};
+
+#define IO_PLUG_THRESHOLD 2
+#define IO_IOPOLL_BATCH 8
+
+struct io_submit_state {
+ struct blk_plug plug;
+
+ /*
+ * io_kiocb alloc cache
+ */
+ void *reqs[IO_IOPOLL_BATCH];
+ unsigned int free_reqs;
+ unsigned int cur_req;
+
+ /*
+ * File reference cache
+ */
+ struct file *file;
+ unsigned int fd;
+ unsigned int has_refs;
+ unsigned int used_refs;
+ unsigned int ios_left;
+};
+
+static struct kmem_cache *req_cachep;
+
+static const struct file_operations io_uring_fops;
+
+struct sock *io_uring_get_socket(struct file *file)
+{
+#if defined(CONFIG_UNIX)
+ if (file->f_op == &io_uring_fops) {
+ struct io_ring_ctx *ctx = file->private_data;
+
+ return ctx->ring_sock->sk;
+ }
+#endif
+ return NULL;
+}
+EXPORT_SYMBOL(io_uring_get_socket);
+
+static void io_ring_ctx_ref_free(struct percpu_ref *ref)
+{
+ struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
+
+ complete(&ctx->ctx_done);
+}
+
+static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
+{
+ struct io_ring_ctx *ctx;
+ int i;
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return NULL;
+
+ if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, 0, GFP_KERNEL)) {
+ kfree(ctx);
+ return NULL;
+ }
+
+ ctx->flags = p->flags;
+ init_waitqueue_head(&ctx->cq_wait);
+ init_completion(&ctx->ctx_done);
+ mutex_init(&ctx->uring_lock);
+ init_waitqueue_head(&ctx->wait);
+ for (i = 0; i < ARRAY_SIZE(ctx->pending_async); i++) {
+ spin_lock_init(&ctx->pending_async[i].lock);
+ INIT_LIST_HEAD(&ctx->pending_async[i].list);
+ atomic_set(&ctx->pending_async[i].cnt, 0);
+ }
+ spin_lock_init(&ctx->completion_lock);
+ INIT_LIST_HEAD(&ctx->poll_list);
+ INIT_LIST_HEAD(&ctx->cancel_list);
+ return ctx;
+}
+
+static void io_commit_cqring(struct io_ring_ctx *ctx)
+{
+ struct io_cq_ring *ring = ctx->cq_ring;
+
+ if (ctx->cached_cq_tail != READ_ONCE(ring->r.tail)) {
+ /* order cqe stores with ring update */
+ smp_store_release(&ring->r.tail, ctx->cached_cq_tail);
+
+ /*
+ * Write sider barrier of tail update, app has read side. See
+ * comment at the top of this file.
+ */
+ smp_wmb();
+
+ if (wq_has_sleeper(&ctx->cq_wait)) {
+ wake_up_interruptible(&ctx->cq_wait);
+ kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
+ }
+ }
+}
+
+static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
+{
+ struct io_cq_ring *ring = ctx->cq_ring;
+ unsigned tail;
+
+ tail = ctx->cached_cq_tail;
+ /* See comment at the top of the file */
+ smp_rmb();
+ if (tail + 1 == READ_ONCE(ring->r.head))
+ return NULL;
+
+ ctx->cached_cq_tail++;
+ return &ring->cqes[tail & ctx->cq_mask];
+}
+
+static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
+ long res, unsigned ev_flags)
+{
+ struct io_uring_cqe *cqe;
+
+ /*
+ * If we can't get a cq entry, userspace overflowed the
+ * submission (by quite a lot). Increment the overflow count in
+ * the ring.
+ */
+ cqe = io_get_cqring(ctx);
+ if (cqe) {
+ WRITE_ONCE(cqe->user_data, ki_user_data);
+ WRITE_ONCE(cqe->res, res);
+ WRITE_ONCE(cqe->flags, ev_flags);
+ } else {
+ unsigned overflow = READ_ONCE(ctx->cq_ring->overflow);
+
+ WRITE_ONCE(ctx->cq_ring->overflow, overflow + 1);
+ }
+}
+
+static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 ki_user_data,
+ long res, unsigned ev_flags)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&ctx->completion_lock, flags);
+ io_cqring_fill_event(ctx, ki_user_data, res, ev_flags);
+ io_commit_cqring(ctx);
+ spin_unlock_irqrestore(&ctx->completion_lock, flags);
+
+ if (waitqueue_active(&ctx->wait))
+ wake_up(&ctx->wait);
+ if (waitqueue_active(&ctx->sqo_wait))
+ wake_up(&ctx->sqo_wait);
+}
+
+static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs)
+{
+ percpu_ref_put_many(&ctx->refs, refs);
+
+ if (waitqueue_active(&ctx->wait))
+ wake_up(&ctx->wait);
+}
+
+static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
+ struct io_submit_state *state)
+{
+ struct io_kiocb *req;
+
+ if (!percpu_ref_tryget(&ctx->refs))
+ return NULL;
+
+ if (!state) {
+ req = kmem_cache_alloc(req_cachep, __GFP_NOWARN);
+ if (unlikely(!req))
+ goto out;
+ } else if (!state->free_reqs) {
+ size_t sz;
+ int ret;
+
+ sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
+ ret = kmem_cache_alloc_bulk(req_cachep, __GFP_NOWARN, sz,
+ state->reqs);
+ if (unlikely(ret <= 0))
+ goto out;
+ state->free_reqs = ret - 1;
+ state->cur_req = 1;
+ req = state->reqs[0];
+ } else {
+ req = state->reqs[state->cur_req];
+ state->free_reqs--;
+ state->cur_req++;
+ }
+
+ req->ctx = ctx;
+ req->flags = 0;
+ refcount_set(&req->refs, 0);
+ return req;
+out:
+ io_ring_drop_ctx_refs(ctx, 1);
+ return NULL;
+}
+
+static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr)
+{
+ if (*nr) {
+ kmem_cache_free_bulk(req_cachep, *nr, reqs);
+ io_ring_drop_ctx_refs(ctx, *nr);
+ *nr = 0;
+ }
+}
+
+static void io_free_req(struct io_kiocb *req)
+{
+ if (!refcount_read(&req->refs) || refcount_dec_and_test(&req->refs)) {
+ io_ring_drop_ctx_refs(req->ctx, 1);
+ kmem_cache_free(req_cachep, req);
+ }
+}
+
+/*
+ * Find and free completed poll iocbs
+ */
+static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
+ struct list_head *done)
+{
+ void *reqs[IO_IOPOLL_BATCH];
+ int file_count, to_free;
+ struct file *file = NULL;
+ struct io_kiocb *req;
+
+ file_count = to_free = 0;
+ while (!list_empty(done)) {
+ req = list_first_entry(done, struct io_kiocb, list);
+ list_del(&req->list);
+
+ io_cqring_fill_event(ctx, req->user_data, req->error, 0);
+
+ reqs[to_free++] = req;
+ (*nr_events)++;
+
+ /*
+ * Batched puts of the same file, to avoid dirtying the
+ * file usage count multiple times, if avoidable.
+ */
+ if (!(req->flags & REQ_F_FIXED_FILE)) {
+ if (!file) {
+ file = req->rw.ki_filp;
+ file_count = 1;
+ } else if (file == req->rw.ki_filp) {
+ file_count++;
+ } else {
+ fput_many(file, file_count);
+ file = req->rw.ki_filp;
+ file_count = 1;
+ }
+ }
+
+ if (to_free == ARRAY_SIZE(reqs))
+ io_free_req_many(ctx, reqs, &to_free);
+ }
+ io_commit_cqring(ctx);
+
+ if (file)
+ fput_many(file, file_count);
+ io_free_req_many(ctx, reqs, &to_free);
+}
+
+static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
+ long min)
+{
+ struct io_kiocb *req, *tmp;
+ LIST_HEAD(done);
+ bool spin;
+ int ret;
+
+ /*
+ * Only spin for completions if we don't have multiple devices hanging
+ * off our complete list, and we're under the requested amount.
+ */
+ spin = !ctx->poll_multi_file && *nr_events < min;
+
+ ret = 0;
+ list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) {
+ struct kiocb *kiocb = &req->rw;
+
+ /*
+ * Move completed entries to our local list. If we find a
+ * request that requires polling, break out and complete
+ * the done list first, if we have entries there.
+ */
+ if (req->flags & REQ_F_IOPOLL_COMPLETED) {
+ list_move_tail(&req->list, &done);
+ continue;
+ }
+ if (!list_empty(&done))
+ break;
+
+ ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
+ if (ret < 0)
+ break;
+
+ if (ret && spin)
+ spin = false;
+ ret = 0;
+ }
+
+ if (!list_empty(&done))
+ io_iopoll_complete(ctx, nr_events, &done);
+
+ return ret;
+}
+
+/*
+ * Poll for a mininum of 'min' events. Note that if min == 0 we consider that a
+ * non-spinning poll check - we'll still enter the driver poll loop, but only
+ * as a non-spinning completion check.
+ */
+static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
+ long min)
+{
+ while (!list_empty(&ctx->poll_list)) {
+ int ret;
+
+ ret = io_do_iopoll(ctx, nr_events, min);
+ if (ret < 0)
+ return ret;
+ if (!min || *nr_events >= min)
+ return 0;
+ }
+
+ return 1;
+}
+
+/*
+ * We can't just wait for polled events to come to us, we have to actively
+ * find and complete them.
+ */
+static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
+{
+ if (!(ctx->flags & IORING_SETUP_IOPOLL))
+ return;
+
+ mutex_lock(&ctx->uring_lock);
+ while (!list_empty(&ctx->poll_list)) {
+ unsigned int nr_events = 0;
+
+ io_iopoll_getevents(ctx, &nr_events, 1);
+ }
+ mutex_unlock(&ctx->uring_lock);
+}
+
+static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
+ long min)
+{
+ int ret = 0;
+
+ do {
+ int tmin = 0;
+
+ if (*nr_events < min)
+ tmin = min - *nr_events;
+
+ ret = io_iopoll_getevents(ctx, nr_events, tmin);
+ if (ret <= 0)
+ break;
+ ret = 0;
+ } while (min && !*nr_events && !need_resched());
+
+ return ret;
+}
+
+static void kiocb_end_write(struct kiocb *kiocb)
+{
+ if (kiocb->ki_flags & IOCB_WRITE) {
+ struct inode *inode = file_inode(kiocb->ki_filp);
+
+ /*
+ * Tell lockdep we inherited freeze protection from submission
+ * thread.
+ */
+ if (S_ISREG(inode->i_mode))
+ __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
+ file_end_write(kiocb->ki_filp);
+ }
+}
+
+static void io_fput(struct io_kiocb *req)
+{
+ if (!(req->flags & REQ_F_FIXED_FILE))
+ fput(req->rw.ki_filp);
+}
+
+static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
+{
+ struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
+
+ kiocb_end_write(kiocb);
+
+ io_fput(req);
+ io_cqring_add_event(req->ctx, req->user_data, res, 0);
+ io_free_req(req);
+}
+
+static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
+{
+ struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
+
+ kiocb_end_write(kiocb);
+
+ req->error = res;
+ if (res != -EAGAIN)
+ req->flags |= REQ_F_IOPOLL_COMPLETED;
+}
+
+/*
+ * After the iocb has been issued, it's safe to be found on the poll list.
+ * Adding the kiocb to the list AFTER submission ensures that we don't
+ * find it from a io_iopoll_getevents() thread before the issuer is done
+ * accessing the kiocb cookie.
+ */
+static void io_iopoll_req_issued(struct io_kiocb *req)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+
+ /*
+ * Track whether we have multiple files in our lists. This will impact
+ * how we do polling eventually, not spinning if we're on potentially
+ * different devices.
+ */
+ if (list_empty(&ctx->poll_list)) {
+ ctx->poll_multi_file = false;
+ } else if (!ctx->poll_multi_file) {
+ struct io_kiocb *list_req;
+
+ list_req = list_first_entry(&ctx->poll_list, struct io_kiocb,
+ list);
+ if (list_req->rw.ki_filp != req->rw.ki_filp)
+ ctx->poll_multi_file = true;
+ }
+
+ /*
+ * For fast devices, IO may have already completed. If it has, add
+ * it to the front so we find it first.
+ */
+ if (req->flags & REQ_F_IOPOLL_COMPLETED)
+ list_add(&req->list, &ctx->poll_list);
+ else
+ list_add_tail(&req->list, &ctx->poll_list);
+}
+
+static void io_file_put(struct io_submit_state *state, struct file *file)
+{
+ if (!state) {
+ fput(file);
+ } else if (state->file) {
+ int diff = state->has_refs - state->used_refs;
+
+ if (diff)
+ fput_many(state->file, diff);
+ state->file = NULL;
+ }
+}
+
+/*
+ * Get as many references to a file as we have IOs left in this submission,
+ * assuming most submissions are for one file, or at least that each file
+ * has more than one submission.
+ */
+static struct file *io_file_get(struct io_submit_state *state, int fd)
+{
+ if (!state)
+ return fget(fd);
+
+ if (state->file) {
+ if (state->fd == fd) {
+ state->used_refs++;
+ state->ios_left--;
+ return state->file;
+ }
+ io_file_put(state, NULL);
+ }
+ state->file = fget_many(fd, state->ios_left);
+ if (!state->file)
+ return NULL;
+
+ state->fd = fd;
+ state->has_refs = state->ios_left;
+ state->used_refs = 1;
+ state->ios_left--;
+ return state->file;
+}
+
+/*
+ * If we tracked the file through the SCM inflight mechanism, we could support
+ * any file. For now, just ensure that anything potentially problematic is done
+ * inline.
+ */
+static bool io_file_supports_async(struct file *file)
+{
+ umode_t mode = file_inode(file)->i_mode;
+
+ if (S_ISBLK(mode) || S_ISCHR(mode))
+ return true;
+ if (S_ISREG(mode) && file->f_op != &io_uring_fops)
+ return true;
+
+ return false;
+}
+
+static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
+ bool force_nonblock, struct io_submit_state *state)
+{
+ const struct io_uring_sqe *sqe = s->sqe;
+ struct io_ring_ctx *ctx = req->ctx;
+ struct kiocb *kiocb = &req->rw;
+ unsigned ioprio, flags;
+ int fd, ret;
+
+ /* For -EAGAIN retry, everything is already prepped */
+ if (kiocb->ki_filp)
+ return 0;
+
+ flags = READ_ONCE(sqe->flags);
+ fd = READ_ONCE(sqe->fd);
+
+ if (flags & IOSQE_FIXED_FILE) {
+ if (unlikely(!ctx->user_files ||
+ (unsigned) fd >= ctx->nr_user_files))
+ return -EBADF;
+ kiocb->ki_filp = ctx->user_files[fd];
+ req->flags |= REQ_F_FIXED_FILE;
+ } else {
+ if (s->needs_fixed_file)
+ return -EBADF;
+ kiocb->ki_filp = io_file_get(state, fd);
+ if (unlikely(!kiocb->ki_filp))
+ return -EBADF;
+ if (force_nonblock && !io_file_supports_async(kiocb->ki_filp))
+ force_nonblock = false;
+ }
+ kiocb->ki_pos = READ_ONCE(sqe->off);
+ kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
+ kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
+
+ ioprio = READ_ONCE(sqe->ioprio);
+ if (ioprio) {
+ ret = ioprio_check_cap(ioprio);
+ if (ret)
+ goto out_fput;
+
+ kiocb->ki_ioprio = ioprio;
+ } else
+ kiocb->ki_ioprio = get_current_ioprio();
+
+ ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
+ if (unlikely(ret))
+ goto out_fput;
+ if (force_nonblock) {
+ kiocb->ki_flags |= IOCB_NOWAIT;
+ req->flags |= REQ_F_FORCE_NONBLOCK;
+ }
+ if (ctx->flags & IORING_SETUP_IOPOLL) {
+ ret = -EOPNOTSUPP;
+ if (!(kiocb->ki_flags & IOCB_DIRECT) ||
+ !kiocb->ki_filp->f_op->iopoll)
+ goto out_fput;
+
+ req->error = 0;
+ kiocb->ki_flags |= IOCB_HIPRI;
+ kiocb->ki_complete = io_complete_rw_iopoll;
+ } else {
+ if (kiocb->ki_flags & IOCB_HIPRI) {
+ ret = -EINVAL;
+ goto out_fput;
+ }
+ kiocb->ki_complete = io_complete_rw;
+ }
+ return 0;
+out_fput:
+ if (!(flags & IOSQE_FIXED_FILE)) {
+ /*
+ * in case of error, we didn't use this file reference. drop it.
+ */
+ if (state)
+ state->used_refs--;
+ io_file_put(state, kiocb->ki_filp);
+ }
+ return ret;
+}
+
+static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
+{
+ switch (ret) {
+ case -EIOCBQUEUED:
+ break;
+ case -ERESTARTSYS:
+ case -ERESTARTNOINTR:
+ case -ERESTARTNOHAND:
+ case -ERESTART_RESTARTBLOCK:
+ /*
+ * We can't just restart the syscall, since previously
+ * submitted sqes may already be in progress. Just fail this
+ * IO with EINTR.
+ */
+ ret = -EINTR;
+ /* fall through */
+ default:
+ kiocb->ki_complete(kiocb, ret, 0);
+ }
+}
+
+static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
+ const struct io_uring_sqe *sqe,
+ struct iov_iter *iter)
+{
+ size_t len = READ_ONCE(sqe->len);
+ struct io_mapped_ubuf *imu;
+ unsigned index, buf_index;
+ size_t offset;
+ u64 buf_addr;
+
+ /* attempt to use fixed buffers without having provided iovecs */
+ if (unlikely(!ctx->user_bufs))
+ return -EFAULT;
+
+ buf_index = READ_ONCE(sqe->buf_index);
+ if (unlikely(buf_index >= ctx->nr_user_bufs))
+ return -EFAULT;
+
+ index = array_index_nospec(buf_index, ctx->nr_user_bufs);
+ imu = &ctx->user_bufs[index];
+ buf_addr = READ_ONCE(sqe->addr);
+
+ /* overflow */
+ if (buf_addr + len < buf_addr)
+ return -EFAULT;
+ /* not inside the mapped region */
+ if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)
+ return -EFAULT;
+
+ /*
+ * May not be a start of buffer, set size appropriately
+ * and advance us to the beginning.
+ */
+ offset = buf_addr - imu->ubuf;
+ iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
+ if (offset)
+ iov_iter_advance(iter, offset);
+ return 0;
+}
+
+static int io_import_iovec(struct io_ring_ctx *ctx, int rw,
+ const struct sqe_submit *s, struct iovec **iovec,
+ struct iov_iter *iter)
+{
+ const struct io_uring_sqe *sqe = s->sqe;
+ void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ size_t sqe_len = READ_ONCE(sqe->len);
+ u8 opcode;
+
+ /*
+ * We're reading ->opcode for the second time, but the first read
+ * doesn't care whether it's _FIXED or not, so it doesn't matter
+ * whether ->opcode changes concurrently. The first read does care
+ * about whether it is a READ or a WRITE, so we don't trust this read
+ * for that purpose and instead let the caller pass in the read/write
+ * flag.
+ */
+ opcode = READ_ONCE(sqe->opcode);
+ if (opcode == IORING_OP_READ_FIXED ||
+ opcode == IORING_OP_WRITE_FIXED) {
+ ssize_t ret = io_import_fixed(ctx, rw, sqe, iter);
+ *iovec = NULL;
+ return ret;
+ }
+
+ if (!s->has_user)
+ return -EFAULT;
+
+#ifdef CONFIG_COMPAT
+ if (ctx->compat)
+ return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
+ iovec, iter);
+#endif
+
+ return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter);
+}
+
+/*
+ * Make a note of the last file/offset/direction we punted to async
+ * context. We'll use this information to see if we can piggy back a
+ * sequential request onto the previous one, if it's still hasn't been
+ * completed by the async worker.
+ */
+static void io_async_list_note(int rw, struct io_kiocb *req, size_t len)
+{
+ struct async_list *async_list = &req->ctx->pending_async[rw];
+ struct kiocb *kiocb = &req->rw;
+ struct file *filp = kiocb->ki_filp;
+ off_t io_end = kiocb->ki_pos + len;
+
+ if (filp == async_list->file && kiocb->ki_pos == async_list->io_end) {
+ unsigned long max_pages;
+
+ /* Use 8x RA size as a decent limiter for both reads/writes */
+ max_pages = filp->f_ra.ra_pages;
+ if (!max_pages)
+ max_pages = VM_MAX_READAHEAD >> (PAGE_SHIFT - 10);
+ max_pages *= 8;
+
+ /* If max pages are exceeded, reset the state */
+ len >>= PAGE_SHIFT;
+ if (async_list->io_pages + len <= max_pages) {
+ req->flags |= REQ_F_SEQ_PREV;
+ async_list->io_pages += len;
+ } else {
+ io_end = 0;
+ async_list->io_pages = 0;
+ }
+ }
+
+ /* New file? Reset state. */
+ if (async_list->file != filp) {
+ async_list->io_pages = 0;
+ async_list->file = filp;
+ }
+ async_list->io_end = io_end;
+}
+
+static ssize_t io_read(struct io_kiocb *req, const struct sqe_submit *s,
+ bool force_nonblock, struct io_submit_state *state)
+{
+ struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
+ struct kiocb *kiocb = &req->rw;
+ struct iov_iter iter;
+ struct file *file;
+ size_t iov_count;
+ ssize_t ret;
+
+ ret = io_prep_rw(req, s, force_nonblock, state);
+ if (ret)
+ return ret;
+ file = kiocb->ki_filp;
+
+ ret = -EBADF;
+ if (unlikely(!(file->f_mode & FMODE_READ)))
+ goto out_fput;
+ ret = -EINVAL;
+ if (unlikely(!file->f_op->read_iter))
+ goto out_fput;
+
+ ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter);
+ if (ret)
+ goto out_fput;
+
+ iov_count = iov_iter_count(&iter);
+ ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count);
+ if (!ret) {
+ ssize_t ret2;
+
+ /* Catch -EAGAIN return for forced non-blocking submission */
+ ret2 = call_read_iter(file, kiocb, &iter);
+ if (!force_nonblock || ret2 != -EAGAIN) {
+ io_rw_done(kiocb, ret2);
+ } else {
+ /*
+ * If ->needs_lock is true, we're already in async
+ * context.
+ */
+ if (!s->needs_lock)
+ io_async_list_note(READ, req, iov_count);
+ ret = -EAGAIN;
+ }
+ }
+ kfree(iovec);
+out_fput:
+ /* Hold on to the file for -EAGAIN */
+ if (unlikely(ret && ret != -EAGAIN))
+ io_fput(req);
+ return ret;
+}
+
+static ssize_t io_write(struct io_kiocb *req, const struct sqe_submit *s,
+ bool force_nonblock, struct io_submit_state *state)
+{
+ struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
+ struct kiocb *kiocb = &req->rw;
+ struct iov_iter iter;
+ struct file *file;
+ size_t iov_count;
+ ssize_t ret;
+
+ ret = io_prep_rw(req, s, force_nonblock, state);
+ if (ret)
+ return ret;
+
+ ret = -EBADF;
+ file = kiocb->ki_filp;
+ if (unlikely(!(file->f_mode & FMODE_WRITE)))
+ goto out_fput;
+ ret = -EINVAL;
+ if (unlikely(!file->f_op->write_iter))
+ goto out_fput;
+
+ ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter);
+ if (ret)
+ goto out_fput;
+
+ iov_count = iov_iter_count(&iter);
+
+ ret = -EAGAIN;
+ if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT)) {
+ /* If ->needs_lock is true, we're already in async context. */
+ if (!s->needs_lock)
+ io_async_list_note(WRITE, req, iov_count);
+ goto out_free;
+ }
+
+ ret = rw_verify_area(WRITE, file, &kiocb->ki_pos, iov_count);
+ if (!ret) {
+ /*
+ * Open-code file_start_write here to grab freeze protection,
+ * which will be released by another thread in
+ * io_complete_rw(). Fool lockdep by telling it the lock got
+ * released so that it doesn't complain about the held lock when
+ * we return to userspace.
+ */
+ if (S_ISREG(file_inode(file)->i_mode)) {
+ __sb_start_write(file_inode(file)->i_sb,
+ SB_FREEZE_WRITE, true);
+ __sb_writers_release(file_inode(file)->i_sb,
+ SB_FREEZE_WRITE);
+ }
+ kiocb->ki_flags |= IOCB_WRITE;
+ io_rw_done(kiocb, call_write_iter(file, kiocb, &iter));
+ }
+out_free:
+ kfree(iovec);
+out_fput:
+ /* Hold on to the file for -EAGAIN */
+ if (unlikely(ret && ret != -EAGAIN))
+ io_fput(req);
+ return ret;
+}
+
+/*
+ * IORING_OP_NOP just posts a completion event, nothing else.
+ */
+static int io_nop(struct io_kiocb *req, u64 user_data)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ long err = 0;
+
+ if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
+
+ /*
+ * Twilight zone - it's possible that someone issued an opcode that
+ * has a file attached, then got -EAGAIN on submission, and changed
+ * the sqe before we retried it from async context. Avoid dropping
+ * a file reference for this malicious case, and flag the error.
+ */
+ if (req->rw.ki_filp) {
+ err = -EBADF;
+ io_fput(req);
+ }
+ io_cqring_add_event(ctx, user_data, err, 0);
+ io_free_req(req);
+ return 0;
+}
+
+static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ unsigned flags;
+ int fd;
+
+ /* Prep already done */
+ if (req->rw.ki_filp)
+ return 0;
+
+ if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
+ if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
+ return -EINVAL;
+
+ fd = READ_ONCE(sqe->fd);
+ flags = READ_ONCE(sqe->flags);
+
+ if (flags & IOSQE_FIXED_FILE) {
+ if (unlikely(!ctx->user_files || fd >= ctx->nr_user_files))
+ return -EBADF;
+ req->rw.ki_filp = ctx->user_files[fd];
+ req->flags |= REQ_F_FIXED_FILE;
+ } else {
+ req->rw.ki_filp = fget(fd);
+ if (unlikely(!req->rw.ki_filp))
+ return -EBADF;
+ }
+
+ return 0;
+}
+
+static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+ bool force_nonblock)
+{
+ loff_t sqe_off = READ_ONCE(sqe->off);
+ loff_t sqe_len = READ_ONCE(sqe->len);
+ loff_t end = sqe_off + sqe_len;
+ unsigned fsync_flags;
+ int ret;
+
+ fsync_flags = READ_ONCE(sqe->fsync_flags);
+ if (unlikely(fsync_flags & ~IORING_FSYNC_DATASYNC))
+ return -EINVAL;
+
+ ret = io_prep_fsync(req, sqe);
+ if (ret)
+ return ret;
+
+ /* fsync always requires a blocking context */
+ if (force_nonblock)
+ return -EAGAIN;
+
+ ret = vfs_fsync_range(req->rw.ki_filp, sqe_off,
+ end > 0 ? end : LLONG_MAX,
+ fsync_flags & IORING_FSYNC_DATASYNC);
+
+ io_fput(req);
+ io_cqring_add_event(req->ctx, sqe->user_data, ret, 0);
+ io_free_req(req);
+ return 0;
+}
+
+static void io_poll_remove_one(struct io_kiocb *req)
+{
+ struct io_poll_iocb *poll = &req->poll;
+
+ spin_lock(&poll->head->lock);
+ WRITE_ONCE(poll->canceled, true);
+ if (!list_empty(&poll->wait.entry)) {
+ list_del_init(&poll->wait.entry);
+ queue_work(req->ctx->sqo_wq, &req->work);
+ }
+ spin_unlock(&poll->head->lock);
+
+ list_del_init(&req->list);
+}
+
+static void io_poll_remove_all(struct io_ring_ctx *ctx)
+{
+ struct io_kiocb *req;
+
+ spin_lock_irq(&ctx->completion_lock);
+ while (!list_empty(&ctx->cancel_list)) {
+ req = list_first_entry(&ctx->cancel_list, struct io_kiocb,list);
+ io_poll_remove_one(req);
+ }
+ spin_unlock_irq(&ctx->completion_lock);
+}
+
+/*
+ * Find a running poll command that matches one specified in sqe->addr,
+ * and remove it if found.
+ */
+static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_kiocb *poll_req, *next;
+ int ret = -ENOENT;
+
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
+ if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
+ sqe->poll_events)
+ return -EINVAL;
+
+ spin_lock_irq(&ctx->completion_lock);
+ list_for_each_entry_safe(poll_req, next, &ctx->cancel_list, list) {
+ if (READ_ONCE(sqe->addr) == poll_req->user_data) {
+ io_poll_remove_one(poll_req);
+ ret = 0;
+ break;
+ }
+ }
+ spin_unlock_irq(&ctx->completion_lock);
+
+ io_cqring_add_event(req->ctx, sqe->user_data, ret, 0);
+ io_free_req(req);
+ return 0;
+}
+
+static void io_poll_complete(struct io_kiocb *req, __poll_t mask)
+{
+ io_cqring_add_event(req->ctx, req->user_data, mangle_poll(mask), 0);
+ io_fput(req);
+ io_free_req(req);
+}
+
+static void io_poll_complete_work(struct work_struct *work)
+{
+ struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+ struct io_poll_iocb *poll = &req->poll;
+ struct poll_table_struct pt = { ._key = poll->events };
+ struct io_ring_ctx *ctx = req->ctx;
+ __poll_t mask = 0;
+
+ if (!READ_ONCE(poll->canceled))
+ mask = vfs_poll(poll->file, &pt) & poll->events;
+
+ /*
+ * Note that ->ki_cancel callers also delete iocb from active_reqs after
+ * calling ->ki_cancel. We need the ctx_lock roundtrip here to
+ * synchronize with them. In the cancellation case the list_del_init
+ * itself is not actually needed, but harmless so we keep it in to
+ * avoid further branches in the fast path.
+ */
+ spin_lock_irq(&ctx->completion_lock);
+ if (!mask && !READ_ONCE(poll->canceled)) {
+ add_wait_queue(poll->head, &poll->wait);
+ spin_unlock_irq(&ctx->completion_lock);
+ return;
+ }
+ list_del_init(&req->list);
+ spin_unlock_irq(&ctx->completion_lock);
+
+ io_poll_complete(req, mask);
+}
+
+static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
+ void *key)
+{
+ struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb,
+ wait);
+ struct io_kiocb *req = container_of(poll, struct io_kiocb, poll);
+ struct io_ring_ctx *ctx = req->ctx;
+ __poll_t mask = key_to_poll(key);
+
+ poll->woken = true;
+
+ /* for instances that support it check for an event match first: */
+ if (mask) {
+ unsigned long flags;
+
+ if (!(mask & poll->events))
+ return 0;
+
+ /* try to complete the iocb inline if we can: */
+ if (spin_trylock_irqsave(&ctx->completion_lock, flags)) {
+ list_del(&req->list);
+ spin_unlock_irqrestore(&ctx->completion_lock, flags);
+
+ list_del_init(&poll->wait.entry);
+ io_poll_complete(req, mask);
+ return 1;
+ }
+ }
+
+ list_del_init(&poll->wait.entry);
+ queue_work(ctx->sqo_wq, &req->work);
+ return 1;
+}
+
+struct io_poll_table {
+ struct poll_table_struct pt;
+ struct io_kiocb *req;
+ int error;
+};
+
+static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
+ struct poll_table_struct *p)
+{
+ struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
+
+ if (unlikely(pt->req->poll.head)) {
+ pt->error = -EINVAL;
+ return;
+ }
+
+ pt->error = 0;
+ pt->req->poll.head = head;
+ add_wait_queue(head, &pt->req->poll.wait);
+}
+
+static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ struct io_poll_iocb *poll = &req->poll;
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_poll_table ipt;
+ unsigned flags;
+ __poll_t mask;
+ u16 events;
+ int fd;
+
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
+ if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
+ return -EINVAL;
+
+ INIT_WORK(&req->work, io_poll_complete_work);
+ events = READ_ONCE(sqe->poll_events);
+ poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
+
+ flags = READ_ONCE(sqe->flags);
+ fd = READ_ONCE(sqe->fd);
+
+ if (flags & IOSQE_FIXED_FILE) {
+ if (unlikely(!ctx->user_files || fd >= ctx->nr_user_files))
+ return -EBADF;
+ poll->file = ctx->user_files[fd];
+ req->flags |= REQ_F_FIXED_FILE;
+ } else {
+ poll->file = fget(fd);
+ }
+ if (unlikely(!poll->file))
+ return -EBADF;
+
+ poll->head = NULL;
+ poll->woken = false;
+ poll->canceled = false;
+
+ ipt.pt._qproc = io_poll_queue_proc;
+ ipt.pt._key = poll->events;
+ ipt.req = req;
+ ipt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
+
+ /* initialized the list so that we can do list_empty checks */
+ INIT_LIST_HEAD(&poll->wait.entry);
+ init_waitqueue_func_entry(&poll->wait, io_poll_wake);
+
+ /* one for removal from waitqueue, one for this function */
+ refcount_set(&req->refs, 2);
+
+ mask = vfs_poll(poll->file, &ipt.pt) & poll->events;
+ if (unlikely(!poll->head)) {
+ /* we did not manage to set up a waitqueue, done */
+ goto out;
+ }
+
+ spin_lock_irq(&ctx->completion_lock);
+ spin_lock(&poll->head->lock);
+ if (poll->woken) {
+ /* wake_up context handles the rest */
+ mask = 0;
+ ipt.error = 0;
+ } else if (mask || ipt.error) {
+ /* if we get an error or a mask we are done */
+ WARN_ON_ONCE(list_empty(&poll->wait.entry));
+ list_del_init(&poll->wait.entry);
+ } else {
+ /* actually waiting for an event */
+ list_add_tail(&req->list, &ctx->cancel_list);
+ }
+ spin_unlock(&poll->head->lock);
+ spin_unlock_irq(&ctx->completion_lock);
+
+out:
+ if (unlikely(ipt.error)) {
+ if (!(flags & IOSQE_FIXED_FILE))
+ fput(poll->file);
+ /*
+ * Drop one of our refs to this req, __io_submit_sqe() will
+ * drop the other one since we're returning an error.
+ */
+ io_free_req(req);
+ return ipt.error;
+ }
+
+ if (mask)
+ io_poll_complete(req, mask);
+ io_free_req(req);
+ return 0;
+}
+
+static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
+ const struct sqe_submit *s, bool force_nonblock,
+ struct io_submit_state *state)
+{
+ ssize_t ret;
+ int opcode;
+
+ if (unlikely(s->index >= ctx->sq_entries))
+ return -EINVAL;
+ req->user_data = READ_ONCE(s->sqe->user_data);
+
+ opcode = READ_ONCE(s->sqe->opcode);
+ switch (opcode) {
+ case IORING_OP_NOP:
+ ret = io_nop(req, req->user_data);
+ break;
+ case IORING_OP_READV:
+ if (unlikely(s->sqe->buf_index))
+ return -EINVAL;
+ ret = io_read(req, s, force_nonblock, state);
+ break;
+ case IORING_OP_WRITEV:
+ if (unlikely(s->sqe->buf_index))
+ return -EINVAL;
+ ret = io_write(req, s, force_nonblock, state);
+ break;
+ case IORING_OP_READ_FIXED:
+ ret = io_read(req, s, force_nonblock, state);
+ break;
+ case IORING_OP_WRITE_FIXED:
+ ret = io_write(req, s, force_nonblock, state);
+ break;
+ case IORING_OP_FSYNC:
+ ret = io_fsync(req, s->sqe, force_nonblock);
+ break;
+ case IORING_OP_POLL_ADD:
+ ret = io_poll_add(req, s->sqe);
+ break;
+ case IORING_OP_POLL_REMOVE:
+ ret = io_poll_remove(req, s->sqe);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ if (ret)
+ return ret;
+
+ if (ctx->flags & IORING_SETUP_IOPOLL) {
+ if (req->error == -EAGAIN)
+ return -EAGAIN;
+
+ /* workqueue context doesn't hold uring_lock, grab it now */
+ if (s->needs_lock)
+ mutex_lock(&ctx->uring_lock);
+ io_iopoll_req_issued(req);
+ if (s->needs_lock)
+ mutex_unlock(&ctx->uring_lock);
+ }
+
+ return 0;
+}
+
+static struct async_list *io_async_list_from_sqe(struct io_ring_ctx *ctx,
+ const struct io_uring_sqe *sqe)
+{
+ switch (sqe->opcode) {
+ case IORING_OP_READV:
+ case IORING_OP_READ_FIXED:
+ return &ctx->pending_async[READ];
+ case IORING_OP_WRITEV:
+ case IORING_OP_WRITE_FIXED:
+ return &ctx->pending_async[WRITE];
+ default:
+ return NULL;
+ }
+}
+
+static inline bool io_sqe_needs_user(const struct io_uring_sqe *sqe)
+{
+ u8 opcode = READ_ONCE(sqe->opcode);
+
+ return !(opcode == IORING_OP_READ_FIXED ||
+ opcode == IORING_OP_WRITE_FIXED);
+}
+
+static void io_sq_wq_submit_work(struct work_struct *work)
+{
+ struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+ struct io_ring_ctx *ctx = req->ctx;
+ struct mm_struct *cur_mm = NULL;
+ struct async_list *async_list;
+ LIST_HEAD(req_list);
+ mm_segment_t old_fs;
+ int ret;
+
+ async_list = io_async_list_from_sqe(ctx, req->submit.sqe);
+restart:
+ do {
+ struct sqe_submit *s = &req->submit;
+ const struct io_uring_sqe *sqe = s->sqe;
+
+ /* Ensure we clear previously set forced non-block flag */
+ req->flags &= ~REQ_F_FORCE_NONBLOCK;
+ req->rw.ki_flags &= ~IOCB_NOWAIT;
+
+ ret = 0;
+ if (io_sqe_needs_user(sqe) && !cur_mm) {
+ if (!mmget_not_zero(ctx->sqo_mm)) {
+ ret = -EFAULT;
+ } else {
+ cur_mm = ctx->sqo_mm;
+ use_mm(cur_mm);
+ old_fs = get_fs();
+ set_fs(USER_DS);
+ }
+ }
+
+ if (!ret) {
+ s->has_user = cur_mm != NULL;
+ s->needs_lock = true;
+ do {
+ ret = __io_submit_sqe(ctx, req, s, false, NULL);
+ /*
+ * We can get EAGAIN for polled IO even though
+ * we're forcing a sync submission from here,
+ * since we can't wait for request slots on the
+ * block side.
+ */
+ if (ret != -EAGAIN)
+ break;
+ cond_resched();
+ } while (1);
+ }
+ if (ret) {
+ io_cqring_add_event(ctx, sqe->user_data, ret, 0);
+ io_free_req(req);
+ }
+
+ /* async context always use a copy of the sqe */
+ kfree(sqe);
+
+ if (!async_list)
+ break;
+ if (!list_empty(&req_list)) {
+ req = list_first_entry(&req_list, struct io_kiocb,
+ list);
+ list_del(&req->list);
+ continue;
+ }
+ if (list_empty(&async_list->list))
+ break;
+
+ req = NULL;
+ spin_lock(&async_list->lock);
+ if (list_empty(&async_list->list)) {
+ spin_unlock(&async_list->lock);
+ break;
+ }
+ list_splice_init(&async_list->list, &req_list);
+ spin_unlock(&async_list->lock);
+
+ req = list_first_entry(&req_list, struct io_kiocb, list);
+ list_del(&req->list);
+ } while (req);
+
+ /*
+ * Rare case of racing with a submitter. If we find the count has
+ * dropped to zero AND we have pending work items, then restart
+ * the processing. This is a tiny race window.
+ */
+ if (async_list) {
+ ret = atomic_dec_return(&async_list->cnt);
+ while (!ret && !list_empty(&async_list->list)) {
+ spin_lock(&async_list->lock);
+ atomic_inc(&async_list->cnt);
+ list_splice_init(&async_list->list, &req_list);
+ spin_unlock(&async_list->lock);
+
+ if (!list_empty(&req_list)) {
+ req = list_first_entry(&req_list,
+ struct io_kiocb, list);
+ list_del(&req->list);
+ goto restart;
+ }
+ ret = atomic_dec_return(&async_list->cnt);
+ }
+ }
+
+ if (cur_mm) {
+ set_fs(old_fs);
+ unuse_mm(cur_mm);
+ mmput(cur_mm);
+ }
+}
+
+/*
+ * See if we can piggy back onto previously submitted work, that is still
+ * running. We currently only allow this if the new request is sequential
+ * to the previous one we punted.
+ */
+static bool io_add_to_prev_work(struct async_list *list, struct io_kiocb *req)
+{
+ bool ret = false;
+
+ if (!list)
+ return false;
+ if (!(req->flags & REQ_F_SEQ_PREV))
+ return false;
+ if (!atomic_read(&list->cnt))
+ return false;
+
+ ret = true;
+ spin_lock(&list->lock);
+ list_add_tail(&req->list, &list->list);
+ if (!atomic_read(&list->cnt)) {
+ list_del_init(&req->list);
+ ret = false;
+ }
+ spin_unlock(&list->lock);
+ return ret;
+}
+
+static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
+ struct io_submit_state *state)
+{
+ struct io_kiocb *req;
+ ssize_t ret;
+
+ /* enforce forwards compatibility on users */
+ if (unlikely(s->sqe->flags & ~IOSQE_FIXED_FILE))
+ return -EINVAL;
+
+ req = io_get_req(ctx, state);
+ if (unlikely(!req))
+ return -EAGAIN;
+
+ req->rw.ki_filp = NULL;
+
+ ret = __io_submit_sqe(ctx, req, s, true, state);
+ if (ret == -EAGAIN) {
+ struct io_uring_sqe *sqe_copy;
+
+ sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL);
+ if (sqe_copy) {
+ struct async_list *list;
+
+ memcpy(sqe_copy, s->sqe, sizeof(*sqe_copy));
+ s->sqe = sqe_copy;
+
+ memcpy(&req->submit, s, sizeof(*s));
+ list = io_async_list_from_sqe(ctx, s->sqe);
+ if (!io_add_to_prev_work(list, req)) {
+ if (list)
+ atomic_inc(&list->cnt);
+ INIT_WORK(&req->work, io_sq_wq_submit_work);
+ queue_work(ctx->sqo_wq, &req->work);
+ }
+ ret = 0;
+ }
+ }
+ if (ret)
+ io_free_req(req);
+
+ return ret;
+}
+
+/*
+ * Batched submission is done, ensure local IO is flushed out.
+ */
+static void io_submit_state_end(struct io_submit_state *state)
+{
+ blk_finish_plug(&state->plug);
+ io_file_put(state, NULL);
+ if (state->free_reqs)
+ kmem_cache_free_bulk(req_cachep, state->free_reqs,
+ &state->reqs[state->cur_req]);
+}
+
+/*
+ * Start submission side cache.
+ */
+static void io_submit_state_start(struct io_submit_state *state,
+ struct io_ring_ctx *ctx, unsigned max_ios)
+{
+ blk_start_plug(&state->plug);
+ state->free_reqs = 0;
+ state->file = NULL;
+ state->ios_left = max_ios;
+}
+
+static void io_commit_sqring(struct io_ring_ctx *ctx)
+{
+ struct io_sq_ring *ring = ctx->sq_ring;
+
+ if (ctx->cached_sq_head != READ_ONCE(ring->r.head)) {
+ /*
+ * Ensure any loads from the SQEs are done at this point,
+ * since once we write the new head, the application could
+ * write new data to them.
+ */
+ smp_store_release(&ring->r.head, ctx->cached_sq_head);
+
+ /*
+ * write side barrier of head update, app has read side. See
+ * comment at the top of this file
+ */
+ smp_wmb();
+ }
+}
+
+/*
+ * Undo last io_get_sqring()
+ */
+static void io_drop_sqring(struct io_ring_ctx *ctx)
+{
+ ctx->cached_sq_head--;
+}
+
+/*
+ * Fetch an sqe, if one is available. Note that s->sqe will point to memory
+ * that is mapped by userspace. This means that care needs to be taken to
+ * ensure that reads are stable, as we cannot rely on userspace always
+ * being a good citizen. If members of the sqe are validated and then later
+ * used, it's important that those reads are done through READ_ONCE() to
+ * prevent a re-load down the line.
+ */
+static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
+{
+ struct io_sq_ring *ring = ctx->sq_ring;
+ unsigned head;
+
+ /*
+ * The cached sq head (or cq tail) serves two purposes:
+ *
+ * 1) allows us to batch the cost of updating the user visible
+ * head updates.
+ * 2) allows the kernel side to track the head on its own, even
+ * though the application is the one updating it.
+ */
+ head = ctx->cached_sq_head;
+ /* See comment at the top of this file */
+ smp_rmb();
+ if (head == READ_ONCE(ring->r.tail))
+ return false;
+
+ head = READ_ONCE(ring->array[head & ctx->sq_mask]);
+ if (head < ctx->sq_entries) {
+ s->index = head;
+ s->sqe = &ctx->sq_sqes[head];
+ ctx->cached_sq_head++;
+ return true;
+ }
+
+ /* drop invalid entries */
+ ctx->cached_sq_head++;
+ ring->dropped++;
+ /* See comment at the top of this file */
+ smp_wmb();
+ return false;
+}
+
+static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes,
+ unsigned int nr, bool has_user, bool mm_fault)
+{
+ struct io_submit_state state, *statep = NULL;
+ int ret, i, submitted = 0;
+
+ if (nr > IO_PLUG_THRESHOLD) {
+ io_submit_state_start(&state, ctx, nr);
+ statep = &state;
+ }
+
+ for (i = 0; i < nr; i++) {
+ if (unlikely(mm_fault)) {
+ ret = -EFAULT;
+ } else {
+ sqes[i].has_user = has_user;
+ sqes[i].needs_lock = true;
+ sqes[i].needs_fixed_file = true;
+ ret = io_submit_sqe(ctx, &sqes[i], statep);
+ }
+ if (!ret) {
+ submitted++;
+ continue;
+ }
+
+ io_cqring_add_event(ctx, sqes[i].sqe->user_data, ret, 0);
+ }
+
+ if (statep)
+ io_submit_state_end(&state);
+
+ return submitted;
+}
+
+static int io_sq_thread(void *data)
+{
+ struct sqe_submit sqes[IO_IOPOLL_BATCH];
+ struct io_ring_ctx *ctx = data;
+ struct mm_struct *cur_mm = NULL;
+ mm_segment_t old_fs;
+ DEFINE_WAIT(wait);
+ unsigned inflight;
+ unsigned long timeout;
+
+ old_fs = get_fs();
+ set_fs(USER_DS);
+
+ timeout = inflight = 0;
+ while (!kthread_should_stop() && !ctx->sqo_stop) {
+ bool all_fixed, mm_fault = false;
+ int i;
+
+ if (inflight) {
+ unsigned nr_events = 0;
+
+ if (ctx->flags & IORING_SETUP_IOPOLL) {
+ /*
+ * We disallow the app entering submit/complete
+ * with polling, but we still need to lock the
+ * ring to prevent racing with polled issue
+ * that got punted to a workqueue.
+ */
+ mutex_lock(&ctx->uring_lock);
+ io_iopoll_check(ctx, &nr_events, 0);
+ mutex_unlock(&ctx->uring_lock);
+ } else {
+ /*
+ * Normal IO, just pretend everything completed.
+ * We don't have to poll completions for that.
+ */
+ nr_events = inflight;
+ }
+
+ inflight -= nr_events;
+ if (!inflight)
+ timeout = jiffies + ctx->sq_thread_idle;
+ }
+
+ if (!io_get_sqring(ctx, &sqes[0])) {
+ /*
+ * We're polling. If we're within the defined idle
+ * period, then let us spin without work before going
+ * to sleep.
+ */
+ if (inflight || !time_after(jiffies, timeout)) {
+ cpu_relax();
+ continue;
+ }
+
+ /*
+ * Drop cur_mm before scheduling, we can't hold it for
+ * long periods (or over schedule()). Do this before
+ * adding ourselves to the waitqueue, as the unuse/drop
+ * may sleep.
+ */
+ if (cur_mm) {
+ unuse_mm(cur_mm);
+ mmput(cur_mm);
+ cur_mm = NULL;
+ }
+
+ prepare_to_wait(&ctx->sqo_wait, &wait,
+ TASK_INTERRUPTIBLE);
+
+ /* Tell userspace we may need a wakeup call */
+ ctx->sq_ring->flags |= IORING_SQ_NEED_WAKEUP;
+ smp_wmb();
+
+ if (!io_get_sqring(ctx, &sqes[0])) {
+ if (kthread_should_stop()) {
+ finish_wait(&ctx->sqo_wait, &wait);
+ break;
+ }
+ if (signal_pending(current))
+ flush_signals(current);
+ schedule();
+ finish_wait(&ctx->sqo_wait, &wait);
+
+ ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP;
+ smp_wmb();
+ continue;
+ }
+ finish_wait(&ctx->sqo_wait, &wait);
+
+ ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP;
+ smp_wmb();
+ }
+
+ i = 0;
+ all_fixed = true;
+ do {
+ if (all_fixed && io_sqe_needs_user(sqes[i].sqe))
+ all_fixed = false;
+
+ i++;
+ if (i == ARRAY_SIZE(sqes))
+ break;
+ } while (io_get_sqring(ctx, &sqes[i]));
+
+ /* Unless all new commands are FIXED regions, grab mm */
+ if (!all_fixed && !cur_mm) {
+ mm_fault = !mmget_not_zero(ctx->sqo_mm);
+ if (!mm_fault) {
+ use_mm(ctx->sqo_mm);
+ cur_mm = ctx->sqo_mm;
+ }
+ }
+
+ inflight += io_submit_sqes(ctx, sqes, i, cur_mm != NULL,
+ mm_fault);
+
+ /* Commit SQ ring head once we've consumed all SQEs */
+ io_commit_sqring(ctx);
+ }
+
+ set_fs(old_fs);
+ if (cur_mm) {
+ unuse_mm(cur_mm);
+ mmput(cur_mm);
+ }
+ return 0;
+}
+
+static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
+{
+ struct io_submit_state state, *statep = NULL;
+ int i, ret = 0, submit = 0;
+
+ if (to_submit > IO_PLUG_THRESHOLD) {
+ io_submit_state_start(&state, ctx, to_submit);
+ statep = &state;
+ }
+
+ for (i = 0; i < to_submit; i++) {
+ struct sqe_submit s;
+
+ if (!io_get_sqring(ctx, &s))
+ break;
+
+ s.has_user = true;
+ s.needs_lock = false;
+ s.needs_fixed_file = false;
+
+ ret = io_submit_sqe(ctx, &s, statep);
+ if (ret) {
+ io_drop_sqring(ctx);
+ break;
+ }
+
+ submit++;
+ }
+ io_commit_sqring(ctx);
+
+ if (statep)
+ io_submit_state_end(statep);
+
+ return submit ? submit : ret;
+}
+
+static unsigned io_cqring_events(struct io_cq_ring *ring)
+{
+ return READ_ONCE(ring->r.tail) - READ_ONCE(ring->r.head);
+}
+
+/*
+ * Wait until events become available, if we don't already have some. The
+ * application must reap them itself, as they reside on the shared cq ring.
+ */
+static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
+ const sigset_t __user *sig, size_t sigsz)
+{
+ struct io_cq_ring *ring = ctx->cq_ring;
+ sigset_t ksigmask, sigsaved;
+ DEFINE_WAIT(wait);
+ int ret;
+
+ /* See comment at the top of this file */
+ smp_rmb();
+ if (io_cqring_events(ring) >= min_events)
+ return 0;
+
+ if (sig) {
+ ret = set_user_sigmask(sig, &ksigmask, &sigsaved, sigsz);
+ if (ret)
+ return ret;
+ }
+
+ do {
+ prepare_to_wait(&ctx->wait, &wait, TASK_INTERRUPTIBLE);
+
+ ret = 0;
+ /* See comment at the top of this file */
+ smp_rmb();
+ if (io_cqring_events(ring) >= min_events)
+ break;
+
+ schedule();
+
+ ret = -EINTR;
+ if (signal_pending(current))
+ break;
+ } while (1);
+
+ finish_wait(&ctx->wait, &wait);
+
+ if (sig)
+ restore_user_sigmask(sig, &sigsaved);
+
+ return READ_ONCE(ring->r.head) == READ_ONCE(ring->r.tail) ? ret : 0;
+}
+
+static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
+{
+#if defined(CONFIG_UNIX)
+ if (ctx->ring_sock) {
+ struct sock *sock = ctx->ring_sock->sk;
+ struct sk_buff *skb;
+
+ while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
+ kfree_skb(skb);
+ }
+#else
+ int i;
+
+ for (i = 0; i < ctx->nr_user_files; i++)
+ fput(ctx->user_files[i]);
+#endif
+}
+
+static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
+{
+ if (!ctx->user_files)
+ return -ENXIO;
+
+ __io_sqe_files_unregister(ctx);
+ kfree(ctx->user_files);
+ ctx->user_files = NULL;
+ ctx->nr_user_files = 0;
+ return 0;
+}
+
+static void io_sq_thread_stop(struct io_ring_ctx *ctx)
+{
+ if (ctx->sqo_thread) {
+ ctx->sqo_stop = 1;
+ mb();
+ kthread_stop(ctx->sqo_thread);
+ ctx->sqo_thread = NULL;
+ }
+}
+
+static void io_finish_async(struct io_ring_ctx *ctx)
+{
+ io_sq_thread_stop(ctx);
+
+ if (ctx->sqo_wq) {
+ destroy_workqueue(ctx->sqo_wq);
+ ctx->sqo_wq = NULL;
+ }
+}
+
+#if defined(CONFIG_UNIX)
+static void io_destruct_skb(struct sk_buff *skb)
+{
+ struct io_ring_ctx *ctx = skb->sk->sk_user_data;
+
+ io_finish_async(ctx);
+ unix_destruct_scm(skb);
+}
+
+/*
+ * Ensure the UNIX gc is aware of our file set, so we are certain that
+ * the io_uring can be safely unregistered on process exit, even if we have
+ * loops in the file referencing.
+ */
+static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
+{
+ struct sock *sk = ctx->ring_sock->sk;
+ struct scm_fp_list *fpl;
+ struct sk_buff *skb;
+ int i;
+
+ if (!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
+ unsigned long inflight = ctx->user->unix_inflight + nr;
+
+ if (inflight > task_rlimit(current, RLIMIT_NOFILE))
+ return -EMFILE;
+ }
+
+ fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
+ if (!fpl)
+ return -ENOMEM;
+
+ skb = alloc_skb(0, GFP_KERNEL);
+ if (!skb) {
+ kfree(fpl);
+ return -ENOMEM;
+ }
+
+ skb->sk = sk;
+ skb->destructor = io_destruct_skb;
+
+ fpl->user = get_uid(ctx->user);
+ for (i = 0; i < nr; i++) {
+ fpl->fp[i] = get_file(ctx->user_files[i + offset]);
+ unix_inflight(fpl->user, fpl->fp[i]);
+ }
+
+ fpl->max = fpl->count = nr;
+ UNIXCB(skb).fp = fpl;
+ refcount_add(skb->truesize, &sk->sk_wmem_alloc);
+ skb_queue_head(&sk->sk_receive_queue, skb);
+
+ for (i = 0; i < nr; i++)
+ fput(fpl->fp[i]);
+
+ return 0;
+}
+
+/*
+ * If UNIX sockets are enabled, fd passing can cause a reference cycle which
+ * causes regular reference counting to break down. We rely on the UNIX
+ * garbage collection to take care of this problem for us.
+ */
+static int io_sqe_files_scm(struct io_ring_ctx *ctx)
+{
+ unsigned left, total;
+ int ret = 0;
+
+ total = 0;
+ left = ctx->nr_user_files;
+ while (left) {
+ unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
+ int ret;
+
+ ret = __io_sqe_files_scm(ctx, this_files, total);
+ if (ret)
+ break;
+ left -= this_files;
+ total += this_files;
+ }
+
+ if (!ret)
+ return 0;
+
+ while (total < ctx->nr_user_files) {
+ fput(ctx->user_files[total]);
+ total++;
+ }
+
+ return ret;
+}
+#else
+static int io_sqe_files_scm(struct io_ring_ctx *ctx)
+{
+ return 0;
+}
+#endif
+
+static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
+ unsigned nr_args)
+{
+ __s32 __user *fds = (__s32 __user *) arg;
+ int fd, ret = 0;
+ unsigned i;
+
+ if (ctx->user_files)
+ return -EBUSY;
+ if (!nr_args)
+ return -EINVAL;
+ if (nr_args > IORING_MAX_FIXED_FILES)
+ return -EMFILE;
+
+ ctx->user_files = kcalloc(nr_args, sizeof(struct file *), GFP_KERNEL);
+ if (!ctx->user_files)
+ return -ENOMEM;
+
+ for (i = 0; i < nr_args; i++) {
+ ret = -EFAULT;
+ if (copy_from_user(&fd, &fds[i], sizeof(fd)))
+ break;
+
+ ctx->user_files[i] = fget(fd);
+
+ ret = -EBADF;
+ if (!ctx->user_files[i])
+ break;
+ /*
+ * Don't allow io_uring instances to be registered. If UNIX
+ * isn't enabled, then this causes a reference cycle and this
+ * instance can never get freed. If UNIX is enabled we'll
+ * handle it just fine, but there's still no point in allowing
+ * a ring fd as it doesn't support regular read/write anyway.
+ */
+ if (ctx->user_files[i]->f_op == &io_uring_fops) {
+ fput(ctx->user_files[i]);
+ break;
+ }
+ ctx->nr_user_files++;
+ ret = 0;
+ }
+
+ if (ret) {
+ for (i = 0; i < ctx->nr_user_files; i++)
+ fput(ctx->user_files[i]);
+
+ kfree(ctx->user_files);
+ ctx->nr_user_files = 0;
+ return ret;
+ }
+
+ ret = io_sqe_files_scm(ctx);
+ if (ret)
+ io_sqe_files_unregister(ctx);
+
+ return ret;
+}
+
+static int io_sq_offload_start(struct io_ring_ctx *ctx,
+ struct io_uring_params *p)
+{
+ int ret;
+
+ init_waitqueue_head(&ctx->sqo_wait);
+ mmgrab(current->mm);
+ ctx->sqo_mm = current->mm;
+
+ ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
+ if (!ctx->sq_thread_idle)
+ ctx->sq_thread_idle = HZ;
+
+ ret = -EINVAL;
+ if (!cpu_possible(p->sq_thread_cpu))
+ goto err;
+
+ if (ctx->flags & IORING_SETUP_SQPOLL) {
+ if (p->flags & IORING_SETUP_SQ_AFF) {
+ int cpu;
+
+ cpu = array_index_nospec(p->sq_thread_cpu, NR_CPUS);
+ ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread,
+ ctx, cpu,
+ "io_uring-sq");
+ } else {
+ ctx->sqo_thread = kthread_create(io_sq_thread, ctx,
+ "io_uring-sq");
+ }
+ if (IS_ERR(ctx->sqo_thread)) {
+ ret = PTR_ERR(ctx->sqo_thread);
+ ctx->sqo_thread = NULL;
+ goto err;
+ }
+ wake_up_process(ctx->sqo_thread);
+ } else if (p->flags & IORING_SETUP_SQ_AFF) {
+ /* Can't have SQ_AFF without SQPOLL */
+ ret = -EINVAL;
+ goto err;
+ }
+
+ /* Do QD, or 2 * CPUS, whatever is smallest */
+ ctx->sqo_wq = alloc_workqueue("io_ring-wq", WQ_UNBOUND | WQ_FREEZABLE,
+ min(ctx->sq_entries - 1, 2 * num_online_cpus()));
+ if (!ctx->sqo_wq) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ return 0;
+err:
+ io_sq_thread_stop(ctx);
+ mmdrop(ctx->sqo_mm);
+ ctx->sqo_mm = NULL;
+ return ret;
+}
+
+static void io_unaccount_mem(struct user_struct *user, unsigned long nr_pages)
+{
+ atomic_long_sub(nr_pages, &user->locked_vm);
+}
+
+static int io_account_mem(struct user_struct *user, unsigned long nr_pages)
+{
+ unsigned long page_limit, cur_pages, new_pages;
+
+ /* Don't allow more pages than we can safely lock */
+ page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+ do {
+ cur_pages = atomic_long_read(&user->locked_vm);
+ new_pages = cur_pages + nr_pages;
+ if (new_pages > page_limit)
+ return -ENOMEM;
+ } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
+ new_pages) != cur_pages);
+
+ return 0;
+}
+
+static void io_mem_free(void *ptr)
+{
+ struct page *page = virt_to_head_page(ptr);
+
+ if (put_page_testzero(page))
+ free_compound_page(page);
+}
+
+static void *io_mem_alloc(size_t size)
+{
+ gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
+ __GFP_NORETRY;
+
+ return (void *) __get_free_pages(gfp_flags, get_order(size));
+}
+
+static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
+{
+ struct io_sq_ring *sq_ring;
+ struct io_cq_ring *cq_ring;
+ size_t bytes;
+
+ bytes = struct_size(sq_ring, array, sq_entries);
+ bytes += array_size(sizeof(struct io_uring_sqe), sq_entries);
+ bytes += struct_size(cq_ring, cqes, cq_entries);
+
+ return (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
+}
+
+static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
+{
+ int i, j;
+
+ if (!ctx->user_bufs)
+ return -ENXIO;
+
+ for (i = 0; i < ctx->nr_user_bufs; i++) {
+ struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
+
+ for (j = 0; j < imu->nr_bvecs; j++)
+ put_page(imu->bvec[j].bv_page);
+
+ if (ctx->account_mem)
+ io_unaccount_mem(ctx->user, imu->nr_bvecs);
+ kfree(imu->bvec);
+ imu->nr_bvecs = 0;
+ }
+
+ kfree(ctx->user_bufs);
+ ctx->user_bufs = NULL;
+ ctx->nr_user_bufs = 0;
+ return 0;
+}
+
+static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
+ void __user *arg, unsigned index)
+{
+ struct iovec __user *src;
+
+#ifdef CONFIG_COMPAT
+ if (ctx->compat) {
+ struct compat_iovec __user *ciovs;
+ struct compat_iovec ciov;
+
+ ciovs = (struct compat_iovec __user *) arg;
+ if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
+ return -EFAULT;
+
+ dst->iov_base = (void __user *) (unsigned long) ciov.iov_base;
+ dst->iov_len = ciov.iov_len;
+ return 0;
+ }
+#endif
+ src = (struct iovec __user *) arg;
+ if (copy_from_user(dst, &src[index], sizeof(*dst)))
+ return -EFAULT;
+ return 0;
+}
+
+static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
+ unsigned nr_args)
+{
+ struct vm_area_struct **vmas = NULL;
+ struct page **pages = NULL;
+ int i, j, got_pages = 0;
+ int ret = -EINVAL;
+
+ if (ctx->user_bufs)
+ return -EBUSY;
+ if (!nr_args || nr_args > UIO_MAXIOV)
+ return -EINVAL;
+
+ ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
+ GFP_KERNEL);
+ if (!ctx->user_bufs)
+ return -ENOMEM;
+
+ for (i = 0; i < nr_args; i++) {
+ struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
+ unsigned long off, start, end, ubuf;
+ int pret, nr_pages;
+ struct iovec iov;
+ size_t size;
+
+ ret = io_copy_iov(ctx, &iov, arg, i);
+ if (ret)
+ break;
+
+ /*
+ * Don't impose further limits on the size and buffer
+ * constraints here, we'll -EINVAL later when IO is
+ * submitted if they are wrong.
+ */
+ ret = -EFAULT;
+ if (!iov.iov_base || !iov.iov_len)
+ goto err;
+
+ /* arbitrary limit, but we need something */
+ if (iov.iov_len > SZ_1G)
+ goto err;
+
+ ubuf = (unsigned long) iov.iov_base;
+ end = (ubuf + iov.iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ start = ubuf >> PAGE_SHIFT;
+ nr_pages = end - start;
+
+ if (ctx->account_mem) {
+ ret = io_account_mem(ctx->user, nr_pages);
+ if (ret)
+ goto err;
+ }
+
+ ret = 0;
+ if (!pages || nr_pages > got_pages) {
+ kfree(vmas);
+ kfree(pages);
+ pages = kmalloc_array(nr_pages, sizeof(struct page *),
+ GFP_KERNEL);
+ vmas = kmalloc_array(nr_pages,
+ sizeof(struct vm_area_struct *),
+ GFP_KERNEL);
+ if (!pages || !vmas) {
+ ret = -ENOMEM;
+ if (ctx->account_mem)
+ io_unaccount_mem(ctx->user, nr_pages);
+ goto err;
+ }
+ got_pages = nr_pages;
+ }
+
+ imu->bvec = kmalloc_array(nr_pages, sizeof(struct bio_vec),
+ GFP_KERNEL);
+ ret = -ENOMEM;
+ if (!imu->bvec) {
+ if (ctx->account_mem)
+ io_unaccount_mem(ctx->user, nr_pages);
+ goto err;
+ }
+
+ ret = 0;
+ down_read(&current->mm->mmap_sem);
+ pret = get_user_pages_longterm(ubuf, nr_pages, FOLL_WRITE,
+ pages, vmas);
+ if (pret == nr_pages) {
+ /* don't support file backed memory */
+ for (j = 0; j < nr_pages; j++) {
+ struct vm_area_struct *vma = vmas[j];
+
+ if (vma->vm_file &&
+ !is_file_hugepages(vma->vm_file)) {
+ ret = -EOPNOTSUPP;
+ break;
+ }
+ }
+ } else {
+ ret = pret < 0 ? pret : -EFAULT;
+ }
+ up_read(&current->mm->mmap_sem);
+ if (ret) {
+ /*
+ * if we did partial map, or found file backed vmas,
+ * release any pages we did get
+ */
+ if (pret > 0) {
+ for (j = 0; j < pret; j++)
+ put_page(pages[j]);
+ }
+ if (ctx->account_mem)
+ io_unaccount_mem(ctx->user, nr_pages);
+ goto err;
+ }
+
+ off = ubuf & ~PAGE_MASK;
+ size = iov.iov_len;
+ for (j = 0; j < nr_pages; j++) {
+ size_t vec_len;
+
+ vec_len = min_t(size_t, size, PAGE_SIZE - off);
+ imu->bvec[j].bv_page = pages[j];
+ imu->bvec[j].bv_len = vec_len;
+ imu->bvec[j].bv_offset = off;
+ off = 0;
+ size -= vec_len;
+ }
+ /* store original address for later verification */
+ imu->ubuf = ubuf;
+ imu->len = iov.iov_len;
+ imu->nr_bvecs = nr_pages;
+
+ ctx->nr_user_bufs++;
+ }
+ kfree(pages);
+ kfree(vmas);
+ return 0;
+err:
+ kfree(pages);
+ kfree(vmas);
+ io_sqe_buffer_unregister(ctx);
+ return ret;
+}
+
+static void io_ring_ctx_free(struct io_ring_ctx *ctx)
+{
+ io_finish_async(ctx);
+ if (ctx->sqo_mm)
+ mmdrop(ctx->sqo_mm);
+
+ io_iopoll_reap_events(ctx);
+ io_sqe_buffer_unregister(ctx);
+ io_sqe_files_unregister(ctx);
+
+#if defined(CONFIG_UNIX)
+ if (ctx->ring_sock)
+ sock_release(ctx->ring_sock);
+#endif
+
+ io_mem_free(ctx->sq_ring);
+ io_mem_free(ctx->sq_sqes);
+ io_mem_free(ctx->cq_ring);
+
+ percpu_ref_exit(&ctx->refs);
+ if (ctx->account_mem)
+ io_unaccount_mem(ctx->user,
+ ring_pages(ctx->sq_entries, ctx->cq_entries));
+ free_uid(ctx->user);
+ kfree(ctx);
+}
+
+static __poll_t io_uring_poll(struct file *file, poll_table *wait)
+{
+ struct io_ring_ctx *ctx = file->private_data;
+ __poll_t mask = 0;
+
+ poll_wait(file, &ctx->cq_wait, wait);
+ /* See comment at the top of this file */
+ smp_rmb();
+ if (READ_ONCE(ctx->sq_ring->r.tail) + 1 != ctx->cached_sq_head)
+ mask |= EPOLLOUT | EPOLLWRNORM;
+ if (READ_ONCE(ctx->cq_ring->r.head) != ctx->cached_cq_tail)
+ mask |= EPOLLIN | EPOLLRDNORM;
+
+ return mask;
+}
+
+static int io_uring_fasync(int fd, struct file *file, int on)
+{
+ struct io_ring_ctx *ctx = file->private_data;
+
+ return fasync_helper(fd, file, on, &ctx->cq_fasync);
+}
+
+static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
+{
+ mutex_lock(&ctx->uring_lock);
+ percpu_ref_kill(&ctx->refs);
+ mutex_unlock(&ctx->uring_lock);
+
+ io_poll_remove_all(ctx);
+ io_iopoll_reap_events(ctx);
+ wait_for_completion(&ctx->ctx_done);
+ io_ring_ctx_free(ctx);
+}
+
+static int io_uring_release(struct inode *inode, struct file *file)
+{
+ struct io_ring_ctx *ctx = file->private_data;
+
+ file->private_data = NULL;
+ io_ring_ctx_wait_and_kill(ctx);
+ return 0;
+}
+
+static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ loff_t offset = (loff_t) vma->vm_pgoff << PAGE_SHIFT;
+ unsigned long sz = vma->vm_end - vma->vm_start;
+ struct io_ring_ctx *ctx = file->private_data;
+ unsigned long pfn;
+ struct page *page;
+ void *ptr;
+
+ switch (offset) {
+ case IORING_OFF_SQ_RING:
+ ptr = ctx->sq_ring;
+ break;
+ case IORING_OFF_SQES:
+ ptr = ctx->sq_sqes;
+ break;
+ case IORING_OFF_CQ_RING:
+ ptr = ctx->cq_ring;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ page = virt_to_head_page(ptr);
+ if (sz > (PAGE_SIZE << compound_order(page)))
+ return -EINVAL;
+
+ pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
+ return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
+}
+
+SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
+ u32, min_complete, u32, flags, const sigset_t __user *, sig,
+ size_t, sigsz)
+{
+ struct io_ring_ctx *ctx;
+ long ret = -EBADF;
+ int submitted = 0;
+ struct fd f;
+
+ if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP))
+ return -EINVAL;
+
+ f = fdget(fd);
+ if (!f.file)
+ return -EBADF;
+
+ ret = -EOPNOTSUPP;
+ if (f.file->f_op != &io_uring_fops)
+ goto out_fput;
+
+ ret = -ENXIO;
+ ctx = f.file->private_data;
+ if (!percpu_ref_tryget(&ctx->refs))
+ goto out_fput;
+
+ /*
+ * For SQ polling, the thread will do all submissions and completions.
+ * Just return the requested submit count, and wake the thread if
+ * we were asked to.
+ */
+ if (ctx->flags & IORING_SETUP_SQPOLL) {
+ if (flags & IORING_ENTER_SQ_WAKEUP)
+ wake_up(&ctx->sqo_wait);
+ submitted = to_submit;
+ goto out_ctx;
+ }
+
+ ret = 0;
+ if (to_submit) {
+ to_submit = min(to_submit, ctx->sq_entries);
+
+ mutex_lock(&ctx->uring_lock);
+ submitted = io_ring_submit(ctx, to_submit);
+ mutex_unlock(&ctx->uring_lock);
+
+ if (submitted < 0)
+ goto out_ctx;
+ }
+ if (flags & IORING_ENTER_GETEVENTS) {
+ unsigned nr_events = 0;
+
+ min_complete = min(min_complete, ctx->cq_entries);
+
+ /*
+ * The application could have included the 'to_submit' count
+ * in how many events it wanted to wait for. If we failed to
+ * submit the desired count, we may need to adjust the number
+ * of events to poll/wait for.
+ */
+ if (submitted < to_submit)
+ min_complete = min_t(unsigned, submitted, min_complete);
+
+ if (ctx->flags & IORING_SETUP_IOPOLL) {
+ mutex_lock(&ctx->uring_lock);
+ ret = io_iopoll_check(ctx, &nr_events, min_complete);
+ mutex_unlock(&ctx->uring_lock);
+ } else {
+ ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
+ }
+ }
+
+out_ctx:
+ io_ring_drop_ctx_refs(ctx, 1);
+out_fput:
+ fdput(f);
+ return submitted ? submitted : ret;
+}
+
+static const struct file_operations io_uring_fops = {
+ .release = io_uring_release,
+ .mmap = io_uring_mmap,
+ .poll = io_uring_poll,
+ .fasync = io_uring_fasync,
+};
+
+static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
+ struct io_uring_params *p)
+{
+ struct io_sq_ring *sq_ring;
+ struct io_cq_ring *cq_ring;
+ size_t size;
+
+ sq_ring = io_mem_alloc(struct_size(sq_ring, array, p->sq_entries));
+ if (!sq_ring)
+ return -ENOMEM;
+
+ ctx->sq_ring = sq_ring;
+ sq_ring->ring_mask = p->sq_entries - 1;
+ sq_ring->ring_entries = p->sq_entries;
+ ctx->sq_mask = sq_ring->ring_mask;
+ ctx->sq_entries = sq_ring->ring_entries;
+
+ size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
+ if (size == SIZE_MAX)
+ return -EOVERFLOW;
+
+ ctx->sq_sqes = io_mem_alloc(size);
+ if (!ctx->sq_sqes) {
+ io_mem_free(ctx->sq_ring);
+ return -ENOMEM;
+ }
+
+ cq_ring = io_mem_alloc(struct_size(cq_ring, cqes, p->cq_entries));
+ if (!cq_ring) {
+ io_mem_free(ctx->sq_ring);
+ io_mem_free(ctx->sq_sqes);
+ return -ENOMEM;
+ }
+
+ ctx->cq_ring = cq_ring;
+ cq_ring->ring_mask = p->cq_entries - 1;
+ cq_ring->ring_entries = p->cq_entries;
+ ctx->cq_mask = cq_ring->ring_mask;
+ ctx->cq_entries = cq_ring->ring_entries;
+ return 0;
+}
+
+/*
+ * Allocate an anonymous fd, this is what constitutes the application
+ * visible backing of an io_uring instance. The application mmaps this
+ * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
+ * we have to tie this fd to a socket for file garbage collection purposes.
+ */
+static int io_uring_get_fd(struct io_ring_ctx *ctx)
+{
+ struct file *file;
+ int ret;
+
+#if defined(CONFIG_UNIX)
+ ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
+ &ctx->ring_sock);
+ if (ret)
+ return ret;
+#endif
+
+ ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
+ if (ret < 0)
+ goto err;
+
+ file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
+ O_RDWR | O_CLOEXEC);
+ if (IS_ERR(file)) {
+ put_unused_fd(ret);
+ ret = PTR_ERR(file);
+ goto err;
+ }
+
+#if defined(CONFIG_UNIX)
+ ctx->ring_sock->file = file;
+ ctx->ring_sock->sk->sk_user_data = ctx;
+#endif
+ fd_install(ret, file);
+ return ret;
+err:
+#if defined(CONFIG_UNIX)
+ sock_release(ctx->ring_sock);
+ ctx->ring_sock = NULL;
+#endif
+ return ret;
+}
+
+static int io_uring_create(unsigned entries, struct io_uring_params *p)
+{
+ struct user_struct *user = NULL;
+ struct io_ring_ctx *ctx;
+ bool account_mem;
+ int ret;
+
+ if (!entries || entries > IORING_MAX_ENTRIES)
+ return -EINVAL;
+
+ /*
+ * Use twice as many entries for the CQ ring. It's possible for the
+ * application to drive a higher depth than the size of the SQ ring,
+ * since the sqes are only used at submission time. This allows for
+ * some flexibility in overcommitting a bit.
+ */
+ p->sq_entries = roundup_pow_of_two(entries);
+ p->cq_entries = 2 * p->sq_entries;
+
+ user = get_uid(current_user());
+ account_mem = !capable(CAP_IPC_LOCK);
+
+ if (account_mem) {
+ ret = io_account_mem(user,
+ ring_pages(p->sq_entries, p->cq_entries));
+ if (ret) {
+ free_uid(user);
+ return ret;
+ }
+ }
+
+ ctx = io_ring_ctx_alloc(p);
+ if (!ctx) {
+ if (account_mem)
+ io_unaccount_mem(user, ring_pages(p->sq_entries,
+ p->cq_entries));
+ free_uid(user);
+ return -ENOMEM;
+ }
+ ctx->compat = in_compat_syscall();
+ ctx->account_mem = account_mem;
+ ctx->user = user;
+
+ ret = io_allocate_scq_urings(ctx, p);
+ if (ret)
+ goto err;
+
+ ret = io_sq_offload_start(ctx, p);
+ if (ret)
+ goto err;
+
+ ret = io_uring_get_fd(ctx);
+ if (ret < 0)
+ goto err;
+
+ memset(&p->sq_off, 0, sizeof(p->sq_off));
+ p->sq_off.head = offsetof(struct io_sq_ring, r.head);
+ p->sq_off.tail = offsetof(struct io_sq_ring, r.tail);
+ p->sq_off.ring_mask = offsetof(struct io_sq_ring, ring_mask);
+ p->sq_off.ring_entries = offsetof(struct io_sq_ring, ring_entries);
+ p->sq_off.flags = offsetof(struct io_sq_ring, flags);
+ p->sq_off.dropped = offsetof(struct io_sq_ring, dropped);
+ p->sq_off.array = offsetof(struct io_sq_ring, array);
+
+ memset(&p->cq_off, 0, sizeof(p->cq_off));
+ p->cq_off.head = offsetof(struct io_cq_ring, r.head);
+ p->cq_off.tail = offsetof(struct io_cq_ring, r.tail);
+ p->cq_off.ring_mask = offsetof(struct io_cq_ring, ring_mask);
+ p->cq_off.ring_entries = offsetof(struct io_cq_ring, ring_entries);
+ p->cq_off.overflow = offsetof(struct io_cq_ring, overflow);
+ p->cq_off.cqes = offsetof(struct io_cq_ring, cqes);
+ return ret;
+err:
+ io_ring_ctx_wait_and_kill(ctx);
+ return ret;
+}
+
+/*
+ * Sets up an aio uring context, and returns the fd. Applications asks for a
+ * ring size, we return the actual sq/cq ring sizes (among other things) in the
+ * params structure passed in.
+ */
+static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
+{
+ struct io_uring_params p;
+ long ret;
+ int i;
+
+ if (copy_from_user(&p, params, sizeof(p)))
+ return -EFAULT;
+ for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
+ if (p.resv[i])
+ return -EINVAL;
+ }
+
+ if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
+ IORING_SETUP_SQ_AFF))
+ return -EINVAL;
+
+ ret = io_uring_create(entries, &p);
+ if (ret < 0)
+ return ret;
+
+ if (copy_to_user(params, &p, sizeof(p)))
+ return -EFAULT;
+
+ return ret;
+}
+
+SYSCALL_DEFINE2(io_uring_setup, u32, entries,
+ struct io_uring_params __user *, params)
+{
+ return io_uring_setup(entries, params);
+}
+
+static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
+ void __user *arg, unsigned nr_args)
+{
+ int ret;
+
+ percpu_ref_kill(&ctx->refs);
+ wait_for_completion(&ctx->ctx_done);
+
+ switch (opcode) {
+ case IORING_REGISTER_BUFFERS:
+ ret = io_sqe_buffer_register(ctx, arg, nr_args);
+ break;
+ case IORING_UNREGISTER_BUFFERS:
+ ret = -EINVAL;
+ if (arg || nr_args)
+ break;
+ ret = io_sqe_buffer_unregister(ctx);
+ break;
+ case IORING_REGISTER_FILES:
+ ret = io_sqe_files_register(ctx, arg, nr_args);
+ break;
+ case IORING_UNREGISTER_FILES:
+ ret = -EINVAL;
+ if (arg || nr_args)
+ break;
+ ret = io_sqe_files_unregister(ctx);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ /* bring the ctx back to life */
+ reinit_completion(&ctx->ctx_done);
+ percpu_ref_reinit(&ctx->refs);
+ return ret;
+}
+
+SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
+ void __user *, arg, unsigned int, nr_args)
+{
+ struct io_ring_ctx *ctx;
+ long ret = -EBADF;
+ struct fd f;
+
+ f = fdget(fd);
+ if (!f.file)
+ return -EBADF;
+
+ ret = -EOPNOTSUPP;
+ if (f.file->f_op != &io_uring_fops)
+ goto out_fput;
+
+ ctx = f.file->private_data;
+
+ mutex_lock(&ctx->uring_lock);
+ ret = __io_uring_register(ctx, opcode, arg, nr_args);
+ mutex_unlock(&ctx->uring_lock);
+out_fput:
+ fdput(f);
+ return ret;
+}
+
+static int __init io_uring_init(void)
+{
+ req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
+ return 0;
+};
+__initcall(io_uring_init);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 2005529af560..fef3a6bf7c78 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -203,7 +203,7 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
fieinfo.fi_extents_start = ufiemap->fm_extents;
if (fiemap.fm_extent_count != 0 &&
- !access_ok(VERIFY_WRITE, fieinfo.fi_extents_start,
+ !access_ok(fieinfo.fi_extents_start,
fieinfo.fi_extents_max * sizeof(struct fiemap_extent)))
return -EFAULT;
@@ -223,6 +223,7 @@ static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd,
u64 off, u64 olen, u64 destoff)
{
struct fd src_file = fdget(srcfd);
+ loff_t cloned;
int ret;
if (!src_file.file)
@@ -230,7 +231,14 @@ static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd,
ret = -EXDEV;
if (src_file.file->f_path.mnt != dst_file->f_path.mnt)
goto fdput;
- ret = vfs_clone_file_range(src_file.file, off, dst_file, destoff, olen);
+ cloned = vfs_clone_file_range(src_file.file, off, dst_file, destoff,
+ olen, 0);
+ if (cloned < 0)
+ ret = cloned;
+ else if (olen && cloned != olen)
+ ret = -EINVAL;
+ else
+ ret = 0;
fdput:
fdput(src_file);
return ret;
@@ -669,6 +677,9 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
return ioctl_fiemap(filp, arg);
case FIGETBSZ:
+ /* anon_bdev filesystems may not have a block size */
+ if (!inode->i_sb->s_blocksize)
+ return -EINVAL;
return put_user(inode->i_sb->s_blocksize, argp);
case FICLONE:
diff --git a/fs/iomap.c b/fs/iomap.c
index ec15cf2ec696..97cb9d486a7d 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -30,7 +30,6 @@
#include <linux/task_io_accounting_ops.h>
#include <linux/dax.h>
#include <linux/sched/signal.h>
-#include <linux/swap.h>
#include "internal.h"
@@ -117,6 +116,12 @@ iomap_page_create(struct inode *inode, struct page *page)
atomic_set(&iop->read_count, 0);
atomic_set(&iop->write_count, 0);
bitmap_zero(iop->uptodate, PAGE_SIZE / SECTOR_SIZE);
+
+ /*
+ * migrate_page_move_mapping() assumes that pages with private data have
+ * their count elevated by 1.
+ */
+ get_page(page);
set_page_private(page, (unsigned long)iop);
SetPagePrivate(page);
return iop;
@@ -133,6 +138,7 @@ iomap_page_release(struct page *page)
WARN_ON_ONCE(atomic_read(&iop->write_count));
ClearPagePrivate(page);
set_page_private(page, 0);
+ put_page(page);
kfree(iop);
}
@@ -143,13 +149,14 @@ static void
iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop,
loff_t *pos, loff_t length, unsigned *offp, unsigned *lenp)
{
+ loff_t orig_pos = *pos;
+ loff_t isize = i_size_read(inode);
unsigned block_bits = inode->i_blkbits;
unsigned block_size = (1 << block_bits);
unsigned poff = offset_in_page(*pos);
unsigned plen = min_t(loff_t, PAGE_SIZE - poff, length);
unsigned first = poff >> block_bits;
unsigned last = (poff + plen - 1) >> block_bits;
- unsigned end = offset_in_page(i_size_read(inode)) >> block_bits;
/*
* If the block size is smaller than the page size we need to check the
@@ -184,8 +191,12 @@ iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop,
* handle both halves separately so that we properly zero data in the
* page cache for blocks that are entirely outside of i_size.
*/
- if (first <= end && last > end)
- plen -= (last - end) * block_size;
+ if (orig_pos <= isize && orig_pos + length > isize) {
+ unsigned end = offset_in_page(isize - 1) >> block_bits;
+
+ if (first <= end && last > end)
+ plen -= (last - end) * block_size;
+ }
*offp = poff;
*lenp = plen;
@@ -263,8 +274,9 @@ iomap_read_end_io(struct bio *bio)
int error = blk_status_to_errno(bio->bi_status);
struct bio_vec *bvec;
int i;
+ struct bvec_iter_all iter_all;
- bio_for_each_segment_all(bvec, bio, i)
+ bio_for_each_segment_all(bvec, bio, i, iter_all)
iomap_read_page_end_io(bvec, error);
bio_put(bio);
}
@@ -313,7 +325,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
*/
sector = iomap_sector(iomap, pos);
if (ctx->bio && bio_end_sector(ctx->bio) == sector) {
- if (__bio_try_merge_page(ctx->bio, page, plen, poff))
+ if (__bio_try_merge_page(ctx->bio, page, plen, poff, true))
goto done;
is_contig = true;
}
@@ -344,7 +356,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
ctx->bio->bi_end_io = iomap_read_end_io;
}
- __bio_add_page(ctx->bio, page, plen, poff);
+ bio_add_page(ctx->bio, page, plen, poff);
done:
/*
* Move the caller beyond our range so that it keeps making progress.
@@ -488,16 +500,29 @@ done:
}
EXPORT_SYMBOL_GPL(iomap_readpages);
+/*
+ * iomap_is_partially_uptodate checks whether blocks within a page are
+ * uptodate or not.
+ *
+ * Returns true if all blocks which correspond to a file portion
+ * we want to read within the page are uptodate.
+ */
int
iomap_is_partially_uptodate(struct page *page, unsigned long from,
unsigned long count)
{
struct iomap_page *iop = to_iomap_page(page);
struct inode *inode = page->mapping->host;
- unsigned first = from >> inode->i_blkbits;
- unsigned last = (from + count - 1) >> inode->i_blkbits;
+ unsigned len, first, last;
unsigned i;
+ /* Limit range to one page */
+ len = min_t(unsigned, PAGE_SIZE - from, count);
+
+ /* First and last blocks in range within page */
+ first = from >> inode->i_blkbits;
+ last = (from + len - 1) >> inode->i_blkbits;
+
if (iop) {
for (i = first; i <= last; i++)
if (!test_bit(i, iop->uptodate))
@@ -546,14 +571,16 @@ iomap_migrate_page(struct address_space *mapping, struct page *newpage,
{
int ret;
- ret = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
+ ret = migrate_page_move_mapping(mapping, newpage, page, mode, 0);
if (ret != MIGRATEPAGE_SUCCESS)
return ret;
if (page_has_private(page)) {
ClearPagePrivate(page);
+ get_page(newpage);
set_page_private(newpage, page_private(page));
set_page_private(page, 0);
+ put_page(page);
SetPagePrivate(newpage);
}
@@ -1057,7 +1084,7 @@ iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
return length;
}
-int iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
+vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
{
struct page *page = vmf->page;
struct inode *inode = file_inode(vmf->vma->vm_file);
@@ -1437,6 +1464,28 @@ struct iomap_dio {
};
};
+int iomap_dio_iopoll(struct kiocb *kiocb, bool spin)
+{
+ struct request_queue *q = READ_ONCE(kiocb->private);
+
+ if (!q)
+ return 0;
+ return blk_poll(q, READ_ONCE(kiocb->ki_cookie), spin);
+}
+EXPORT_SYMBOL_GPL(iomap_dio_iopoll);
+
+static void iomap_dio_submit_bio(struct iomap_dio *dio, struct iomap *iomap,
+ struct bio *bio)
+{
+ atomic_inc(&dio->ref);
+
+ if (dio->iocb->ki_flags & IOCB_HIPRI)
+ bio_set_polled(bio, dio->iocb);
+
+ dio->submit.last_queue = bdev_get_queue(iomap->bdev);
+ dio->submit.cookie = submit_bio(bio);
+}
+
static ssize_t iomap_dio_complete(struct iomap_dio *dio)
{
struct kiocb *iocb = dio->iocb;
@@ -1526,7 +1575,7 @@ static void iomap_dio_bio_end_io(struct bio *bio)
if (dio->wait_for_completion) {
struct task_struct *waiter = dio->submit.waiter;
WRITE_ONCE(dio->submit.waiter, NULL);
- wake_up_process(waiter);
+ blk_wake_io_task(waiter);
} else if (dio->flags & IOMAP_DIO_WRITE) {
struct inode *inode = file_inode(dio->iocb->ki_filp);
@@ -1542,18 +1591,20 @@ static void iomap_dio_bio_end_io(struct bio *bio)
} else {
struct bio_vec *bvec;
int i;
+ struct bvec_iter_all iter_all;
- bio_for_each_segment_all(bvec, bio, i)
+ bio_for_each_segment_all(bvec, bio, i, iter_all)
put_page(bvec->bv_page);
bio_put(bio);
}
}
-static blk_qc_t
+static void
iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
unsigned len)
{
struct page *page = ZERO_PAGE(0);
+ int flags = REQ_SYNC | REQ_IDLE;
struct bio *bio;
bio = bio_alloc(GFP_KERNEL, 1);
@@ -1564,10 +1615,8 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
get_page(page);
__bio_add_page(bio, page, len, 0);
- bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE);
-
- atomic_inc(&dio->ref);
- return submit_bio(bio);
+ bio_set_op_attrs(bio, REQ_OP_WRITE, flags);
+ iomap_dio_submit_bio(dio, iomap, bio);
}
static loff_t
@@ -1581,7 +1630,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
struct bio *bio;
bool need_zeroout = false;
bool use_fua = false;
- int nr_pages, ret;
+ int nr_pages, ret = 0;
size_t copied = 0;
if ((pos | length | align) & ((1 << blkbits) - 1))
@@ -1597,12 +1646,13 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
if (iomap->flags & IOMAP_F_NEW) {
need_zeroout = true;
- } else {
+ } else if (iomap->type == IOMAP_MAPPED) {
/*
- * Use a FUA write if we need datasync semantics, this
- * is a pure data IO that doesn't require any metadata
- * updates and the underlying device supports FUA. This
- * allows us to avoid cache flushes on IO completion.
+ * Use a FUA write if we need datasync semantics, this is a pure
+ * data IO that doesn't require any metadata updates (including
+ * after IO completion such as unwritten extent conversion) and
+ * the underlying device supports FUA. This allows us to avoid
+ * cache flushes on IO completion.
*/
if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
(dio->flags & IOMAP_DIO_WRITE_FUA) &&
@@ -1645,8 +1695,14 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
ret = bio_iov_iter_get_pages(bio, &iter);
if (unlikely(ret)) {
+ /*
+ * We have to stop part way through an IO. We must fall
+ * through to the sub-block tail zeroing here, otherwise
+ * this short IO may expose stale data in the tail of
+ * the block we haven't written data to.
+ */
bio_put(bio);
- return copied ? copied : ret;
+ goto zero_tail;
}
n = bio->bi_iter.bi_size;
@@ -1670,20 +1726,24 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
copied += n;
nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
-
- atomic_inc(&dio->ref);
-
- dio->submit.last_queue = bdev_get_queue(iomap->bdev);
- dio->submit.cookie = submit_bio(bio);
+ iomap_dio_submit_bio(dio, iomap, bio);
} while (nr_pages);
- if (need_zeroout) {
+ /*
+ * We need to zeroout the tail of a sub-block write if the extent type
+ * requires zeroing or the write extends beyond EOF. If we don't zero
+ * the block tail in the latter case, we can expose stale data via mmap
+ * reads of the EOF block.
+ */
+zero_tail:
+ if (need_zeroout ||
+ ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) {
/* zero out from the end of the write to the end of the block */
pad = pos & (fs_block_size - 1);
if (pad)
iomap_dio_zero(dio, iomap, pos, fs_block_size - pad);
}
- return copied;
+ return copied ? copied : ret;
}
static loff_t
@@ -1765,6 +1825,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
loff_t pos = iocb->ki_pos, start = pos;
loff_t end = iocb->ki_pos + count - 1, ret = 0;
unsigned int flags = IOMAP_DIRECT;
+ bool wait_for_completion = is_sync_kiocb(iocb);
struct blk_plug plug;
struct iomap_dio *dio;
@@ -1784,7 +1845,6 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
dio->end_io = end_io;
dio->error = 0;
dio->flags = 0;
- dio->wait_for_completion = is_sync_kiocb(iocb);
dio->submit.iter = iter;
dio->submit.waiter = current;
@@ -1795,7 +1855,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (pos >= dio->i_size)
goto out_free_dio;
- if (iter->type == ITER_IOVEC)
+ if (iter_is_iovec(iter) && iov_iter_rw(iter) == READ)
dio->flags |= IOMAP_DIO_DIRTY;
} else {
flags |= IOMAP_WRITE;
@@ -1839,7 +1899,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
dio_warn_stale_pagecache(iocb->ki_filp);
ret = 0;
- if (iov_iter_rw(iter) == WRITE && !dio->wait_for_completion &&
+ if (iov_iter_rw(iter) == WRITE && !wait_for_completion &&
!inode->i_sb->s_dio_done_wq) {
ret = sb_init_dio_done_wq(inode->i_sb);
if (ret < 0)
@@ -1855,7 +1915,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (ret <= 0) {
/* magic error code to fall back to buffered I/O */
if (ret == -ENOTBLK) {
- dio->wait_for_completion = true;
+ wait_for_completion = true;
ret = 0;
}
break;
@@ -1877,8 +1937,27 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (dio->flags & IOMAP_DIO_WRITE_FUA)
dio->flags &= ~IOMAP_DIO_NEED_SYNC;
+ WRITE_ONCE(iocb->ki_cookie, dio->submit.cookie);
+ WRITE_ONCE(iocb->private, dio->submit.last_queue);
+
+ /*
+ * We are about to drop our additional submission reference, which
+ * might be the last reference to the dio. There are three three
+ * different ways we can progress here:
+ *
+ * (a) If this is the last reference we will always complete and free
+ * the dio ourselves.
+ * (b) If this is not the last reference, and we serve an asynchronous
+ * iocb, we must never touch the dio after the decrement, the
+ * I/O completion handler will complete and free it.
+ * (c) If this is not the last reference, but we serve a synchronous
+ * iocb, the I/O completion handler will wake us up on the drop
+ * of the final reference, and we will complete and free it here
+ * after we got woken by the I/O completion handler.
+ */
+ dio->wait_for_completion = wait_for_completion;
if (!atomic_dec_and_test(&dio->ref)) {
- if (!dio->wait_for_completion)
+ if (!wait_for_completion)
return -EIOCBQUEUED;
for (;;) {
@@ -1889,15 +1968,13 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (!(iocb->ki_flags & IOCB_HIPRI) ||
!dio->submit.last_queue ||
!blk_poll(dio->submit.last_queue,
- dio->submit.cookie))
+ dio->submit.cookie, true))
io_schedule();
}
__set_current_state(TASK_RUNNING);
}
- ret = iomap_dio_complete(dio);
-
- return ret;
+ return iomap_dio_complete(dio);
out_free_dio:
kfree(dio);
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index 947ce22f5b3c..f0fe641893a5 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -46,7 +46,7 @@ int isofs_name_translate(struct iso_directory_record *de, char *new, struct inod
return i;
}
-/* Acorn extensions written by Matthew Wilcox <willy@bofh.ai> 1998 */
+/* Acorn extensions written by Matthew Wilcox <willy@infradead.org> 1998 */
int get_acorn_filename(struct iso_directory_record *de,
char *retname, struct inode *inode)
{
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 150cc030b4d7..2eb55c3361a8 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -439,6 +439,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
finish_wait(&journal->j_wait_updates, &wait);
}
spin_unlock(&commit_transaction->t_handle_lock);
+ commit_transaction->t_state = T_SWITCH;
+ write_unlock(&journal->j_state_lock);
J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
journal->j_max_transaction_buffers);
@@ -505,6 +507,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
atomic_sub(atomic_read(&journal->j_reserved_credits),
&commit_transaction->t_outstanding_credits);
+ write_lock(&journal->j_state_lock);
trace_jbd2_commit_flushing(journal, commit_transaction);
stats.run.rs_flushing = jiffies;
stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index c0b66a7a795b..cc35537232f2 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -138,9 +138,9 @@ static inline void update_t_max_wait(transaction_t *transaction,
}
/*
- * Wait until running transaction passes T_LOCKED state. Also starts the commit
- * if needed. The function expects running transaction to exist and releases
- * j_state_lock.
+ * Wait until running transaction passes to T_FLUSH state and new transaction
+ * can thus be started. Also starts the commit if needed. The function expects
+ * running transaction to exist and releases j_state_lock.
*/
static void wait_transaction_locked(journal_t *journal)
__releases(journal->j_state_lock)
@@ -160,6 +160,32 @@ static void wait_transaction_locked(journal_t *journal)
finish_wait(&journal->j_wait_transaction_locked, &wait);
}
+/*
+ * Wait until running transaction transitions from T_SWITCH to T_FLUSH
+ * state and new transaction can thus be started. The function releases
+ * j_state_lock.
+ */
+static void wait_transaction_switching(journal_t *journal)
+ __releases(journal->j_state_lock)
+{
+ DEFINE_WAIT(wait);
+
+ if (WARN_ON(!journal->j_running_transaction ||
+ journal->j_running_transaction->t_state != T_SWITCH))
+ return;
+ prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
+ TASK_UNINTERRUPTIBLE);
+ read_unlock(&journal->j_state_lock);
+ /*
+ * We don't call jbd2_might_wait_for_commit() here as there's no
+ * waiting for outstanding handles happening anymore in T_SWITCH state
+ * and handling of reserved handles actually relies on that for
+ * correctness.
+ */
+ schedule();
+ finish_wait(&journal->j_wait_transaction_locked, &wait);
+}
+
static void sub_reserved_credits(journal_t *journal, int blocks)
{
atomic_sub(blocks, &journal->j_reserved_credits);
@@ -183,7 +209,8 @@ static int add_transaction_credits(journal_t *journal, int blocks,
* If the current transaction is locked down for commit, wait
* for the lock to be released.
*/
- if (t->t_state == T_LOCKED) {
+ if (t->t_state != T_RUNNING) {
+ WARN_ON_ONCE(t->t_state >= T_FLUSH);
wait_transaction_locked(journal);
return 1;
}
@@ -360,8 +387,14 @@ repeat:
/*
* We have handle reserved so we are allowed to join T_LOCKED
* transaction and we don't have to check for transaction size
- * and journal space.
+ * and journal space. But we still have to wait while running
+ * transaction is being switched to a committing one as it
+ * won't wait for any handles anymore.
*/
+ if (transaction->t_state == T_SWITCH) {
+ wait_transaction_switching(journal);
+ goto repeat;
+ }
sub_reserved_credits(journal, blocks);
handle->h_reserved = 0;
}
@@ -910,7 +943,7 @@ repeat:
* this is the first time this transaction is touching this buffer,
* reset the modified flag
*/
- jh->b_modified = 0;
+ jh->b_modified = 0;
/*
* If the buffer is not journaled right now, we need to make sure it
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 902a7dd10e5c..bb6ae387469f 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -101,7 +101,8 @@ static int jffs2_sync_fs(struct super_block *sb, int wait)
struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
- cancel_delayed_work_sync(&c->wbuf_dwork);
+ if (jffs2_is_writebuffered(c))
+ cancel_delayed_work_sync(&c->wbuf_dwork);
#endif
mutex_lock(&c->alloc_sem);
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 4ca0b5c18192..b84d635567d3 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -536,8 +536,8 @@ void kernfs_put(struct kernfs_node *kn)
security_release_secctx(kn->iattr->ia_secdata,
kn->iattr->ia_secdata_len);
simple_xattrs_free(&kn->iattr->xattrs);
+ kmem_cache_free(kernfs_iattrs_cache, kn->iattr);
}
- kfree(kn->iattr);
spin_lock(&kernfs_idr_lock);
idr_remove(&root->ino_idr, kn->id.ino);
spin_unlock(&kernfs_idr_lock);
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index dbf5bc250bfd..ae948aaa4c53 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -832,32 +832,40 @@ void kernfs_drain_open_files(struct kernfs_node *kn)
* to see if it supports poll (Neither 'poll' nor 'select' return
* an appropriate error code). When in doubt, set a suitable timeout value.
*/
+__poll_t kernfs_generic_poll(struct kernfs_open_file *of, poll_table *wait)
+{
+ struct kernfs_node *kn = kernfs_dentry_node(of->file->f_path.dentry);
+ struct kernfs_open_node *on = kn->attr.open;
+
+ poll_wait(of->file, &on->poll, wait);
+
+ if (of->event != atomic_read(&on->event))
+ return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI;
+
+ return DEFAULT_POLLMASK;
+}
+
static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait)
{
struct kernfs_open_file *of = kernfs_of(filp);
struct kernfs_node *kn = kernfs_dentry_node(filp->f_path.dentry);
- struct kernfs_open_node *on = kn->attr.open;
+ __poll_t ret;
if (!kernfs_get_active(kn))
- goto trigger;
+ return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI;
- poll_wait(filp, &on->poll, wait);
+ if (kn->attr.ops->poll)
+ ret = kn->attr.ops->poll(of, wait);
+ else
+ ret = kernfs_generic_poll(of, wait);
kernfs_put_active(kn);
-
- if (of->event != atomic_read(&on->event))
- goto trigger;
-
- return DEFAULT_POLLMASK;
-
- trigger:
- return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI;
+ return ret;
}
static void kernfs_notify_workfn(struct work_struct *work)
{
struct kernfs_node *kn;
- struct kernfs_open_node *on;
struct kernfs_super_info *info;
repeat:
/* pop one off the notify_list */
@@ -871,17 +879,6 @@ repeat:
kn->attr.notify_next = NULL;
spin_unlock_irq(&kernfs_notify_lock);
- /* kick poll */
- spin_lock_irq(&kernfs_open_node_lock);
-
- on = kn->attr.open;
- if (on) {
- atomic_inc(&on->event);
- wake_up_interruptible(&on->poll);
- }
-
- spin_unlock_irq(&kernfs_open_node_lock);
-
/* kick fsnotify */
mutex_lock(&kernfs_mutex);
@@ -934,10 +931,21 @@ void kernfs_notify(struct kernfs_node *kn)
{
static DECLARE_WORK(kernfs_notify_work, kernfs_notify_workfn);
unsigned long flags;
+ struct kernfs_open_node *on;
if (WARN_ON(kernfs_type(kn) != KERNFS_FILE))
return;
+ /* kick poll immediately */
+ spin_lock_irqsave(&kernfs_open_node_lock, flags);
+ on = kn->attr.open;
+ if (on) {
+ atomic_inc(&on->event);
+ wake_up_interruptible(&on->poll);
+ }
+ spin_unlock_irqrestore(&kernfs_open_node_lock, flags);
+
+ /* schedule work to kick fsnotify */
spin_lock_irqsave(&kernfs_notify_lock, flags);
if (!kn->attr.notify_next) {
kernfs_get(kn);
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 80cebcd94c90..0c1fd945ce42 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -42,7 +42,7 @@ static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn)
if (kn->iattr)
goto out_unlock;
- kn->iattr = kzalloc(sizeof(struct kernfs_iattrs), GFP_KERNEL);
+ kn->iattr = kmem_cache_zalloc(kernfs_iattrs_cache, GFP_KERNEL);
if (!kn->iattr)
goto out_unlock;
iattrs = &kn->iattr->ia_iattr;
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 3d83b114bb08..dba810cd83b1 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -78,7 +78,7 @@ static inline struct kernfs_node *kernfs_dentry_node(struct dentry *dentry)
}
extern const struct super_operations kernfs_sops;
-extern struct kmem_cache *kernfs_node_cache;
+extern struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache;
/*
* inode.c
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index ff2716f9322e..f3ac352699cf 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -20,7 +20,7 @@
#include "kernfs-internal.h"
-struct kmem_cache *kernfs_node_cache;
+struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache;
static int kernfs_sop_remount_fs(struct super_block *sb, int *flags, char *data)
{
@@ -196,8 +196,10 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn,
return dentry;
knparent = find_next_ancestor(kn, NULL);
- if (WARN_ON(!knparent))
+ if (WARN_ON(!knparent)) {
+ dput(dentry);
return ERR_PTR(-EINVAL);
+ }
do {
struct dentry *dtmp;
@@ -206,8 +208,10 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn,
if (kn == knparent)
return dentry;
kntmp = find_next_ancestor(kn, knparent);
- if (WARN_ON(!kntmp))
+ if (WARN_ON(!kntmp)) {
+ dput(dentry);
return ERR_PTR(-EINVAL);
+ }
dtmp = lookup_one_len_unlocked(kntmp->name, dentry,
strlen(kntmp->name));
dput(dentry);
@@ -236,6 +240,9 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic)
sb->s_export_op = &kernfs_export_ops;
sb->s_time_gran = 1;
+ /* sysfs dentries and inodes don't require IO to create */
+ sb->s_shrink.seeks = 0;
+
/* get root inode, initialize and unlock it */
mutex_lock(&kernfs_mutex);
inode = kernfs_get_inode(sb, info->root->kn);
@@ -414,4 +421,9 @@ void __init kernfs_init(void)
0,
SLAB_PANIC | SLAB_TYPESAFE_BY_RCU,
NULL);
+
+ /* Creates slab cache for kernfs inode attributes */
+ kernfs_iattrs_cache = kmem_cache_create("kernfs_iattrs_cache",
+ sizeof(struct kernfs_iattrs),
+ 0, SLAB_PANIC, NULL);
}
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
index 305b220af45d..162f43b80c84 100644
--- a/fs/kernfs/symlink.c
+++ b/fs/kernfs/symlink.c
@@ -72,6 +72,9 @@ static int kernfs_get_target_path(struct kernfs_node *parent,
if (base == kn)
break;
+ if ((s - path) + 3 >= PATH_MAX)
+ return -ENAMETOOLONG;
+
strcpy(s, "../");
s += 3;
base = base->parent;
@@ -88,7 +91,7 @@ static int kernfs_get_target_path(struct kernfs_node *parent,
if (len < 2)
return -EINVAL;
len--;
- if ((s - path) + len > PATH_MAX)
+ if ((s - path) + len >= PATH_MAX)
return -ENAMETOOLONG;
/* reverse fillup of target string from target to base */
diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c
index 00d5ef5f99f7..214a2fa1f1e3 100644
--- a/fs/lockd/clnt4xdr.c
+++ b/fs/lockd/clnt4xdr.c
@@ -128,24 +128,14 @@ static void encode_netobj(struct xdr_stream *xdr,
static int decode_netobj(struct xdr_stream *xdr,
struct xdr_netobj *obj)
{
- u32 length;
- __be32 *p;
+ ssize_t ret;
- p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
- length = be32_to_cpup(p++);
- if (unlikely(length > XDR_MAX_NETOBJ))
- goto out_size;
- obj->len = length;
- obj->data = (u8 *)p;
+ ret = xdr_stream_decode_opaque_inline(xdr, (void *)&obj->data,
+ XDR_MAX_NETOBJ);
+ if (unlikely(ret < 0))
+ return -EIO;
+ obj->len = ret;
return 0;
-out_size:
- dprintk("NFS: returned netobj was too long: %u\n", length);
- return -EIO;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index d20b92f271c2..e8a004097d18 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -256,7 +256,7 @@ static int nlm_wait_on_grace(wait_queue_head_t *queue)
* Generic NLM call
*/
static int
-nlmclnt_call(struct rpc_cred *cred, struct nlm_rqst *req, u32 proc)
+nlmclnt_call(const struct cred *cred, struct nlm_rqst *req, u32 proc)
{
struct nlm_host *host = req->a_host;
struct rpc_clnt *clnt;
@@ -401,7 +401,7 @@ int nlm_async_reply(struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *t
* completion in order to be able to correctly track the lock
* state.
*/
-static int nlmclnt_async_call(struct rpc_cred *cred, struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *tk_ops)
+static int nlmclnt_async_call(const struct cred *cred, struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *tk_ops)
{
struct rpc_message msg = {
.rpc_argp = &req->a_args,
@@ -442,7 +442,7 @@ nlmclnt_test(struct nlm_rqst *req, struct file_lock *fl)
fl->fl_start = req->a_res.lock.fl.fl_start;
fl->fl_end = req->a_res.lock.fl.fl_end;
fl->fl_type = req->a_res.lock.fl.fl_type;
- fl->fl_pid = 0;
+ fl->fl_pid = -req->a_res.lock.fl.fl_pid;
break;
default:
status = nlm_stat_to_errno(req->a_res.status);
@@ -510,7 +510,7 @@ static int do_vfs_lock(struct file_lock *fl)
static int
nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
{
- struct rpc_cred *cred = nfs_file_cred(fl->fl_file);
+ const struct cred *cred = nfs_file_cred(fl->fl_file);
struct nlm_host *host = req->a_host;
struct nlm_res *resp = &req->a_res;
struct nlm_wait *block = NULL;
diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c
index 2c6176387143..747b9c8c940a 100644
--- a/fs/lockd/clntxdr.c
+++ b/fs/lockd/clntxdr.c
@@ -125,24 +125,14 @@ static void encode_netobj(struct xdr_stream *xdr,
static int decode_netobj(struct xdr_stream *xdr,
struct xdr_netobj *obj)
{
- u32 length;
- __be32 *p;
+ ssize_t ret;
- p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
- length = be32_to_cpup(p++);
- if (unlikely(length > XDR_MAX_NETOBJ))
- goto out_size;
- obj->len = length;
- obj->data = (u8 *)p;
+ ret = xdr_stream_decode_opaque_inline(xdr, (void *)&obj->data,
+ XDR_MAX_NETOBJ);
+ if (unlikely(ret < 0))
+ return -EIO;
+ obj->len = ret;
return 0;
-out_size:
- dprintk("NFS: returned netobj was too long: %u\n", length);
- return -EIO;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index d35cd6be0675..93fb7cf0b92b 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -341,7 +341,7 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
};
struct lockd_net *ln = net_generic(net, lockd_net_id);
- dprintk("lockd: %s(host='%*s', vers=%u, proto=%s)\n", __func__,
+ dprintk("lockd: %s(host='%.*s', vers=%u, proto=%s)\n", __func__,
(int)hostname_len, hostname, rqstp->rq_vers,
(rqstp->rq_prot == IPPROTO_UDP ? "udp" : "tcp"));
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 74330daeab71..ea719cdd6a36 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -276,7 +276,7 @@ static int nlmsvc_unlink_block(struct nlm_block *block)
dprintk("lockd: unlinking block %p...\n", block);
/* Remove block from list */
- status = posix_unblock_lock(&block->b_call->a_args.lock.fl);
+ status = locks_delete_block(&block->b_call->a_args.lock.fl);
nlmsvc_remove_block(block);
return status;
}
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 7147e4aebecc..9846f7e95282 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -127,7 +127,7 @@ nlm_decode_lock(__be32 *p, struct nlm_lock *lock)
locks_init_lock(fl);
fl->fl_owner = current->files;
- fl->fl_pid = (pid_t)lock->svid;
+ fl->fl_pid = current->tgid;
fl->fl_flags = FL_POSIX;
fl->fl_type = F_RDLCK; /* as good as anything else */
start = ntohl(*p++);
@@ -269,7 +269,7 @@ nlmsvc_decode_shareargs(struct svc_rqst *rqstp, __be32 *p)
memset(lock, 0, sizeof(*lock));
locks_init_lock(&lock->fl);
lock->svid = ~(u32) 0;
- lock->fl.fl_pid = (pid_t)lock->svid;
+ lock->fl.fl_pid = current->tgid;
if (!(p = nlm_decode_cookie(p, &argp->cookie))
|| !(p = xdr_decode_string_inplace(p, &lock->caller,
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index 7ed9edf9aed4..70154f376695 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -119,7 +119,7 @@ nlm4_decode_lock(__be32 *p, struct nlm_lock *lock)
locks_init_lock(fl);
fl->fl_owner = current->files;
- fl->fl_pid = (pid_t)lock->svid;
+ fl->fl_pid = current->tgid;
fl->fl_flags = FL_POSIX;
fl->fl_type = F_RDLCK; /* as good as anything else */
p = xdr_decode_hyper(p, &start);
@@ -266,7 +266,7 @@ nlm4svc_decode_shareargs(struct svc_rqst *rqstp, __be32 *p)
memset(lock, 0, sizeof(*lock));
locks_init_lock(&lock->fl);
lock->svid = ~(u32) 0;
- lock->fl.fl_pid = (pid_t)lock->svid;
+ lock->fl.fl_pid = current->tgid;
if (!(p = nlm4_decode_cookie(p, &argp->cookie))
|| !(p = xdr_decode_string_inplace(p, &lock->caller,
diff --git a/fs/locks.c b/fs/locks.c
index 2ecb4db8c840..eaa1cfaf73b0 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -11,11 +11,11 @@
*
* Miscellaneous edits, and a total rewrite of posix_lock_file() code.
* Kai Petzke (wpp@marie.physik.tu-berlin.de), 1994
- *
+ *
* Converted file_lock_table to a linked list from an array, which eliminates
* the limits on how many active file locks are open.
* Chad Page (pageone@netcom.com), November 27, 1994
- *
+ *
* Removed dependency on file descriptors. dup()'ed file descriptors now
* get the same locks as the original file descriptors, and a close() on
* any file descriptor removes ALL the locks on the file for the current
@@ -41,7 +41,7 @@
* with a file pointer (filp). As a result they can be shared by a parent
* process and its children after a fork(). They are removed when the last
* file descriptor referring to the file pointer is closed (unless explicitly
- * unlocked).
+ * unlocked).
*
* FL_FLOCK locks never deadlock, an existing lock is always removed before
* upgrading from shared to exclusive (or vice versa). When this happens
@@ -50,7 +50,7 @@
* Andy Walker (andy@lysaker.kvaerner.no), June 09, 1995
*
* Removed some race conditions in flock_lock_file(), marked other possible
- * races. Just grep for FIXME to see them.
+ * races. Just grep for FIXME to see them.
* Dmitry Gorodchanin (pgmdsg@ibi.com), February 09, 1996.
*
* Addressed Dmitry's concerns. Deadlock checking no longer recursive.
@@ -112,6 +112,46 @@
* Leases and LOCK_MAND
* Matthew Wilcox <willy@debian.org>, June, 2000.
* Stephen Rothwell <sfr@canb.auug.org.au>, June, 2000.
+ *
+ * Locking conflicts and dependencies:
+ * If multiple threads attempt to lock the same byte (or flock the same file)
+ * only one can be granted the lock, and other must wait their turn.
+ * The first lock has been "applied" or "granted", the others are "waiting"
+ * and are "blocked" by the "applied" lock..
+ *
+ * Waiting and applied locks are all kept in trees whose properties are:
+ *
+ * - the root of a tree may be an applied or waiting lock.
+ * - every other node in the tree is a waiting lock that
+ * conflicts with every ancestor of that node.
+ *
+ * Every such tree begins life as a waiting singleton which obviously
+ * satisfies the above properties.
+ *
+ * The only ways we modify trees preserve these properties:
+ *
+ * 1. We may add a new leaf node, but only after first verifying that it
+ * conflicts with all of its ancestors.
+ * 2. We may remove the root of a tree, creating a new singleton
+ * tree from the root and N new trees rooted in the immediate
+ * children.
+ * 3. If the root of a tree is not currently an applied lock, we may
+ * apply it (if possible).
+ * 4. We may upgrade the root of the tree (either extend its range,
+ * or upgrade its entire range from read to write).
+ *
+ * When an applied lock is modified in a way that reduces or downgrades any
+ * part of its range, we remove all its children (2 above). This particularly
+ * happens when a lock is unlocked.
+ *
+ * For each of those child trees we "wake up" the thread which is
+ * waiting for the lock so it can continue handling as follows: if the
+ * root of the tree applies, we do so (3). If it doesn't, it must
+ * conflict with some applied lock. We remove (wake up) all of its children
+ * (2), and add it is a new leaf to the tree rooted in the applied
+ * lock (1). We then repeat the process recursively with those
+ * children.
+ *
*/
#include <linux/capability.h>
@@ -189,9 +229,9 @@ static DEFINE_HASHTABLE(blocked_hash, BLOCKED_HASH_BITS);
* This lock protects the blocked_hash. Generally, if you're accessing it, you
* want to be holding this lock.
*
- * In addition, it also protects the fl->fl_block list, and the fl->fl_next
- * pointer for file_lock structures that are acting as lock requests (in
- * contrast to those that are acting as records of acquired locks).
+ * In addition, it also protects the fl->fl_blocked_requests list, and the
+ * fl->fl_blocker pointer for file_lock structures that are acting as lock
+ * requests (in contrast to those that are acting as records of acquired locks).
*
* Note that when we acquire this lock in order to change the above fields,
* we often hold the flc_lock as well. In certain cases, when reading the fields
@@ -293,7 +333,8 @@ static void locks_init_lock_heads(struct file_lock *fl)
{
INIT_HLIST_NODE(&fl->fl_link);
INIT_LIST_HEAD(&fl->fl_list);
- INIT_LIST_HEAD(&fl->fl_block);
+ INIT_LIST_HEAD(&fl->fl_blocked_requests);
+ INIT_LIST_HEAD(&fl->fl_blocked_member);
init_waitqueue_head(&fl->fl_wait);
}
@@ -332,7 +373,8 @@ void locks_free_lock(struct file_lock *fl)
{
BUG_ON(waitqueue_active(&fl->fl_wait));
BUG_ON(!list_empty(&fl->fl_list));
- BUG_ON(!list_empty(&fl->fl_block));
+ BUG_ON(!list_empty(&fl->fl_blocked_requests));
+ BUG_ON(!list_empty(&fl->fl_blocked_member));
BUG_ON(!hlist_unhashed(&fl->fl_link));
locks_release_private(fl);
@@ -357,7 +399,6 @@ void locks_init_lock(struct file_lock *fl)
memset(fl, 0, sizeof(struct file_lock));
locks_init_lock_heads(fl);
}
-
EXPORT_SYMBOL(locks_init_lock);
/*
@@ -397,9 +438,26 @@ void locks_copy_lock(struct file_lock *new, struct file_lock *fl)
fl->fl_ops->fl_copy_lock(new, fl);
}
}
-
EXPORT_SYMBOL(locks_copy_lock);
+static void locks_move_blocks(struct file_lock *new, struct file_lock *fl)
+{
+ struct file_lock *f;
+
+ /*
+ * As ctx->flc_lock is held, new requests cannot be added to
+ * ->fl_blocked_requests, so we don't need a lock to check if it
+ * is empty.
+ */
+ if (list_empty(&fl->fl_blocked_requests))
+ return;
+ spin_lock(&blocked_lock_lock);
+ list_splice_init(&fl->fl_blocked_requests, &new->fl_blocked_requests);
+ list_for_each_entry(f, &new->fl_blocked_requests, fl_blocked_member)
+ f->fl_blocker = new;
+ spin_unlock(&blocked_lock_lock);
+}
+
static inline int flock_translate_cmd(int cmd) {
if (cmd & LOCK_MAND)
return cmd & (LOCK_MAND | LOCK_RW);
@@ -416,17 +474,20 @@ static inline int flock_translate_cmd(int cmd) {
/* Fill in a file_lock structure with an appropriate FLOCK lock. */
static struct file_lock *
-flock_make_lock(struct file *filp, unsigned int cmd)
+flock_make_lock(struct file *filp, unsigned int cmd, struct file_lock *fl)
{
- struct file_lock *fl;
int type = flock_translate_cmd(cmd);
if (type < 0)
return ERR_PTR(type);
-
- fl = locks_alloc_lock();
- if (fl == NULL)
- return ERR_PTR(-ENOMEM);
+
+ if (fl == NULL) {
+ fl = locks_alloc_lock();
+ if (fl == NULL)
+ return ERR_PTR(-ENOMEM);
+ } else {
+ locks_init_lock(fl);
+ }
fl->fl_file = filp;
fl->fl_owner = filp;
@@ -434,7 +495,7 @@ flock_make_lock(struct file *filp, unsigned int cmd)
fl->fl_flags = FL_FLOCK;
fl->fl_type = type;
fl->fl_end = OFFSET_MAX;
-
+
return fl;
}
@@ -666,16 +727,58 @@ static void locks_delete_global_blocked(struct file_lock *waiter)
static void __locks_delete_block(struct file_lock *waiter)
{
locks_delete_global_blocked(waiter);
- list_del_init(&waiter->fl_block);
- waiter->fl_next = NULL;
+ list_del_init(&waiter->fl_blocked_member);
+ waiter->fl_blocker = NULL;
}
-static void locks_delete_block(struct file_lock *waiter)
+static void __locks_wake_up_blocks(struct file_lock *blocker)
{
+ while (!list_empty(&blocker->fl_blocked_requests)) {
+ struct file_lock *waiter;
+
+ waiter = list_first_entry(&blocker->fl_blocked_requests,
+ struct file_lock, fl_blocked_member);
+ __locks_delete_block(waiter);
+ if (waiter->fl_lmops && waiter->fl_lmops->lm_notify)
+ waiter->fl_lmops->lm_notify(waiter);
+ else
+ wake_up(&waiter->fl_wait);
+ }
+}
+
+/**
+ * locks_delete_lock - stop waiting for a file lock
+ * @waiter: the lock which was waiting
+ *
+ * lockd/nfsd need to disconnect the lock while working on it.
+ */
+int locks_delete_block(struct file_lock *waiter)
+{
+ int status = -ENOENT;
+
+ /*
+ * If fl_blocker is NULL, it won't be set again as this thread
+ * "owns" the lock and is the only one that might try to claim
+ * the lock. So it is safe to test fl_blocker locklessly.
+ * Also if fl_blocker is NULL, this waiter is not listed on
+ * fl_blocked_requests for some lock, so no other request can
+ * be added to the list of fl_blocked_requests for this
+ * request. So if fl_blocker is NULL, it is safe to
+ * locklessly check if fl_blocked_requests is empty. If both
+ * of these checks succeed, there is no need to take the lock.
+ */
+ if (waiter->fl_blocker == NULL &&
+ list_empty(&waiter->fl_blocked_requests))
+ return status;
spin_lock(&blocked_lock_lock);
+ if (waiter->fl_blocker)
+ status = 0;
+ __locks_wake_up_blocks(waiter);
__locks_delete_block(waiter);
spin_unlock(&blocked_lock_lock);
+ return status;
}
+EXPORT_SYMBOL(locks_delete_block);
/* Insert waiter into blocker's block list.
* We use a circular list so that processes can be easily woken up in
@@ -683,26 +786,49 @@ static void locks_delete_block(struct file_lock *waiter)
* it seems like the reasonable thing to do.
*
* Must be called with both the flc_lock and blocked_lock_lock held. The
- * fl_block list itself is protected by the blocked_lock_lock, but by ensuring
- * that the flc_lock is also held on insertions we can avoid taking the
- * blocked_lock_lock in some cases when we see that the fl_block list is empty.
+ * fl_blocked_requests list itself is protected by the blocked_lock_lock,
+ * but by ensuring that the flc_lock is also held on insertions we can avoid
+ * taking the blocked_lock_lock in some cases when we see that the
+ * fl_blocked_requests list is empty.
+ *
+ * Rather than just adding to the list, we check for conflicts with any existing
+ * waiters, and add beneath any waiter that blocks the new waiter.
+ * Thus wakeups don't happen until needed.
*/
static void __locks_insert_block(struct file_lock *blocker,
- struct file_lock *waiter)
+ struct file_lock *waiter,
+ bool conflict(struct file_lock *,
+ struct file_lock *))
{
- BUG_ON(!list_empty(&waiter->fl_block));
- waiter->fl_next = blocker;
- list_add_tail(&waiter->fl_block, &blocker->fl_block);
+ struct file_lock *fl;
+ BUG_ON(!list_empty(&waiter->fl_blocked_member));
+
+new_blocker:
+ list_for_each_entry(fl, &blocker->fl_blocked_requests, fl_blocked_member)
+ if (conflict(fl, waiter)) {
+ blocker = fl;
+ goto new_blocker;
+ }
+ waiter->fl_blocker = blocker;
+ list_add_tail(&waiter->fl_blocked_member, &blocker->fl_blocked_requests);
if (IS_POSIX(blocker) && !IS_OFDLCK(blocker))
locks_insert_global_blocked(waiter);
+
+ /* The requests in waiter->fl_blocked are known to conflict with
+ * waiter, but might not conflict with blocker, or the requests
+ * and lock which block it. So they all need to be woken.
+ */
+ __locks_wake_up_blocks(waiter);
}
/* Must be called with flc_lock held. */
static void locks_insert_block(struct file_lock *blocker,
- struct file_lock *waiter)
+ struct file_lock *waiter,
+ bool conflict(struct file_lock *,
+ struct file_lock *))
{
spin_lock(&blocked_lock_lock);
- __locks_insert_block(blocker, waiter);
+ __locks_insert_block(blocker, waiter, conflict);
spin_unlock(&blocked_lock_lock);
}
@@ -716,25 +842,15 @@ static void locks_wake_up_blocks(struct file_lock *blocker)
/*
* Avoid taking global lock if list is empty. This is safe since new
* blocked requests are only added to the list under the flc_lock, and
- * the flc_lock is always held here. Note that removal from the fl_block
- * list does not require the flc_lock, so we must recheck list_empty()
- * after acquiring the blocked_lock_lock.
+ * the flc_lock is always held here. Note that removal from the
+ * fl_blocked_requests list does not require the flc_lock, so we must
+ * recheck list_empty() after acquiring the blocked_lock_lock.
*/
- if (list_empty(&blocker->fl_block))
+ if (list_empty(&blocker->fl_blocked_requests))
return;
spin_lock(&blocked_lock_lock);
- while (!list_empty(&blocker->fl_block)) {
- struct file_lock *waiter;
-
- waiter = list_first_entry(&blocker->fl_block,
- struct file_lock, fl_block);
- __locks_delete_block(waiter);
- if (waiter->fl_lmops && waiter->fl_lmops->lm_notify)
- waiter->fl_lmops->lm_notify(waiter);
- else
- wake_up(&waiter->fl_wait);
- }
+ __locks_wake_up_blocks(blocker);
spin_unlock(&blocked_lock_lock);
}
@@ -766,47 +882,50 @@ locks_delete_lock_ctx(struct file_lock *fl, struct list_head *dispose)
/* Determine if lock sys_fl blocks lock caller_fl. Common functionality
* checks for shared/exclusive status of overlapping locks.
*/
-static int locks_conflict(struct file_lock *caller_fl, struct file_lock *sys_fl)
+static bool locks_conflict(struct file_lock *caller_fl,
+ struct file_lock *sys_fl)
{
if (sys_fl->fl_type == F_WRLCK)
- return 1;
+ return true;
if (caller_fl->fl_type == F_WRLCK)
- return 1;
- return 0;
+ return true;
+ return false;
}
/* Determine if lock sys_fl blocks lock caller_fl. POSIX specific
* checking before calling the locks_conflict().
*/
-static int posix_locks_conflict(struct file_lock *caller_fl, struct file_lock *sys_fl)
+static bool posix_locks_conflict(struct file_lock *caller_fl,
+ struct file_lock *sys_fl)
{
/* POSIX locks owned by the same process do not conflict with
* each other.
*/
if (posix_same_owner(caller_fl, sys_fl))
- return (0);
+ return false;
/* Check whether they overlap */
if (!locks_overlap(caller_fl, sys_fl))
- return 0;
+ return false;
- return (locks_conflict(caller_fl, sys_fl));
+ return locks_conflict(caller_fl, sys_fl);
}
/* Determine if lock sys_fl blocks lock caller_fl. FLOCK specific
* checking before calling the locks_conflict().
*/
-static int flock_locks_conflict(struct file_lock *caller_fl, struct file_lock *sys_fl)
+static bool flock_locks_conflict(struct file_lock *caller_fl,
+ struct file_lock *sys_fl)
{
/* FLOCK locks referring to the same filp do not conflict with
* each other.
*/
if (caller_fl->fl_file == sys_fl->fl_file)
- return (0);
+ return false;
if ((caller_fl->fl_type & LOCK_MAND) || (sys_fl->fl_type & LOCK_MAND))
- return 0;
+ return false;
- return (locks_conflict(caller_fl, sys_fl));
+ return locks_conflict(caller_fl, sys_fl);
}
void
@@ -877,8 +996,11 @@ static struct file_lock *what_owner_is_waiting_for(struct file_lock *block_fl)
struct file_lock *fl;
hash_for_each_possible(blocked_hash, fl, fl_link, posix_owner_key(block_fl)) {
- if (posix_same_owner(fl, block_fl))
- return fl->fl_next;
+ if (posix_same_owner(fl, block_fl)) {
+ while (fl->fl_blocker)
+ fl = fl->fl_blocker;
+ return fl;
+ }
}
return NULL;
}
@@ -936,7 +1058,7 @@ static int flock_lock_inode(struct inode *inode, struct file_lock *request)
return -ENOMEM;
}
- percpu_down_read_preempt_disable(&file_rwsem);
+ percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
if (request->fl_flags & FL_ACCESS)
goto find_conflict;
@@ -965,19 +1087,20 @@ find_conflict:
if (!(request->fl_flags & FL_SLEEP))
goto out;
error = FILE_LOCK_DEFERRED;
- locks_insert_block(fl, request);
+ locks_insert_block(fl, request, flock_locks_conflict);
goto out;
}
if (request->fl_flags & FL_ACCESS)
goto out;
locks_copy_lock(new_fl, request);
+ locks_move_blocks(new_fl, request);
locks_insert_lock_ctx(new_fl, &ctx->flc_flock);
new_fl = NULL;
error = 0;
out:
spin_unlock(&ctx->flc_lock);
- percpu_up_read_preempt_enable(&file_rwsem);
+ percpu_up_read(&file_rwsem);
if (new_fl)
locks_free_lock(new_fl);
locks_dispose_list(&dispose);
@@ -1015,7 +1138,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
new_fl2 = locks_alloc_lock();
}
- percpu_down_read_preempt_disable(&file_rwsem);
+ percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
/*
* New lock request. Walk all POSIX locks and look for conflicts. If
@@ -1039,12 +1162,13 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
spin_lock(&blocked_lock_lock);
if (likely(!posix_locks_deadlock(request, fl))) {
error = FILE_LOCK_DEFERRED;
- __locks_insert_block(fl, request);
+ __locks_insert_block(fl, request,
+ posix_locks_conflict);
}
spin_unlock(&blocked_lock_lock);
goto out;
- }
- }
+ }
+ }
/* If we're just looking for a conflict, we're done. */
error = 0;
@@ -1164,6 +1288,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
goto out;
}
locks_copy_lock(new_fl, request);
+ locks_move_blocks(new_fl, request);
locks_insert_lock_ctx(new_fl, &fl->fl_list);
fl = new_fl;
new_fl = NULL;
@@ -1187,7 +1312,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
}
out:
spin_unlock(&ctx->flc_lock);
- percpu_up_read_preempt_enable(&file_rwsem);
+ percpu_up_read(&file_rwsem);
/*
* Free any unused locks.
*/
@@ -1237,13 +1362,11 @@ static int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl)
error = posix_lock_inode(inode, fl, NULL);
if (error != FILE_LOCK_DEFERRED)
break;
- error = wait_event_interruptible(fl->fl_wait, !fl->fl_next);
- if (!error)
- continue;
-
- locks_delete_block(fl);
- break;
+ error = wait_event_interruptible(fl->fl_wait, !fl->fl_blocker);
+ if (error)
+ break;
}
+ locks_delete_block(fl);
return error;
}
@@ -1324,7 +1447,7 @@ int locks_mandatory_area(struct inode *inode, struct file *filp, loff_t start,
error = posix_lock_inode(inode, &fl, NULL);
if (error != FILE_LOCK_DEFERRED)
break;
- error = wait_event_interruptible(fl.fl_wait, !fl.fl_next);
+ error = wait_event_interruptible(fl.fl_wait, !fl.fl_blocker);
if (!error) {
/*
* If we've been sleeping someone might have
@@ -1334,13 +1457,12 @@ int locks_mandatory_area(struct inode *inode, struct file *filp, loff_t start,
continue;
}
- locks_delete_block(&fl);
break;
}
+ locks_delete_block(&fl);
return error;
}
-
EXPORT_SYMBOL(locks_mandatory_area);
#endif /* CONFIG_MANDATORY_FILE_LOCKING */
@@ -1462,7 +1584,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
return error;
}
- percpu_down_read_preempt_disable(&file_rwsem);
+ percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
time_out_leases(inode, &dispose);
@@ -1511,16 +1633,16 @@ restart:
break_time -= jiffies;
if (break_time == 0)
break_time++;
- locks_insert_block(fl, new_fl);
+ locks_insert_block(fl, new_fl, leases_conflict);
trace_break_lease_block(inode, new_fl);
spin_unlock(&ctx->flc_lock);
- percpu_up_read_preempt_enable(&file_rwsem);
+ percpu_up_read(&file_rwsem);
locks_dispose_list(&dispose);
error = wait_event_interruptible_timeout(new_fl->fl_wait,
- !new_fl->fl_next, break_time);
+ !new_fl->fl_blocker, break_time);
- percpu_down_read_preempt_disable(&file_rwsem);
+ percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
trace_break_lease_unblock(inode, new_fl);
locks_delete_block(new_fl);
@@ -1537,12 +1659,11 @@ restart:
}
out:
spin_unlock(&ctx->flc_lock);
- percpu_up_read_preempt_enable(&file_rwsem);
+ percpu_up_read(&file_rwsem);
locks_dispose_list(&dispose);
locks_free_lock(new_fl);
return error;
}
-
EXPORT_SYMBOL(__break_lease);
/**
@@ -1573,7 +1694,6 @@ void lease_get_mtime(struct inode *inode, struct timespec64 *time)
if (has_lease)
*time = current_time(inode);
}
-
EXPORT_SYMBOL(lease_get_mtime);
/**
@@ -1609,7 +1729,7 @@ int fcntl_getlease(struct file *filp)
ctx = smp_load_acquire(&inode->i_flctx);
if (ctx && !list_empty_careful(&ctx->flc_lease)) {
- percpu_down_read_preempt_disable(&file_rwsem);
+ percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
time_out_leases(inode, &dispose);
list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
@@ -1619,7 +1739,7 @@ int fcntl_getlease(struct file *filp)
break;
}
spin_unlock(&ctx->flc_lock);
- percpu_up_read_preempt_enable(&file_rwsem);
+ percpu_up_read(&file_rwsem);
locks_dispose_list(&dispose);
}
@@ -1628,8 +1748,8 @@ int fcntl_getlease(struct file *filp)
/**
* check_conflicting_open - see if the given dentry points to a file that has
- * an existing open that would conflict with the
- * desired lease.
+ * an existing open that would conflict with the
+ * desired lease.
* @dentry: dentry to check
* @arg: type of lease that we're trying to acquire
* @flags: current lock flags
@@ -1646,7 +1766,7 @@ check_conflicting_open(const struct dentry *dentry, const long arg, int flags)
if (flags & FL_LAYOUT)
return 0;
- if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
+ if ((arg == F_RDLCK) && inode_is_open_for_write(inode))
return -EAGAIN;
if ((arg == F_WRLCK) && ((d_count(dentry) > 1) ||
@@ -1693,7 +1813,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
return -EINVAL;
}
- percpu_down_read_preempt_disable(&file_rwsem);
+ percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
time_out_leases(inode, &dispose);
error = check_conflicting_open(dentry, arg, lease->fl_flags);
@@ -1764,7 +1884,7 @@ out_setup:
lease->fl_lmops->lm_setup(lease, priv);
out:
spin_unlock(&ctx->flc_lock);
- percpu_up_read_preempt_enable(&file_rwsem);
+ percpu_up_read(&file_rwsem);
locks_dispose_list(&dispose);
if (is_deleg)
inode_unlock(inode);
@@ -1787,7 +1907,7 @@ static int generic_delete_lease(struct file *filp, void *owner)
return error;
}
- percpu_down_read_preempt_disable(&file_rwsem);
+ percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
if (fl->fl_file == filp &&
@@ -1800,7 +1920,7 @@ static int generic_delete_lease(struct file *filp, void *owner)
if (victim)
error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose);
spin_unlock(&ctx->flc_lock);
- percpu_up_read_preempt_enable(&file_rwsem);
+ percpu_up_read(&file_rwsem);
locks_dispose_list(&dispose);
return error;
}
@@ -1853,7 +1973,7 @@ EXPORT_SYMBOL(generic_setlease);
* @arg: type of lease to obtain
* @lease: file_lock to use when adding a lease
* @priv: private info for lm_setup when adding a lease (may be
- * NULL if lm_setup doesn't require it)
+ * NULL if lm_setup doesn't require it)
*
* Call this to establish a lease on the file. The "lease" argument is not
* used for F_UNLCK requests and may be NULL. For commands that set or alter
@@ -1931,13 +2051,11 @@ static int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl)
error = flock_lock_inode(inode, fl);
if (error != FILE_LOCK_DEFERRED)
break;
- error = wait_event_interruptible(fl->fl_wait, !fl->fl_next);
- if (!error)
- continue;
-
- locks_delete_block(fl);
- break;
+ error = wait_event_interruptible(fl->fl_wait, !fl->fl_blocker);
+ if (error)
+ break;
}
+ locks_delete_block(fl);
return error;
}
@@ -2001,7 +2119,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
!(f.file->f_mode & (FMODE_READ|FMODE_WRITE)))
goto out_putf;
- lock = flock_make_lock(f.file, cmd);
+ lock = flock_make_lock(f.file, cmd, NULL);
if (IS_ERR(lock)) {
error = PTR_ERR(lock);
goto out_putf;
@@ -2143,7 +2261,7 @@ int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock *flock)
error = vfs_test_lock(filp, fl);
if (error)
goto out;
-
+
flock->l_type = fl->fl_type;
if (fl->fl_type != F_UNLCK) {
error = posix_lock_to_flock(flock, fl);
@@ -2210,13 +2328,11 @@ static int do_lock_file_wait(struct file *filp, unsigned int cmd,
error = vfs_lock_file(filp, cmd, fl, NULL);
if (error != FILE_LOCK_DEFERRED)
break;
- error = wait_event_interruptible(fl->fl_wait, !fl->fl_next);
- if (!error)
- continue;
-
- locks_delete_block(fl);
- break;
+ error = wait_event_interruptible(fl->fl_wait, !fl->fl_blocker);
+ if (error)
+ break;
}
+ locks_delete_block(fl);
return error;
}
@@ -2476,6 +2592,7 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner)
if (!ctx || list_empty(&ctx->flc_posix))
return;
+ locks_init_lock(&lock);
lock.fl_type = F_UNLCK;
lock.fl_flags = FL_POSIX | FL_CLOSE;
lock.fl_start = 0;
@@ -2492,26 +2609,21 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner)
lock.fl_ops->fl_release_private(&lock);
trace_locks_remove_posix(inode, &lock, error);
}
-
EXPORT_SYMBOL(locks_remove_posix);
/* The i_flctx must be valid when calling into here */
static void
locks_remove_flock(struct file *filp, struct file_lock_context *flctx)
{
- struct file_lock fl = {
- .fl_owner = filp,
- .fl_pid = current->tgid,
- .fl_file = filp,
- .fl_flags = FL_FLOCK | FL_CLOSE,
- .fl_type = F_UNLCK,
- .fl_end = OFFSET_MAX,
- };
+ struct file_lock fl;
struct inode *inode = locks_inode(filp);
if (list_empty(&flctx->flc_flock))
return;
+ flock_make_lock(filp, LOCK_UN, &fl);
+ fl.fl_flags |= FL_CLOSE;
+
if (filp->f_op->flock)
filp->f_op->flock(filp, F_SETLKW, &fl);
else
@@ -2531,13 +2643,13 @@ locks_remove_lease(struct file *filp, struct file_lock_context *ctx)
if (list_empty(&ctx->flc_lease))
return;
- percpu_down_read_preempt_disable(&file_rwsem);
+ percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list)
if (filp == fl->fl_file)
lease_modify(fl, F_UNLCK, &dispose);
spin_unlock(&ctx->flc_lock);
- percpu_up_read_preempt_enable(&file_rwsem);
+ percpu_up_read(&file_rwsem);
locks_dispose_list(&dispose);
}
@@ -2570,27 +2682,6 @@ void locks_remove_file(struct file *filp)
}
/**
- * posix_unblock_lock - stop waiting for a file lock
- * @waiter: the lock which was waiting
- *
- * lockd needs to block waiting for locks.
- */
-int
-posix_unblock_lock(struct file_lock *waiter)
-{
- int status = 0;
-
- spin_lock(&blocked_lock_lock);
- if (waiter->fl_next)
- __locks_delete_block(waiter);
- else
- status = -ENOENT;
- spin_unlock(&blocked_lock_lock);
- return status;
-}
-EXPORT_SYMBOL(posix_unblock_lock);
-
-/**
* vfs_cancel_lock - file byte range unblock lock
* @filp: The file to apply the unblock to
* @fl: The lock to be unblocked
@@ -2603,7 +2694,6 @@ int vfs_cancel_lock(struct file *filp, struct file_lock *fl)
return filp->f_op->lock(filp, F_CANCELLK, fl);
return 0;
}
-
EXPORT_SYMBOL_GPL(vfs_cancel_lock);
#ifdef CONFIG_PROC_FS
@@ -2707,7 +2797,7 @@ static int locks_show(struct seq_file *f, void *v)
lock_get_status(f, fl, iter->li_pos, "");
- list_for_each_entry(bfl, &fl->fl_block, fl_block)
+ list_for_each_entry(bfl, &fl->fl_blocked_requests, fl_blocked_member)
lock_get_status(f, bfl, iter->li_pos, " ->");
return 0;
@@ -2803,7 +2893,6 @@ static int __init filelock_init(void)
filelock_cache = kmem_cache_create("file_lock_cache",
sizeof(struct file_lock), 0, SLAB_PANIC, NULL);
-
for_each_possible_cpu(i) {
struct file_lock_list_struct *fll = per_cpu_ptr(&file_lock_list, i);
@@ -2813,5 +2902,4 @@ static int __init filelock_init(void)
return 0;
}
-
core_initcall(filelock_init);
diff --git a/fs/mpage.c b/fs/mpage.c
index c820dc9bebab..3f19da75178b 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -48,8 +48,9 @@ static void mpage_end_io(struct bio *bio)
{
struct bio_vec *bv;
int i;
+ struct bvec_iter_all iter_all;
- bio_for_each_segment_all(bv, bio, i) {
+ bio_for_each_segment_all(bv, bio, i, iter_all) {
struct page *page = bv->bv_page;
page_endio(page, bio_op(bio),
blk_status_to_errno(bio->bi_status));
diff --git a/fs/namei.c b/fs/namei.c
index 0cab6494978c..0a8c5c27f90e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -39,7 +39,6 @@
#include <linux/bitops.h>
#include <linux/init_task.h>
#include <linux/uaccess.h>
-#include <linux/build_bug.h>
#include "internal.h"
#include "mount.h"
@@ -131,7 +130,6 @@ getname_flags(const char __user *filename, int flags, int *empty)
struct filename *result;
char *kname;
int len;
- BUILD_BUG_ON(offsetof(struct filename, iname) % sizeof(long) != 0);
result = audit_reusename(filename);
if (result)
@@ -2720,7 +2718,7 @@ filename_mountpoint(int dfd, struct filename *name, struct path *path,
if (unlikely(error == -ESTALE))
error = path_mountpoint(&nd, flags | LOOKUP_REVAL, path);
if (likely(!error))
- audit_inode(name, path->dentry, 0);
+ audit_inode(name, path->dentry, flags & LOOKUP_NO_EVAL);
restore_nameidata();
putname(name);
return error;
@@ -3701,8 +3699,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
if (error)
return error;
- if ((S_ISCHR(mode) || S_ISBLK(mode)) &&
- !ns_capable(dentry->d_sb->s_user_ns, CAP_MKNOD))
+ if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD))
return -EPERM;
if (!dir->i_op->mknod)
diff --git a/fs/namespace.c b/fs/namespace.c
index d86830c86ce8..98a8c182af4f 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -23,9 +23,10 @@
#include <linux/uaccess.h>
#include <linux/proc_ns.h>
#include <linux/magic.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/task_work.h>
#include <linux/sched/task.h>
+#include <uapi/linux/mount.h>
#include "pnode.h"
#include "internal.h"
@@ -245,13 +246,9 @@ out_free_cache:
* mnt_want/drop_write() will _keep_ the filesystem
* r/w.
*/
-int __mnt_is_readonly(struct vfsmount *mnt)
+bool __mnt_is_readonly(struct vfsmount *mnt)
{
- if (mnt->mnt_flags & MNT_READONLY)
- return 1;
- if (sb_rdonly(mnt->mnt_sb))
- return 1;
- return 0;
+ return (mnt->mnt_flags & MNT_READONLY) || sb_rdonly(mnt->mnt_sb);
}
EXPORT_SYMBOL_GPL(__mnt_is_readonly);
@@ -507,11 +504,12 @@ static int mnt_make_readonly(struct mount *mnt)
return ret;
}
-static void __mnt_unmake_readonly(struct mount *mnt)
+static int __mnt_unmake_readonly(struct mount *mnt)
{
lock_mount_hash();
mnt->mnt.mnt_flags &= ~MNT_READONLY;
unlock_mount_hash();
+ return 0;
}
int sb_prepare_remount_readonly(struct super_block *sb)
@@ -695,9 +693,6 @@ static struct mountpoint *lookup_mountpoint(struct dentry *dentry)
hlist_for_each_entry(mp, chain, m_hash) {
if (mp->m_dentry == dentry) {
- /* might be worth a WARN_ON() */
- if (d_unlinked(dentry))
- return ERR_PTR(-ENOENT);
mp->m_count++;
return mp;
}
@@ -711,6 +706,9 @@ static struct mountpoint *get_mountpoint(struct dentry *dentry)
int ret;
if (d_mountpoint(dentry)) {
+ /* might be worth a WARN_ON() */
+ if (d_unlinked(dentry))
+ return ERR_PTR(-ENOENT);
mountpoint:
read_seqlock_excl(&mount_lock);
mp = lookup_mountpoint(dentry);
@@ -1360,7 +1358,7 @@ static void namespace_unlock(void)
if (likely(hlist_empty(&head)))
return;
- synchronize_rcu();
+ synchronize_rcu_expedited();
group_pin_kill(&head);
}
@@ -1540,8 +1538,13 @@ static int do_umount(struct mount *mnt, int flags)
namespace_lock();
lock_mount_hash();
- event++;
+ /* Recheck MNT_LOCKED with the locks held */
+ retval = -EINVAL;
+ if (mnt->mnt.mnt_flags & MNT_LOCKED)
+ goto out;
+
+ event++;
if (flags & MNT_DETACH) {
if (!list_empty(&mnt->mnt_list))
umount_tree(mnt, UMOUNT_PROPAGATE);
@@ -1555,6 +1558,7 @@ static int do_umount(struct mount *mnt, int flags)
retval = 0;
}
}
+out:
unlock_mount_hash();
namespace_unlock();
return retval;
@@ -1636,6 +1640,8 @@ int ksys_umount(char __user *name, int flags)
if (!(flags & UMOUNT_NOFOLLOW))
lookup_flags |= LOOKUP_FOLLOW;
+ lookup_flags |= LOOKUP_NO_EVAL;
+
retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path);
if (retval)
goto out;
@@ -1645,7 +1651,7 @@ int ksys_umount(char __user *name, int flags)
goto dput_and_out;
if (!check_mnt(mnt))
goto dput_and_out;
- if (mnt->mnt.mnt_flags & MNT_LOCKED)
+ if (mnt->mnt.mnt_flags & MNT_LOCKED) /* Check optimistically */
goto dput_and_out;
retval = -EPERM;
if (flags & MNT_FORCE && !capable(CAP_SYS_ADMIN))
@@ -1728,8 +1734,14 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
for (s = r; s; s = next_mnt(s, r)) {
if (!(flag & CL_COPY_UNBINDABLE) &&
IS_MNT_UNBINDABLE(s)) {
- s = skip_mnt_tree(s);
- continue;
+ if (s->mnt.mnt_flags & MNT_LOCKED) {
+ /* Both unbindable and locked. */
+ q = ERR_PTR(-EPERM);
+ goto out;
+ } else {
+ s = skip_mnt_tree(s);
+ continue;
+ }
}
if (!(flag & CL_COPY_MNT_NS_FILE) &&
is_mnt_ns_file(s->mnt.mnt_root)) {
@@ -1782,7 +1794,7 @@ void drop_collected_mounts(struct vfsmount *mnt)
{
namespace_lock();
lock_mount_hash();
- umount_tree(real_mount(mnt), UMOUNT_SYNC);
+ umount_tree(real_mount(mnt), 0);
unlock_mount_hash();
namespace_unlock();
}
@@ -2203,21 +2215,91 @@ out:
return err;
}
-static int change_mount_flags(struct vfsmount *mnt, int ms_flags)
+/*
+ * Don't allow locked mount flags to be cleared.
+ *
+ * No locks need to be held here while testing the various MNT_LOCK
+ * flags because those flags can never be cleared once they are set.
+ */
+static bool can_change_locked_flags(struct mount *mnt, unsigned int mnt_flags)
+{
+ unsigned int fl = mnt->mnt.mnt_flags;
+
+ if ((fl & MNT_LOCK_READONLY) &&
+ !(mnt_flags & MNT_READONLY))
+ return false;
+
+ if ((fl & MNT_LOCK_NODEV) &&
+ !(mnt_flags & MNT_NODEV))
+ return false;
+
+ if ((fl & MNT_LOCK_NOSUID) &&
+ !(mnt_flags & MNT_NOSUID))
+ return false;
+
+ if ((fl & MNT_LOCK_NOEXEC) &&
+ !(mnt_flags & MNT_NOEXEC))
+ return false;
+
+ if ((fl & MNT_LOCK_ATIME) &&
+ ((fl & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK)))
+ return false;
+
+ return true;
+}
+
+static int change_mount_ro_state(struct mount *mnt, unsigned int mnt_flags)
{
- int error = 0;
- int readonly_request = 0;
+ bool readonly_request = (mnt_flags & MNT_READONLY);
- if (ms_flags & MS_RDONLY)
- readonly_request = 1;
- if (readonly_request == __mnt_is_readonly(mnt))
+ if (readonly_request == __mnt_is_readonly(&mnt->mnt))
return 0;
if (readonly_request)
- error = mnt_make_readonly(real_mount(mnt));
- else
- __mnt_unmake_readonly(real_mount(mnt));
- return error;
+ return mnt_make_readonly(mnt);
+
+ return __mnt_unmake_readonly(mnt);
+}
+
+/*
+ * Update the user-settable attributes on a mount. The caller must hold
+ * sb->s_umount for writing.
+ */
+static void set_mount_attributes(struct mount *mnt, unsigned int mnt_flags)
+{
+ lock_mount_hash();
+ mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
+ mnt->mnt.mnt_flags = mnt_flags;
+ touch_mnt_namespace(mnt->mnt_ns);
+ unlock_mount_hash();
+}
+
+/*
+ * Handle reconfiguration of the mountpoint only without alteration of the
+ * superblock it refers to. This is triggered by specifying MS_REMOUNT|MS_BIND
+ * to mount(2).
+ */
+static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags)
+{
+ struct super_block *sb = path->mnt->mnt_sb;
+ struct mount *mnt = real_mount(path->mnt);
+ int ret;
+
+ if (!check_mnt(mnt))
+ return -EINVAL;
+
+ if (path->dentry != mnt->mnt.mnt_root)
+ return -EINVAL;
+
+ if (!can_change_locked_flags(mnt, mnt_flags))
+ return -EPERM;
+
+ down_write(&sb->s_umount);
+ ret = change_mount_ro_state(mnt, mnt_flags);
+ if (ret == 0)
+ set_mount_attributes(mnt, mnt_flags);
+ up_write(&sb->s_umount);
+ return ret;
}
/*
@@ -2231,6 +2313,7 @@ static int do_remount(struct path *path, int ms_flags, int sb_flags,
int err;
struct super_block *sb = path->mnt->mnt_sb;
struct mount *mnt = real_mount(path->mnt);
+ void *sec_opts = NULL;
if (!check_mnt(mnt))
return -EINVAL;
@@ -2238,50 +2321,25 @@ static int do_remount(struct path *path, int ms_flags, int sb_flags,
if (path->dentry != path->mnt->mnt_root)
return -EINVAL;
- /* Don't allow changing of locked mnt flags.
- *
- * No locks need to be held here while testing the various
- * MNT_LOCK flags because those flags can never be cleared
- * once they are set.
- */
- if ((mnt->mnt.mnt_flags & MNT_LOCK_READONLY) &&
- !(mnt_flags & MNT_READONLY)) {
- return -EPERM;
- }
- if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) &&
- !(mnt_flags & MNT_NODEV)) {
- return -EPERM;
- }
- if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) &&
- !(mnt_flags & MNT_NOSUID)) {
- return -EPERM;
- }
- if ((mnt->mnt.mnt_flags & MNT_LOCK_NOEXEC) &&
- !(mnt_flags & MNT_NOEXEC)) {
+ if (!can_change_locked_flags(mnt, mnt_flags))
return -EPERM;
- }
- if ((mnt->mnt.mnt_flags & MNT_LOCK_ATIME) &&
- ((mnt->mnt.mnt_flags & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK))) {
- return -EPERM;
- }
- err = security_sb_remount(sb, data);
+ if (data && !(sb->s_type->fs_flags & FS_BINARY_MOUNTDATA)) {
+ err = security_sb_eat_lsm_opts(data, &sec_opts);
+ if (err)
+ return err;
+ }
+ err = security_sb_remount(sb, sec_opts);
+ security_free_mnt_opts(&sec_opts);
if (err)
return err;
down_write(&sb->s_umount);
- if (ms_flags & MS_BIND)
- err = change_mount_flags(path->mnt, ms_flags);
- else if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
- err = -EPERM;
- else
+ err = -EPERM;
+ if (ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) {
err = do_remount_sb(sb, sb_flags, data, 0);
- if (!err) {
- lock_mount_hash();
- mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
- mnt->mnt.mnt_flags = mnt_flags;
- touch_mnt_namespace(mnt->mnt_ns);
- unlock_mount_hash();
+ if (!err)
+ set_mount_attributes(mnt, mnt_flags);
}
up_write(&sb->s_umount);
return err;
@@ -2639,10 +2697,9 @@ static long exact_copy_from_user(void *to, const void __user * from,
const char __user *f = from;
char c;
- if (!access_ok(VERIFY_READ, from, n))
+ if (!access_ok(from, n))
return n;
- current->kernel_uaccess_faults_ok++;
while (n) {
if (__get_user(c, f)) {
memset(t, 0, n);
@@ -2652,7 +2709,6 @@ static long exact_copy_from_user(void *to, const void __user * from,
f++;
n--;
}
- current->kernel_uaccess_faults_ok--;
return n;
}
@@ -2690,7 +2746,7 @@ void *copy_mount_options(const void __user * data)
char *copy_mount_string(const void __user *data)
{
- return data ? strndup_user(data, PAGE_SIZE) : NULL;
+ return data ? strndup_user(data, PATH_MAX) : NULL;
}
/*
@@ -2776,7 +2832,9 @@ long do_mount(const char *dev_name, const char __user *dir_name,
SB_LAZYTIME |
SB_I_VERSION);
- if (flags & MS_REMOUNT)
+ if ((flags & (MS_REMOUNT | MS_BIND)) == (MS_REMOUNT | MS_BIND))
+ retval = do_reconfigure_mnt(&path, mnt_flags);
+ else if (flags & MS_REMOUNT)
retval = do_remount(&path, flags, sb_flags, mnt_flags,
data_page);
else if (flags & MS_BIND)
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 06cb0c1d9aee..690221747b47 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -584,7 +584,7 @@ static int decode_sector_number(__be32 **rp, sector_t *sp)
static struct nfs4_deviceid_node *
bl_find_get_deviceid(struct nfs_server *server,
- const struct nfs4_deviceid *id, struct rpc_cred *cred,
+ const struct nfs4_deviceid *id, const struct cred *cred,
gfp_t gfp_mask)
{
struct nfs4_deviceid_node *node;
@@ -896,7 +896,7 @@ static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx)
end = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
if (end != inode->i_mapping->nrpages) {
rcu_read_lock();
- end = page_cache_next_hole(mapping, idx + 1, ULONG_MAX);
+ end = page_cache_next_miss(mapping, idx + 1, ULONG_MAX);
rcu_read_unlock();
}
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 509dc5adeb8f..0b602a39dd71 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -56,7 +56,7 @@ static int nfs4_callback_up_net(struct svc_serv *serv, struct net *net)
nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
if (ret > 0) {
nn->nfs_callback_tcpport6 = ret;
- dprintk("NFS: Callback listener port = %u (af %u, net %x\n",
+ dprintk("NFS: Callback listener port = %u (af %u, net %x)\n",
nn->nfs_callback_tcpport6, PF_INET6, net->ns.inum);
} else if (ret != -EAFNOSUPPORT)
goto out_err;
@@ -206,11 +206,13 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv,
goto err_bind;
}
- ret = -EPROTONOSUPPORT;
+ ret = 0;
if (!IS_ENABLED(CONFIG_NFS_V4_1) || minorversion == 0)
ret = nfs4_callback_up_net(serv, net);
- else if (xprt->ops->bc_up)
- ret = xprt->ops->bc_up(serv, net);
+ else if (xprt->ops->bc_setup)
+ set_bc_enabled(serv);
+ else
+ ret = -EPROTONOSUPPORT;
if (ret < 0) {
printk(KERN_ERR "NFS: callback service start failed\n");
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index fa515d5ea5ba..315967354954 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -66,7 +66,7 @@ __be32 nfs4_callback_getattr(void *argp, void *resp,
out_iput:
rcu_read_unlock();
trace_nfs4_cb_getattr(cps->clp, &args->fh, inode, -ntohl(res->status));
- iput(inode);
+ nfs_iput_and_deactive(inode);
out:
dprintk("%s: exit with status = %d\n", __func__, ntohl(res->status));
return res->status;
@@ -108,7 +108,7 @@ __be32 nfs4_callback_recall(void *argp, void *resp,
}
trace_nfs4_cb_recall(cps->clp, &args->fh, inode,
&args->stateid, -ntohl(res));
- iput(inode);
+ nfs_iput_and_deactive(inode);
out:
dprintk("%s: exit with status = %d\n", __func__, ntohl(res));
return res;
@@ -686,20 +686,24 @@ __be32 nfs4_callback_offload(void *data, void *dummy,
{
struct cb_offloadargs *args = data;
struct nfs_server *server;
- struct nfs4_copy_state *copy;
+ struct nfs4_copy_state *copy, *tmp_copy;
bool found = false;
+ copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_NOFS);
+ if (!copy)
+ return htonl(NFS4ERR_SERVERFAULT);
+
spin_lock(&cps->clp->cl_lock);
rcu_read_lock();
list_for_each_entry_rcu(server, &cps->clp->cl_superblocks,
client_link) {
- list_for_each_entry(copy, &server->ss_copies, copies) {
+ list_for_each_entry(tmp_copy, &server->ss_copies, copies) {
if (memcmp(args->coa_stateid.other,
- copy->stateid.other,
+ tmp_copy->stateid.other,
sizeof(args->coa_stateid.other)))
continue;
- nfs4_copy_cb_args(copy, args);
- complete(&copy->completion);
+ nfs4_copy_cb_args(tmp_copy, args);
+ complete(&tmp_copy->completion);
found = true;
goto out;
}
@@ -707,15 +711,11 @@ __be32 nfs4_callback_offload(void *data, void *dummy,
out:
rcu_read_unlock();
if (!found) {
- copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_NOFS);
- if (!copy) {
- spin_unlock(&cps->clp->cl_lock);
- return htonl(NFS4ERR_SERVERFAULT);
- }
memcpy(&copy->stateid, &args->coa_stateid, NFS4_STATEID_SIZE);
nfs4_copy_cb_args(copy, args);
list_add_tail(&copy->copies, &cps->clp->pending_cb_stateids);
- }
+ } else
+ kfree(copy);
spin_unlock(&cps->clp->cl_lock);
return 0;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 96d5f8135eb9..fb1cf1a4bda2 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -151,7 +151,6 @@ EXPORT_SYMBOL_GPL(unregister_nfs_version);
struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)
{
struct nfs_client *clp;
- struct rpc_cred *cred;
int err = -ENOMEM;
if ((clp = kzalloc(sizeof(*clp), GFP_KERNEL)) == NULL)
@@ -182,9 +181,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)
clp->cl_proto = cl_init->proto;
clp->cl_net = get_net(cl_init->net);
- cred = rpc_lookup_machine_cred("*");
- if (!IS_ERR(cred))
- clp->cl_machine_cred = cred;
+ clp->cl_principal = "*";
nfs_fscache_get_client_cookie(clp);
return clp;
@@ -246,9 +243,6 @@ void nfs_free_client(struct nfs_client *clp)
if (!IS_ERR(clp->cl_rpcclient))
rpc_shutdown_client(clp->cl_rpcclient);
- if (clp->cl_machine_cred != NULL)
- put_rpccred(clp->cl_machine_cred);
-
put_net(clp->cl_net);
put_nfs_version(clp->cl_nfs_mod);
kfree(clp->cl_hostname);
@@ -527,6 +521,7 @@ int nfs_create_rpc_client(struct nfs_client *clp,
return PTR_ERR(clnt);
}
+ clnt->cl_principal = clp->cl_principal;
clp->cl_rpcclient = clnt;
return 0;
}
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index f033f3a69a3b..885363ca8569 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -26,10 +26,8 @@
static void nfs_free_delegation(struct nfs_delegation *delegation)
{
- if (delegation->cred) {
- put_rpccred(delegation->cred);
- delegation->cred = NULL;
- }
+ put_cred(delegation->cred);
+ delegation->cred = NULL;
kfree_rcu(delegation, rcu);
}
@@ -93,7 +91,7 @@ int nfs4_check_delegation(struct inode *inode, fmode_t flags)
return nfs4_do_check_delegation(inode, flags, false);
}
-static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid)
+static int nfs_delegation_claim_locks(struct nfs4_state *state, const nfs4_stateid *stateid)
{
struct inode *inode = state->inode;
struct file_lock *fl;
@@ -108,7 +106,7 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_
spin_lock(&flctx->flc_lock);
restart:
list_for_each_entry(fl, list, fl_list) {
- if (nfs_file_open_context(fl->fl_file) != ctx)
+ if (nfs_file_open_context(fl->fl_file)->state != state)
continue;
spin_unlock(&flctx->flc_lock);
status = nfs4_lock_delegation_recall(fl, state, stateid);
@@ -136,8 +134,8 @@ static int nfs_delegation_claim_opens(struct inode *inode,
int err;
again:
- spin_lock(&inode->i_lock);
- list_for_each_entry(ctx, &nfsi->open_files, list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(ctx, &nfsi->open_files, list) {
state = ctx->state;
if (state == NULL)
continue;
@@ -147,15 +145,16 @@ again:
continue;
if (!nfs4_stateid_match(&state->stateid, stateid))
continue;
- get_nfs_open_context(ctx);
- spin_unlock(&inode->i_lock);
+ if (!get_nfs_open_context(ctx))
+ continue;
+ rcu_read_unlock();
sp = state->owner;
/* Block nfs4_proc_unlck */
mutex_lock(&sp->so_delegreturn_mutex);
seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
err = nfs4_open_delegation_recall(ctx, state, stateid, type);
if (!err)
- err = nfs_delegation_claim_locks(ctx, state, stateid);
+ err = nfs_delegation_claim_locks(state, stateid);
if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
err = -EAGAIN;
mutex_unlock(&sp->so_delegreturn_mutex);
@@ -164,7 +163,7 @@ again:
return err;
goto again;
}
- spin_unlock(&inode->i_lock);
+ rcu_read_unlock();
return 0;
}
@@ -177,13 +176,13 @@ again:
* @pagemod_limit: write delegation "space_limit"
*
*/
-void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred,
+void nfs_inode_reclaim_delegation(struct inode *inode, const struct cred *cred,
fmode_t type,
const nfs4_stateid *stateid,
unsigned long pagemod_limit)
{
struct nfs_delegation *delegation;
- struct rpc_cred *oldcred = NULL;
+ const struct cred *oldcred = NULL;
rcu_read_lock();
delegation = rcu_dereference(NFS_I(inode)->delegation);
@@ -194,12 +193,12 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred,
delegation->type = type;
delegation->pagemod_limit = pagemod_limit;
oldcred = delegation->cred;
- delegation->cred = get_rpccred(cred);
+ delegation->cred = get_cred(cred);
clear_bit(NFS_DELEGATION_NEED_RECLAIM,
&delegation->flags);
spin_unlock(&delegation->lock);
rcu_read_unlock();
- put_rpccred(oldcred);
+ put_cred(oldcred);
trace_nfs4_reclaim_delegation(inode, type);
return;
}
@@ -340,7 +339,7 @@ nfs_update_inplace_delegation(struct nfs_delegation *delegation,
*
* Returns zero on success, or a negative errno value.
*/
-int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred,
+int nfs_inode_set_delegation(struct inode *inode, const struct cred *cred,
fmode_t type,
const nfs4_stateid *stateid,
unsigned long pagemod_limit)
@@ -359,7 +358,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred,
delegation->type = type;
delegation->pagemod_limit = pagemod_limit;
delegation->change_attr = inode_peek_iversion_raw(inode);
- delegation->cred = get_rpccred(cred);
+ delegation->cred = get_cred(cred);
delegation->inode = inode;
delegation->flags = 1<<NFS_DELEGATION_REFERENCED;
spin_lock_init(&delegation->lock);
@@ -849,16 +848,23 @@ nfs_delegation_find_inode_server(struct nfs_server *server,
const struct nfs_fh *fhandle)
{
struct nfs_delegation *delegation;
- struct inode *res = NULL;
+ struct inode *freeme, *res = NULL;
list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
spin_lock(&delegation->lock);
if (delegation->inode != NULL &&
nfs_compare_fh(fhandle, &NFS_I(delegation->inode)->fh) == 0) {
- res = igrab(delegation->inode);
+ freeme = igrab(delegation->inode);
+ if (freeme && nfs_sb_active(freeme->i_sb))
+ res = freeme;
spin_unlock(&delegation->lock);
if (res != NULL)
return res;
+ if (freeme) {
+ rcu_read_unlock();
+ iput(freeme);
+ rcu_read_lock();
+ }
return ERR_PTR(-EAGAIN);
}
spin_unlock(&delegation->lock);
@@ -1039,7 +1045,7 @@ void nfs_reap_expired_delegations(struct nfs_client *clp)
struct nfs_delegation *delegation;
struct nfs_server *server;
struct inode *inode;
- struct rpc_cred *cred;
+ const struct cred *cred;
nfs4_stateid stateid;
restart:
@@ -1061,7 +1067,7 @@ restart:
nfs_sb_deactive(server->super);
goto restart;
}
- cred = get_rpccred_rcu(delegation->cred);
+ cred = get_cred_rcu(delegation->cred);
nfs4_stateid_copy(&stateid, &delegation->stateid);
clear_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags);
rcu_read_unlock();
@@ -1070,7 +1076,7 @@ restart:
nfs_revoke_delegation(inode, &stateid);
nfs_inode_find_state_and_recover(inode, &stateid);
}
- put_rpccred(cred);
+ put_cred(cred);
if (nfs4_server_rebooted(clp)) {
nfs_inode_mark_test_expired_delegation(server,inode);
iput(inode);
@@ -1165,7 +1171,7 @@ out:
* otherwise "false" is returned.
*/
bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags,
- nfs4_stateid *dst, struct rpc_cred **cred)
+ nfs4_stateid *dst, const struct cred **cred)
{
struct nfs_inode *nfsi = NFS_I(inode);
struct nfs_delegation *delegation;
@@ -1179,7 +1185,7 @@ bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags,
nfs4_stateid_copy(dst, &delegation->stateid);
nfs_mark_delegation_referenced(delegation);
if (cred)
- *cred = get_rpccred(delegation->cred);
+ *cred = get_cred(delegation->cred);
}
rcu_read_unlock();
return ret;
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index bb1ef8c37af4..dcbf3394ba0e 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -15,7 +15,7 @@
*/
struct nfs_delegation {
struct list_head super_list;
- struct rpc_cred *cred;
+ const struct cred *cred;
struct inode *inode;
nfs4_stateid stateid;
fmode_t type;
@@ -36,9 +36,9 @@ enum {
NFS_DELEGATION_TEST_EXPIRED,
};
-int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred,
+int nfs_inode_set_delegation(struct inode *inode, const struct cred *cred,
fmode_t type, const nfs4_stateid *stateid, unsigned long pagemod_limit);
-void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred,
+void nfs_inode_reclaim_delegation(struct inode *inode, const struct cred *cred,
fmode_t type, const nfs4_stateid *stateid, unsigned long pagemod_limit);
int nfs4_inode_return_delegation(struct inode *inode);
int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid);
@@ -60,10 +60,10 @@ void nfs_mark_test_expired_all_delegations(struct nfs_client *clp);
void nfs_reap_expired_delegations(struct nfs_client *clp);
/* NFSv4 delegation-related procedures */
-int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync);
+int nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred, const nfs4_stateid *stateid, int issync);
int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid, fmode_t type);
int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid);
-bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags, nfs4_stateid *dst, struct rpc_cred **cred);
+bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags, nfs4_stateid *dst, const struct cred **cred);
bool nfs4_refresh_delegation_stateid(nfs4_stateid *dst, struct inode *inode);
void nfs_mark_delegation_referenced(struct nfs_delegation *delegation);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 8bfaa658b2c1..6bf4471850c8 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -67,7 +67,7 @@ const struct address_space_operations nfs_dir_aops = {
.freepage = nfs_readdir_clear_array,
};
-static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir, struct rpc_cred *cred)
+static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir, const struct cred *cred)
{
struct nfs_inode *nfsi = NFS_I(dir);
struct nfs_open_dir_context *ctx;
@@ -77,7 +77,7 @@ static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir
ctx->attr_gencount = nfsi->attr_gencount;
ctx->dir_cookie = 0;
ctx->dup_cookie = 0;
- ctx->cred = get_rpccred(cred);
+ ctx->cred = get_cred(cred);
spin_lock(&dir->i_lock);
list_add(&ctx->list, &nfsi->open_files);
spin_unlock(&dir->i_lock);
@@ -91,7 +91,7 @@ static void put_nfs_open_dir_context(struct inode *dir, struct nfs_open_dir_cont
spin_lock(&dir->i_lock);
list_del(&ctx->list);
spin_unlock(&dir->i_lock);
- put_rpccred(ctx->cred);
+ put_cred(ctx->cred);
kfree(ctx);
}
@@ -103,23 +103,18 @@ nfs_opendir(struct inode *inode, struct file *filp)
{
int res = 0;
struct nfs_open_dir_context *ctx;
- struct rpc_cred *cred;
dfprintk(FILE, "NFS: open dir(%pD2)\n", filp);
nfs_inc_stats(inode, NFSIOS_VFSOPEN);
- cred = rpc_lookup_cred();
- if (IS_ERR(cred))
- return PTR_ERR(cred);
- ctx = alloc_nfs_open_dir_context(inode, cred);
+ ctx = alloc_nfs_open_dir_context(inode, current_cred());
if (IS_ERR(ctx)) {
res = PTR_ERR(ctx);
goto out;
}
filp->private_data = ctx;
out:
- put_rpccred(cred);
return res;
}
@@ -334,7 +329,7 @@ int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc,
struct nfs_entry *entry, struct file *file, struct inode *inode)
{
struct nfs_open_dir_context *ctx = file->private_data;
- struct rpc_cred *cred = ctx->cred;
+ const struct cred *cred = ctx->cred;
unsigned long timestamp, gencount;
int error;
@@ -1072,6 +1067,100 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry,
return !nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU);
}
+static int
+nfs_lookup_revalidate_done(struct inode *dir, struct dentry *dentry,
+ struct inode *inode, int error)
+{
+ switch (error) {
+ case 1:
+ dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is valid\n",
+ __func__, dentry);
+ return 1;
+ case 0:
+ nfs_mark_for_revalidate(dir);
+ if (inode && S_ISDIR(inode->i_mode)) {
+ /* Purge readdir caches. */
+ nfs_zap_caches(inode);
+ /*
+ * We can't d_drop the root of a disconnected tree:
+ * its d_hash is on the s_anon list and d_drop() would hide
+ * it from shrink_dcache_for_unmount(), leading to busy
+ * inodes on unmount and further oopses.
+ */
+ if (IS_ROOT(dentry))
+ return 1;
+ }
+ dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is invalid\n",
+ __func__, dentry);
+ return 0;
+ }
+ dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) lookup returned error %d\n",
+ __func__, dentry, error);
+ return error;
+}
+
+static int
+nfs_lookup_revalidate_negative(struct inode *dir, struct dentry *dentry,
+ unsigned int flags)
+{
+ int ret = 1;
+ if (nfs_neg_need_reval(dir, dentry, flags)) {
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+ ret = 0;
+ }
+ return nfs_lookup_revalidate_done(dir, dentry, NULL, ret);
+}
+
+static int
+nfs_lookup_revalidate_delegated(struct inode *dir, struct dentry *dentry,
+ struct inode *inode)
+{
+ nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+ return nfs_lookup_revalidate_done(dir, dentry, inode, 1);
+}
+
+static int
+nfs_lookup_revalidate_dentry(struct inode *dir, struct dentry *dentry,
+ struct inode *inode)
+{
+ struct nfs_fh *fhandle;
+ struct nfs_fattr *fattr;
+ struct nfs4_label *label;
+ int ret;
+
+ ret = -ENOMEM;
+ fhandle = nfs_alloc_fhandle();
+ fattr = nfs_alloc_fattr();
+ label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL);
+ if (fhandle == NULL || fattr == NULL || IS_ERR(label))
+ goto out;
+
+ ret = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label);
+ if (ret < 0) {
+ if (ret == -ESTALE || ret == -ENOENT)
+ ret = 0;
+ goto out;
+ }
+ ret = 0;
+ if (nfs_compare_fh(NFS_FH(inode), fhandle))
+ goto out;
+ if (nfs_refresh_inode(inode, fattr) < 0)
+ goto out;
+
+ nfs_setsecurity(inode, fattr, label);
+ nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+
+ /* set a readdirplus hint that we had a cache miss */
+ nfs_force_use_readdirplus(dir);
+ ret = 1;
+out:
+ nfs_free_fattr(fattr);
+ nfs_free_fhandle(fhandle);
+ nfs4_label_free(label);
+ return nfs_lookup_revalidate_done(dir, dentry, inode, ret);
+}
+
/*
* This is called every time the dcache has a lookup hit,
* and we should check whether we can really trust that
@@ -1083,58 +1172,36 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry,
* If the parent directory is seen to have changed, we throw out the
* cached dentry and do a new lookup.
*/
-static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
+static int
+nfs_do_lookup_revalidate(struct inode *dir, struct dentry *dentry,
+ unsigned int flags)
{
- struct inode *dir;
struct inode *inode;
- struct dentry *parent;
- struct nfs_fh *fhandle = NULL;
- struct nfs_fattr *fattr = NULL;
- struct nfs4_label *label = NULL;
int error;
- if (flags & LOOKUP_RCU) {
- parent = READ_ONCE(dentry->d_parent);
- dir = d_inode_rcu(parent);
- if (!dir)
- return -ECHILD;
- } else {
- parent = dget_parent(dentry);
- dir = d_inode(parent);
- }
nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE);
inode = d_inode(dentry);
- if (!inode) {
- if (nfs_neg_need_reval(dir, dentry, flags)) {
- if (flags & LOOKUP_RCU)
- return -ECHILD;
- goto out_bad;
- }
- goto out_valid;
- }
+ if (!inode)
+ return nfs_lookup_revalidate_negative(dir, dentry, flags);
if (is_bad_inode(inode)) {
- if (flags & LOOKUP_RCU)
- return -ECHILD;
dfprintk(LOOKUPCACHE, "%s: %pd2 has dud inode\n",
__func__, dentry);
goto out_bad;
}
if (NFS_PROTO(dir)->have_delegation(inode, FMODE_READ))
- goto out_set_verifier;
+ return nfs_lookup_revalidate_delegated(dir, dentry, inode);
/* Force a full look up iff the parent directory has changed */
if (!(flags & (LOOKUP_EXCL | LOOKUP_REVAL)) &&
nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU)) {
error = nfs_lookup_verify_inode(inode, flags);
if (error) {
- if (flags & LOOKUP_RCU)
- return -ECHILD;
if (error == -ESTALE)
- goto out_zap_parent;
- goto out_error;
+ nfs_zap_caches(dir);
+ goto out_bad;
}
nfs_advise_use_readdirplus(dir);
goto out_valid;
@@ -1146,81 +1213,45 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
if (NFS_STALE(inode))
goto out_bad;
- error = -ENOMEM;
- fhandle = nfs_alloc_fhandle();
- fattr = nfs_alloc_fattr();
- if (fhandle == NULL || fattr == NULL)
- goto out_error;
-
- label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT);
- if (IS_ERR(label))
- goto out_error;
-
trace_nfs_lookup_revalidate_enter(dir, dentry, flags);
- error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label);
+ error = nfs_lookup_revalidate_dentry(dir, dentry, inode);
trace_nfs_lookup_revalidate_exit(dir, dentry, flags, error);
- if (error == -ESTALE || error == -ENOENT)
- goto out_bad;
- if (error)
- goto out_error;
- if (nfs_compare_fh(NFS_FH(inode), fhandle))
- goto out_bad;
- if ((error = nfs_refresh_inode(inode, fattr)) != 0)
- goto out_bad;
-
- nfs_setsecurity(inode, fattr, label);
-
- nfs_free_fattr(fattr);
- nfs_free_fhandle(fhandle);
- nfs4_label_free(label);
+ return error;
+out_valid:
+ return nfs_lookup_revalidate_done(dir, dentry, inode, 1);
+out_bad:
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+ return nfs_lookup_revalidate_done(dir, dentry, inode, 0);
+}
- /* set a readdirplus hint that we had a cache miss */
- nfs_force_use_readdirplus(dir);
+static int
+__nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags,
+ int (*reval)(struct inode *, struct dentry *, unsigned int))
+{
+ struct dentry *parent;
+ struct inode *dir;
+ int ret;
-out_set_verifier:
- nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
- out_valid:
if (flags & LOOKUP_RCU) {
+ parent = READ_ONCE(dentry->d_parent);
+ dir = d_inode_rcu(parent);
+ if (!dir)
+ return -ECHILD;
+ ret = reval(dir, dentry, flags);
if (parent != READ_ONCE(dentry->d_parent))
return -ECHILD;
- } else
+ } else {
+ parent = dget_parent(dentry);
+ ret = reval(d_inode(parent), dentry, flags);
dput(parent);
- dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is valid\n",
- __func__, dentry);
- return 1;
-out_zap_parent:
- nfs_zap_caches(dir);
- out_bad:
- WARN_ON(flags & LOOKUP_RCU);
- nfs_free_fattr(fattr);
- nfs_free_fhandle(fhandle);
- nfs4_label_free(label);
- nfs_mark_for_revalidate(dir);
- if (inode && S_ISDIR(inode->i_mode)) {
- /* Purge readdir caches. */
- nfs_zap_caches(inode);
- /*
- * We can't d_drop the root of a disconnected tree:
- * its d_hash is on the s_anon list and d_drop() would hide
- * it from shrink_dcache_for_unmount(), leading to busy
- * inodes on unmount and further oopses.
- */
- if (IS_ROOT(dentry))
- goto out_valid;
}
- dput(parent);
- dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is invalid\n",
- __func__, dentry);
- return 0;
-out_error:
- WARN_ON(flags & LOOKUP_RCU);
- nfs_free_fattr(fattr);
- nfs_free_fhandle(fhandle);
- nfs4_label_free(label);
- dput(parent);
- dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) lookup returned error %d\n",
- __func__, dentry, error);
- return error;
+ return ret;
+}
+
+static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ return __nfs_lookup_revalidate(dentry, flags, nfs_do_lookup_revalidate);
}
/*
@@ -1579,62 +1610,55 @@ no_open:
}
EXPORT_SYMBOL_GPL(nfs_atomic_open);
-static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
+static int
+nfs4_do_lookup_revalidate(struct inode *dir, struct dentry *dentry,
+ unsigned int flags)
{
struct inode *inode;
- int ret = 0;
if (!(flags & LOOKUP_OPEN) || (flags & LOOKUP_DIRECTORY))
- goto no_open;
+ goto full_reval;
if (d_mountpoint(dentry))
- goto no_open;
- if (NFS_SB(dentry->d_sb)->caps & NFS_CAP_ATOMIC_OPEN_V1)
- goto no_open;
+ goto full_reval;
inode = d_inode(dentry);
/* We can't create new files in nfs_open_revalidate(), so we
* optimize away revalidation of negative dentries.
*/
- if (inode == NULL) {
- struct dentry *parent;
- struct inode *dir;
-
- if (flags & LOOKUP_RCU) {
- parent = READ_ONCE(dentry->d_parent);
- dir = d_inode_rcu(parent);
- if (!dir)
- return -ECHILD;
- } else {
- parent = dget_parent(dentry);
- dir = d_inode(parent);
- }
- if (!nfs_neg_need_reval(dir, dentry, flags))
- ret = 1;
- else if (flags & LOOKUP_RCU)
- ret = -ECHILD;
- if (!(flags & LOOKUP_RCU))
- dput(parent);
- else if (parent != READ_ONCE(dentry->d_parent))
- return -ECHILD;
- goto out;
- }
+ if (inode == NULL)
+ goto full_reval;
+
+ if (NFS_PROTO(dir)->have_delegation(inode, FMODE_READ))
+ return nfs_lookup_revalidate_delegated(dir, dentry, inode);
/* NFS only supports OPEN on regular files */
if (!S_ISREG(inode->i_mode))
- goto no_open;
+ goto full_reval;
+
/* We cannot do exclusive creation on a positive dentry */
- if (flags & LOOKUP_EXCL)
- goto no_open;
+ if (flags & (LOOKUP_EXCL | LOOKUP_REVAL))
+ goto reval_dentry;
+
+ /* Check if the directory changed */
+ if (!nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU))
+ goto reval_dentry;
/* Let f_op->open() actually open (and revalidate) the file */
- ret = 1;
+ return 1;
+reval_dentry:
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+ return nfs_lookup_revalidate_dentry(dir, dentry, inode);;
-out:
- return ret;
+full_reval:
+ return nfs_do_lookup_revalidate(dir, dentry, flags);
+}
-no_open:
- return nfs_lookup_revalidate(dentry, flags);
+static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ return __nfs_lookup_revalidate(dentry, flags,
+ nfs4_do_lookup_revalidate);
}
#endif /* CONFIG_NFSV4 */
@@ -2110,7 +2134,7 @@ MODULE_PARM_DESC(nfs_access_max_cachesize, "NFS access maximum total cache lengt
static void nfs_access_free_entry(struct nfs_access_entry *entry)
{
- put_rpccred(entry->cred);
+ put_cred(entry->cred);
kfree_rcu(entry, rcu_head);
smp_mb__before_atomic();
atomic_long_dec(&nfs_access_nr_entries);
@@ -2236,17 +2260,18 @@ void nfs_access_zap_cache(struct inode *inode)
}
EXPORT_SYMBOL_GPL(nfs_access_zap_cache);
-static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, struct rpc_cred *cred)
+static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, const struct cred *cred)
{
struct rb_node *n = NFS_I(inode)->access_cache.rb_node;
- struct nfs_access_entry *entry;
while (n != NULL) {
- entry = rb_entry(n, struct nfs_access_entry, rb_node);
+ struct nfs_access_entry *entry =
+ rb_entry(n, struct nfs_access_entry, rb_node);
+ int cmp = cred_fscmp(cred, entry->cred);
- if (cred < entry->cred)
+ if (cmp < 0)
n = n->rb_left;
- else if (cred > entry->cred)
+ else if (cmp > 0)
n = n->rb_right;
else
return entry;
@@ -2254,7 +2279,7 @@ static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, st
return NULL;
}
-static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res, bool may_block)
+static int nfs_access_get_cached(struct inode *inode, const struct cred *cred, struct nfs_access_entry *res, bool may_block)
{
struct nfs_inode *nfsi = NFS_I(inode);
struct nfs_access_entry *cache;
@@ -2297,7 +2322,7 @@ out_zap:
return -ENOENT;
}
-static int nfs_access_get_cached_rcu(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res)
+static int nfs_access_get_cached_rcu(struct inode *inode, const struct cred *cred, struct nfs_access_entry *res)
{
/* Only check the most recently returned cache entry,
* but do it without locking.
@@ -2334,15 +2359,17 @@ static void nfs_access_add_rbtree(struct inode *inode, struct nfs_access_entry *
struct rb_node **p = &root_node->rb_node;
struct rb_node *parent = NULL;
struct nfs_access_entry *entry;
+ int cmp;
spin_lock(&inode->i_lock);
while (*p != NULL) {
parent = *p;
entry = rb_entry(parent, struct nfs_access_entry, rb_node);
+ cmp = cred_fscmp(set->cred, entry->cred);
- if (set->cred < entry->cred)
+ if (cmp < 0)
p = &parent->rb_left;
- else if (set->cred > entry->cred)
+ else if (cmp > 0)
p = &parent->rb_right;
else
goto found;
@@ -2366,7 +2393,7 @@ void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
if (cache == NULL)
return;
RB_CLEAR_NODE(&cache->rb_node);
- cache->cred = get_rpccred(set->cred);
+ cache->cred = get_cred(set->cred);
cache->mask = set->mask;
/* The above field assignments must be visible
@@ -2430,7 +2457,7 @@ void nfs_access_set_mask(struct nfs_access_entry *entry, u32 access_result)
}
EXPORT_SYMBOL_GPL(nfs_access_set_mask);
-static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
+static int nfs_do_access(struct inode *inode, const struct cred *cred, int mask)
{
struct nfs_access_entry cache;
bool may_block = (mask & MAY_NOT_BLOCK) == 0;
@@ -2494,7 +2521,7 @@ static int nfs_open_permission_mask(int openflags)
return mask;
}
-int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags)
+int nfs_may_open(struct inode *inode, const struct cred *cred, int openflags)
{
return nfs_do_access(inode, cred, nfs_open_permission_mask(openflags));
}
@@ -2519,7 +2546,7 @@ static int nfs_execute_ok(struct inode *inode, int mask)
int nfs_permission(struct inode *inode, int mask)
{
- struct rpc_cred *cred;
+ const struct cred *cred = current_cred();
int res = 0;
nfs_inc_stats(inode, NFSIOS_VFSACCESS);
@@ -2553,20 +2580,11 @@ force_lookup:
/* Always try fast lookups first */
rcu_read_lock();
- cred = rpc_lookup_cred_nonblock();
- if (!IS_ERR(cred))
- res = nfs_do_access(inode, cred, mask|MAY_NOT_BLOCK);
- else
- res = PTR_ERR(cred);
+ res = nfs_do_access(inode, cred, mask|MAY_NOT_BLOCK);
rcu_read_unlock();
if (res == -ECHILD && !(mask & MAY_NOT_BLOCK)) {
/* Fast lookup failed, try the slow way */
- cred = rpc_lookup_cred();
- if (!IS_ERR(cred)) {
- res = nfs_do_access(inode, cred, mask);
- put_rpccred(cred);
- } else
- res = PTR_ERR(cred);
+ res = nfs_do_access(inode, cred, mask);
}
out:
if (!res && (mask & MAY_EXEC))
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index aa12c3063bae..33824a0a57bf 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -98,8 +98,11 @@ struct nfs_direct_req {
struct pnfs_ds_commit_info ds_cinfo; /* Storage for cinfo */
struct work_struct work;
int flags;
+ /* for write */
#define NFS_ODIRECT_DO_COMMIT (1) /* an unstable reply was received */
#define NFS_ODIRECT_RESCHED_WRITES (2) /* write verification failed */
+ /* for read */
+#define NFS_ODIRECT_SHOULD_DIRTY (3) /* dirty user-space page after read */
struct nfs_writeverf verf; /* unstable write verifier */
};
@@ -412,7 +415,8 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
struct nfs_page *req = nfs_list_entry(hdr->pages.next);
struct page *page = req->wb_page;
- if (!PageCompound(page) && bytes < hdr->good_bytes)
+ if (!PageCompound(page) && bytes < hdr->good_bytes &&
+ (dreq->flags == NFS_ODIRECT_SHOULD_DIRTY))
set_page_dirty(page);
bytes += req->wb_bytes;
nfs_list_remove_request(req);
@@ -587,6 +591,9 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
if (!is_sync_kiocb(iocb))
dreq->iocb = iocb;
+ if (iter_is_iovec(iter))
+ dreq->flags = NFS_ODIRECT_SHOULD_DIRTY;
+
nfs_start_io_direct(inode);
NFS_I(inode)->read_io += count;
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index 060c658eab66..a7d3df85736d 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -65,6 +65,7 @@ struct nfs_dns_ent {
struct sockaddr_storage addr;
size_t addrlen;
+ struct rcu_head rcu_head;
};
@@ -101,15 +102,23 @@ static void nfs_dns_ent_init(struct cache_head *cnew,
}
}
-static void nfs_dns_ent_put(struct kref *ref)
+static void nfs_dns_ent_free_rcu(struct rcu_head *head)
{
struct nfs_dns_ent *item;
- item = container_of(ref, struct nfs_dns_ent, h.ref);
+ item = container_of(head, struct nfs_dns_ent, rcu_head);
kfree(item->hostname);
kfree(item);
}
+static void nfs_dns_ent_put(struct kref *ref)
+{
+ struct nfs_dns_ent *item;
+
+ item = container_of(ref, struct nfs_dns_ent, h.ref);
+ call_rcu(&item->rcu_head, nfs_dns_ent_free_rcu);
+}
+
static struct cache_head *nfs_dns_ent_alloc(void)
{
struct nfs_dns_ent *item = kmalloc(sizeof(*item), GFP_KERNEL);
@@ -195,7 +204,7 @@ static struct nfs_dns_ent *nfs_dns_lookup(struct cache_detail *cd,
{
struct cache_head *ch;
- ch = sunrpc_cache_lookup(cd,
+ ch = sunrpc_cache_lookup_rcu(cd,
&key->h,
nfs_dns_hash(key));
if (!ch)
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index d175724ff566..61f46facb39c 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -1164,6 +1164,7 @@ static struct pnfs_layoutdriver_type filelayout_type = {
.id = LAYOUT_NFSV4_1_FILES,
.name = "LAYOUT_NFSV4_1_FILES",
.owner = THIS_MODULE,
+ .max_layoutget_response = 4096, /* 1 page or so... */
.alloc_layout_hdr = filelayout_alloc_layout_hdr,
.free_layout_hdr = filelayout_free_layout_hdr,
.alloc_lseg = filelayout_alloc_lseg,
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index cae43333ef16..63abe705f4ca 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -9,6 +9,7 @@
#include <linux/nfs_fs.h>
#include <linux/nfs_page.h>
#include <linux/module.h>
+#include <linux/sched/mm.h>
#include <linux/sunrpc/metrics.h>
@@ -27,9 +28,6 @@
#define FF_LAYOUT_POLL_RETRY_MAX (15*HZ)
#define FF_LAYOUTRETURN_MAXERR 20
-
-static struct group_info *ff_zero_group;
-
static void ff_layout_read_record_layoutstats_done(struct rpc_task *task,
struct nfs_pgio_header *hdr);
static int ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo,
@@ -226,16 +224,14 @@ static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags)
static void ff_layout_free_mirror(struct nfs4_ff_layout_mirror *mirror)
{
- struct rpc_cred *cred;
+ const struct cred *cred;
ff_layout_remove_mirror(mirror);
kfree(mirror->fh_versions);
cred = rcu_access_pointer(mirror->ro_cred);
- if (cred)
- put_rpccred(cred);
+ put_cred(cred);
cred = rcu_access_pointer(mirror->rw_cred);
- if (cred)
- put_rpccred(cred);
+ put_cred(cred);
nfs4_ff_layout_put_deviceid(mirror->mirror_ds);
kfree(mirror);
}
@@ -413,8 +409,10 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
for (i = 0; i < fls->mirror_array_cnt; i++) {
struct nfs4_ff_layout_mirror *mirror;
- struct auth_cred acred = { .group_info = ff_zero_group };
- struct rpc_cred __rcu *cred;
+ struct cred *kcred;
+ const struct cred *cred;
+ kuid_t uid;
+ kgid_t gid;
u32 ds_count, fh_count, id;
int j;
@@ -482,21 +480,28 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
if (rc)
goto out_err_free;
- acred.uid = make_kuid(&init_user_ns, id);
+ uid = make_kuid(&init_user_ns, id);
/* group */
rc = decode_name(&stream, &id);
if (rc)
goto out_err_free;
- acred.gid = make_kgid(&init_user_ns, id);
+ gid = make_kgid(&init_user_ns, id);
- /* find the cred for it */
- rcu_assign_pointer(cred, rpc_lookup_generic_cred(&acred, 0, gfp_flags));
- if (IS_ERR(cred)) {
- rc = PTR_ERR(cred);
- goto out_err_free;
+ if (gfp_flags & __GFP_FS)
+ kcred = prepare_kernel_cred(NULL);
+ else {
+ unsigned int nofs_flags = memalloc_nofs_save();
+ kcred = prepare_kernel_cred(NULL);
+ memalloc_nofs_restore(nofs_flags);
}
+ rc = -ENOMEM;
+ if (!kcred)
+ goto out_err_free;
+ kcred->fsuid = uid;
+ kcred->fsgid = gid;
+ cred = kcred;
if (lgr->range.iomode == IOMODE_READ)
rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred);
@@ -519,8 +524,8 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
dprintk("%s: iomode %s uid %u gid %u\n", __func__,
lgr->range.iomode == IOMODE_READ ? "READ" : "RW",
- from_kuid(&init_user_ns, acred.uid),
- from_kgid(&init_user_ns, acred.gid));
+ from_kuid(&init_user_ns, uid),
+ from_kgid(&init_user_ns, gid));
}
p = xdr_inline_decode(&stream, 4);
@@ -1361,12 +1366,7 @@ static void ff_layout_read_prepare_v4(struct rpc_task *task, void *data)
task))
return;
- if (ff_layout_read_prepare_common(task, hdr))
- return;
-
- if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
- hdr->args.lock_context, FMODE_READ) == -EIO)
- rpc_exit(task, -EIO); /* lost lock, terminate I/O */
+ ff_layout_read_prepare_common(task, hdr);
}
static void ff_layout_read_call_done(struct rpc_task *task, void *data)
@@ -1542,12 +1542,7 @@ static void ff_layout_write_prepare_v4(struct rpc_task *task, void *data)
task))
return;
- if (ff_layout_write_prepare_common(task, hdr))
- return;
-
- if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
- hdr->args.lock_context, FMODE_WRITE) == -EIO)
- rpc_exit(task, -EIO); /* lost lock, terminate I/O */
+ ff_layout_write_prepare_common(task, hdr);
}
static void ff_layout_write_call_done(struct rpc_task *task, void *data)
@@ -1708,7 +1703,7 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
struct pnfs_layout_segment *lseg = hdr->lseg;
struct nfs4_pnfs_ds *ds;
struct rpc_clnt *ds_clnt;
- struct rpc_cred *ds_cred;
+ const struct cred *ds_cred;
loff_t offset = hdr->args.offset;
u32 idx = hdr->pgio_mirror_idx;
int vers;
@@ -1742,6 +1737,11 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
fh = nfs4_ff_layout_select_ds_fh(lseg, idx);
if (fh)
hdr->args.fh = fh;
+
+ if (vers == 4 &&
+ !nfs4_ff_layout_select_ds_stateid(lseg, idx, &hdr->args.stateid))
+ goto out_failed;
+
/*
* Note that if we ever decide to split across DSes,
* then we may need to handle dense-like offsets.
@@ -1754,7 +1754,7 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
vers == 3 ? &ff_layout_read_call_ops_v3 :
&ff_layout_read_call_ops_v4,
0, RPC_TASK_SOFTCONN);
- put_rpccred(ds_cred);
+ put_cred(ds_cred);
return PNFS_ATTEMPTED;
out_failed:
@@ -1770,7 +1770,7 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
struct pnfs_layout_segment *lseg = hdr->lseg;
struct nfs4_pnfs_ds *ds;
struct rpc_clnt *ds_clnt;
- struct rpc_cred *ds_cred;
+ const struct cred *ds_cred;
loff_t offset = hdr->args.offset;
int vers;
struct nfs_fh *fh;
@@ -1804,6 +1804,10 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
if (fh)
hdr->args.fh = fh;
+ if (vers == 4 &&
+ !nfs4_ff_layout_select_ds_stateid(lseg, idx, &hdr->args.stateid))
+ goto out_failed;
+
/*
* Note that if we ever decide to split across DSes,
* then we may need to handle dense-like offsets.
@@ -1815,7 +1819,7 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
vers == 3 ? &ff_layout_write_call_ops_v3 :
&ff_layout_write_call_ops_v4,
sync, RPC_TASK_SOFTCONN);
- put_rpccred(ds_cred);
+ put_cred(ds_cred);
return PNFS_ATTEMPTED;
out_failed:
@@ -1845,7 +1849,7 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
struct pnfs_layout_segment *lseg = data->lseg;
struct nfs4_pnfs_ds *ds;
struct rpc_clnt *ds_clnt;
- struct rpc_cred *ds_cred;
+ const struct cred *ds_cred;
u32 idx;
int vers, ret;
struct nfs_fh *fh;
@@ -1885,7 +1889,7 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
vers == 3 ? &ff_layout_commit_call_ops_v3 :
&ff_layout_commit_call_ops_v4,
how, RPC_TASK_SOFTCONN);
- put_rpccred(ds_cred);
+ put_cred(ds_cred);
return ret;
out_err:
pnfs_generic_prepare_to_resend_writes(data);
@@ -2356,6 +2360,7 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = {
.name = "LAYOUT_FLEX_FILES",
.owner = THIS_MODULE,
.flags = PNFS_LAYOUTGET_ON_OPEN,
+ .max_layoutget_response = 4096, /* 1 page or so... */
.set_layoutdriver = ff_layout_set_layoutdriver,
.alloc_layout_hdr = ff_layout_alloc_layout_hdr,
.free_layout_hdr = ff_layout_free_layout_hdr,
@@ -2383,11 +2388,6 @@ static int __init nfs4flexfilelayout_init(void)
{
printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Registering...\n",
__func__);
- if (!ff_zero_group) {
- ff_zero_group = groups_alloc(0);
- if (!ff_zero_group)
- return -ENOMEM;
- }
return pnfs_register_layoutdriver(&flexfilelayout_type);
}
@@ -2396,10 +2396,6 @@ static void __exit nfs4flexfilelayout_exit(void)
printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Unregistering...\n",
__func__);
pnfs_unregister_layoutdriver(&flexfilelayout_type);
- if (ff_zero_group) {
- put_group_info(ff_zero_group);
- ff_zero_group = NULL;
- }
}
MODULE_ALIAS("nfs-layouttype4-4");
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
index 411798346e48..c2626bad466b 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.h
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -81,8 +81,8 @@ struct nfs4_ff_layout_mirror {
u32 fh_versions_cnt;
struct nfs_fh *fh_versions;
nfs4_stateid stateid;
- struct rpc_cred __rcu *ro_cred;
- struct rpc_cred __rcu *rw_cred;
+ const struct cred __rcu *ro_cred;
+ const struct cred __rcu *rw_cred;
refcount_t ref;
spinlock_t lock;
unsigned long flags;
@@ -215,6 +215,10 @@ unsigned int ff_layout_fetch_ds_ioerr(struct pnfs_layout_hdr *lo,
unsigned int maxnum);
struct nfs_fh *
nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx);
+int
+nfs4_ff_layout_select_ds_stateid(struct pnfs_layout_segment *lseg,
+ u32 mirror_idx,
+ nfs4_stateid *stateid);
struct nfs4_pnfs_ds *
nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
@@ -225,8 +229,8 @@ nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg,
u32 ds_idx,
struct nfs_client *ds_clp,
struct inode *inode);
-struct rpc_cred *ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg,
- u32 ds_idx, struct rpc_cred *mdscred);
+const struct cred *ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg,
+ u32 ds_idx, const struct cred *mdscred);
bool ff_layout_avoid_mds_available_ds(struct pnfs_layout_segment *lseg);
bool ff_layout_avoid_read_on_rw(struct pnfs_layout_segment *lseg);
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index 59aa04976331..11766a74216d 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -330,10 +330,10 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
return 0;
}
-static struct rpc_cred *
+static const struct cred *
ff_layout_get_mirror_cred(struct nfs4_ff_layout_mirror *mirror, u32 iomode)
{
- struct rpc_cred *cred, __rcu **pcred;
+ const struct cred *cred, __rcu **pcred;
if (iomode == IOMODE_READ)
pcred = &mirror->ro_cred;
@@ -346,7 +346,7 @@ ff_layout_get_mirror_cred(struct nfs4_ff_layout_mirror *mirror, u32 iomode)
if (!cred)
break;
- cred = get_rpccred_rcu(cred);
+ cred = get_cred_rcu(cred);
} while(!cred);
rcu_read_unlock();
return cred;
@@ -370,6 +370,25 @@ out:
return fh;
}
+int
+nfs4_ff_layout_select_ds_stateid(struct pnfs_layout_segment *lseg,
+ u32 mirror_idx,
+ nfs4_stateid *stateid)
+{
+ struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, mirror_idx);
+
+ if (!ff_layout_mirror_valid(lseg, mirror, false)) {
+ pr_err_ratelimited("NFS: %s: No data server for mirror offset index %d\n",
+ __func__, mirror_idx);
+ goto out;
+ }
+
+ nfs4_stateid_copy(stateid, &mirror->stateid);
+ return 1;
+out:
+ return 0;
+}
+
/**
* nfs4_ff_layout_prepare_ds - prepare a DS connection for an RPC call
* @lseg: the layout segment we're operating on
@@ -446,19 +465,19 @@ out:
return ds;
}
-struct rpc_cred *
+const struct cred *
ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, u32 ds_idx,
- struct rpc_cred *mdscred)
+ const struct cred *mdscred)
{
struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
- struct rpc_cred *cred;
+ const struct cred *cred;
- if (mirror) {
+ if (mirror && !mirror->mirror_ds->ds_versions[0].tightly_coupled) {
cred = ff_layout_get_mirror_cred(mirror, lseg->pls_range.iomode);
if (!cred)
- cred = get_rpccred(mdscred);
+ cred = get_cred(mdscred);
} else {
- cred = get_rpccred(mdscred);
+ cred = get_cred(mdscred);
}
return cred;
}
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index b65aee481d13..094775ea0781 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -857,15 +857,14 @@ static void nfs_init_lock_context(struct nfs_lock_context *l_ctx)
static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx)
{
- struct nfs_lock_context *head = &ctx->lock_context;
- struct nfs_lock_context *pos = head;
+ struct nfs_lock_context *pos;
- do {
+ list_for_each_entry_rcu(pos, &ctx->lock_context.list, list) {
if (pos->lockowner != current->files)
continue;
- refcount_inc(&pos->count);
- return pos;
- } while ((pos = list_entry(pos->list.next, typeof(*pos), list)) != head);
+ if (refcount_inc_not_zero(&pos->count))
+ return pos;
+ }
return NULL;
}
@@ -874,10 +873,10 @@ struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx)
struct nfs_lock_context *res, *new = NULL;
struct inode *inode = d_inode(ctx->dentry);
- spin_lock(&inode->i_lock);
+ rcu_read_lock();
res = __nfs_find_lock_context(ctx);
+ rcu_read_unlock();
if (res == NULL) {
- spin_unlock(&inode->i_lock);
new = kmalloc(sizeof(*new), GFP_KERNEL);
if (new == NULL)
return ERR_PTR(-ENOMEM);
@@ -885,14 +884,14 @@ struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx)
spin_lock(&inode->i_lock);
res = __nfs_find_lock_context(ctx);
if (res == NULL) {
- list_add_tail(&new->list, &ctx->lock_context.list);
+ list_add_tail_rcu(&new->list, &ctx->lock_context.list);
new->open_context = ctx;
res = new;
new = NULL;
}
+ spin_unlock(&inode->i_lock);
+ kfree(new);
}
- spin_unlock(&inode->i_lock);
- kfree(new);
return res;
}
EXPORT_SYMBOL_GPL(nfs_get_lock_context);
@@ -904,9 +903,9 @@ void nfs_put_lock_context(struct nfs_lock_context *l_ctx)
if (!refcount_dec_and_lock(&l_ctx->count, &inode->i_lock))
return;
- list_del(&l_ctx->list);
+ list_del_rcu(&l_ctx->list);
spin_unlock(&inode->i_lock);
- kfree(l_ctx);
+ kfree_rcu(l_ctx, rcu_head);
}
EXPORT_SYMBOL_GPL(nfs_put_lock_context);
@@ -951,18 +950,17 @@ struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry,
struct file *filp)
{
struct nfs_open_context *ctx;
- struct rpc_cred *cred = rpc_lookup_cred();
- if (IS_ERR(cred))
- return ERR_CAST(cred);
+ const struct cred *cred = get_current_cred();
ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
if (!ctx) {
- put_rpccred(cred);
+ put_cred(cred);
return ERR_PTR(-ENOMEM);
}
nfs_sb_active(dentry->d_sb);
ctx->dentry = dget(dentry);
ctx->cred = cred;
+ ctx->ll_cred = NULL;
ctx->state = NULL;
ctx->mode = f_mode;
ctx->flags = 0;
@@ -978,9 +976,9 @@ EXPORT_SYMBOL_GPL(alloc_nfs_open_context);
struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx)
{
- if (ctx != NULL)
- refcount_inc(&ctx->lock_context.count);
- return ctx;
+ if (ctx != NULL && refcount_inc_not_zero(&ctx->lock_context.count))
+ return ctx;
+ return NULL;
}
EXPORT_SYMBOL_GPL(get_nfs_open_context);
@@ -989,21 +987,21 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
struct inode *inode = d_inode(ctx->dentry);
struct super_block *sb = ctx->dentry->d_sb;
+ if (!refcount_dec_and_test(&ctx->lock_context.count))
+ return;
if (!list_empty(&ctx->list)) {
- if (!refcount_dec_and_lock(&ctx->lock_context.count, &inode->i_lock))
- return;
- list_del(&ctx->list);
+ spin_lock(&inode->i_lock);
+ list_del_rcu(&ctx->list);
spin_unlock(&inode->i_lock);
- } else if (!refcount_dec_and_test(&ctx->lock_context.count))
- return;
+ }
if (inode != NULL)
NFS_PROTO(inode)->close_context(ctx, is_sync);
- if (ctx->cred != NULL)
- put_rpccred(ctx->cred);
+ put_cred(ctx->cred);
dput(ctx->dentry);
nfs_sb_deactive(sb);
+ put_rpccred(ctx->ll_cred);
kfree(ctx->mdsthreshold);
- kfree(ctx);
+ kfree_rcu(ctx, rcu_head);
}
void put_nfs_open_context(struct nfs_open_context *ctx)
@@ -1027,10 +1025,7 @@ void nfs_inode_attach_open_context(struct nfs_open_context *ctx)
struct nfs_inode *nfsi = NFS_I(inode);
spin_lock(&inode->i_lock);
- if (ctx->mode & FMODE_WRITE)
- list_add(&ctx->list, &nfsi->open_files);
- else
- list_add_tail(&ctx->list, &nfsi->open_files);
+ list_add_tail_rcu(&ctx->list, &nfsi->open_files);
spin_unlock(&inode->i_lock);
}
EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context);
@@ -1046,21 +1041,22 @@ EXPORT_SYMBOL_GPL(nfs_file_set_open_context);
/*
* Given an inode, search for an open context with the desired characteristics
*/
-struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, fmode_t mode)
+struct nfs_open_context *nfs_find_open_context(struct inode *inode, const struct cred *cred, fmode_t mode)
{
struct nfs_inode *nfsi = NFS_I(inode);
struct nfs_open_context *pos, *ctx = NULL;
- spin_lock(&inode->i_lock);
- list_for_each_entry(pos, &nfsi->open_files, list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(pos, &nfsi->open_files, list) {
if (cred != NULL && pos->cred != cred)
continue;
if ((pos->mode & (FMODE_READ|FMODE_WRITE)) != mode)
continue;
ctx = get_nfs_open_context(pos);
- break;
+ if (ctx)
+ break;
}
- spin_unlock(&inode->i_lock);
+ rcu_read_unlock();
return ctx;
}
@@ -1078,9 +1074,6 @@ void nfs_file_clear_open_context(struct file *filp)
if (ctx->error < 0)
invalidate_inode_pages2(inode->i_mapping);
filp->private_data = NULL;
- spin_lock(&inode->i_lock);
- list_move_tail(&ctx->list, &NFS_I(inode)->open_files);
- spin_unlock(&inode->i_lock);
put_nfs_open_context_sync(ctx);
}
}
@@ -1329,19 +1322,11 @@ static bool nfs_file_has_writers(struct nfs_inode *nfsi)
{
struct inode *inode = &nfsi->vfs_inode;
- assert_spin_locked(&inode->i_lock);
-
if (!S_ISREG(inode->i_mode))
return false;
if (list_empty(&nfsi->open_files))
return false;
- /* Note: This relies on nfsi->open_files being ordered with writers
- * being placed at the head of the list.
- * See nfs_inode_attach_open_context()
- */
- return (list_first_entry(&nfsi->open_files,
- struct nfs_open_context,
- list)->mode & FMODE_WRITE) == FMODE_WRITE;
+ return inode_is_open_for_write(inode);
}
static bool nfs_file_has_buffered_writers(struct nfs_inode *nfsi)
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 8357ff69962f..b1e577302518 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -123,7 +123,7 @@ struct nfs_parsed_mount_data {
unsigned short protocol;
} nfs_server;
- struct security_mnt_opts lsm_opts;
+ void *lsm_opts;
struct net *net;
};
@@ -254,7 +254,7 @@ struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *);
void nfs_pgio_header_free(struct nfs_pgio_header *);
int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *);
int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
- struct rpc_cred *cred, const struct nfs_rpc_ops *rpc_ops,
+ const struct cred *cred, const struct nfs_rpc_ops *rpc_ops,
const struct rpc_call_ops *call_ops, int how, int flags);
void nfs_free_request(struct nfs_page *req);
struct nfs_pgio_mirror *
@@ -269,7 +269,7 @@ static inline bool nfs_pgio_has_mirroring(struct nfs_pageio_descriptor *desc)
static inline bool nfs_match_open_context(const struct nfs_open_context *ctx1,
const struct nfs_open_context *ctx2)
{
- return ctx1->cred == ctx2->cred && ctx1->state == ctx2->state;
+ return cred_fscmp(ctx1->cred, ctx2->cred) == 0 && ctx1->state == ctx2->state;
}
/* nfs2xdr.c */
@@ -395,7 +395,6 @@ extern const struct super_operations nfs_sops;
extern struct file_system_type nfs_fs_type;
extern struct file_system_type nfs_xdev_fs_type;
#if IS_ENABLED(CONFIG_NFS_V4)
-extern struct file_system_type nfs4_xdev_fs_type;
extern struct file_system_type nfs4_referral_fs_type;
#endif
bool nfs_auth_info_match(const struct nfs_auth_info *, rpc_authflavor_t);
@@ -565,13 +564,13 @@ extern struct nfs_client *nfs4_init_client(struct nfs_client *clp,
const struct nfs_client_initdata *);
extern int nfs40_walk_client_list(struct nfs_client *clp,
struct nfs_client **result,
- struct rpc_cred *cred);
+ const struct cred *cred);
extern int nfs41_walk_client_list(struct nfs_client *clp,
struct nfs_client **result,
- struct rpc_cred *cred);
-extern int nfs4_test_session_trunk(struct rpc_clnt *,
- struct rpc_xprt *,
- void *);
+ const struct cred *cred);
+extern void nfs4_test_session_trunk(struct rpc_clnt *clnt,
+ struct rpc_xprt *xprt,
+ void *data);
static inline struct inode *nfs_igrab_and_active(struct inode *inode)
{
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index ec8a9efa268f..a3ad2d46fd42 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -607,7 +607,7 @@ out:
* readdirplus.
*/
static int
-nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
+nfs3_proc_readdir(struct dentry *dentry, const struct cred *cred,
u64 cookie, struct page **pages, unsigned int count, bool plus)
{
struct inode *dir = d_inode(dentry);
@@ -628,7 +628,7 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
.rpc_proc = &nfs3_procedures[NFS3PROC_READDIR],
.rpc_argp = &arg,
.rpc_resp = &res,
- .rpc_cred = cred
+ .rpc_cred = cred,
};
int status = -ENOMEM;
@@ -786,6 +786,7 @@ nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
{
struct inode *inode = hdr->inode;
+ struct nfs_server *server = NFS_SERVER(inode);
if (hdr->pgio_done_cb != NULL)
return hdr->pgio_done_cb(task, hdr);
@@ -793,6 +794,9 @@ static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
if (nfs3_async_handle_jukebox(task, inode))
return -EAGAIN;
+ if (task->tk_status >= 0 && !server->read_hdrsize)
+ cmpxchg(&server->read_hdrsize, 0, hdr->res.replen);
+
nfs_invalidate_atime(inode);
nfs_refresh_inode(inode, &hdr->fattr);
return 0;
@@ -802,6 +806,7 @@ static void nfs3_proc_read_setup(struct nfs_pgio_header *hdr,
struct rpc_message *msg)
{
msg->rpc_proc = &nfs3_procedures[NFS3PROC_READ];
+ hdr->args.replen = NFS_SERVER(hdr->inode)->read_hdrsize;
}
static int nfs3_proc_pgio_rpc_prepare(struct rpc_task *task,
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 64e4fa33d89f..78df4eb60f85 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -983,10 +983,11 @@ static void nfs3_xdr_enc_read3args(struct rpc_rqst *req,
const void *data)
{
const struct nfs_pgio_args *args = data;
+ unsigned int replen = args->replen ? args->replen : NFS3_readres_sz;
encode_read3args(xdr, args);
prepare_reply_buffer(req, args->pages, args->pgbase,
- args->count, NFS3_readres_sz);
+ args->count, replen);
req->rq_rcv_buf.flags |= XDRBUF_READ;
}
@@ -1364,10 +1365,12 @@ static void nfs3_xdr_enc_getacl3args(struct rpc_rqst *req,
encode_nfs_fh3(xdr, args->fh);
encode_uint32(xdr, args->mask);
- if (args->mask & (NFS_ACL | NFS_DFACL))
+ if (args->mask & (NFS_ACL | NFS_DFACL)) {
prepare_reply_buffer(req, args->pages, 0,
NFSACL_MAXPAGES << PAGE_SHIFT,
ACL3_getaclres_sz);
+ req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES;
+ }
}
static void nfs3_xdr_enc_setacl3args(struct rpc_rqst *req,
@@ -1673,9 +1676,11 @@ static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr,
void *data)
{
struct nfs_pgio_res *result = data;
+ unsigned int pos;
enum nfs_stat status;
int error;
+ pos = xdr_stream_pos(xdr);
error = decode_nfsstat3(xdr, &status);
if (unlikely(error))
goto out;
@@ -1685,6 +1690,7 @@ static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr,
result->op_status = status;
if (status != NFS3_OK)
goto out_status;
+ result->replen = 3 + ((xdr_stream_pos(xdr) - pos) >> 2);
error = decode_read3resok(xdr, result);
out:
return error;
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index ac5b784a1de0..fed06fd9998d 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -137,31 +137,32 @@ static int handle_async_copy(struct nfs42_copy_res *res,
struct file *dst,
nfs4_stateid *src_stateid)
{
- struct nfs4_copy_state *copy;
+ struct nfs4_copy_state *copy, *tmp_copy;
int status = NFS4_OK;
bool found_pending = false;
struct nfs_open_context *ctx = nfs_file_open_context(dst);
+ copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_NOFS);
+ if (!copy)
+ return -ENOMEM;
+
spin_lock(&server->nfs_client->cl_lock);
- list_for_each_entry(copy, &server->nfs_client->pending_cb_stateids,
+ list_for_each_entry(tmp_copy, &server->nfs_client->pending_cb_stateids,
copies) {
- if (memcmp(&res->write_res.stateid, &copy->stateid,
+ if (memcmp(&res->write_res.stateid, &tmp_copy->stateid,
NFS4_STATEID_SIZE))
continue;
found_pending = true;
- list_del(&copy->copies);
+ list_del(&tmp_copy->copies);
break;
}
if (found_pending) {
spin_unlock(&server->nfs_client->cl_lock);
+ kfree(copy);
+ copy = tmp_copy;
goto out;
}
- copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_NOFS);
- if (!copy) {
- spin_unlock(&server->nfs_client->cl_lock);
- return -ENOMEM;
- }
memcpy(&copy->stateid, &res->write_res.stateid, NFS4_STATEID_SIZE);
init_completion(&copy->completion);
copy->parent_state = ctx->state;
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 3a6904173214..06ac3d9ac7c6 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -41,6 +41,8 @@ enum nfs4_client_state {
NFS4CLNT_MOVED,
NFS4CLNT_LEASE_MOVED,
NFS4CLNT_DELEGATION_EXPIRED,
+ NFS4CLNT_RUN_MANAGER,
+ NFS4CLNT_DELEGRETURN_RUNNING,
};
#define NFS4_RENEW_TIMEOUT 0x01
@@ -60,10 +62,11 @@ struct nfs4_minor_version_ops {
void (*free_lock_state)(struct nfs_server *,
struct nfs4_lock_state *);
int (*test_and_free_expired)(struct nfs_server *,
- nfs4_stateid *, struct rpc_cred *);
+ nfs4_stateid *, const struct cred *);
struct nfs_seqid *
(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
- int (*session_trunk)(struct rpc_clnt *, struct rpc_xprt *, void *);
+ void (*session_trunk)(struct rpc_clnt *clnt,
+ struct rpc_xprt *xprt, void *data);
const struct rpc_call_ops *call_sync_ops;
const struct nfs4_state_recovery_ops *reboot_recovery_ops;
const struct nfs4_state_recovery_ops *nograce_recovery_ops;
@@ -105,7 +108,7 @@ struct nfs4_state_owner {
unsigned long so_expires;
struct rb_node so_server_node;
- struct rpc_cred *so_cred; /* Associated cred */
+ const struct cred *so_cred; /* Associated cred */
spinlock_t so_lock;
atomic_t so_count;
@@ -188,9 +191,10 @@ struct nfs4_state {
unsigned int n_wronly; /* Number of write-only references */
unsigned int n_rdwr; /* Number of read/write references */
fmode_t state; /* State on the server (R,W, or RW) */
- atomic_t count;
+ refcount_t count;
wait_queue_head_t waitq;
+ struct rcu_head rcu_head;
};
@@ -209,10 +213,10 @@ struct nfs4_state_recovery_ops {
int state_flag_bit;
int (*recover_open)(struct nfs4_state_owner *, struct nfs4_state *);
int (*recover_lock)(struct nfs4_state *, struct file_lock *);
- int (*establish_clid)(struct nfs_client *, struct rpc_cred *);
- int (*reclaim_complete)(struct nfs_client *, struct rpc_cred *);
+ int (*establish_clid)(struct nfs_client *, const struct cred *);
+ int (*reclaim_complete)(struct nfs_client *, const struct cred *);
int (*detect_trunking)(struct nfs_client *, struct nfs_client **,
- struct rpc_cred *);
+ const struct cred *);
};
struct nfs4_opendata {
@@ -242,19 +246,19 @@ struct nfs4_opendata {
struct nfs4_add_xprt_data {
struct nfs_client *clp;
- struct rpc_cred *cred;
+ const struct cred *cred;
};
struct nfs4_state_maintenance_ops {
- int (*sched_state_renewal)(struct nfs_client *, struct rpc_cred *, unsigned);
- struct rpc_cred * (*get_state_renewal_cred_locked)(struct nfs_client *);
- int (*renew_lease)(struct nfs_client *, struct rpc_cred *);
+ int (*sched_state_renewal)(struct nfs_client *, const struct cred *, unsigned);
+ const struct cred * (*get_state_renewal_cred)(struct nfs_client *);
+ int (*renew_lease)(struct nfs_client *, const struct cred *);
};
struct nfs4_mig_recovery_ops {
int (*get_locations)(struct inode *, struct nfs4_fs_locations *,
- struct page *, struct rpc_cred *);
- int (*fsid_present)(struct inode *, struct rpc_cred *);
+ struct page *, const struct cred *);
+ int (*fsid_present)(struct inode *, const struct cred *);
};
extern const struct dentry_operations nfs4_dentry_operations;
@@ -283,21 +287,21 @@ extern int nfs4_call_sync(struct rpc_clnt *, struct nfs_server *,
struct rpc_message *, struct nfs4_sequence_args *,
struct nfs4_sequence_res *, int);
extern void nfs4_init_sequence(struct nfs4_sequence_args *, struct nfs4_sequence_res *, int, int);
-extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *);
-extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *);
+extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, const struct cred *, struct nfs4_setclientid_res *);
+extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, const struct cred *);
extern int nfs4_proc_get_rootfh(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *, bool);
-extern int nfs4_proc_bind_conn_to_session(struct nfs_client *, struct rpc_cred *cred);
-extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred);
+extern int nfs4_proc_bind_conn_to_session(struct nfs_client *, const struct cred *cred);
+extern int nfs4_proc_exchange_id(struct nfs_client *clp, const struct cred *cred);
extern int nfs4_destroy_clientid(struct nfs_client *clp);
-extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
-extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *);
+extern int nfs4_init_clientid(struct nfs_client *, const struct cred *);
+extern int nfs41_init_clientid(struct nfs_client *, const struct cred *);
extern int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait);
extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
extern int nfs4_proc_fs_locations(struct rpc_clnt *, struct inode *, const struct qstr *,
struct nfs4_fs_locations *, struct page *);
extern int nfs4_proc_get_locations(struct inode *, struct nfs4_fs_locations *,
- struct page *page, struct rpc_cred *);
-extern int nfs4_proc_fsid_present(struct inode *, struct rpc_cred *);
+ struct page *page, const struct cred *);
+extern int nfs4_proc_fsid_present(struct inode *, const struct cred *);
extern struct rpc_clnt *nfs4_proc_lookup_mountpoint(struct inode *, const struct qstr *,
struct nfs_fh *, struct nfs_fattr *);
extern int nfs4_proc_secinfo(struct inode *, const struct qstr *, struct nfs4_secinfo_flavors *);
@@ -309,8 +313,8 @@ extern int nfs4_set_rw_stateid(nfs4_stateid *stateid,
#if defined(CONFIG_NFS_V4_1)
extern int nfs41_sequence_done(struct rpc_task *, struct nfs4_sequence_res *);
-extern int nfs4_proc_create_session(struct nfs_client *, struct rpc_cred *);
-extern int nfs4_proc_destroy_session(struct nfs4_session *, struct rpc_cred *);
+extern int nfs4_proc_create_session(struct nfs_client *, const struct cred *);
+extern int nfs4_proc_destroy_session(struct nfs4_session *, const struct cred *);
extern int nfs4_proc_get_lease_time(struct nfs_client *clp,
struct nfs_fsinfo *fsinfo);
extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data,
@@ -335,7 +339,6 @@ static inline bool
_nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_mode,
struct rpc_clnt **clntp, struct rpc_message *msg)
{
- struct rpc_cred *newcred = NULL;
rpc_authflavor_t flavor;
if (sp4_mode == NFS_SP4_MACH_CRED_CLEANUP ||
@@ -350,13 +353,7 @@ _nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_mode,
return false;
}
if (test_bit(sp4_mode, &clp->cl_sp4_flags)) {
- spin_lock(&clp->cl_lock);
- if (clp->cl_machine_cred != NULL)
- /* don't call get_rpccred on the machine cred -
- * a reference will be held for life of clp */
- newcred = clp->cl_machine_cred;
- spin_unlock(&clp->cl_lock);
- msg->rpc_cred = newcred;
+ msg->rpc_cred = rpc_machine_cred();
flavor = clp->cl_rpcclient->cl_auth->au_flavor;
WARN_ON_ONCE(flavor != RPC_AUTH_GSS_KRB5I &&
@@ -447,16 +444,16 @@ extern void nfs4_set_lease_period(struct nfs_client *clp,
/* nfs4state.c */
-struct rpc_cred *nfs4_get_clid_cred(struct nfs_client *clp);
-struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp);
-struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp);
+const struct cred *nfs4_get_clid_cred(struct nfs_client *clp);
+const struct cred *nfs4_get_machine_cred(struct nfs_client *clp);
+const struct cred *nfs4_get_renew_cred(struct nfs_client *clp);
int nfs4_discover_server_trunking(struct nfs_client *clp,
struct nfs_client **);
int nfs40_discover_server_trunking(struct nfs_client *clp,
- struct nfs_client **, struct rpc_cred *);
+ struct nfs_client **, const struct cred *);
#if defined(CONFIG_NFS_V4_1)
int nfs41_discover_server_trunking(struct nfs_client *clp,
- struct nfs_client **, struct rpc_cred *);
+ struct nfs_client **, const struct cred *);
extern void nfs4_schedule_session_recovery(struct nfs4_session *, int);
extern void nfs41_notify_server(struct nfs_client *);
#else
@@ -465,7 +462,7 @@ static inline void nfs4_schedule_session_recovery(struct nfs4_session *session,
}
#endif /* CONFIG_NFS_V4_1 */
-extern struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *, gfp_t);
+extern struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *, const struct cred *, gfp_t);
extern void nfs4_put_state_owner(struct nfs4_state_owner *);
extern void nfs4_purge_state_owners(struct nfs_server *);
extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *);
@@ -491,7 +488,7 @@ extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
extern int nfs4_select_rw_stateid(struct nfs4_state *, fmode_t,
const struct nfs_lock_context *, nfs4_stateid *,
- struct rpc_cred **);
+ const struct cred **);
extern bool nfs4_refresh_open_stateid(nfs4_stateid *dst,
struct nfs4_state *state);
extern bool nfs4_copy_open_stateid(nfs4_stateid *dst,
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 146e30862234..2548405da1f7 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -545,7 +545,7 @@ static int nfs4_match_client(struct nfs_client *pos, struct nfs_client *new,
*/
int nfs40_walk_client_list(struct nfs_client *new,
struct nfs_client **result,
- struct rpc_cred *cred)
+ const struct cred *cred)
{
struct nfs_net *nn = net_generic(new->cl_net, nfs_net_id);
struct nfs_client *pos, *prev = NULL;
@@ -711,7 +711,7 @@ out_err:
*/
int nfs41_walk_client_list(struct nfs_client *new,
struct nfs_client **result,
- struct rpc_cred *cred)
+ const struct cred *cred)
{
struct nfs_net *nn = net_generic(new->cl_net, nfs_net_id);
struct nfs_client *pos, *prev = NULL;
@@ -950,10 +950,10 @@ EXPORT_SYMBOL_GPL(nfs4_set_ds_client);
/*
* Session has been established, and the client marked ready.
- * Set the mount rsize and wsize with negotiated fore channel
- * attributes which will be bound checked in nfs_server_set_fsinfo.
+ * Limit the mount rsize, wsize and dtsize using negotiated fore
+ * channel attributes.
*/
-static void nfs4_session_set_rwsize(struct nfs_server *server)
+static void nfs4_session_limit_rwsize(struct nfs_server *server)
{
#ifdef CONFIG_NFS_V4_1
struct nfs4_session *sess;
@@ -966,9 +966,11 @@ static void nfs4_session_set_rwsize(struct nfs_server *server)
server_resp_sz = sess->fc_attrs.max_resp_sz - nfs41_maxread_overhead;
server_rqst_sz = sess->fc_attrs.max_rqst_sz - nfs41_maxwrite_overhead;
- if (!server->rsize || server->rsize > server_resp_sz)
+ if (server->dtsize > server_resp_sz)
+ server->dtsize = server_resp_sz;
+ if (server->rsize > server_resp_sz)
server->rsize = server_resp_sz;
- if (!server->wsize || server->wsize > server_rqst_sz)
+ if (server->wsize > server_rqst_sz)
server->wsize = server_rqst_sz;
#endif /* CONFIG_NFS_V4_1 */
}
@@ -1015,12 +1017,12 @@ static int nfs4_server_common_setup(struct nfs_server *server,
(unsigned long long) server->fsid.minor);
nfs_display_fhandle(mntfh, "Pseudo-fs root FH");
- nfs4_session_set_rwsize(server);
-
error = nfs_probe_fsinfo(server, mntfh, fattr);
if (error < 0)
goto out;
+ nfs4_session_limit_rwsize(server);
+
if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN)
server->namelen = NFS4_MAXNAMLEN;
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 4288a6ecaf75..45b2322e092d 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -133,15 +133,9 @@ static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
size_t count, unsigned int flags)
{
- ssize_t ret;
-
if (file_inode(file_in) == file_inode(file_out))
return -EINVAL;
-retry:
- ret = nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count);
- if (ret == -EAGAIN)
- goto retry;
- return ret;
+ return nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count);
}
static loff_t nfs4_file_llseek(struct file *filep, loff_t offset, int whence)
@@ -180,8 +174,9 @@ static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t
return nfs42_proc_allocate(filep, offset, len);
}
-static int nfs42_clone_file_range(struct file *src_file, loff_t src_off,
- struct file *dst_file, loff_t dst_off, u64 count)
+static loff_t nfs42_remap_file_range(struct file *src_file, loff_t src_off,
+ struct file *dst_file, loff_t dst_off, loff_t count,
+ unsigned int remap_flags)
{
struct inode *dst_inode = file_inode(dst_file);
struct nfs_server *server = NFS_SERVER(dst_inode);
@@ -190,6 +185,9 @@ static int nfs42_clone_file_range(struct file *src_file, loff_t src_off,
bool same_inode = false;
int ret;
+ if (remap_flags & ~REMAP_FILE_ADVISORY)
+ return -EINVAL;
+
/* check alignment w.r.t. clone_blksize */
ret = -EINVAL;
if (bs) {
@@ -240,7 +238,7 @@ out_unlock:
inode_unlock(src_inode);
}
out:
- return ret;
+ return ret < 0 ? ret : count;
}
#endif /* CONFIG_NFS_V4_2 */
@@ -262,7 +260,7 @@ const struct file_operations nfs4_file_operations = {
.copy_file_range = nfs4_copy_file_range,
.llseek = nfs4_file_llseek,
.fallocate = nfs42_fallocate,
- .clone_file_range = nfs42_clone_file_range,
+ .remap_file_range = nfs42_remap_file_range,
#else
.llseek = nfs_file_llseek,
#endif
diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index 3f23b6840547..bf34ddaa2ad7 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -44,6 +44,7 @@
#include <linux/keyctl.h>
#include <linux/key-type.h>
#include <keys/user-type.h>
+#include <keys/request_key_auth-type.h>
#include <linux/module.h>
#include "internal.h"
@@ -59,7 +60,7 @@ static struct key_type key_type_id_resolver_legacy;
struct idmap_legacy_upcalldata {
struct rpc_pipe_msg pipe_msg;
struct idmap_msg idmap_msg;
- struct key_construction *key_cons;
+ struct key *authkey;
struct idmap *idmap;
};
@@ -384,7 +385,7 @@ static const match_table_t nfs_idmap_tokens = {
{ Opt_find_err, NULL }
};
-static int nfs_idmap_legacy_upcall(struct key_construction *, const char *, void *);
+static int nfs_idmap_legacy_upcall(struct key *, void *);
static ssize_t idmap_pipe_downcall(struct file *, const char __user *,
size_t);
static void idmap_release_pipe(struct inode *);
@@ -549,11 +550,12 @@ nfs_idmap_prepare_pipe_upcall(struct idmap *idmap,
static void
nfs_idmap_complete_pipe_upcall_locked(struct idmap *idmap, int ret)
{
- struct key_construction *cons = idmap->idmap_upcall_data->key_cons;
+ struct key *authkey = idmap->idmap_upcall_data->authkey;
kfree(idmap->idmap_upcall_data);
idmap->idmap_upcall_data = NULL;
- complete_request_key(cons, ret);
+ complete_request_key(authkey, ret);
+ key_put(authkey);
}
static void
@@ -563,15 +565,14 @@ nfs_idmap_abort_pipe_upcall(struct idmap *idmap, int ret)
nfs_idmap_complete_pipe_upcall_locked(idmap, ret);
}
-static int nfs_idmap_legacy_upcall(struct key_construction *cons,
- const char *op,
- void *aux)
+static int nfs_idmap_legacy_upcall(struct key *authkey, void *aux)
{
struct idmap_legacy_upcalldata *data;
+ struct request_key_auth *rka = get_request_key_auth(authkey);
struct rpc_pipe_msg *msg;
struct idmap_msg *im;
struct idmap *idmap = (struct idmap *)aux;
- struct key *key = cons->key;
+ struct key *key = rka->target_key;
int ret = -ENOKEY;
if (!aux)
@@ -586,7 +587,7 @@ static int nfs_idmap_legacy_upcall(struct key_construction *cons,
msg = &data->pipe_msg;
im = &data->idmap_msg;
data->idmap = idmap;
- data->key_cons = cons;
+ data->authkey = key_get(authkey);
ret = nfs_idmap_prepare_message(key->description, idmap, im, msg);
if (ret < 0)
@@ -604,7 +605,7 @@ static int nfs_idmap_legacy_upcall(struct key_construction *cons,
out2:
kfree(data);
out1:
- complete_request_key(cons, ret);
+ complete_request_key(authkey, ret);
return ret;
}
@@ -651,9 +652,10 @@ out:
static ssize_t
idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
{
+ struct request_key_auth *rka;
struct rpc_inode *rpci = RPC_I(file_inode(filp));
struct idmap *idmap = (struct idmap *)rpci->private;
- struct key_construction *cons;
+ struct key *authkey;
struct idmap_msg im;
size_t namelen_in;
int ret = -ENOKEY;
@@ -665,7 +667,8 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
if (idmap->idmap_upcall_data == NULL)
goto out_noupcall;
- cons = idmap->idmap_upcall_data->key_cons;
+ authkey = idmap->idmap_upcall_data->authkey;
+ rka = get_request_key_auth(authkey);
if (mlen != sizeof(im)) {
ret = -ENOSPC;
@@ -690,9 +693,9 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
ret = nfs_idmap_read_and_verify_message(&im,
&idmap->idmap_upcall_data->idmap_msg,
- cons->key, cons->authkey);
+ rka->target_key, authkey);
if (ret >= 0) {
- key_set_timeout(cons->key, nfs_idmap_cache_timeout);
+ key_set_timeout(rka->target_key, nfs_idmap_cache_timeout);
ret = mlen;
}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 8220a168282e..557a5d636183 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -93,19 +93,19 @@ static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinf
static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr);
static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, struct nfs4_label *label, struct inode *inode);
static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label, struct inode *inode);
-static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
+static int nfs4_do_setattr(struct inode *inode, const struct cred *cred,
struct nfs_fattr *fattr, struct iattr *sattr,
struct nfs_open_context *ctx, struct nfs4_label *ilabel,
struct nfs4_label *olabel);
#ifdef CONFIG_NFS_V4_1
static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp,
- struct rpc_cred *cred,
+ const struct cred *cred,
struct nfs4_slot *slot,
bool is_privileged);
static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *,
- struct rpc_cred *);
+ const struct cred *);
static int nfs41_free_stateid(struct nfs_server *, const nfs4_stateid *,
- struct rpc_cred *, bool);
+ const struct cred *, bool);
#endif
#ifdef CONFIG_NFS_V4_SECURITY_LABEL
@@ -361,7 +361,7 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
static void nfs4_test_and_free_stateid(struct nfs_server *server,
nfs4_stateid *stateid,
- struct rpc_cred *cred)
+ const struct cred *cred)
{
const struct nfs4_minor_version_ops *ops = server->nfs_client->cl_mvops;
@@ -370,7 +370,7 @@ static void nfs4_test_and_free_stateid(struct nfs_server *server,
static void __nfs4_free_revoked_stateid(struct nfs_server *server,
nfs4_stateid *stateid,
- struct rpc_cred *cred)
+ const struct cred *cred)
{
stateid->type = NFS4_REVOKED_STATEID_TYPE;
nfs4_test_and_free_stateid(server, stateid, cred);
@@ -378,7 +378,7 @@ static void __nfs4_free_revoked_stateid(struct nfs_server *server,
static void nfs4_free_revoked_stateid(struct nfs_server *server,
const nfs4_stateid *stateid,
- struct rpc_cred *cred)
+ const struct cred *cred)
{
nfs4_stateid tmp;
@@ -908,7 +908,7 @@ static const struct rpc_call_ops nfs41_call_sync_ops = {
static void
nfs4_sequence_process_interrupted(struct nfs_client *client,
- struct nfs4_slot *slot, struct rpc_cred *cred)
+ struct nfs4_slot *slot, const struct cred *cred)
{
struct rpc_task *task;
@@ -939,7 +939,7 @@ EXPORT_SYMBOL_GPL(nfs4_sequence_done);
static void
nfs4_sequence_process_interrupted(struct nfs_client *client,
- struct nfs4_slot *slot, struct rpc_cred *cred)
+ struct nfs4_slot *slot, const struct cred *cred)
{
WARN_ON_ONCE(1);
slot->interrupted = 0;
@@ -1349,12 +1349,20 @@ static bool nfs4_mode_match_open_stateid(struct nfs4_state *state,
return false;
}
-static int can_open_cached(struct nfs4_state *state, fmode_t mode, int open_mode)
+static int can_open_cached(struct nfs4_state *state, fmode_t mode,
+ int open_mode, enum open_claim_type4 claim)
{
int ret = 0;
if (open_mode & (O_EXCL|O_TRUNC))
goto out;
+ switch (claim) {
+ case NFS4_OPEN_CLAIM_NULL:
+ case NFS4_OPEN_CLAIM_FH:
+ goto out;
+ default:
+ break;
+ }
switch (mode & (FMODE_READ|FMODE_WRITE)) {
case FMODE_READ:
ret |= test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0
@@ -1747,7 +1755,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
for (;;) {
spin_lock(&state->owner->so_lock);
- if (can_open_cached(state, fmode, open_mode)) {
+ if (can_open_cached(state, fmode, open_mode, claim)) {
update_open_stateflags(state, fmode);
spin_unlock(&state->owner->so_lock);
goto out_return_state;
@@ -1777,7 +1785,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
out:
return ERR_PTR(ret);
out_return_state:
- atomic_inc(&state->count);
+ refcount_inc(&state->count);
return state;
}
@@ -1849,7 +1857,7 @@ _nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data)
update:
update_open_stateid(state, &data->o_res.stateid, NULL,
data->o_arg.fmode);
- atomic_inc(&state->count);
+ refcount_inc(&state->count);
return state;
}
@@ -1887,7 +1895,7 @@ nfs4_opendata_find_nfs4_state(struct nfs4_opendata *data)
return ERR_CAST(inode);
if (data->state != NULL && data->state->inode == inode) {
state = data->state;
- atomic_inc(&state->count);
+ refcount_inc(&state->count);
} else
state = nfs4_get_open_state(inode, data->owner);
iput(inode);
@@ -1933,23 +1941,41 @@ nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
return ret;
}
-static struct nfs_open_context *nfs4_state_find_open_context(struct nfs4_state *state)
+static struct nfs_open_context *
+nfs4_state_find_open_context_mode(struct nfs4_state *state, fmode_t mode)
{
struct nfs_inode *nfsi = NFS_I(state->inode);
struct nfs_open_context *ctx;
- spin_lock(&state->inode->i_lock);
- list_for_each_entry(ctx, &nfsi->open_files, list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(ctx, &nfsi->open_files, list) {
if (ctx->state != state)
continue;
- get_nfs_open_context(ctx);
- spin_unlock(&state->inode->i_lock);
+ if ((ctx->mode & mode) != mode)
+ continue;
+ if (!get_nfs_open_context(ctx))
+ continue;
+ rcu_read_unlock();
return ctx;
}
- spin_unlock(&state->inode->i_lock);
+ rcu_read_unlock();
return ERR_PTR(-ENOENT);
}
+static struct nfs_open_context *
+nfs4_state_find_open_context(struct nfs4_state *state)
+{
+ struct nfs_open_context *ctx;
+
+ ctx = nfs4_state_find_open_context_mode(state, FMODE_READ|FMODE_WRITE);
+ if (!IS_ERR(ctx))
+ return ctx;
+ ctx = nfs4_state_find_open_context_mode(state, FMODE_WRITE);
+ if (!IS_ERR(ctx))
+ return ctx;
+ return nfs4_state_find_open_context_mode(state, FMODE_READ);
+}
+
static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context *ctx,
struct nfs4_state *state, enum open_claim_type4 claim)
{
@@ -1960,7 +1986,7 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context
if (opendata == NULL)
return ERR_PTR(-ENOMEM);
opendata->state = state;
- atomic_inc(&state->count);
+ refcount_inc(&state->count);
return opendata;
}
@@ -2276,7 +2302,8 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
if (data->state != NULL) {
struct nfs_delegation *delegation;
- if (can_open_cached(data->state, data->o_arg.fmode, data->o_arg.open_flags))
+ if (can_open_cached(data->state, data->o_arg.fmode,
+ data->o_arg.open_flags, claim))
goto out_no_action;
rcu_read_lock();
delegation = rcu_dereference(NFS_I(data->state->inode)->delegation);
@@ -2457,7 +2484,7 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data)
* Note that in the non-execute case, we want to turn off permission
* checking if we just created a new file (POSIX open() semantics).
*/
-static int nfs4_opendata_access(struct rpc_cred *cred,
+static int nfs4_opendata_access(const struct cred *cred,
struct nfs4_opendata *opendata,
struct nfs4_state *state, fmode_t fmode,
int openflags)
@@ -2624,7 +2651,7 @@ static int nfs40_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st
static int nfs40_test_and_free_expired_stateid(struct nfs_server *server,
nfs4_stateid *stateid,
- struct rpc_cred *cred)
+ const struct cred *cred)
{
return -NFS4ERR_BAD_STATEID;
}
@@ -2632,7 +2659,7 @@ static int nfs40_test_and_free_expired_stateid(struct nfs_server *server,
#if defined(CONFIG_NFS_V4_1)
static int nfs41_test_and_free_expired_stateid(struct nfs_server *server,
nfs4_stateid *stateid,
- struct rpc_cred *cred)
+ const struct cred *cred)
{
int status;
@@ -2666,7 +2693,7 @@ static void nfs41_check_delegation_stateid(struct nfs4_state *state)
struct nfs_server *server = NFS_SERVER(state->inode);
nfs4_stateid stateid;
struct nfs_delegation *delegation;
- struct rpc_cred *cred;
+ const struct cred *cred = NULL;
int status;
/* Get the delegation credential for use by test/free_stateid */
@@ -2691,14 +2718,16 @@ static void nfs41_check_delegation_stateid(struct nfs4_state *state)
return;
}
- cred = get_rpccred(delegation->cred);
+ if (delegation->cred)
+ cred = get_cred(delegation->cred);
rcu_read_unlock();
status = nfs41_test_and_free_expired_stateid(server, &stateid, cred);
trace_nfs4_test_delegation_stateid(state, NULL, status);
if (status == -NFS4ERR_EXPIRED || status == -NFS4ERR_BAD_STATEID)
nfs_finish_clear_delegation_stateid(state, &stateid);
- put_rpccred(cred);
+ if (delegation->cred)
+ put_cred(cred);
}
/**
@@ -2721,7 +2750,7 @@ static int nfs41_check_expired_locks(struct nfs4_state *state)
spin_lock(&state->state_lock);
list_for_each_entry(lsp, &state->lock_states, ls_locks) {
if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
- struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
+ const struct cred *cred = lsp->ls_state->owner->so_cred;
refcount_inc(&lsp->ls_count);
spin_unlock(&state->state_lock);
@@ -2765,7 +2794,7 @@ static int nfs41_check_open_stateid(struct nfs4_state *state)
{
struct nfs_server *server = NFS_SERVER(state->inode);
nfs4_stateid *stateid = &state->open_stateid;
- struct rpc_cred *cred = state->owner->so_cred;
+ const struct cred *cred = state->owner->so_cred;
int status;
if (test_bit(NFS_OPEN_STATE, &state->flags) == 0) {
@@ -2923,7 +2952,7 @@ static int _nfs4_do_open(struct inode *dir,
struct nfs_server *server = NFS_SERVER(dir);
struct nfs4_opendata *opendata;
struct dentry *dentry = ctx->dentry;
- struct rpc_cred *cred = ctx->cred;
+ const struct cred *cred = ctx->cred;
struct nfs4_threshold **ctx_th = &ctx->mdsthreshold;
fmode_t fmode = ctx->mode & (FMODE_READ|FMODE_WRITE|FMODE_EXEC);
enum open_claim_type4 claim = NFS4_OPEN_CLAIM_NULL;
@@ -3093,7 +3122,7 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir,
static int _nfs4_do_setattr(struct inode *inode,
struct nfs_setattrargs *arg,
struct nfs_setattrres *res,
- struct rpc_cred *cred,
+ const struct cred *cred,
struct nfs_open_context *ctx)
{
struct nfs_server *server = NFS_SERVER(inode);
@@ -3103,7 +3132,7 @@ static int _nfs4_do_setattr(struct inode *inode,
.rpc_resp = res,
.rpc_cred = cred,
};
- struct rpc_cred *delegation_cred = NULL;
+ const struct cred *delegation_cred = NULL;
unsigned long timestamp = jiffies;
bool truncate;
int status;
@@ -3138,14 +3167,14 @@ zero_stateid:
status = nfs4_call_sync(server->client, server, &msg, &arg->seq_args, &res->seq_res, 1);
- put_rpccred(delegation_cred);
+ put_cred(delegation_cred);
if (status == 0 && ctx != NULL)
renew_lease(server, timestamp);
trace_nfs4_setattr(inode, &arg->stateid, status);
return status;
}
-static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
+static int nfs4_do_setattr(struct inode *inode, const struct cred *cred,
struct nfs_fattr *fattr, struct iattr *sattr,
struct nfs_open_context *ctx, struct nfs4_label *ilabel,
struct nfs4_label *olabel)
@@ -3761,7 +3790,7 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
}
/*
- * -EACCESS could mean that the user doesn't have correct permissions
+ * -EACCES could mean that the user doesn't have correct permissions
* to access the mount. It could also mean that we tried to mount
* with a gss auth flavor, but rpc.gssd isn't running. Either way,
* existing mount programs don't handle -EACCES very well so it should
@@ -3946,7 +3975,7 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
struct iattr *sattr)
{
struct inode *inode = d_inode(dentry);
- struct rpc_cred *cred = NULL;
+ const struct cred *cred = NULL;
struct nfs_open_context *ctx = NULL;
struct nfs4_label *label = NULL;
int status;
@@ -4175,7 +4204,6 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
return -ENOMEM;
args.bitmask = server->cache_consistency_bitmask;
}
-
status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
if (!status) {
nfs_access_set_mask(entry, res.access);
@@ -4664,7 +4692,7 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
return err;
}
-static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
+static int _nfs4_proc_readdir(struct dentry *dentry, const struct cred *cred,
u64 cookie, struct page **pages, unsigned int count, bool plus)
{
struct inode *dir = d_inode(dentry);
@@ -4702,7 +4730,7 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
return status;
}
-static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
+static int nfs4_proc_readdir(struct dentry *dentry, const struct cred *cred,
u64 cookie, struct page **pages, unsigned int count, bool plus)
{
struct nfs4_exception exception = { };
@@ -5230,7 +5258,7 @@ static const struct rpc_call_ops nfs4_renew_ops = {
.rpc_release = nfs4_renew_release,
};
-static int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred, unsigned renew_flags)
+static int nfs4_proc_async_renew(struct nfs_client *clp, const struct cred *cred, unsigned renew_flags)
{
struct rpc_message msg = {
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENEW],
@@ -5254,7 +5282,7 @@ static int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred,
&nfs4_renew_ops, data);
}
-static int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred)
+static int nfs4_proc_renew(struct nfs_client *clp, const struct cred *cred)
{
struct rpc_message msg = {
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENEW],
@@ -5669,7 +5697,6 @@ nfs4_set_security_label(struct inode *inode, const void *buf, size_t buflen)
{
struct nfs4_label ilabel, *olabel = NULL;
struct nfs_fattr fattr;
- struct rpc_cred *cred;
int status;
if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL))
@@ -5682,10 +5709,6 @@ nfs4_set_security_label(struct inode *inode, const void *buf, size_t buflen)
ilabel.label = (char *)buf;
ilabel.len = buflen;
- cred = rpc_lookup_cred();
- if (IS_ERR(cred))
- return PTR_ERR(cred);
-
olabel = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL);
if (IS_ERR(olabel)) {
status = -PTR_ERR(olabel);
@@ -5698,7 +5721,6 @@ nfs4_set_security_label(struct inode *inode, const void *buf, size_t buflen)
nfs4_label_free(olabel);
out:
- put_rpccred(cred);
return status;
}
#endif /* CONFIG_NFS_V4_SECURITY_LABEL */
@@ -5867,13 +5889,13 @@ static const struct rpc_call_ops nfs4_setclientid_ops = {
* @clp: state data structure
* @program: RPC program for NFSv4 callback service
* @port: IP port number for NFS4 callback service
- * @cred: RPC credential to use for this call
+ * @cred: credential to use for this call
* @res: where to place the result
*
* Returns zero, a negative errno, or a negative NFS4ERR status code.
*/
int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
- unsigned short port, struct rpc_cred *cred,
+ unsigned short port, const struct cred *cred,
struct nfs4_setclientid_res *res)
{
nfs4_verifier sc_verifier;
@@ -5942,13 +5964,13 @@ out:
* nfs4_proc_setclientid_confirm - Confirm client ID
* @clp: state data structure
* @res: result of a previous SETCLIENTID
- * @cred: RPC credential to use for this call
+ * @cred: credential to use for this call
*
* Returns zero, a negative errno, or a negative NFS4ERR status code.
*/
int nfs4_proc_setclientid_confirm(struct nfs_client *clp,
struct nfs4_setclientid_res *arg,
- struct rpc_cred *cred)
+ const struct cred *cred)
{
struct rpc_message msg = {
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID_CONFIRM],
@@ -6111,7 +6133,7 @@ static const struct rpc_call_ops nfs4_delegreturn_ops = {
.rpc_release = nfs4_delegreturn_release,
};
-static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync)
+static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred, const nfs4_stateid *stateid, int issync)
{
struct nfs4_delegreturndata *data;
struct nfs_server *server = NFS_SERVER(inode);
@@ -6178,7 +6200,7 @@ out:
return status;
}
-int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync)
+int nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred, const nfs4_stateid *stateid, int issync)
{
struct nfs_server *server = NFS_SERVER(inode);
struct nfs4_exception exception = { };
@@ -6284,7 +6306,8 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
/* Ensure we don't close file until we're done freeing locks! */
p->ctx = get_nfs_open_context(ctx);
p->l_ctx = nfs_get_lock_context(ctx);
- memcpy(&p->fl, fl, sizeof(p->fl));
+ locks_init_lock(&p->fl);
+ locks_copy_lock(&p->fl, fl);
p->server = NFS_SERVER(inode);
return p;
}
@@ -6506,7 +6529,8 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
p->server = server;
refcount_inc(&lsp->ls_count);
p->ctx = get_nfs_open_context(ctx);
- memcpy(&p->fl, fl, sizeof(p->fl));
+ locks_init_lock(&p->fl);
+ locks_copy_lock(&p->fl, fl);
return p;
out_free_seqid:
nfs_free_seqid(p->arg.open_seqid);
@@ -7239,7 +7263,7 @@ int nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir,
*/
static int _nfs40_proc_get_locations(struct inode *inode,
struct nfs4_fs_locations *locations,
- struct page *page, struct rpc_cred *cred)
+ struct page *page, const struct cred *cred)
{
struct nfs_server *server = NFS_SERVER(inode);
struct rpc_clnt *clnt = server->client;
@@ -7296,7 +7320,7 @@ static int _nfs40_proc_get_locations(struct inode *inode,
*/
static int _nfs41_proc_get_locations(struct inode *inode,
struct nfs4_fs_locations *locations,
- struct page *page, struct rpc_cred *cred)
+ struct page *page, const struct cred *cred)
{
struct nfs_server *server = NFS_SERVER(inode);
struct rpc_clnt *clnt = server->client;
@@ -7355,7 +7379,7 @@ static int _nfs41_proc_get_locations(struct inode *inode,
*/
int nfs4_proc_get_locations(struct inode *inode,
struct nfs4_fs_locations *locations,
- struct page *page, struct rpc_cred *cred)
+ struct page *page, const struct cred *cred)
{
struct nfs_server *server = NFS_SERVER(inode);
struct nfs_client *clp = server->nfs_client;
@@ -7386,7 +7410,7 @@ int nfs4_proc_get_locations(struct inode *inode,
* is appended to this compound to identify the client ID which is
* performing recovery.
*/
-static int _nfs40_proc_fsid_present(struct inode *inode, struct rpc_cred *cred)
+static int _nfs40_proc_fsid_present(struct inode *inode, const struct cred *cred)
{
struct nfs_server *server = NFS_SERVER(inode);
struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
@@ -7432,7 +7456,7 @@ static int _nfs40_proc_fsid_present(struct inode *inode, struct rpc_cred *cred)
* this operation is identified in the SEQUENCE operation in this
* compound.
*/
-static int _nfs41_proc_fsid_present(struct inode *inode, struct rpc_cred *cred)
+static int _nfs41_proc_fsid_present(struct inode *inode, const struct cred *cred)
{
struct nfs_server *server = NFS_SERVER(inode);
struct rpc_clnt *clnt = server->client;
@@ -7479,7 +7503,7 @@ static int _nfs41_proc_fsid_present(struct inode *inode, struct rpc_cred *cred)
* NFS4ERR code if some error occurred on the server, or a
* negative errno if a local failure occurred.
*/
-int nfs4_proc_fsid_present(struct inode *inode, struct rpc_cred *cred)
+int nfs4_proc_fsid_present(struct inode *inode, const struct cred *cred)
{
struct nfs_server *server = NFS_SERVER(inode);
struct nfs_client *clp = server->nfs_client;
@@ -7526,7 +7550,7 @@ static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct
.rpc_resp = &res,
};
struct rpc_clnt *clnt = NFS_SERVER(dir)->client;
- struct rpc_cred *cred = NULL;
+ const struct cred *cred = NULL;
if (use_integrity) {
clnt = NFS_SERVER(dir)->nfs_client->cl_rpcclient;
@@ -7543,8 +7567,7 @@ static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct
&res.seq_res, 0);
dprintk("NFS reply secinfo: %d\n", status);
- if (cred)
- put_rpccred(cred);
+ put_cred(cred);
return status;
}
@@ -7625,7 +7648,7 @@ static
int nfs4_proc_bind_one_conn_to_session(struct rpc_clnt *clnt,
struct rpc_xprt *xprt,
struct nfs_client *clp,
- struct rpc_cred *cred)
+ const struct cred *cred)
{
int status;
struct nfs41_bind_conn_to_session_args args = {
@@ -7687,7 +7710,7 @@ int nfs4_proc_bind_one_conn_to_session(struct rpc_clnt *clnt,
struct rpc_bind_conn_calldata {
struct nfs_client *clp;
- struct rpc_cred *cred;
+ const struct cred *cred;
};
static int
@@ -7700,7 +7723,7 @@ nfs4_proc_bind_conn_to_session_callback(struct rpc_clnt *clnt,
return nfs4_proc_bind_one_conn_to_session(clnt, xprt, p->clp, p->cred);
}
-int nfs4_proc_bind_conn_to_session(struct nfs_client *clp, struct rpc_cred *cred)
+int nfs4_proc_bind_conn_to_session(struct nfs_client *clp, const struct cred *cred)
{
struct rpc_bind_conn_calldata data = {
.clp = clp,
@@ -7866,7 +7889,7 @@ static const struct rpc_call_ops nfs4_exchange_id_call_ops = {
* Wrapper for EXCHANGE_ID operation.
*/
static struct rpc_task *
-nfs4_run_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
+nfs4_run_exchange_id(struct nfs_client *clp, const struct cred *cred,
u32 sp4_how, struct rpc_xprt *xprt)
{
struct rpc_message msg = {
@@ -7962,7 +7985,7 @@ out:
*
* Wrapper for EXCHANGE_ID operation.
*/
-static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
+static int _nfs4_proc_exchange_id(struct nfs_client *clp, const struct cred *cred,
u32 sp4_how)
{
struct rpc_task *task;
@@ -8029,7 +8052,7 @@ out:
*
* Will attempt to negotiate SP4_MACH_CRED if krb5i / krb5p auth is used.
*/
-int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
+int nfs4_proc_exchange_id(struct nfs_client *clp, const struct cred *cred)
{
rpc_authflavor_t authflavor = clp->cl_rpcclient->cl_auth->au_flavor;
int status;
@@ -8061,7 +8084,7 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
* @xprt: the rpc_xprt to test
* @data: call data for _nfs4_proc_exchange_id.
*/
-int nfs4_test_session_trunk(struct rpc_clnt *clnt, struct rpc_xprt *xprt,
+void nfs4_test_session_trunk(struct rpc_clnt *clnt, struct rpc_xprt *xprt,
void *data)
{
struct nfs4_add_xprt_data *adata = (struct nfs4_add_xprt_data *)data;
@@ -8078,20 +8101,22 @@ int nfs4_test_session_trunk(struct rpc_clnt *clnt, struct rpc_xprt *xprt,
/* Test connection for session trunking. Async exchange_id call */
task = nfs4_run_exchange_id(adata->clp, adata->cred, sp4_how, xprt);
if (IS_ERR(task))
- return PTR_ERR(task);
+ return;
status = task->tk_status;
if (status == 0)
status = nfs4_detect_session_trunking(adata->clp,
task->tk_msg.rpc_resp, xprt);
+ if (status == 0)
+ rpc_clnt_xprt_switch_add_xprt(clnt, xprt);
+
rpc_put_task(task);
- return status;
}
EXPORT_SYMBOL_GPL(nfs4_test_session_trunk);
static int _nfs4_proc_destroy_clientid(struct nfs_client *clp,
- struct rpc_cred *cred)
+ const struct cred *cred)
{
struct rpc_message msg = {
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DESTROY_CLIENTID],
@@ -8109,7 +8134,7 @@ static int _nfs4_proc_destroy_clientid(struct nfs_client *clp,
}
static int nfs4_proc_destroy_clientid(struct nfs_client *clp,
- struct rpc_cred *cred)
+ const struct cred *cred)
{
unsigned int loop;
int ret;
@@ -8130,7 +8155,7 @@ static int nfs4_proc_destroy_clientid(struct nfs_client *clp,
int nfs4_destroy_clientid(struct nfs_client *clp)
{
- struct rpc_cred *cred;
+ const struct cred *cred;
int ret = 0;
if (clp->cl_mvops->minor_version < 1)
@@ -8141,8 +8166,7 @@ int nfs4_destroy_clientid(struct nfs_client *clp)
goto out;
cred = nfs4_get_clid_cred(clp);
ret = nfs4_proc_destroy_clientid(clp, cred);
- if (cred)
- put_rpccred(cred);
+ put_cred(cred);
switch (ret) {
case 0:
case -NFS4ERR_STALE_CLIENTID:
@@ -8358,7 +8382,7 @@ static void nfs4_update_session(struct nfs4_session *session,
}
static int _nfs4_proc_create_session(struct nfs_client *clp,
- struct rpc_cred *cred)
+ const struct cred *cred)
{
struct nfs4_session *session = clp->cl_session;
struct nfs41_create_session_args args = {
@@ -8410,7 +8434,7 @@ out:
* It is the responsibility of the caller to verify the session is
* expired before calling this routine.
*/
-int nfs4_proc_create_session(struct nfs_client *clp, struct rpc_cred *cred)
+int nfs4_proc_create_session(struct nfs_client *clp, const struct cred *cred)
{
int status;
unsigned *ptr;
@@ -8441,7 +8465,7 @@ out:
* The caller must serialize access to this routine.
*/
int nfs4_proc_destroy_session(struct nfs4_session *session,
- struct rpc_cred *cred)
+ const struct cred *cred)
{
struct rpc_message msg = {
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DESTROY_SESSION],
@@ -8543,7 +8567,7 @@ static const struct rpc_call_ops nfs41_sequence_ops = {
};
static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp,
- struct rpc_cred *cred,
+ const struct cred *cred,
struct nfs4_slot *slot,
bool is_privileged)
{
@@ -8586,7 +8610,7 @@ out_err:
return ret;
}
-static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cred, unsigned renew_flags)
+static int nfs41_proc_async_sequence(struct nfs_client *clp, const struct cred *cred, unsigned renew_flags)
{
struct rpc_task *task;
int ret = 0;
@@ -8602,7 +8626,7 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cr
return ret;
}
-static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred)
+static int nfs4_proc_sequence(struct nfs_client *clp, const struct cred *cred)
{
struct rpc_task *task;
int ret;
@@ -8698,7 +8722,7 @@ static const struct rpc_call_ops nfs4_reclaim_complete_call_ops = {
* Issue a global reclaim complete.
*/
static int nfs41_proc_reclaim_complete(struct nfs_client *clp,
- struct rpc_cred *cred)
+ const struct cred *cred)
{
struct nfs4_reclaim_complete_data *calldata;
struct rpc_task *task;
@@ -9051,7 +9075,7 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync)
static int
_nfs4_proc_getdeviceinfo(struct nfs_server *server,
struct pnfs_device *pdev,
- struct rpc_cred *cred)
+ const struct cred *cred)
{
struct nfs4_getdeviceinfo_args args = {
.pdev = pdev,
@@ -9083,7 +9107,7 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server,
int nfs4_proc_getdeviceinfo(struct nfs_server *server,
struct pnfs_device *pdev,
- struct rpc_cred *cred)
+ const struct cred *cred)
{
struct nfs4_exception exception = { };
int err;
@@ -9140,7 +9164,7 @@ static void nfs4_layoutcommit_release(void *calldata)
pnfs_cleanup_layoutcommit(data);
nfs_post_op_update_inode_force_wcc(data->args.inode,
data->res.fattr);
- put_rpccred(data->cred);
+ put_cred(data->cred);
nfs_iput_and_deactive(data->inode);
kfree(data);
}
@@ -9216,7 +9240,7 @@ _nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle,
.rpc_resp = &res,
};
struct rpc_clnt *clnt = server->client;
- struct rpc_cred *cred = NULL;
+ const struct cred *cred = NULL;
int status;
if (use_integrity) {
@@ -9230,8 +9254,7 @@ _nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle,
&res.seq_res, 0);
dprintk("<-- %s status=%d\n", __func__, status);
- if (cred)
- put_rpccred(cred);
+ put_cred(cred);
return status;
}
@@ -9344,7 +9367,7 @@ out:
static int _nfs41_test_stateid(struct nfs_server *server,
nfs4_stateid *stateid,
- struct rpc_cred *cred)
+ const struct cred *cred)
{
int status;
struct nfs41_test_stateid_args args = {
@@ -9405,7 +9428,7 @@ static void nfs4_handle_delay_or_session_error(struct nfs_server *server,
*/
static int nfs41_test_stateid(struct nfs_server *server,
nfs4_stateid *stateid,
- struct rpc_cred *cred)
+ const struct cred *cred)
{
struct nfs4_exception exception = { };
int err;
@@ -9467,7 +9490,7 @@ static const struct rpc_call_ops nfs41_free_stateid_ops = {
*/
static int nfs41_free_stateid(struct nfs_server *server,
const nfs4_stateid *stateid,
- struct rpc_cred *cred,
+ const struct cred *cred,
bool privileged)
{
struct rpc_message msg = {
@@ -9508,7 +9531,7 @@ static int nfs41_free_stateid(struct nfs_server *server,
static void
nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)
{
- struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
+ const struct cred *cred = lsp->ls_state->owner->so_cred;
nfs41_free_stateid(server, &lsp->ls_stateid, cred, false);
nfs4_free_lock_state(server, lsp);
@@ -9579,14 +9602,14 @@ static const struct nfs4_state_recovery_ops nfs41_nograce_recovery_ops = {
static const struct nfs4_state_maintenance_ops nfs40_state_renewal_ops = {
.sched_state_renewal = nfs4_proc_async_renew,
- .get_state_renewal_cred_locked = nfs4_get_renew_cred_locked,
+ .get_state_renewal_cred = nfs4_get_renew_cred,
.renew_lease = nfs4_proc_renew,
};
#if defined(CONFIG_NFS_V4_1)
static const struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = {
.sched_state_renewal = nfs41_proc_async_sequence,
- .get_state_renewal_cred_locked = nfs4_get_machine_cred_locked,
+ .get_state_renewal_cred = nfs4_get_machine_cred,
.renew_lease = nfs4_proc_sequence,
};
#endif
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 1f8c2ae43a8d..6ea431b067dd 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -57,7 +57,7 @@ nfs4_renew_state(struct work_struct *work)
const struct nfs4_state_maintenance_ops *ops;
struct nfs_client *clp =
container_of(work, struct nfs_client, cl_renewd.work);
- struct rpc_cred *cred;
+ const struct cred *cred;
long lease;
unsigned long last, now;
unsigned renew_flags = 0;
@@ -68,7 +68,6 @@ nfs4_renew_state(struct work_struct *work)
if (test_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state))
goto out;
- spin_lock(&clp->cl_lock);
lease = clp->cl_lease_time;
last = clp->cl_last_renewal;
now = jiffies;
@@ -79,8 +78,7 @@ nfs4_renew_state(struct work_struct *work)
renew_flags |= NFS4_RENEW_DELEGATION_CB;
if (renew_flags != 0) {
- cred = ops->get_state_renewal_cred_locked(clp);
- spin_unlock(&clp->cl_lock);
+ cred = ops->get_state_renewal_cred(clp);
if (cred == NULL) {
if (!(renew_flags & NFS4_RENEW_DELEGATION_CB)) {
set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
@@ -92,7 +90,7 @@ nfs4_renew_state(struct work_struct *work)
/* Queue an asynchronous RENEW. */
ret = ops->sched_state_renewal(clp, cred, renew_flags);
- put_rpccred(cred);
+ put_cred(cred);
switch (ret) {
default:
goto out_exp;
@@ -104,7 +102,6 @@ nfs4_renew_state(struct work_struct *work)
} else {
dprintk("%s: failed to call renewd. Reason: lease not expired \n",
__func__);
- spin_unlock(&clp->cl_lock);
}
nfs4_schedule_state_renewal(clp);
out_exp:
diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c
index 769b85655c4b..a5489d70a724 100644
--- a/fs/nfs/nfs4session.c
+++ b/fs/nfs/nfs4session.c
@@ -573,12 +573,11 @@ static void nfs4_destroy_session_slot_tables(struct nfs4_session *session)
void nfs4_destroy_session(struct nfs4_session *session)
{
struct rpc_xprt *xprt;
- struct rpc_cred *cred;
+ const struct cred *cred;
cred = nfs4_get_clid_cred(session->clp);
nfs4_proc_destroy_session(session, cred);
- if (cred)
- put_rpccred(cred);
+ put_cred(cred);
rcu_read_lock();
xprt = rcu_dereference(session->clp->cl_rpcclient->cl_xprt);
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 40a08cd483f0..02488b50534a 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -87,7 +87,7 @@ const nfs4_stateid current_stateid = {
static DEFINE_MUTEX(nfs_clid_init_mutex);
-int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
+int nfs4_init_clientid(struct nfs_client *clp, const struct cred *cred)
{
struct nfs4_setclientid_res clid = {
.clientid = clp->cl_clientid,
@@ -134,7 +134,7 @@ out:
*/
int nfs40_discover_server_trunking(struct nfs_client *clp,
struct nfs_client **result,
- struct rpc_cred *cred)
+ const struct cred *cred)
{
struct nfs4_setclientid_res clid = {
.clientid = clp->cl_clientid,
@@ -164,32 +164,23 @@ out:
return status;
}
-struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp)
+const struct cred *nfs4_get_machine_cred(struct nfs_client *clp)
{
- struct rpc_cred *cred = NULL;
-
- if (clp->cl_machine_cred != NULL)
- cred = get_rpccred(clp->cl_machine_cred);
- return cred;
+ return get_cred(rpc_machine_cred());
}
static void nfs4_root_machine_cred(struct nfs_client *clp)
{
- struct rpc_cred *cred, *new;
- new = rpc_lookup_machine_cred(NULL);
- spin_lock(&clp->cl_lock);
- cred = clp->cl_machine_cred;
- clp->cl_machine_cred = new;
- spin_unlock(&clp->cl_lock);
- if (cred != NULL)
- put_rpccred(cred);
+ /* Force root creds instead of machine */
+ clp->cl_principal = NULL;
+ clp->cl_rpcclient->cl_principal = NULL;
}
-static struct rpc_cred *
+static const struct cred *
nfs4_get_renew_cred_server_locked(struct nfs_server *server)
{
- struct rpc_cred *cred = NULL;
+ const struct cred *cred = NULL;
struct nfs4_state_owner *sp;
struct rb_node *pos;
@@ -199,29 +190,30 @@ nfs4_get_renew_cred_server_locked(struct nfs_server *server)
sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
if (list_empty(&sp->so_states))
continue;
- cred = get_rpccred(sp->so_cred);
+ cred = get_cred(sp->so_cred);
break;
}
return cred;
}
/**
- * nfs4_get_renew_cred_locked - Acquire credential for a renew operation
+ * nfs4_get_renew_cred - Acquire credential for a renew operation
* @clp: client state handle
*
* Returns an rpc_cred with reference count bumped, or NULL.
* Caller must hold clp->cl_lock.
*/
-struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp)
+const struct cred *nfs4_get_renew_cred(struct nfs_client *clp)
{
- struct rpc_cred *cred = NULL;
+ const struct cred *cred = NULL;
struct nfs_server *server;
/* Use machine credentials if available */
- cred = nfs4_get_machine_cred_locked(clp);
+ cred = nfs4_get_machine_cred(clp);
if (cred != NULL)
goto out;
+ spin_lock(&clp->cl_lock);
rcu_read_lock();
list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
cred = nfs4_get_renew_cred_server_locked(server);
@@ -229,6 +221,7 @@ struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp)
break;
}
rcu_read_unlock();
+ spin_unlock(&clp->cl_lock);
out:
return cred;
@@ -319,7 +312,7 @@ static void nfs41_finish_session_reset(struct nfs_client *clp)
nfs41_setup_state_renewal(clp);
}
-int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
+int nfs41_init_clientid(struct nfs_client *clp, const struct cred *cred)
{
int status;
@@ -354,7 +347,7 @@ out:
*/
int nfs41_discover_server_trunking(struct nfs_client *clp,
struct nfs_client **result,
- struct rpc_cred *cred)
+ const struct cred *cred)
{
int status;
@@ -392,32 +385,32 @@ int nfs41_discover_server_trunking(struct nfs_client *clp,
* nfs4_get_clid_cred - Acquire credential for a setclientid operation
* @clp: client state handle
*
- * Returns an rpc_cred with reference count bumped, or NULL.
+ * Returns a cred with reference count bumped, or NULL.
*/
-struct rpc_cred *nfs4_get_clid_cred(struct nfs_client *clp)
+const struct cred *nfs4_get_clid_cred(struct nfs_client *clp)
{
- struct rpc_cred *cred;
+ const struct cred *cred;
- spin_lock(&clp->cl_lock);
- cred = nfs4_get_machine_cred_locked(clp);
- spin_unlock(&clp->cl_lock);
+ cred = nfs4_get_machine_cred(clp);
return cred;
}
static struct nfs4_state_owner *
-nfs4_find_state_owner_locked(struct nfs_server *server, struct rpc_cred *cred)
+nfs4_find_state_owner_locked(struct nfs_server *server, const struct cred *cred)
{
struct rb_node **p = &server->state_owners.rb_node,
*parent = NULL;
struct nfs4_state_owner *sp;
+ int cmp;
while (*p != NULL) {
parent = *p;
sp = rb_entry(parent, struct nfs4_state_owner, so_server_node);
+ cmp = cred_fscmp(cred, sp->so_cred);
- if (cred < sp->so_cred)
+ if (cmp < 0)
p = &parent->rb_left;
- else if (cred > sp->so_cred)
+ else if (cmp > 0)
p = &parent->rb_right;
else {
if (!list_empty(&sp->so_lru))
@@ -436,14 +429,16 @@ nfs4_insert_state_owner_locked(struct nfs4_state_owner *new)
struct rb_node **p = &server->state_owners.rb_node,
*parent = NULL;
struct nfs4_state_owner *sp;
+ int cmp;
while (*p != NULL) {
parent = *p;
sp = rb_entry(parent, struct nfs4_state_owner, so_server_node);
+ cmp = cred_fscmp(new->so_cred, sp->so_cred);
- if (new->so_cred < sp->so_cred)
+ if (cmp < 0)
p = &parent->rb_left;
- else if (new->so_cred > sp->so_cred)
+ else if (cmp > 0)
p = &parent->rb_right;
else {
if (!list_empty(&sp->so_lru))
@@ -490,7 +485,7 @@ nfs4_destroy_seqid_counter(struct nfs_seqid_counter *sc)
*/
static struct nfs4_state_owner *
nfs4_alloc_state_owner(struct nfs_server *server,
- struct rpc_cred *cred,
+ const struct cred *cred,
gfp_t gfp_flags)
{
struct nfs4_state_owner *sp;
@@ -505,7 +500,7 @@ nfs4_alloc_state_owner(struct nfs_server *server,
return NULL;
}
sp->so_server = server;
- sp->so_cred = get_rpccred(cred);
+ sp->so_cred = get_cred(cred);
spin_lock_init(&sp->so_lock);
INIT_LIST_HEAD(&sp->so_states);
nfs4_init_seqid_counter(&sp->so_seqid);
@@ -534,7 +529,7 @@ nfs4_reset_state_owner(struct nfs4_state_owner *sp)
static void nfs4_free_state_owner(struct nfs4_state_owner *sp)
{
nfs4_destroy_seqid_counter(&sp->so_seqid);
- put_rpccred(sp->so_cred);
+ put_cred(sp->so_cred);
ida_simple_remove(&sp->so_server->openowner_id, sp->so_seqid.owner_id);
kfree(sp);
}
@@ -572,7 +567,7 @@ static void nfs4_gc_state_owners(struct nfs_server *server)
* Returns a pointer to an instantiated nfs4_state_owner struct, or NULL.
*/
struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server,
- struct rpc_cred *cred,
+ const struct cred *cred,
gfp_t gfp_flags)
{
struct nfs_client *clp = server->nfs_client;
@@ -655,7 +650,7 @@ nfs4_alloc_open_state(void)
state = kzalloc(sizeof(*state), GFP_NOFS);
if (!state)
return NULL;
- atomic_set(&state->count, 1);
+ refcount_set(&state->count, 1);
INIT_LIST_HEAD(&state->lock_states);
spin_lock_init(&state->state_lock);
seqlock_init(&state->seqlock);
@@ -684,12 +679,12 @@ __nfs4_find_state_byowner(struct inode *inode, struct nfs4_state_owner *owner)
struct nfs_inode *nfsi = NFS_I(inode);
struct nfs4_state *state;
- list_for_each_entry(state, &nfsi->open_states, inode_states) {
+ list_for_each_entry_rcu(state, &nfsi->open_states, inode_states) {
if (state->owner != owner)
continue;
if (!nfs4_valid_open_stateid(state))
continue;
- if (atomic_inc_not_zero(&state->count))
+ if (refcount_inc_not_zero(&state->count))
return state;
}
return NULL;
@@ -698,7 +693,7 @@ __nfs4_find_state_byowner(struct inode *inode, struct nfs4_state_owner *owner)
static void
nfs4_free_open_state(struct nfs4_state *state)
{
- kfree(state);
+ kfree_rcu(state, rcu_head);
}
struct nfs4_state *
@@ -707,9 +702,9 @@ nfs4_get_open_state(struct inode *inode, struct nfs4_state_owner *owner)
struct nfs4_state *state, *new;
struct nfs_inode *nfsi = NFS_I(inode);
- spin_lock(&inode->i_lock);
+ rcu_read_lock();
state = __nfs4_find_state_byowner(inode, owner);
- spin_unlock(&inode->i_lock);
+ rcu_read_unlock();
if (state)
goto out;
new = nfs4_alloc_open_state();
@@ -720,7 +715,7 @@ nfs4_get_open_state(struct inode *inode, struct nfs4_state_owner *owner)
state = new;
state->owner = owner;
atomic_inc(&owner->so_count);
- list_add(&state->inode_states, &nfsi->open_states);
+ list_add_rcu(&state->inode_states, &nfsi->open_states);
ihold(inode);
state->inode = inode;
spin_unlock(&inode->i_lock);
@@ -743,10 +738,10 @@ void nfs4_put_open_state(struct nfs4_state *state)
struct inode *inode = state->inode;
struct nfs4_state_owner *owner = state->owner;
- if (!atomic_dec_and_lock(&state->count, &owner->so_lock))
+ if (!refcount_dec_and_lock(&state->count, &owner->so_lock))
return;
spin_lock(&inode->i_lock);
- list_del(&state->inode_states);
+ list_del_rcu(&state->inode_states);
list_del(&state->open_states);
spin_unlock(&inode->i_lock);
spin_unlock(&owner->so_lock);
@@ -1041,7 +1036,7 @@ bool nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state)
*/
int nfs4_select_rw_stateid(struct nfs4_state *state,
fmode_t fmode, const struct nfs_lock_context *l_ctx,
- nfs4_stateid *dst, struct rpc_cred **cred)
+ nfs4_stateid *dst, const struct cred **cred)
{
int ret;
@@ -1210,6 +1205,7 @@ void nfs4_schedule_state_manager(struct nfs_client *clp)
struct task_struct *task;
char buf[INET6_ADDRSTRLEN + sizeof("-manager") + 1];
+ set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state);
if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0)
return;
__module_get(THIS_MODULE);
@@ -1437,8 +1433,8 @@ void nfs_inode_find_state_and_recover(struct inode *inode,
struct nfs4_state *state;
bool found = false;
- spin_lock(&inode->i_lock);
- list_for_each_entry(ctx, &nfsi->open_files, list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(ctx, &nfsi->open_files, list) {
state = ctx->state;
if (state == NULL)
continue;
@@ -1456,7 +1452,7 @@ void nfs_inode_find_state_and_recover(struct inode *inode,
nfs4_state_mark_reclaim_nograce(clp, state))
found = true;
}
- spin_unlock(&inode->i_lock);
+ rcu_read_unlock();
nfs_inode_find_delegation_state_and_recover(inode, stateid);
if (found)
@@ -1469,13 +1465,13 @@ static void nfs4_state_mark_open_context_bad(struct nfs4_state *state)
struct nfs_inode *nfsi = NFS_I(inode);
struct nfs_open_context *ctx;
- spin_lock(&inode->i_lock);
- list_for_each_entry(ctx, &nfsi->open_files, list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(ctx, &nfsi->open_files, list) {
if (ctx->state != state)
continue;
set_bit(NFS_CONTEXT_BAD, &ctx->flags);
}
- spin_unlock(&inode->i_lock);
+ rcu_read_unlock();
}
static void nfs4_state_mark_recovery_failed(struct nfs4_state *state, int error)
@@ -1549,10 +1545,62 @@ out:
return status;
}
+#ifdef CONFIG_NFS_V4_2
+static void nfs42_complete_copies(struct nfs4_state_owner *sp, struct nfs4_state *state)
+{
+ struct nfs4_copy_state *copy;
+
+ if (!test_bit(NFS_CLNT_DST_SSC_COPY_STATE, &state->flags))
+ return;
+
+ spin_lock(&sp->so_server->nfs_client->cl_lock);
+ list_for_each_entry(copy, &sp->so_server->ss_copies, copies) {
+ if (!nfs4_stateid_match_other(&state->stateid, &copy->parent_state->stateid))
+ continue;
+ copy->flags = 1;
+ complete(&copy->completion);
+ break;
+ }
+ spin_unlock(&sp->so_server->nfs_client->cl_lock);
+}
+#else /* !CONFIG_NFS_V4_2 */
+static inline void nfs42_complete_copies(struct nfs4_state_owner *sp,
+ struct nfs4_state *state)
+{
+}
+#endif /* CONFIG_NFS_V4_2 */
+
+static int __nfs4_reclaim_open_state(struct nfs4_state_owner *sp, struct nfs4_state *state,
+ const struct nfs4_state_recovery_ops *ops)
+{
+ struct nfs4_lock_state *lock;
+ int status;
+
+ status = ops->recover_open(sp, state);
+ if (status < 0)
+ return status;
+
+ status = nfs4_reclaim_locks(state, ops);
+ if (status < 0)
+ return status;
+
+ if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) {
+ spin_lock(&state->state_lock);
+ list_for_each_entry(lock, &state->lock_states, ls_locks) {
+ if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags))
+ pr_warn_ratelimited("NFS: %s: Lock reclaim failed!\n", __func__);
+ }
+ spin_unlock(&state->state_lock);
+ }
+
+ nfs42_complete_copies(sp, state);
+ clear_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags);
+ return status;
+}
+
static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs4_state_recovery_ops *ops)
{
struct nfs4_state *state;
- struct nfs4_lock_state *lock;
int status = 0;
/* Note: we rely on the sp->so_states list being ordered
@@ -1573,79 +1621,45 @@ restart:
continue;
if (state->state == 0)
continue;
- atomic_inc(&state->count);
+ refcount_inc(&state->count);
spin_unlock(&sp->so_lock);
- status = ops->recover_open(sp, state);
- if (status >= 0) {
- status = nfs4_reclaim_locks(state, ops);
- if (status >= 0) {
- if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) {
- spin_lock(&state->state_lock);
- list_for_each_entry(lock, &state->lock_states, ls_locks) {
- if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags))
- pr_warn_ratelimited("NFS: "
- "%s: Lock reclaim "
- "failed!\n", __func__);
- }
- spin_unlock(&state->state_lock);
- }
- clear_bit(NFS_STATE_RECLAIM_NOGRACE,
- &state->flags);
-#ifdef CONFIG_NFS_V4_2
- if (test_bit(NFS_CLNT_DST_SSC_COPY_STATE, &state->flags)) {
- struct nfs4_copy_state *copy;
-
- spin_lock(&sp->so_server->nfs_client->cl_lock);
- list_for_each_entry(copy, &sp->so_server->ss_copies, copies) {
- if (memcmp(&state->stateid.other, &copy->parent_state->stateid.other, NFS4_STATEID_SIZE))
- continue;
- copy->flags = 1;
- complete(&copy->completion);
- printk("AGLO: server rebooted waking up the copy\n");
- break;
- }
- spin_unlock(&sp->so_server->nfs_client->cl_lock);
- }
-#endif /* CONFIG_NFS_V4_2 */
- nfs4_put_open_state(state);
- spin_lock(&sp->so_lock);
- goto restart;
- }
- }
+ status = __nfs4_reclaim_open_state(sp, state, ops);
+
switch (status) {
- default:
- printk(KERN_ERR "NFS: %s: unhandled error %d\n",
- __func__, status);
- /* Fall through */
- case -ENOENT:
- case -ENOMEM:
- case -EACCES:
- case -EROFS:
- case -EIO:
- case -ESTALE:
- /* Open state on this file cannot be recovered */
- nfs4_state_mark_recovery_failed(state, status);
- break;
- case -EAGAIN:
- ssleep(1);
- /* Fall through */
- case -NFS4ERR_ADMIN_REVOKED:
- case -NFS4ERR_STALE_STATEID:
- case -NFS4ERR_OLD_STATEID:
- case -NFS4ERR_BAD_STATEID:
- case -NFS4ERR_RECLAIM_BAD:
- case -NFS4ERR_RECLAIM_CONFLICT:
- nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state);
+ default:
+ if (status >= 0)
break;
- case -NFS4ERR_EXPIRED:
- case -NFS4ERR_NO_GRACE:
- nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state);
- case -NFS4ERR_STALE_CLIENTID:
- case -NFS4ERR_BADSESSION:
- case -NFS4ERR_BADSLOT:
- case -NFS4ERR_BAD_HIGH_SLOT:
- case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
- goto out_err;
+ printk(KERN_ERR "NFS: %s: unhandled error %d\n", __func__, status);
+ /* Fall through */
+ case -ENOENT:
+ case -ENOMEM:
+ case -EACCES:
+ case -EROFS:
+ case -EIO:
+ case -ESTALE:
+ /* Open state on this file cannot be recovered */
+ nfs4_state_mark_recovery_failed(state, status);
+ break;
+ case -EAGAIN:
+ ssleep(1);
+ /* Fall through */
+ case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_STALE_STATEID:
+ case -NFS4ERR_OLD_STATEID:
+ case -NFS4ERR_BAD_STATEID:
+ case -NFS4ERR_RECLAIM_BAD:
+ case -NFS4ERR_RECLAIM_CONFLICT:
+ nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state);
+ break;
+ case -NFS4ERR_EXPIRED:
+ case -NFS4ERR_NO_GRACE:
+ nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state);
+ case -NFS4ERR_STALE_CLIENTID:
+ case -NFS4ERR_BADSESSION:
+ case -NFS4ERR_BADSLOT:
+ case -NFS4ERR_BAD_HIGH_SLOT:
+ case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+ goto out_err;
}
nfs4_put_open_state(state);
spin_lock(&sp->so_lock);
@@ -1722,7 +1736,7 @@ static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp)
static int nfs4_reclaim_complete(struct nfs_client *clp,
const struct nfs4_state_recovery_ops *ops,
- struct rpc_cred *cred)
+ const struct cred *cred)
{
/* Notify the server we're done reclaiming our state */
if (ops->reclaim_complete)
@@ -1773,7 +1787,7 @@ static int nfs4_state_clear_reclaim_reboot(struct nfs_client *clp)
static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
{
const struct nfs4_state_recovery_ops *ops;
- struct rpc_cred *cred;
+ const struct cred *cred;
int err;
if (!nfs4_state_clear_reclaim_reboot(clp))
@@ -1781,7 +1795,7 @@ static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
ops = clp->cl_mvops->reboot_recovery_ops;
cred = nfs4_get_clid_cred(clp);
err = nfs4_reclaim_complete(clp, ops, cred);
- put_rpccred(cred);
+ put_cred(cred);
if (err == -NFS4ERR_CONN_NOT_BOUND_TO_SESSION)
set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state);
}
@@ -1795,38 +1809,38 @@ static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp)
static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
{
switch (error) {
- case 0:
- break;
- case -NFS4ERR_CB_PATH_DOWN:
- nfs40_handle_cb_pathdown(clp);
- break;
- case -NFS4ERR_NO_GRACE:
- nfs4_state_end_reclaim_reboot(clp);
- break;
- case -NFS4ERR_STALE_CLIENTID:
- set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
- nfs4_state_start_reclaim_reboot(clp);
- break;
- case -NFS4ERR_EXPIRED:
- set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
- nfs4_state_start_reclaim_nograce(clp);
- break;
- case -NFS4ERR_BADSESSION:
- case -NFS4ERR_BADSLOT:
- case -NFS4ERR_BAD_HIGH_SLOT:
- case -NFS4ERR_DEADSESSION:
- case -NFS4ERR_SEQ_FALSE_RETRY:
- case -NFS4ERR_SEQ_MISORDERED:
- set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
- /* Zero session reset errors */
- break;
- case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
- set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state);
- break;
- default:
- dprintk("%s: failed to handle error %d for server %s\n",
- __func__, error, clp->cl_hostname);
- return error;
+ case 0:
+ break;
+ case -NFS4ERR_CB_PATH_DOWN:
+ nfs40_handle_cb_pathdown(clp);
+ break;
+ case -NFS4ERR_NO_GRACE:
+ nfs4_state_end_reclaim_reboot(clp);
+ break;
+ case -NFS4ERR_STALE_CLIENTID:
+ set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+ nfs4_state_start_reclaim_reboot(clp);
+ break;
+ case -NFS4ERR_EXPIRED:
+ set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+ nfs4_state_start_reclaim_nograce(clp);
+ break;
+ case -NFS4ERR_BADSESSION:
+ case -NFS4ERR_BADSLOT:
+ case -NFS4ERR_BAD_HIGH_SLOT:
+ case -NFS4ERR_DEADSESSION:
+ case -NFS4ERR_SEQ_FALSE_RETRY:
+ case -NFS4ERR_SEQ_MISORDERED:
+ set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
+ /* Zero session reset errors */
+ break;
+ case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+ set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state);
+ break;
+ default:
+ dprintk("%s: failed to handle error %d for server %s\n",
+ __func__, error, clp->cl_hostname);
+ return error;
}
dprintk("%s: handled error %d for server %s\n", __func__, error,
clp->cl_hostname);
@@ -1877,7 +1891,7 @@ restart:
static int nfs4_check_lease(struct nfs_client *clp)
{
- struct rpc_cred *cred;
+ const struct cred *cred;
const struct nfs4_state_maintenance_ops *ops =
clp->cl_mvops->state_renewal_ops;
int status;
@@ -1885,9 +1899,7 @@ static int nfs4_check_lease(struct nfs_client *clp)
/* Is the client already known to have an expired lease? */
if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
return 0;
- spin_lock(&clp->cl_lock);
- cred = ops->get_state_renewal_cred_locked(clp);
- spin_unlock(&clp->cl_lock);
+ cred = ops->get_state_renewal_cred(clp);
if (cred == NULL) {
cred = nfs4_get_clid_cred(clp);
status = -ENOKEY;
@@ -1895,7 +1907,7 @@ static int nfs4_check_lease(struct nfs_client *clp)
goto out;
}
status = ops->renew_lease(clp, cred);
- put_rpccred(cred);
+ put_cred(cred);
if (status == -ETIMEDOUT) {
set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
return 0;
@@ -1955,7 +1967,7 @@ static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status)
static int nfs4_establish_lease(struct nfs_client *clp)
{
- struct rpc_cred *cred;
+ const struct cred *cred;
const struct nfs4_state_recovery_ops *ops =
clp->cl_mvops->reboot_recovery_ops;
int status;
@@ -1967,7 +1979,7 @@ static int nfs4_establish_lease(struct nfs_client *clp)
if (cred == NULL)
return -ENOENT;
status = ops->establish_clid(clp, cred);
- put_rpccred(cred);
+ put_cred(cred);
if (status != 0)
return status;
pnfs_destroy_all_layouts(clp);
@@ -2014,7 +2026,7 @@ static int nfs4_purge_lease(struct nfs_client *clp)
*
* Returns zero or a negative NFS4ERR status code.
*/
-static int nfs4_try_migration(struct nfs_server *server, struct rpc_cred *cred)
+static int nfs4_try_migration(struct nfs_server *server, const struct cred *cred)
{
struct nfs_client *clp = server->nfs_client;
struct nfs4_fs_locations *locations = NULL;
@@ -2084,14 +2096,12 @@ static int nfs4_handle_migration(struct nfs_client *clp)
const struct nfs4_state_maintenance_ops *ops =
clp->cl_mvops->state_renewal_ops;
struct nfs_server *server;
- struct rpc_cred *cred;
+ const struct cred *cred;
dprintk("%s: migration reported on \"%s\"\n", __func__,
clp->cl_hostname);
- spin_lock(&clp->cl_lock);
- cred = ops->get_state_renewal_cred_locked(clp);
- spin_unlock(&clp->cl_lock);
+ cred = ops->get_state_renewal_cred(clp);
if (cred == NULL)
return -NFS4ERR_NOENT;
@@ -2112,13 +2122,13 @@ restart:
rcu_read_unlock();
status = nfs4_try_migration(server, cred);
if (status < 0) {
- put_rpccred(cred);
+ put_cred(cred);
return status;
}
goto restart;
}
rcu_read_unlock();
- put_rpccred(cred);
+ put_cred(cred);
return 0;
}
@@ -2132,14 +2142,12 @@ static int nfs4_handle_lease_moved(struct nfs_client *clp)
const struct nfs4_state_maintenance_ops *ops =
clp->cl_mvops->state_renewal_ops;
struct nfs_server *server;
- struct rpc_cred *cred;
+ const struct cred *cred;
dprintk("%s: lease moved reported on \"%s\"\n", __func__,
clp->cl_hostname);
- spin_lock(&clp->cl_lock);
- cred = ops->get_state_renewal_cred_locked(clp);
- spin_unlock(&clp->cl_lock);
+ cred = ops->get_state_renewal_cred(clp);
if (cred == NULL)
return -NFS4ERR_NOENT;
@@ -2167,7 +2175,7 @@ restart:
rcu_read_unlock();
out:
- put_rpccred(cred);
+ put_cred(cred);
return 0;
}
@@ -2190,7 +2198,7 @@ int nfs4_discover_server_trunking(struct nfs_client *clp,
const struct nfs4_state_recovery_ops *ops =
clp->cl_mvops->reboot_recovery_ops;
struct rpc_clnt *clnt;
- struct rpc_cred *cred;
+ const struct cred *cred;
int i, status;
dprintk("NFS: %s: testing '%s'\n", __func__, clp->cl_hostname);
@@ -2206,7 +2214,7 @@ again:
goto out_unlock;
status = ops->detect_trunking(clp, result, cred);
- put_rpccred(cred);
+ put_cred(cred);
switch (status) {
case 0:
case -EINTR:
@@ -2397,7 +2405,7 @@ out_recovery:
static int nfs4_reset_session(struct nfs_client *clp)
{
- struct rpc_cred *cred;
+ const struct cred *cred;
int status;
if (!nfs4_has_session(clp))
@@ -2435,14 +2443,13 @@ static int nfs4_reset_session(struct nfs_client *clp)
dprintk("%s: session reset was successful for server %s!\n",
__func__, clp->cl_hostname);
out:
- if (cred)
- put_rpccred(cred);
+ put_cred(cred);
return status;
}
static int nfs4_bind_conn_to_session(struct nfs_client *clp)
{
- struct rpc_cred *cred;
+ const struct cred *cred;
int ret;
if (!nfs4_has_session(clp))
@@ -2452,8 +2459,7 @@ static int nfs4_bind_conn_to_session(struct nfs_client *clp)
return ret;
cred = nfs4_get_clid_cred(clp);
ret = nfs4_proc_bind_conn_to_session(clp, cred);
- if (cred)
- put_rpccred(cred);
+ put_cred(cred);
clear_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state);
switch (ret) {
case 0:
@@ -2485,6 +2491,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
/* Ensure exclusive access to NFSv4 state */
do {
+ clear_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state);
if (test_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state)) {
section = "purge state";
status = nfs4_purge_lease(clp);
@@ -2575,19 +2582,24 @@ static void nfs4_state_manager(struct nfs_client *clp)
}
nfs4_end_drain_session(clp);
- if (test_and_clear_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) {
- nfs_client_return_marked_delegations(clp);
- continue;
+ nfs4_clear_state_manager_bit(clp);
+
+ if (!test_and_set_bit(NFS4CLNT_DELEGRETURN_RUNNING, &clp->cl_state)) {
+ if (test_and_clear_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) {
+ nfs_client_return_marked_delegations(clp);
+ set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state);
+ }
+ clear_bit(NFS4CLNT_DELEGRETURN_RUNNING, &clp->cl_state);
}
- nfs4_clear_state_manager_bit(clp);
/* Did we race with an attempt to give us more work? */
- if (clp->cl_state == 0)
- break;
+ if (!test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state))
+ return;
if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0)
- break;
- } while (refcount_read(&clp->cl_count) > 1);
- return;
+ return;
+ } while (refcount_read(&clp->cl_count) > 1 && !signalled());
+ goto out_drain;
+
out_error:
if (strlen(section))
section_sep = ": ";
@@ -2595,6 +2607,7 @@ out_error:
" with error %d\n", section_sep, section,
clp->cl_hostname, -status);
ssleep(1);
+out_drain:
nfs4_end_drain_session(clp);
nfs4_clear_state_manager_bit(clp);
}
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index b1483b303e0b..b4557cf685fb 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -10,157 +10,302 @@
#include <linux/tracepoint.h>
+TRACE_DEFINE_ENUM(EPERM);
+TRACE_DEFINE_ENUM(ENOENT);
+TRACE_DEFINE_ENUM(EIO);
+TRACE_DEFINE_ENUM(ENXIO);
+TRACE_DEFINE_ENUM(EACCES);
+TRACE_DEFINE_ENUM(EEXIST);
+TRACE_DEFINE_ENUM(EXDEV);
+TRACE_DEFINE_ENUM(ENOTDIR);
+TRACE_DEFINE_ENUM(EISDIR);
+TRACE_DEFINE_ENUM(EFBIG);
+TRACE_DEFINE_ENUM(ENOSPC);
+TRACE_DEFINE_ENUM(EROFS);
+TRACE_DEFINE_ENUM(EMLINK);
+TRACE_DEFINE_ENUM(ENAMETOOLONG);
+TRACE_DEFINE_ENUM(ENOTEMPTY);
+TRACE_DEFINE_ENUM(EDQUOT);
+TRACE_DEFINE_ENUM(ESTALE);
+TRACE_DEFINE_ENUM(EBADHANDLE);
+TRACE_DEFINE_ENUM(EBADCOOKIE);
+TRACE_DEFINE_ENUM(ENOTSUPP);
+TRACE_DEFINE_ENUM(ETOOSMALL);
+TRACE_DEFINE_ENUM(EREMOTEIO);
+TRACE_DEFINE_ENUM(EBADTYPE);
+TRACE_DEFINE_ENUM(EAGAIN);
+TRACE_DEFINE_ENUM(ELOOP);
+TRACE_DEFINE_ENUM(EOPNOTSUPP);
+TRACE_DEFINE_ENUM(EDEADLK);
+TRACE_DEFINE_ENUM(ENOMEM);
+TRACE_DEFINE_ENUM(EKEYEXPIRED);
+TRACE_DEFINE_ENUM(ETIMEDOUT);
+TRACE_DEFINE_ENUM(ERESTARTSYS);
+TRACE_DEFINE_ENUM(ECONNREFUSED);
+TRACE_DEFINE_ENUM(ECONNRESET);
+TRACE_DEFINE_ENUM(ENETUNREACH);
+TRACE_DEFINE_ENUM(EHOSTUNREACH);
+TRACE_DEFINE_ENUM(EHOSTDOWN);
+TRACE_DEFINE_ENUM(EPIPE);
+TRACE_DEFINE_ENUM(EPFNOSUPPORT);
+TRACE_DEFINE_ENUM(EPROTONOSUPPORT);
+
+TRACE_DEFINE_ENUM(NFS4_OK);
+TRACE_DEFINE_ENUM(NFS4ERR_ACCESS);
+TRACE_DEFINE_ENUM(NFS4ERR_ATTRNOTSUPP);
+TRACE_DEFINE_ENUM(NFS4ERR_ADMIN_REVOKED);
+TRACE_DEFINE_ENUM(NFS4ERR_BACK_CHAN_BUSY);
+TRACE_DEFINE_ENUM(NFS4ERR_BADCHAR);
+TRACE_DEFINE_ENUM(NFS4ERR_BADHANDLE);
+TRACE_DEFINE_ENUM(NFS4ERR_BADIOMODE);
+TRACE_DEFINE_ENUM(NFS4ERR_BADLAYOUT);
+TRACE_DEFINE_ENUM(NFS4ERR_BADLABEL);
+TRACE_DEFINE_ENUM(NFS4ERR_BADNAME);
+TRACE_DEFINE_ENUM(NFS4ERR_BADOWNER);
+TRACE_DEFINE_ENUM(NFS4ERR_BADSESSION);
+TRACE_DEFINE_ENUM(NFS4ERR_BADSLOT);
+TRACE_DEFINE_ENUM(NFS4ERR_BADTYPE);
+TRACE_DEFINE_ENUM(NFS4ERR_BADXDR);
+TRACE_DEFINE_ENUM(NFS4ERR_BAD_COOKIE);
+TRACE_DEFINE_ENUM(NFS4ERR_BAD_HIGH_SLOT);
+TRACE_DEFINE_ENUM(NFS4ERR_BAD_RANGE);
+TRACE_DEFINE_ENUM(NFS4ERR_BAD_SEQID);
+TRACE_DEFINE_ENUM(NFS4ERR_BAD_SESSION_DIGEST);
+TRACE_DEFINE_ENUM(NFS4ERR_BAD_STATEID);
+TRACE_DEFINE_ENUM(NFS4ERR_CB_PATH_DOWN);
+TRACE_DEFINE_ENUM(NFS4ERR_CLID_INUSE);
+TRACE_DEFINE_ENUM(NFS4ERR_CLIENTID_BUSY);
+TRACE_DEFINE_ENUM(NFS4ERR_COMPLETE_ALREADY);
+TRACE_DEFINE_ENUM(NFS4ERR_CONN_NOT_BOUND_TO_SESSION);
+TRACE_DEFINE_ENUM(NFS4ERR_DEADLOCK);
+TRACE_DEFINE_ENUM(NFS4ERR_DEADSESSION);
+TRACE_DEFINE_ENUM(NFS4ERR_DELAY);
+TRACE_DEFINE_ENUM(NFS4ERR_DELEG_ALREADY_WANTED);
+TRACE_DEFINE_ENUM(NFS4ERR_DELEG_REVOKED);
+TRACE_DEFINE_ENUM(NFS4ERR_DENIED);
+TRACE_DEFINE_ENUM(NFS4ERR_DIRDELEG_UNAVAIL);
+TRACE_DEFINE_ENUM(NFS4ERR_DQUOT);
+TRACE_DEFINE_ENUM(NFS4ERR_ENCR_ALG_UNSUPP);
+TRACE_DEFINE_ENUM(NFS4ERR_EXIST);
+TRACE_DEFINE_ENUM(NFS4ERR_EXPIRED);
+TRACE_DEFINE_ENUM(NFS4ERR_FBIG);
+TRACE_DEFINE_ENUM(NFS4ERR_FHEXPIRED);
+TRACE_DEFINE_ENUM(NFS4ERR_FILE_OPEN);
+TRACE_DEFINE_ENUM(NFS4ERR_GRACE);
+TRACE_DEFINE_ENUM(NFS4ERR_HASH_ALG_UNSUPP);
+TRACE_DEFINE_ENUM(NFS4ERR_INVAL);
+TRACE_DEFINE_ENUM(NFS4ERR_IO);
+TRACE_DEFINE_ENUM(NFS4ERR_ISDIR);
+TRACE_DEFINE_ENUM(NFS4ERR_LAYOUTTRYLATER);
+TRACE_DEFINE_ENUM(NFS4ERR_LAYOUTUNAVAILABLE);
+TRACE_DEFINE_ENUM(NFS4ERR_LEASE_MOVED);
+TRACE_DEFINE_ENUM(NFS4ERR_LOCKED);
+TRACE_DEFINE_ENUM(NFS4ERR_LOCKS_HELD);
+TRACE_DEFINE_ENUM(NFS4ERR_LOCK_RANGE);
+TRACE_DEFINE_ENUM(NFS4ERR_MINOR_VERS_MISMATCH);
+TRACE_DEFINE_ENUM(NFS4ERR_MLINK);
+TRACE_DEFINE_ENUM(NFS4ERR_MOVED);
+TRACE_DEFINE_ENUM(NFS4ERR_NAMETOOLONG);
+TRACE_DEFINE_ENUM(NFS4ERR_NOENT);
+TRACE_DEFINE_ENUM(NFS4ERR_NOFILEHANDLE);
+TRACE_DEFINE_ENUM(NFS4ERR_NOMATCHING_LAYOUT);
+TRACE_DEFINE_ENUM(NFS4ERR_NOSPC);
+TRACE_DEFINE_ENUM(NFS4ERR_NOTDIR);
+TRACE_DEFINE_ENUM(NFS4ERR_NOTEMPTY);
+TRACE_DEFINE_ENUM(NFS4ERR_NOTSUPP);
+TRACE_DEFINE_ENUM(NFS4ERR_NOT_ONLY_OP);
+TRACE_DEFINE_ENUM(NFS4ERR_NOT_SAME);
+TRACE_DEFINE_ENUM(NFS4ERR_NO_GRACE);
+TRACE_DEFINE_ENUM(NFS4ERR_NXIO);
+TRACE_DEFINE_ENUM(NFS4ERR_OLD_STATEID);
+TRACE_DEFINE_ENUM(NFS4ERR_OPENMODE);
+TRACE_DEFINE_ENUM(NFS4ERR_OP_ILLEGAL);
+TRACE_DEFINE_ENUM(NFS4ERR_OP_NOT_IN_SESSION);
+TRACE_DEFINE_ENUM(NFS4ERR_PERM);
+TRACE_DEFINE_ENUM(NFS4ERR_PNFS_IO_HOLE);
+TRACE_DEFINE_ENUM(NFS4ERR_PNFS_NO_LAYOUT);
+TRACE_DEFINE_ENUM(NFS4ERR_RECALLCONFLICT);
+TRACE_DEFINE_ENUM(NFS4ERR_RECLAIM_BAD);
+TRACE_DEFINE_ENUM(NFS4ERR_RECLAIM_CONFLICT);
+TRACE_DEFINE_ENUM(NFS4ERR_REJECT_DELEG);
+TRACE_DEFINE_ENUM(NFS4ERR_REP_TOO_BIG);
+TRACE_DEFINE_ENUM(NFS4ERR_REP_TOO_BIG_TO_CACHE);
+TRACE_DEFINE_ENUM(NFS4ERR_REQ_TOO_BIG);
+TRACE_DEFINE_ENUM(NFS4ERR_RESOURCE);
+TRACE_DEFINE_ENUM(NFS4ERR_RESTOREFH);
+TRACE_DEFINE_ENUM(NFS4ERR_RETRY_UNCACHED_REP);
+TRACE_DEFINE_ENUM(NFS4ERR_RETURNCONFLICT);
+TRACE_DEFINE_ENUM(NFS4ERR_ROFS);
+TRACE_DEFINE_ENUM(NFS4ERR_SAME);
+TRACE_DEFINE_ENUM(NFS4ERR_SHARE_DENIED);
+TRACE_DEFINE_ENUM(NFS4ERR_SEQUENCE_POS);
+TRACE_DEFINE_ENUM(NFS4ERR_SEQ_FALSE_RETRY);
+TRACE_DEFINE_ENUM(NFS4ERR_SEQ_MISORDERED);
+TRACE_DEFINE_ENUM(NFS4ERR_SERVERFAULT);
+TRACE_DEFINE_ENUM(NFS4ERR_STALE);
+TRACE_DEFINE_ENUM(NFS4ERR_STALE_CLIENTID);
+TRACE_DEFINE_ENUM(NFS4ERR_STALE_STATEID);
+TRACE_DEFINE_ENUM(NFS4ERR_SYMLINK);
+TRACE_DEFINE_ENUM(NFS4ERR_TOOSMALL);
+TRACE_DEFINE_ENUM(NFS4ERR_TOO_MANY_OPS);
+TRACE_DEFINE_ENUM(NFS4ERR_UNKNOWN_LAYOUTTYPE);
+TRACE_DEFINE_ENUM(NFS4ERR_UNSAFE_COMPOUND);
+TRACE_DEFINE_ENUM(NFS4ERR_WRONGSEC);
+TRACE_DEFINE_ENUM(NFS4ERR_WRONG_CRED);
+TRACE_DEFINE_ENUM(NFS4ERR_WRONG_TYPE);
+TRACE_DEFINE_ENUM(NFS4ERR_XDEV);
+
#define show_nfsv4_errors(error) \
- __print_symbolic(error, \
+ __print_symbolic(-(error), \
{ NFS4_OK, "OK" }, \
/* Mapped by nfs4_stat_to_errno() */ \
- { -EPERM, "EPERM" }, \
- { -ENOENT, "ENOENT" }, \
- { -EIO, "EIO" }, \
- { -ENXIO, "ENXIO" }, \
- { -EACCES, "EACCES" }, \
- { -EEXIST, "EEXIST" }, \
- { -EXDEV, "EXDEV" }, \
- { -ENOTDIR, "ENOTDIR" }, \
- { -EISDIR, "EISDIR" }, \
- { -EFBIG, "EFBIG" }, \
- { -ENOSPC, "ENOSPC" }, \
- { -EROFS, "EROFS" }, \
- { -EMLINK, "EMLINK" }, \
- { -ENAMETOOLONG, "ENAMETOOLONG" }, \
- { -ENOTEMPTY, "ENOTEMPTY" }, \
- { -EDQUOT, "EDQUOT" }, \
- { -ESTALE, "ESTALE" }, \
- { -EBADHANDLE, "EBADHANDLE" }, \
- { -EBADCOOKIE, "EBADCOOKIE" }, \
- { -ENOTSUPP, "ENOTSUPP" }, \
- { -ETOOSMALL, "ETOOSMALL" }, \
- { -EREMOTEIO, "EREMOTEIO" }, \
- { -EBADTYPE, "EBADTYPE" }, \
- { -EAGAIN, "EAGAIN" }, \
- { -ELOOP, "ELOOP" }, \
- { -EOPNOTSUPP, "EOPNOTSUPP" }, \
- { -EDEADLK, "EDEADLK" }, \
+ { EPERM, "EPERM" }, \
+ { ENOENT, "ENOENT" }, \
+ { EIO, "EIO" }, \
+ { ENXIO, "ENXIO" }, \
+ { EACCES, "EACCES" }, \
+ { EEXIST, "EEXIST" }, \
+ { EXDEV, "EXDEV" }, \
+ { ENOTDIR, "ENOTDIR" }, \
+ { EISDIR, "EISDIR" }, \
+ { EFBIG, "EFBIG" }, \
+ { ENOSPC, "ENOSPC" }, \
+ { EROFS, "EROFS" }, \
+ { EMLINK, "EMLINK" }, \
+ { ENAMETOOLONG, "ENAMETOOLONG" }, \
+ { ENOTEMPTY, "ENOTEMPTY" }, \
+ { EDQUOT, "EDQUOT" }, \
+ { ESTALE, "ESTALE" }, \
+ { EBADHANDLE, "EBADHANDLE" }, \
+ { EBADCOOKIE, "EBADCOOKIE" }, \
+ { ENOTSUPP, "ENOTSUPP" }, \
+ { ETOOSMALL, "ETOOSMALL" }, \
+ { EREMOTEIO, "EREMOTEIO" }, \
+ { EBADTYPE, "EBADTYPE" }, \
+ { EAGAIN, "EAGAIN" }, \
+ { ELOOP, "ELOOP" }, \
+ { EOPNOTSUPP, "EOPNOTSUPP" }, \
+ { EDEADLK, "EDEADLK" }, \
/* RPC errors */ \
- { -ENOMEM, "ENOMEM" }, \
- { -EKEYEXPIRED, "EKEYEXPIRED" }, \
- { -ETIMEDOUT, "ETIMEDOUT" }, \
- { -ERESTARTSYS, "ERESTARTSYS" }, \
- { -ECONNREFUSED, "ECONNREFUSED" }, \
- { -ECONNRESET, "ECONNRESET" }, \
- { -ENETUNREACH, "ENETUNREACH" }, \
- { -EHOSTUNREACH, "EHOSTUNREACH" }, \
- { -EHOSTDOWN, "EHOSTDOWN" }, \
- { -EPIPE, "EPIPE" }, \
- { -EPFNOSUPPORT, "EPFNOSUPPORT" }, \
- { -EPROTONOSUPPORT, "EPROTONOSUPPORT" }, \
+ { ENOMEM, "ENOMEM" }, \
+ { EKEYEXPIRED, "EKEYEXPIRED" }, \
+ { ETIMEDOUT, "ETIMEDOUT" }, \
+ { ERESTARTSYS, "ERESTARTSYS" }, \
+ { ECONNREFUSED, "ECONNREFUSED" }, \
+ { ECONNRESET, "ECONNRESET" }, \
+ { ENETUNREACH, "ENETUNREACH" }, \
+ { EHOSTUNREACH, "EHOSTUNREACH" }, \
+ { EHOSTDOWN, "EHOSTDOWN" }, \
+ { EPIPE, "EPIPE" }, \
+ { EPFNOSUPPORT, "EPFNOSUPPORT" }, \
+ { EPROTONOSUPPORT, "EPROTONOSUPPORT" }, \
/* NFSv4 native errors */ \
- { -NFS4ERR_ACCESS, "ACCESS" }, \
- { -NFS4ERR_ATTRNOTSUPP, "ATTRNOTSUPP" }, \
- { -NFS4ERR_ADMIN_REVOKED, "ADMIN_REVOKED" }, \
- { -NFS4ERR_BACK_CHAN_BUSY, "BACK_CHAN_BUSY" }, \
- { -NFS4ERR_BADCHAR, "BADCHAR" }, \
- { -NFS4ERR_BADHANDLE, "BADHANDLE" }, \
- { -NFS4ERR_BADIOMODE, "BADIOMODE" }, \
- { -NFS4ERR_BADLAYOUT, "BADLAYOUT" }, \
- { -NFS4ERR_BADLABEL, "BADLABEL" }, \
- { -NFS4ERR_BADNAME, "BADNAME" }, \
- { -NFS4ERR_BADOWNER, "BADOWNER" }, \
- { -NFS4ERR_BADSESSION, "BADSESSION" }, \
- { -NFS4ERR_BADSLOT, "BADSLOT" }, \
- { -NFS4ERR_BADTYPE, "BADTYPE" }, \
- { -NFS4ERR_BADXDR, "BADXDR" }, \
- { -NFS4ERR_BAD_COOKIE, "BAD_COOKIE" }, \
- { -NFS4ERR_BAD_HIGH_SLOT, "BAD_HIGH_SLOT" }, \
- { -NFS4ERR_BAD_RANGE, "BAD_RANGE" }, \
- { -NFS4ERR_BAD_SEQID, "BAD_SEQID" }, \
- { -NFS4ERR_BAD_SESSION_DIGEST, "BAD_SESSION_DIGEST" }, \
- { -NFS4ERR_BAD_STATEID, "BAD_STATEID" }, \
- { -NFS4ERR_CB_PATH_DOWN, "CB_PATH_DOWN" }, \
- { -NFS4ERR_CLID_INUSE, "CLID_INUSE" }, \
- { -NFS4ERR_CLIENTID_BUSY, "CLIENTID_BUSY" }, \
- { -NFS4ERR_COMPLETE_ALREADY, "COMPLETE_ALREADY" }, \
- { -NFS4ERR_CONN_NOT_BOUND_TO_SESSION, \
+ { NFS4ERR_ACCESS, "ACCESS" }, \
+ { NFS4ERR_ATTRNOTSUPP, "ATTRNOTSUPP" }, \
+ { NFS4ERR_ADMIN_REVOKED, "ADMIN_REVOKED" }, \
+ { NFS4ERR_BACK_CHAN_BUSY, "BACK_CHAN_BUSY" }, \
+ { NFS4ERR_BADCHAR, "BADCHAR" }, \
+ { NFS4ERR_BADHANDLE, "BADHANDLE" }, \
+ { NFS4ERR_BADIOMODE, "BADIOMODE" }, \
+ { NFS4ERR_BADLAYOUT, "BADLAYOUT" }, \
+ { NFS4ERR_BADLABEL, "BADLABEL" }, \
+ { NFS4ERR_BADNAME, "BADNAME" }, \
+ { NFS4ERR_BADOWNER, "BADOWNER" }, \
+ { NFS4ERR_BADSESSION, "BADSESSION" }, \
+ { NFS4ERR_BADSLOT, "BADSLOT" }, \
+ { NFS4ERR_BADTYPE, "BADTYPE" }, \
+ { NFS4ERR_BADXDR, "BADXDR" }, \
+ { NFS4ERR_BAD_COOKIE, "BAD_COOKIE" }, \
+ { NFS4ERR_BAD_HIGH_SLOT, "BAD_HIGH_SLOT" }, \
+ { NFS4ERR_BAD_RANGE, "BAD_RANGE" }, \
+ { NFS4ERR_BAD_SEQID, "BAD_SEQID" }, \
+ { NFS4ERR_BAD_SESSION_DIGEST, "BAD_SESSION_DIGEST" }, \
+ { NFS4ERR_BAD_STATEID, "BAD_STATEID" }, \
+ { NFS4ERR_CB_PATH_DOWN, "CB_PATH_DOWN" }, \
+ { NFS4ERR_CLID_INUSE, "CLID_INUSE" }, \
+ { NFS4ERR_CLIENTID_BUSY, "CLIENTID_BUSY" }, \
+ { NFS4ERR_COMPLETE_ALREADY, "COMPLETE_ALREADY" }, \
+ { NFS4ERR_CONN_NOT_BOUND_TO_SESSION, \
"CONN_NOT_BOUND_TO_SESSION" }, \
- { -NFS4ERR_DEADLOCK, "DEADLOCK" }, \
- { -NFS4ERR_DEADSESSION, "DEAD_SESSION" }, \
- { -NFS4ERR_DELAY, "DELAY" }, \
- { -NFS4ERR_DELEG_ALREADY_WANTED, \
+ { NFS4ERR_DEADLOCK, "DEADLOCK" }, \
+ { NFS4ERR_DEADSESSION, "DEAD_SESSION" }, \
+ { NFS4ERR_DELAY, "DELAY" }, \
+ { NFS4ERR_DELEG_ALREADY_WANTED, \
"DELEG_ALREADY_WANTED" }, \
- { -NFS4ERR_DELEG_REVOKED, "DELEG_REVOKED" }, \
- { -NFS4ERR_DENIED, "DENIED" }, \
- { -NFS4ERR_DIRDELEG_UNAVAIL, "DIRDELEG_UNAVAIL" }, \
- { -NFS4ERR_DQUOT, "DQUOT" }, \
- { -NFS4ERR_ENCR_ALG_UNSUPP, "ENCR_ALG_UNSUPP" }, \
- { -NFS4ERR_EXIST, "EXIST" }, \
- { -NFS4ERR_EXPIRED, "EXPIRED" }, \
- { -NFS4ERR_FBIG, "FBIG" }, \
- { -NFS4ERR_FHEXPIRED, "FHEXPIRED" }, \
- { -NFS4ERR_FILE_OPEN, "FILE_OPEN" }, \
- { -NFS4ERR_GRACE, "GRACE" }, \
- { -NFS4ERR_HASH_ALG_UNSUPP, "HASH_ALG_UNSUPP" }, \
- { -NFS4ERR_INVAL, "INVAL" }, \
- { -NFS4ERR_IO, "IO" }, \
- { -NFS4ERR_ISDIR, "ISDIR" }, \
- { -NFS4ERR_LAYOUTTRYLATER, "LAYOUTTRYLATER" }, \
- { -NFS4ERR_LAYOUTUNAVAILABLE, "LAYOUTUNAVAILABLE" }, \
- { -NFS4ERR_LEASE_MOVED, "LEASE_MOVED" }, \
- { -NFS4ERR_LOCKED, "LOCKED" }, \
- { -NFS4ERR_LOCKS_HELD, "LOCKS_HELD" }, \
- { -NFS4ERR_LOCK_RANGE, "LOCK_RANGE" }, \
- { -NFS4ERR_MINOR_VERS_MISMATCH, "MINOR_VERS_MISMATCH" }, \
- { -NFS4ERR_MLINK, "MLINK" }, \
- { -NFS4ERR_MOVED, "MOVED" }, \
- { -NFS4ERR_NAMETOOLONG, "NAMETOOLONG" }, \
- { -NFS4ERR_NOENT, "NOENT" }, \
- { -NFS4ERR_NOFILEHANDLE, "NOFILEHANDLE" }, \
- { -NFS4ERR_NOMATCHING_LAYOUT, "NOMATCHING_LAYOUT" }, \
- { -NFS4ERR_NOSPC, "NOSPC" }, \
- { -NFS4ERR_NOTDIR, "NOTDIR" }, \
- { -NFS4ERR_NOTEMPTY, "NOTEMPTY" }, \
- { -NFS4ERR_NOTSUPP, "NOTSUPP" }, \
- { -NFS4ERR_NOT_ONLY_OP, "NOT_ONLY_OP" }, \
- { -NFS4ERR_NOT_SAME, "NOT_SAME" }, \
- { -NFS4ERR_NO_GRACE, "NO_GRACE" }, \
- { -NFS4ERR_NXIO, "NXIO" }, \
- { -NFS4ERR_OLD_STATEID, "OLD_STATEID" }, \
- { -NFS4ERR_OPENMODE, "OPENMODE" }, \
- { -NFS4ERR_OP_ILLEGAL, "OP_ILLEGAL" }, \
- { -NFS4ERR_OP_NOT_IN_SESSION, "OP_NOT_IN_SESSION" }, \
- { -NFS4ERR_PERM, "PERM" }, \
- { -NFS4ERR_PNFS_IO_HOLE, "PNFS_IO_HOLE" }, \
- { -NFS4ERR_PNFS_NO_LAYOUT, "PNFS_NO_LAYOUT" }, \
- { -NFS4ERR_RECALLCONFLICT, "RECALLCONFLICT" }, \
- { -NFS4ERR_RECLAIM_BAD, "RECLAIM_BAD" }, \
- { -NFS4ERR_RECLAIM_CONFLICT, "RECLAIM_CONFLICT" }, \
- { -NFS4ERR_REJECT_DELEG, "REJECT_DELEG" }, \
- { -NFS4ERR_REP_TOO_BIG, "REP_TOO_BIG" }, \
- { -NFS4ERR_REP_TOO_BIG_TO_CACHE, \
+ { NFS4ERR_DELEG_REVOKED, "DELEG_REVOKED" }, \
+ { NFS4ERR_DENIED, "DENIED" }, \
+ { NFS4ERR_DIRDELEG_UNAVAIL, "DIRDELEG_UNAVAIL" }, \
+ { NFS4ERR_DQUOT, "DQUOT" }, \
+ { NFS4ERR_ENCR_ALG_UNSUPP, "ENCR_ALG_UNSUPP" }, \
+ { NFS4ERR_EXIST, "EXIST" }, \
+ { NFS4ERR_EXPIRED, "EXPIRED" }, \
+ { NFS4ERR_FBIG, "FBIG" }, \
+ { NFS4ERR_FHEXPIRED, "FHEXPIRED" }, \
+ { NFS4ERR_FILE_OPEN, "FILE_OPEN" }, \
+ { NFS4ERR_GRACE, "GRACE" }, \
+ { NFS4ERR_HASH_ALG_UNSUPP, "HASH_ALG_UNSUPP" }, \
+ { NFS4ERR_INVAL, "INVAL" }, \
+ { NFS4ERR_IO, "IO" }, \
+ { NFS4ERR_ISDIR, "ISDIR" }, \
+ { NFS4ERR_LAYOUTTRYLATER, "LAYOUTTRYLATER" }, \
+ { NFS4ERR_LAYOUTUNAVAILABLE, "LAYOUTUNAVAILABLE" }, \
+ { NFS4ERR_LEASE_MOVED, "LEASE_MOVED" }, \
+ { NFS4ERR_LOCKED, "LOCKED" }, \
+ { NFS4ERR_LOCKS_HELD, "LOCKS_HELD" }, \
+ { NFS4ERR_LOCK_RANGE, "LOCK_RANGE" }, \
+ { NFS4ERR_MINOR_VERS_MISMATCH, "MINOR_VERS_MISMATCH" }, \
+ { NFS4ERR_MLINK, "MLINK" }, \
+ { NFS4ERR_MOVED, "MOVED" }, \
+ { NFS4ERR_NAMETOOLONG, "NAMETOOLONG" }, \
+ { NFS4ERR_NOENT, "NOENT" }, \
+ { NFS4ERR_NOFILEHANDLE, "NOFILEHANDLE" }, \
+ { NFS4ERR_NOMATCHING_LAYOUT, "NOMATCHING_LAYOUT" }, \
+ { NFS4ERR_NOSPC, "NOSPC" }, \
+ { NFS4ERR_NOTDIR, "NOTDIR" }, \
+ { NFS4ERR_NOTEMPTY, "NOTEMPTY" }, \
+ { NFS4ERR_NOTSUPP, "NOTSUPP" }, \
+ { NFS4ERR_NOT_ONLY_OP, "NOT_ONLY_OP" }, \
+ { NFS4ERR_NOT_SAME, "NOT_SAME" }, \
+ { NFS4ERR_NO_GRACE, "NO_GRACE" }, \
+ { NFS4ERR_NXIO, "NXIO" }, \
+ { NFS4ERR_OLD_STATEID, "OLD_STATEID" }, \
+ { NFS4ERR_OPENMODE, "OPENMODE" }, \
+ { NFS4ERR_OP_ILLEGAL, "OP_ILLEGAL" }, \
+ { NFS4ERR_OP_NOT_IN_SESSION, "OP_NOT_IN_SESSION" }, \
+ { NFS4ERR_PERM, "PERM" }, \
+ { NFS4ERR_PNFS_IO_HOLE, "PNFS_IO_HOLE" }, \
+ { NFS4ERR_PNFS_NO_LAYOUT, "PNFS_NO_LAYOUT" }, \
+ { NFS4ERR_RECALLCONFLICT, "RECALLCONFLICT" }, \
+ { NFS4ERR_RECLAIM_BAD, "RECLAIM_BAD" }, \
+ { NFS4ERR_RECLAIM_CONFLICT, "RECLAIM_CONFLICT" }, \
+ { NFS4ERR_REJECT_DELEG, "REJECT_DELEG" }, \
+ { NFS4ERR_REP_TOO_BIG, "REP_TOO_BIG" }, \
+ { NFS4ERR_REP_TOO_BIG_TO_CACHE, \
"REP_TOO_BIG_TO_CACHE" }, \
- { -NFS4ERR_REQ_TOO_BIG, "REQ_TOO_BIG" }, \
- { -NFS4ERR_RESOURCE, "RESOURCE" }, \
- { -NFS4ERR_RESTOREFH, "RESTOREFH" }, \
- { -NFS4ERR_RETRY_UNCACHED_REP, "RETRY_UNCACHED_REP" }, \
- { -NFS4ERR_RETURNCONFLICT, "RETURNCONFLICT" }, \
- { -NFS4ERR_ROFS, "ROFS" }, \
- { -NFS4ERR_SAME, "SAME" }, \
- { -NFS4ERR_SHARE_DENIED, "SHARE_DENIED" }, \
- { -NFS4ERR_SEQUENCE_POS, "SEQUENCE_POS" }, \
- { -NFS4ERR_SEQ_FALSE_RETRY, "SEQ_FALSE_RETRY" }, \
- { -NFS4ERR_SEQ_MISORDERED, "SEQ_MISORDERED" }, \
- { -NFS4ERR_SERVERFAULT, "SERVERFAULT" }, \
- { -NFS4ERR_STALE, "STALE" }, \
- { -NFS4ERR_STALE_CLIENTID, "STALE_CLIENTID" }, \
- { -NFS4ERR_STALE_STATEID, "STALE_STATEID" }, \
- { -NFS4ERR_SYMLINK, "SYMLINK" }, \
- { -NFS4ERR_TOOSMALL, "TOOSMALL" }, \
- { -NFS4ERR_TOO_MANY_OPS, "TOO_MANY_OPS" }, \
- { -NFS4ERR_UNKNOWN_LAYOUTTYPE, "UNKNOWN_LAYOUTTYPE" }, \
- { -NFS4ERR_UNSAFE_COMPOUND, "UNSAFE_COMPOUND" }, \
- { -NFS4ERR_WRONGSEC, "WRONGSEC" }, \
- { -NFS4ERR_WRONG_CRED, "WRONG_CRED" }, \
- { -NFS4ERR_WRONG_TYPE, "WRONG_TYPE" }, \
- { -NFS4ERR_XDEV, "XDEV" })
+ { NFS4ERR_REQ_TOO_BIG, "REQ_TOO_BIG" }, \
+ { NFS4ERR_RESOURCE, "RESOURCE" }, \
+ { NFS4ERR_RESTOREFH, "RESTOREFH" }, \
+ { NFS4ERR_RETRY_UNCACHED_REP, "RETRY_UNCACHED_REP" }, \
+ { NFS4ERR_RETURNCONFLICT, "RETURNCONFLICT" }, \
+ { NFS4ERR_ROFS, "ROFS" }, \
+ { NFS4ERR_SAME, "SAME" }, \
+ { NFS4ERR_SHARE_DENIED, "SHARE_DENIED" }, \
+ { NFS4ERR_SEQUENCE_POS, "SEQUENCE_POS" }, \
+ { NFS4ERR_SEQ_FALSE_RETRY, "SEQ_FALSE_RETRY" }, \
+ { NFS4ERR_SEQ_MISORDERED, "SEQ_MISORDERED" }, \
+ { NFS4ERR_SERVERFAULT, "SERVERFAULT" }, \
+ { NFS4ERR_STALE, "STALE" }, \
+ { NFS4ERR_STALE_CLIENTID, "STALE_CLIENTID" }, \
+ { NFS4ERR_STALE_STATEID, "STALE_STATEID" }, \
+ { NFS4ERR_SYMLINK, "SYMLINK" }, \
+ { NFS4ERR_TOOSMALL, "TOOSMALL" }, \
+ { NFS4ERR_TOO_MANY_OPS, "TOO_MANY_OPS" }, \
+ { NFS4ERR_UNKNOWN_LAYOUTTYPE, "UNKNOWN_LAYOUTTYPE" }, \
+ { NFS4ERR_UNSAFE_COMPOUND, "UNSAFE_COMPOUND" }, \
+ { NFS4ERR_WRONGSEC, "WRONGSEC" }, \
+ { NFS4ERR_WRONG_CRED, "WRONG_CRED" }, \
+ { NFS4ERR_WRONG_TYPE, "WRONG_TYPE" }, \
+ { NFS4ERR_XDEV, "XDEV" })
#define show_open_flags(flags) \
__print_flags(flags, "|", \
@@ -558,6 +703,13 @@ TRACE_EVENT(nfs4_close,
)
);
+TRACE_DEFINE_ENUM(F_GETLK);
+TRACE_DEFINE_ENUM(F_SETLK);
+TRACE_DEFINE_ENUM(F_SETLKW);
+TRACE_DEFINE_ENUM(F_RDLCK);
+TRACE_DEFINE_ENUM(F_WRLCK);
+TRACE_DEFINE_ENUM(F_UNLCK);
+
#define show_lock_cmd(type) \
__print_symbolic((int)type, \
{ F_GETLK, "GETLK" }, \
@@ -1451,6 +1603,10 @@ DEFINE_NFS4_COMMIT_EVENT(nfs4_commit);
#ifdef CONFIG_NFS_V4_1
DEFINE_NFS4_COMMIT_EVENT(nfs4_pnfs_commit_ds);
+TRACE_DEFINE_ENUM(IOMODE_READ);
+TRACE_DEFINE_ENUM(IOMODE_RW);
+TRACE_DEFINE_ENUM(IOMODE_ANY);
+
#define show_pnfs_iomode(iomode) \
__print_symbolic(iomode, \
{ IOMODE_READ, "READ" }, \
@@ -1528,6 +1684,20 @@ DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_layoutcommit);
DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_layoutreturn);
DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn_on_close);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_UNKNOWN);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_NO_PNFS);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_RD_ZEROLEN);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_MDSTHRESH);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_NOMEM);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_BULK_RECALL);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_IO_TEST_FAIL);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_FOUND_CACHED);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_RETURN);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_BLOCKED);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_INVALID_OPEN);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_RETRY);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET);
+
#define show_pnfs_update_layout_reason(reason) \
__print_symbolic(reason, \
{ PNFS_UPDATE_LAYOUT_UNKNOWN, "unknown" }, \
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index b7bde12d8cd5..2fc8f6fa25e4 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -3516,7 +3516,7 @@ static int decode_attr_exclcreat_supported(struct xdr_stream *xdr,
static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fh *fh)
{
__be32 *p;
- int len;
+ u32 len;
if (fh != NULL)
memset(fh, 0, sizeof(*fh));
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index bb5476a6d264..e54d899c1848 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -63,14 +63,14 @@ EXPORT_SYMBOL_GPL(nfs_pgheader_init);
void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos)
{
- spin_lock(&hdr->lock);
- if (!test_and_set_bit(NFS_IOHDR_ERROR, &hdr->flags)
- || pos < hdr->io_start + hdr->good_bytes) {
+ unsigned int new = pos - hdr->io_start;
+
+ if (hdr->good_bytes > new) {
+ hdr->good_bytes = new;
clear_bit(NFS_IOHDR_EOF, &hdr->flags);
- hdr->good_bytes = pos - hdr->io_start;
- hdr->error = error;
+ if (!test_and_set_bit(NFS_IOHDR_ERROR, &hdr->flags))
+ hdr->error = error;
}
- spin_unlock(&hdr->lock);
}
static inline struct nfs_page *
@@ -461,7 +461,7 @@ EXPORT_SYMBOL_GPL(nfs_wait_on_request);
* @prev: previous request in desc, or NULL
* @req: this request
*
- * Returns zero if @req can be coalesced into @desc, otherwise it returns
+ * Returns zero if @req cannot be coalesced into @desc, otherwise it returns
* the size of the request.
*/
size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
@@ -494,7 +494,6 @@ struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *ops)
if (hdr) {
INIT_LIST_HEAD(&hdr->pages);
- spin_lock_init(&hdr->lock);
hdr->rw_ops = ops;
}
return hdr;
@@ -588,7 +587,7 @@ static void nfs_pgio_prepare(struct rpc_task *task, void *calldata)
}
int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
- struct rpc_cred *cred, const struct nfs_rpc_ops *rpc_ops,
+ const struct cred *cred, const struct nfs_rpc_ops *rpc_ops,
const struct rpc_call_ops *call_ops, int how, int flags)
{
struct rpc_task *task;
@@ -1111,6 +1110,20 @@ static int nfs_pageio_add_request_mirror(struct nfs_pageio_descriptor *desc,
return ret;
}
+static void nfs_pageio_error_cleanup(struct nfs_pageio_descriptor *desc)
+{
+ u32 midx;
+ struct nfs_pgio_mirror *mirror;
+
+ if (!desc->pg_error)
+ return;
+
+ for (midx = 0; midx < desc->pg_mirror_count; midx++) {
+ mirror = &desc->pg_mirrors[midx];
+ desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
+ }
+}
+
int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
struct nfs_page *req)
{
@@ -1161,25 +1174,7 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
return 1;
out_failed:
- /*
- * We might have failed before sending any reqs over wire.
- * Clean up rest of the reqs in mirror pg_list.
- */
- if (desc->pg_error) {
- struct nfs_pgio_mirror *mirror;
- void (*func)(struct list_head *);
-
- /* remember fatal errors */
- if (nfs_error_is_fatal(desc->pg_error))
- nfs_context_set_write_error(req->wb_context,
- desc->pg_error);
-
- func = desc->pg_completion_ops->error_cleanup;
- for (midx = 0; midx < desc->pg_mirror_count; midx++) {
- mirror = &desc->pg_mirrors[midx];
- func(&mirror->pg_list);
- }
- }
+ nfs_pageio_error_cleanup(desc);
return 0;
}
@@ -1251,6 +1246,8 @@ void nfs_pageio_complete(struct nfs_pageio_descriptor *desc)
for (midx = 0; midx < desc->pg_mirror_count; midx++)
nfs_pageio_complete_mirror(desc, midx);
+ if (desc->pg_error < 0)
+ nfs_pageio_error_cleanup(desc);
if (desc->pg_ops->pg_cleanup)
desc->pg_ops->pg_cleanup(desc);
nfs_pageio_cleanup_mirroring(desc);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 7d9a51e6b847..53726da5c010 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -275,7 +275,7 @@ pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo)
list_del_init(&lo->plh_layouts);
spin_unlock(&clp->cl_lock);
}
- put_rpccred(lo->plh_lc_cred);
+ put_cred(lo->plh_lc_cred);
return ld->free_layout_hdr(lo);
}
@@ -965,7 +965,7 @@ static struct page **nfs4_alloc_pages(size_t size, gfp_t gfp_flags)
struct page **pages;
int i;
- pages = kcalloc(size, sizeof(struct page *), gfp_flags);
+ pages = kmalloc_array(size, sizeof(struct page *), gfp_flags);
if (!pages) {
dprintk("%s: can't alloc array of %zu pages\n", __func__, size);
return NULL;
@@ -975,7 +975,7 @@ static struct page **nfs4_alloc_pages(size_t size, gfp_t gfp_flags)
pages[i] = alloc_page(gfp_flags);
if (!pages[i]) {
dprintk("%s: failed to allocate page\n", __func__);
- nfs4_free_pages(pages, size);
+ nfs4_free_pages(pages, i);
return NULL;
}
}
@@ -991,6 +991,7 @@ pnfs_alloc_init_layoutget_args(struct inode *ino,
gfp_t gfp_flags)
{
struct nfs_server *server = pnfs_find_server(ino, ctx);
+ size_t max_reply_sz = server->pnfs_curr_ld->max_layoutget_response;
size_t max_pages = max_response_pages(server);
struct nfs4_layoutget *lgp;
@@ -1000,6 +1001,12 @@ pnfs_alloc_init_layoutget_args(struct inode *ino,
if (lgp == NULL)
return NULL;
+ if (max_reply_sz) {
+ size_t npages = (max_reply_sz + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (npages < max_pages)
+ max_pages = npages;
+ }
+
lgp->args.layout.pages = nfs4_alloc_pages(max_pages, gfp_flags);
if (!lgp->args.layout.pages) {
kfree(lgp);
@@ -1031,7 +1038,7 @@ pnfs_alloc_init_layoutget_args(struct inode *ino,
lgp->args.ctx = get_nfs_open_context(ctx);
nfs4_stateid_copy(&lgp->args.stateid, stateid);
lgp->gfp_flags = gfp_flags;
- lgp->cred = get_rpccred(ctx->cred);
+ lgp->cred = get_cred(ctx->cred);
return lgp;
}
@@ -1042,7 +1049,7 @@ void pnfs_layoutget_free(struct nfs4_layoutget *lgp)
nfs4_free_pages(lgp->args.layout.pages, max_pages);
if (lgp->args.inode)
pnfs_put_layout_hdr(NFS_I(lgp->args.inode)->layout);
- put_rpccred(lgp->cred);
+ put_cred(lgp->cred);
put_nfs_open_context(lgp->args.ctx);
kfree(lgp);
}
@@ -1317,7 +1324,7 @@ pnfs_commit_and_return_layout(struct inode *inode)
bool pnfs_roc(struct inode *ino,
struct nfs4_layoutreturn_args *args,
struct nfs4_layoutreturn_res *res,
- const struct rpc_cred *cred)
+ const struct cred *cred)
{
struct nfs_inode *nfsi = NFS_I(ino);
struct nfs_open_context *ctx;
@@ -1332,6 +1339,7 @@ bool pnfs_roc(struct inode *ino,
if (!nfs_have_layout(ino))
return false;
retry:
+ rcu_read_lock();
spin_lock(&ino->i_lock);
lo = nfsi->layout;
if (!lo || !pnfs_layout_is_valid(lo) ||
@@ -1342,6 +1350,7 @@ retry:
pnfs_get_layout_hdr(lo);
if (test_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) {
spin_unlock(&ino->i_lock);
+ rcu_read_unlock();
wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN,
TASK_UNINTERRUPTIBLE);
pnfs_put_layout_hdr(lo);
@@ -1355,7 +1364,7 @@ retry:
skip_read = true;
}
- list_for_each_entry(ctx, &nfsi->open_files, list) {
+ list_for_each_entry_rcu(ctx, &nfsi->open_files, list) {
state = ctx->state;
if (state == NULL)
continue;
@@ -1403,6 +1412,7 @@ retry:
out_noroc:
spin_unlock(&ino->i_lock);
+ rcu_read_unlock();
pnfs_layoutcommit_inode(ino, true);
if (roc) {
struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
@@ -1573,7 +1583,7 @@ alloc_init_layout_hdr(struct inode *ino,
INIT_LIST_HEAD(&lo->plh_return_segs);
INIT_LIST_HEAD(&lo->plh_bulk_destroy);
lo->plh_inode = ino;
- lo->plh_lc_cred = get_rpccred(ctx->cred);
+ lo->plh_lc_cred = get_cred(ctx->cred);
lo->plh_flags |= 1 << NFS_LAYOUT_INVALID_STID;
return lo;
}
@@ -2918,7 +2928,7 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
spin_unlock(&inode->i_lock);
data->args.inode = inode;
- data->cred = get_rpccred(nfsi->layout->plh_lc_cred);
+ data->cred = get_cred(nfsi->layout->plh_lc_cred);
nfs_fattr_init(&data->fattr);
data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask;
data->res.fattr = &data->fattr;
@@ -2931,7 +2941,7 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
if (ld->prepare_layoutcommit) {
status = ld->prepare_layoutcommit(&data->args);
if (status) {
- put_rpccred(data->cred);
+ put_cred(data->cred);
spin_lock(&inode->i_lock);
set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags);
if (end_pos > nfsi->layout->plh_lwb)
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index ece367ebde69..5e80a07b7bea 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -125,6 +125,7 @@ struct pnfs_layoutdriver_type {
struct module *owner;
unsigned flags;
unsigned max_deviceinfo_size;
+ unsigned max_layoutget_response;
int (*set_layoutdriver) (struct nfs_server *, const struct nfs_fh *);
int (*clear_layoutdriver) (struct nfs_server *);
@@ -199,7 +200,7 @@ struct pnfs_layout_hdr {
u32 plh_return_seq;
enum pnfs_iomode plh_return_iomode;
loff_t plh_lwb; /* last write byte for layoutcommit */
- struct rpc_cred *plh_lc_cred; /* layoutcommit cred */
+ const struct cred *plh_lc_cred; /* layoutcommit cred */
struct inode *plh_inode;
};
@@ -229,7 +230,7 @@ extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
extern size_t max_response_pages(struct nfs_server *server);
extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
struct pnfs_device *dev,
- struct rpc_cred *cred);
+ const struct cred *cred);
extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout);
extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync);
@@ -279,7 +280,7 @@ int pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo,
bool pnfs_roc(struct inode *ino,
struct nfs4_layoutreturn_args *args,
struct nfs4_layoutreturn_res *res,
- const struct rpc_cred *cred);
+ const struct cred *cred);
void pnfs_roc_release(struct nfs4_layoutreturn_args *args,
struct nfs4_layoutreturn_res *res,
int ret);
@@ -342,7 +343,7 @@ struct nfs4_deviceid_node {
struct nfs4_deviceid_node *
nfs4_find_get_deviceid(struct nfs_server *server,
- const struct nfs4_deviceid *id, struct rpc_cred *cred,
+ const struct nfs4_deviceid *id, const struct cred *cred,
gfp_t gfp_mask);
void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
void nfs4_init_deviceid_node(struct nfs4_deviceid_node *, struct nfs_server *,
@@ -693,7 +694,7 @@ static inline bool
pnfs_roc(struct inode *ino,
struct nfs4_layoutreturn_args *args,
struct nfs4_layoutreturn_res *res,
- const struct rpc_cred *cred)
+ const struct cred *cred)
{
return false;
}
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
index e8a07b3f9aaa..7fb59487ee90 100644
--- a/fs/nfs/pnfs_dev.c
+++ b/fs/nfs/pnfs_dev.c
@@ -94,7 +94,7 @@ _lookup_deviceid(const struct pnfs_layoutdriver_type *ld,
static struct nfs4_deviceid_node *
nfs4_get_device_info(struct nfs_server *server,
const struct nfs4_deviceid *dev_id,
- struct rpc_cred *cred, gfp_t gfp_flags)
+ const struct cred *cred, gfp_t gfp_flags)
{
struct nfs4_deviceid_node *d = NULL;
struct pnfs_device *pdev = NULL;
@@ -184,7 +184,7 @@ __nfs4_find_get_deviceid(struct nfs_server *server,
struct nfs4_deviceid_node *
nfs4_find_get_deviceid(struct nfs_server *server,
- const struct nfs4_deviceid *id, struct rpc_cred *cred,
+ const struct nfs4_deviceid *id, const struct cred *cred,
gfp_t gfp_mask)
{
long hash = nfs4_deviceid_hash(id);
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index d5e4d3cd8c7f..f5ad75fafc3c 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -686,7 +686,7 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
rpc_clnt_setup_test_and_add_xprt,
&rpcdata);
if (xprtdata.cred)
- put_rpccred(xprtdata.cred);
+ put_cred(xprtdata.cred);
} else {
clp = nfs4_set_ds_client(mds_srv,
(struct sockaddr *)&da->da_addr,
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index e0c257bd62b9..5552fa8b6e12 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -490,7 +490,7 @@ nfs_proc_rmdir(struct inode *dir, const struct qstr *name)
* from nfs_readdir by calling the decode_entry function directly.
*/
static int
-nfs_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
+nfs_proc_readdir(struct dentry *dentry, const struct cred *cred,
u64 cookie, struct page **pages, unsigned int count, bool plus)
{
struct inode *dir = d_inode(dentry);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 48d7277c60a9..f9f19784db82 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -276,16 +276,14 @@ static void nfs_readpage_result(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
if (hdr->res.eof) {
- loff_t bound;
+ loff_t pos = hdr->args.offset + hdr->res.count;
+ unsigned int new = pos - hdr->io_start;
- bound = hdr->args.offset + hdr->res.count;
- spin_lock(&hdr->lock);
- if (bound < hdr->io_start + hdr->good_bytes) {
+ if (hdr->good_bytes > new) {
+ hdr->good_bytes = new;
set_bit(NFS_IOHDR_EOF, &hdr->flags);
clear_bit(NFS_IOHDR_ERROR, &hdr->flags);
- hdr->good_bytes = bound - hdr->io_start;
}
- spin_unlock(&hdr->lock);
} else if (hdr->res.count < hdr->args.count)
nfs_readpage_retry(task, hdr);
}
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index ac4b2f005778..0570391eaa16 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -929,7 +929,7 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(void)
data->minorversion = 0;
data->need_mount = true;
data->net = current->nsproxy->net_ns;
- security_init_mnt_opts(&data->lsm_opts);
+ data->lsm_opts = NULL;
}
return data;
}
@@ -1206,7 +1206,7 @@ static int nfs_get_option_ul_bound(substring_t args[], unsigned long *option,
static int nfs_parse_mount_options(char *raw,
struct nfs_parsed_mount_data *mnt)
{
- char *p, *string, *secdata;
+ char *p, *string;
int rc, sloppy = 0, invalid_option = 0;
unsigned short protofamily = AF_UNSPEC;
unsigned short mountfamily = AF_UNSPEC;
@@ -1217,20 +1217,10 @@ static int nfs_parse_mount_options(char *raw,
}
dfprintk(MOUNT, "NFS: nfs mount opts='%s'\n", raw);
- secdata = alloc_secdata();
- if (!secdata)
- goto out_nomem;
-
- rc = security_sb_copy_data(raw, secdata);
+ rc = security_sb_eat_lsm_opts(raw, &mnt->lsm_opts);
if (rc)
goto out_security_failure;
- rc = security_sb_parse_opts_str(secdata, &mnt->lsm_opts);
- if (rc)
- goto out_security_failure;
-
- free_secdata(secdata);
-
while ((p = strsep(&raw, ",")) != NULL) {
substring_t args[MAX_OPT_ARGS];
unsigned long option;
@@ -1682,7 +1672,6 @@ out_nomem:
printk(KERN_INFO "NFS: not enough memory to parse option\n");
return 0;
out_security_failure:
- free_secdata(secdata);
printk(KERN_INFO "NFS: security options invalid: %d\n", rc);
return 0;
}
@@ -1906,6 +1895,11 @@ static int nfs_parse_devname(const char *dev_name,
size_t len;
char *end;
+ if (unlikely(!dev_name || !*dev_name)) {
+ dfprintk(MOUNT, "NFS: device name not specified\n");
+ return -EINVAL;
+ }
+
/* Is the host name protected with square brakcets? */
if (*dev_name == '[') {
end = strchr(++dev_name, ']');
@@ -2081,14 +2075,9 @@ static int nfs23_validate_mount_data(void *options,
if (data->context[0]){
#ifdef CONFIG_SECURITY_SELINUX
int rc;
- char *opts_str = kmalloc(sizeof(data->context) + 8, GFP_KERNEL);
- if (!opts_str)
- return -ENOMEM;
- strcpy(opts_str, "context=");
data->context[NFS_MAX_CONTEXT_LEN] = '\0';
- strcat(opts_str, &data->context[0]);
- rc = security_sb_parse_opts_str(opts_str, &args->lsm_opts);
- kfree(opts_str);
+ rc = security_add_mnt_opt("context", data->context,
+ strlen(data->context), &args->lsm_opts);
if (rc)
return rc;
#else
@@ -2168,7 +2157,10 @@ static int nfs_validate_text_mount_data(void *options,
if (args->version == 4) {
#if IS_ENABLED(CONFIG_NFS_V4)
- port = NFS_PORT;
+ if (args->nfs_server.protocol == XPRT_TRANSPORT_RDMA)
+ port = NFS_RDMA_PORT;
+ else
+ port = NFS_PORT;
max_namelen = NFS4_MAXNAMLEN;
max_pathlen = NFS4_MAXPATHLEN;
nfs_validate_transport_protocol(args);
@@ -2178,8 +2170,11 @@ static int nfs_validate_text_mount_data(void *options,
#else
goto out_v4_not_compiled;
#endif /* CONFIG_NFS_V4 */
- } else
+ } else {
nfs_set_mount_transport_protocol(args);
+ if (args->nfs_server.protocol == XPRT_TRANSPORT_RDMA)
+ port = NFS_RDMA_PORT;
+ }
nfs_set_port(sap, &args->nfs_server.port, port);
@@ -2265,7 +2260,7 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
options->version <= 6))))
return 0;
- data = kzalloc(sizeof(*data), GFP_KERNEL);
+ data = nfs_alloc_parsed_mount_data();
if (data == NULL)
return -ENOMEM;
@@ -2304,8 +2299,10 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
/* compare new mount options with old ones */
error = nfs_compare_remount_data(nfss, data);
+ if (!error)
+ error = security_sb_remount(sb, data->lsm_opts);
out:
- kfree(data);
+ nfs_free_parsed_mount_data(data);
return error;
}
EXPORT_SYMBOL_GPL(nfs_remount);
@@ -2409,8 +2406,7 @@ static int nfs_compare_mount_options(const struct super_block *s, const struct n
goto Ebusy;
if (a->acdirmax != b->acdirmax)
goto Ebusy;
- if (b->auth_info.flavor_len > 0 &&
- clnt_a->cl_auth->au_flavor != clnt_b->cl_auth->au_flavor)
+ if (clnt_a->cl_auth->au_flavor != clnt_b->cl_auth->au_flavor)
goto Ebusy;
return 1;
Ebusy:
@@ -2543,7 +2539,7 @@ int nfs_set_sb_security(struct super_block *s, struct dentry *mntroot,
if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL)
kflags |= SECURITY_LSM_NATIVE_LABELS;
- error = security_sb_set_mnt_opts(s, &mount_info->parsed->lsm_opts,
+ error = security_sb_set_mnt_opts(s, mount_info->parsed->lsm_opts,
kflags, &kflags_out);
if (error)
goto err;
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index fd61bf0fce63..79b97b3c4427 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -31,7 +31,7 @@
static void
nfs_free_unlinkdata(struct nfs_unlinkdata *data)
{
- put_rpccred(data->cred);
+ put_cred(data->cred);
kfree(data->args.name.name);
kfree(data);
}
@@ -177,11 +177,7 @@ nfs_async_unlink(struct dentry *dentry, const struct qstr *name)
goto out_free;
data->args.name.len = name->len;
- data->cred = rpc_lookup_cred();
- if (IS_ERR(data->cred)) {
- status = PTR_ERR(data->cred);
- goto out_free_name;
- }
+ data->cred = get_current_cred();
data->res.dir_attr = &data->dir_attr;
init_waitqueue_head(&data->wq);
@@ -202,8 +198,7 @@ nfs_async_unlink(struct dentry *dentry, const struct qstr *name)
return 0;
out_unlock:
spin_unlock(&dentry->d_lock);
- put_rpccred(data->cred);
-out_free_name:
+ put_cred(data->cred);
kfree(data->args.name.name);
out_free:
kfree(data);
@@ -307,7 +302,7 @@ static void nfs_async_rename_release(void *calldata)
iput(data->old_dir);
iput(data->new_dir);
nfs_sb_deactive(sb);
- put_rpccred(data->cred);
+ put_cred(data->cred);
kfree(data);
}
@@ -352,12 +347,7 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
return ERR_PTR(-ENOMEM);
task_setup_data.callback_data = data;
- data->cred = rpc_lookup_cred();
- if (IS_ERR(data->cred)) {
- struct rpc_task *task = ERR_CAST(data->cred);
- kfree(data);
- return task;
- }
+ data->cred = get_current_cred();
msg.rpc_argp = &data->args;
msg.rpc_resp = &data->res;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 586726a590d8..d09c9f878141 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -238,9 +238,9 @@ out:
}
/* A writeback failed: mark the page as bad, and invalidate the page cache */
-static void nfs_set_pageerror(struct page *page)
+static void nfs_set_pageerror(struct address_space *mapping)
{
- nfs_zap_mapping(page_file_mapping(page)->host, page_file_mapping(page));
+ nfs_zap_mapping(mapping->host, mapping);
}
/*
@@ -621,11 +621,12 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
nfs_set_page_writeback(page);
WARN_ON_ONCE(test_bit(PG_CLEAN, &req->wb_flags));
- ret = 0;
+ ret = req->wb_context->error;
/* If there is a fatal error that covers this write, just exit */
- if (nfs_error_is_fatal_on_server(req->wb_context->error))
+ if (nfs_error_is_fatal_on_server(ret))
goto out_launder;
+ ret = 0;
if (!nfs_pageio_add_request(pgio, req)) {
ret = pgio->pg_error;
/*
@@ -635,9 +636,9 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
nfs_context_set_write_error(req->wb_context, ret);
if (nfs_error_is_fatal_on_server(ret))
goto out_launder;
- }
+ } else
+ ret = -EAGAIN;
nfs_redirty_request(req);
- ret = -EAGAIN;
} else
nfs_add_stats(page_file_mapping(page)->host,
NFSIOS_WRITEPAGES, 1);
@@ -993,7 +994,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
nfs_list_remove_request(req);
if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) &&
(hdr->good_bytes < bytes)) {
- nfs_set_pageerror(req->wb_page);
+ nfs_set_pageerror(page_file_mapping(req->wb_page));
nfs_context_set_write_error(req->wb_context, hdr->error);
goto remove_req;
}
@@ -1233,9 +1234,12 @@ int
nfs_key_timeout_notify(struct file *filp, struct inode *inode)
{
struct nfs_open_context *ctx = nfs_file_open_context(filp);
- struct rpc_auth *auth = NFS_SERVER(inode)->client->cl_auth;
- return rpcauth_key_timeout_notify(auth, ctx->cred);
+ if (nfs_ctx_key_to_expire(ctx, inode) &&
+ !ctx->ll_cred)
+ /* Already expired! */
+ return -EACCES;
+ return 0;
}
/*
@@ -1244,8 +1248,23 @@ nfs_key_timeout_notify(struct file *filp, struct inode *inode)
bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx, struct inode *inode)
{
struct rpc_auth *auth = NFS_SERVER(inode)->client->cl_auth;
+ struct rpc_cred *cred = ctx->ll_cred;
+ struct auth_cred acred = {
+ .cred = ctx->cred,
+ };
- return rpcauth_cred_key_to_expire(auth, ctx->cred);
+ if (cred && !cred->cr_ops->crmatch(&acred, cred, 0)) {
+ put_rpccred(cred);
+ ctx->ll_cred = NULL;
+ cred = NULL;
+ }
+ if (!cred)
+ cred = auth->au_ops->lookup_cred(auth, &acred, 0);
+ if (!cred || IS_ERR(cred))
+ return true;
+ ctx->ll_cred = cred;
+ return !!(cred->cr_ops->crkey_timeout &&
+ cred->cr_ops->crkey_timeout(cred));
}
/*
@@ -1329,7 +1348,8 @@ int nfs_updatepage(struct file *file, struct page *page,
unsigned int offset, unsigned int count)
{
struct nfs_open_context *ctx = nfs_file_open_context(file);
- struct inode *inode = page_file_mapping(page)->host;
+ struct address_space *mapping = page_file_mapping(page);
+ struct inode *inode = mapping->host;
int status = 0;
nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE);
@@ -1347,7 +1367,7 @@ int nfs_updatepage(struct file *file, struct page *page,
status = nfs_writepage_setup(ctx, page, offset, count);
if (status < 0)
- nfs_set_pageerror(page);
+ nfs_set_pageerror(mapping);
else
__set_page_dirty_nobuffers(page);
out:
@@ -2121,7 +2141,7 @@ int __init nfs_init_writepagecache(void)
* This allows larger machines to have larger/more transfers.
* Limit the default to 256M
*/
- nfs_congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
+ nfs_congestion_kb = (16*int_sqrt(totalram_pages())) << (PAGE_SHIFT-10);
if (nfs_congestion_kb > 256*1024)
nfs_congestion_kb = 256*1024;
diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h
index b7559c6f2b97..4a98537efb0f 100644
--- a/fs/nfsd/cache.h
+++ b/fs/nfsd/cache.h
@@ -19,18 +19,22 @@
* is much larger than a sockaddr_in6.
*/
struct svc_cacherep {
- struct list_head c_lru;
+ struct {
+ /* Keep often-read xid, csum in the same cache line: */
+ __be32 k_xid;
+ __wsum k_csum;
+ u32 k_proc;
+ u32 k_prot;
+ u32 k_vers;
+ unsigned int k_len;
+ struct sockaddr_in6 k_addr;
+ } c_key;
+ struct rb_node c_node;
+ struct list_head c_lru;
unsigned char c_state, /* unused, inprog, done */
c_type, /* status, buffer */
c_secure : 1; /* req came from port < 1024 */
- struct sockaddr_in6 c_addr;
- __be32 c_xid;
- u32 c_prot;
- u32 c_proc;
- u32 c_vers;
- unsigned int c_len;
- __wsum c_csum;
unsigned long c_timestamp;
union {
struct kvec u_vec;
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index a1143f7c2201..802993d8912f 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -46,7 +46,7 @@ static void expkey_put(struct kref *ref)
!test_bit(CACHE_NEGATIVE, &key->h.flags))
path_put(&key->ek_path);
auth_domain_put(key->ek_client);
- kfree(key);
+ kfree_rcu(key, ek_rcu);
}
static void expkey_request(struct cache_detail *cd,
@@ -265,7 +265,7 @@ svc_expkey_lookup(struct cache_detail *cd, struct svc_expkey *item)
struct cache_head *ch;
int hash = svc_expkey_hash(item);
- ch = sunrpc_cache_lookup(cd, &item->h, hash);
+ ch = sunrpc_cache_lookup_rcu(cd, &item->h, hash);
if (ch)
return container_of(ch, struct svc_expkey, h);
else
@@ -314,7 +314,7 @@ static void svc_export_put(struct kref *ref)
auth_domain_put(exp->ex_client);
nfsd4_fslocs_free(&exp->ex_fslocs);
kfree(exp->ex_uuid);
- kfree(exp);
+ kfree_rcu(exp, ex_rcu);
}
static void svc_export_request(struct cache_detail *cd,
@@ -780,7 +780,7 @@ svc_export_lookup(struct svc_export *exp)
struct cache_head *ch;
int hash = svc_export_hash(exp);
- ch = sunrpc_cache_lookup(exp->cd, &exp->h, hash);
+ ch = sunrpc_cache_lookup_rcu(exp->cd, &exp->h, hash);
if (ch)
return container_of(ch, struct svc_export, h);
else
@@ -1216,9 +1216,9 @@ static int e_show(struct seq_file *m, void *p)
}
const struct seq_operations nfs_exports_op = {
- .start = cache_seq_start,
- .next = cache_seq_next,
- .stop = cache_seq_stop,
+ .start = cache_seq_start_rcu,
+ .next = cache_seq_next_rcu,
+ .stop = cache_seq_stop_rcu,
.show = e_show,
};
diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h
index c8b74126ddaa..e7daa1f246f0 100644
--- a/fs/nfsd/export.h
+++ b/fs/nfsd/export.h
@@ -61,6 +61,7 @@ struct svc_export {
u32 ex_layout_types;
struct nfsd4_deviceid_map *ex_devid_map;
struct cache_detail *cd;
+ struct rcu_head ex_rcu;
};
/* an "export key" (expkey) maps a filehandlefragement to an
@@ -75,6 +76,7 @@ struct svc_expkey {
u32 ek_fsid[6];
struct path ek_path;
+ struct rcu_head ek_rcu;
};
#define EX_ISSYNC(exp) (!((exp)->ex_flags & NFSEXP_ASYNC))
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index 426f55005697..32cb8c027483 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -123,6 +123,14 @@ struct nfsd_net {
wait_queue_head_t ntf_wq;
atomic_t ntf_refcnt;
+
+ /*
+ * clientid and stateid data for construction of net unique COPY
+ * stateids.
+ */
+ u32 s2s_cp_cl_id;
+ struct idr s2s_cp_stateids;
+ spinlock_t s2s_cp_lock;
};
/* Simple check to find out if a given net was properly initialized */
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 601bf33c26a0..c74e4538d0eb 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -39,6 +39,7 @@
#include "state.h"
#include "netns.h"
#include "xdr4cb.h"
+#include "xdr4.h"
#define NFSDDBG_FACILITY NFSDDBG_PROC
@@ -105,6 +106,7 @@ enum nfs_cb_opnum4 {
OP_CB_WANTS_CANCELLED = 12,
OP_CB_NOTIFY_LOCK = 13,
OP_CB_NOTIFY_DEVICEID = 14,
+ OP_CB_OFFLOAD = 15,
OP_CB_ILLEGAL = 10044
};
@@ -683,6 +685,101 @@ static int nfs4_xdr_dec_cb_notify_lock(struct rpc_rqst *rqstp,
}
/*
+ * struct write_response4 {
+ * stateid4 wr_callback_id<1>;
+ * length4 wr_count;
+ * stable_how4 wr_committed;
+ * verifier4 wr_writeverf;
+ * };
+ * union offload_info4 switch (nfsstat4 coa_status) {
+ * case NFS4_OK:
+ * write_response4 coa_resok4;
+ * default:
+ * length4 coa_bytes_copied;
+ * };
+ * struct CB_OFFLOAD4args {
+ * nfs_fh4 coa_fh;
+ * stateid4 coa_stateid;
+ * offload_info4 coa_offload_info;
+ * };
+ */
+static void encode_offload_info4(struct xdr_stream *xdr,
+ __be32 nfserr,
+ const struct nfsd4_copy *cp)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 4);
+ *p++ = nfserr;
+ if (!nfserr) {
+ p = xdr_reserve_space(xdr, 4 + 8 + 4 + NFS4_VERIFIER_SIZE);
+ p = xdr_encode_empty_array(p);
+ p = xdr_encode_hyper(p, cp->cp_res.wr_bytes_written);
+ *p++ = cpu_to_be32(cp->cp_res.wr_stable_how);
+ p = xdr_encode_opaque_fixed(p, cp->cp_res.wr_verifier.data,
+ NFS4_VERIFIER_SIZE);
+ } else {
+ p = xdr_reserve_space(xdr, 8);
+ /* We always return success if bytes were written */
+ p = xdr_encode_hyper(p, 0);
+ }
+}
+
+static void encode_cb_offload4args(struct xdr_stream *xdr,
+ __be32 nfserr,
+ const struct knfsd_fh *fh,
+ const struct nfsd4_copy *cp,
+ struct nfs4_cb_compound_hdr *hdr)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 4);
+ *p++ = cpu_to_be32(OP_CB_OFFLOAD);
+ encode_nfs_fh4(xdr, fh);
+ encode_stateid4(xdr, &cp->cp_res.cb_stateid);
+ encode_offload_info4(xdr, nfserr, cp);
+
+ hdr->nops++;
+}
+
+static void nfs4_xdr_enc_cb_offload(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfsd4_callback *cb = data;
+ const struct nfsd4_copy *cp =
+ container_of(cb, struct nfsd4_copy, cp_cb);
+ struct nfs4_cb_compound_hdr hdr = {
+ .ident = 0,
+ .minorversion = cb->cb_clp->cl_minorversion,
+ };
+
+ encode_cb_compound4args(xdr, &hdr);
+ encode_cb_sequence4args(xdr, cb, &hdr);
+ encode_cb_offload4args(xdr, cp->nfserr, &cp->fh, cp, &hdr);
+ encode_cb_nops(&hdr);
+}
+
+static int nfs4_xdr_dec_cb_offload(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfsd4_callback *cb = data;
+ struct nfs4_cb_compound_hdr hdr;
+ int status;
+
+ status = decode_cb_compound4res(xdr, &hdr);
+ if (unlikely(status))
+ return status;
+
+ if (cb) {
+ status = decode_cb_sequence4res(xdr, cb);
+ if (unlikely(status || cb->cb_seq_status))
+ return status;
+ }
+ return decode_cb_op_status(xdr, OP_CB_OFFLOAD, &cb->cb_status);
+}
+/*
* RPC procedure tables
*/
#define PROC(proc, call, argtype, restype) \
@@ -703,6 +800,7 @@ static const struct rpc_procinfo nfs4_cb_procedures[] = {
PROC(CB_LAYOUT, COMPOUND, cb_layout, cb_layout),
#endif
PROC(CB_NOTIFY_LOCK, COMPOUND, cb_notify_lock, cb_notify_lock),
+ PROC(CB_OFFLOAD, COMPOUND, cb_offload, cb_offload),
};
static unsigned int nfs4_cb_counts[ARRAY_SIZE(nfs4_cb_procedures)];
@@ -746,24 +844,23 @@ static int max_cb_time(struct net *net)
return max(nn->nfsd4_lease/10, (time_t)1) * HZ;
}
-static struct rpc_cred *get_backchannel_cred(struct nfs4_client *clp, struct rpc_clnt *client, struct nfsd4_session *ses)
+static const struct cred *get_backchannel_cred(struct nfs4_client *clp, struct rpc_clnt *client, struct nfsd4_session *ses)
{
if (clp->cl_minorversion == 0) {
- char *principal = clp->cl_cred.cr_targ_princ ?
- clp->cl_cred.cr_targ_princ : "nfs";
- struct rpc_cred *cred;
-
- cred = rpc_lookup_machine_cred(principal);
- if (!IS_ERR(cred))
- get_rpccred(cred);
- return cred;
+ client->cl_principal = clp->cl_cred.cr_targ_princ ?
+ clp->cl_cred.cr_targ_princ : "nfs";
+
+ return get_cred(rpc_machine_cred());
} else {
- struct rpc_auth *auth = client->cl_auth;
- struct auth_cred acred = {};
+ struct cred *kcred;
+
+ kcred = prepare_kernel_cred(NULL);
+ if (!kcred)
+ return NULL;
- acred.uid = ses->se_cb_sec.uid;
- acred.gid = ses->se_cb_sec.gid;
- return auth->au_ops->lookup_cred(client->cl_auth, &acred, 0);
+ kcred->uid = ses->se_cb_sec.uid;
+ kcred->gid = ses->se_cb_sec.gid;
+ return kcred;
}
}
@@ -786,7 +883,7 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
.flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
};
struct rpc_clnt *client;
- struct rpc_cred *cred;
+ const struct cred *cred;
if (clp->cl_minorversion == 0) {
if (!clp->cl_cred.cr_principal &&
@@ -1116,7 +1213,7 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
if (clp->cl_cb_client) {
rpc_shutdown_client(clp->cl_cb_client);
clp->cl_cb_client = NULL;
- put_rpccred(clp->cl_cb_cred);
+ put_cred(clp->cl_cb_cred);
clp->cl_cb_cred = NULL;
}
if (clp->cl_cb_conn.cb_xprt) {
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index a5bb76593ce7..bf137fec33ff 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -65,6 +65,7 @@ struct ent {
u32 id;
char name[IDMAP_NAMESZ];
char authname[IDMAP_NAMESZ];
+ struct rcu_head rcu_head;
};
/* Common entry handling */
@@ -89,7 +90,7 @@ static void
ent_put(struct kref *ref)
{
struct ent *map = container_of(ref, struct ent, h.ref);
- kfree(map);
+ kfree_rcu(map, rcu_head);
}
static struct cache_head *
@@ -264,8 +265,8 @@ out:
static struct ent *
idtoname_lookup(struct cache_detail *cd, struct ent *item)
{
- struct cache_head *ch = sunrpc_cache_lookup(cd, &item->h,
- idtoname_hash(item));
+ struct cache_head *ch = sunrpc_cache_lookup_rcu(cd, &item->h,
+ idtoname_hash(item));
if (ch)
return container_of(ch, struct ent, h);
else
@@ -422,8 +423,8 @@ out:
static struct ent *
nametoid_lookup(struct cache_detail *cd, struct ent *item)
{
- struct cache_head *ch = sunrpc_cache_lookup(cd, &item->h,
- nametoid_hash(item));
+ struct cache_head *ch = sunrpc_cache_lookup_rcu(cd, &item->h,
+ nametoid_hash(item));
if (ch)
return container_of(ch, struct ent, h);
else
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index 2b36aa037ce0..44517fb5c0de 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -656,7 +656,6 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
struct nfsd_net *nn;
ktime_t now, cutoff;
const struct nfsd4_layout_ops *ops;
- LIST_HEAD(reaplist);
switch (task->tk_status) {
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index b7bc6e1a85ac..0cfd257ffdaf 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -36,6 +36,7 @@
#include <linux/file.h>
#include <linux/falloc.h>
#include <linux/slab.h>
+#include <linux/kthread.h>
#include "idmap.h"
#include "cache.h"
@@ -862,8 +863,7 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_rename *rename = &u->rename;
__be32 status;
- if (opens_in_grace(SVC_NET(rqstp)) &&
- !(cstate->save_fh.fh_export->ex_flags & NFSEXP_NOSUBTREECHECK))
+ if (opens_in_grace(SVC_NET(rqstp)))
return nfserr_grace;
status = nfsd_rename(rqstp, &cstate->save_fh, rename->rn_sname,
rename->rn_snamelen, &cstate->current_fh,
@@ -1015,8 +1015,6 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
nvecs = svc_fill_write_vector(rqstp, write->wr_pagelist,
&write->wr_head, write->wr_buflen);
- if (!nvecs)
- return nfserr_io;
WARN_ON_ONCE(nvecs > ARRAY_SIZE(rqstp->rq_vec));
status = nfsd_vfs_write(rqstp, &cstate->current_fh, filp,
@@ -1037,6 +1035,9 @@ nfsd4_verify_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
{
__be32 status;
+ if (!cstate->save_fh.fh_dentry)
+ return nfserr_nofilehandle;
+
status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->save_fh,
src_stateid, RD_STATE, src, NULL);
if (status) {
@@ -1089,36 +1090,254 @@ out:
return status;
}
+void nfs4_put_copy(struct nfsd4_copy *copy)
+{
+ if (!refcount_dec_and_test(&copy->refcount))
+ return;
+ kfree(copy);
+}
+
+static bool
+check_and_set_stop_copy(struct nfsd4_copy *copy)
+{
+ bool value;
+
+ spin_lock(&copy->cp_clp->async_lock);
+ value = copy->stopped;
+ if (!copy->stopped)
+ copy->stopped = true;
+ spin_unlock(&copy->cp_clp->async_lock);
+ return value;
+}
+
+static void nfsd4_stop_copy(struct nfsd4_copy *copy)
+{
+ /* only 1 thread should stop the copy */
+ if (!check_and_set_stop_copy(copy))
+ kthread_stop(copy->copy_task);
+ nfs4_put_copy(copy);
+}
+
+static struct nfsd4_copy *nfsd4_get_copy(struct nfs4_client *clp)
+{
+ struct nfsd4_copy *copy = NULL;
+
+ spin_lock(&clp->async_lock);
+ if (!list_empty(&clp->async_copies)) {
+ copy = list_first_entry(&clp->async_copies, struct nfsd4_copy,
+ copies);
+ refcount_inc(&copy->refcount);
+ }
+ spin_unlock(&clp->async_lock);
+ return copy;
+}
+
+void nfsd4_shutdown_copy(struct nfs4_client *clp)
+{
+ struct nfsd4_copy *copy;
+
+ while ((copy = nfsd4_get_copy(clp)) != NULL)
+ nfsd4_stop_copy(copy);
+}
+
+static void nfsd4_cb_offload_release(struct nfsd4_callback *cb)
+{
+ struct nfsd4_copy *copy = container_of(cb, struct nfsd4_copy, cp_cb);
+
+ nfs4_put_copy(copy);
+}
+
+static int nfsd4_cb_offload_done(struct nfsd4_callback *cb,
+ struct rpc_task *task)
+{
+ return 1;
+}
+
+static const struct nfsd4_callback_ops nfsd4_cb_offload_ops = {
+ .release = nfsd4_cb_offload_release,
+ .done = nfsd4_cb_offload_done
+};
+
+static void nfsd4_init_copy_res(struct nfsd4_copy *copy, bool sync)
+{
+ copy->cp_res.wr_stable_how = NFS_UNSTABLE;
+ copy->cp_synchronous = sync;
+ gen_boot_verifier(&copy->cp_res.wr_verifier, copy->cp_clp->net);
+}
+
+static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy)
+{
+ ssize_t bytes_copied = 0;
+ size_t bytes_total = copy->cp_count;
+ u64 src_pos = copy->cp_src_pos;
+ u64 dst_pos = copy->cp_dst_pos;
+
+ do {
+ if (kthread_should_stop())
+ break;
+ bytes_copied = nfsd_copy_file_range(copy->file_src, src_pos,
+ copy->file_dst, dst_pos, bytes_total);
+ if (bytes_copied <= 0)
+ break;
+ bytes_total -= bytes_copied;
+ copy->cp_res.wr_bytes_written += bytes_copied;
+ src_pos += bytes_copied;
+ dst_pos += bytes_copied;
+ } while (bytes_total > 0 && !copy->cp_synchronous);
+ return bytes_copied;
+}
+
+static __be32 nfsd4_do_copy(struct nfsd4_copy *copy, bool sync)
+{
+ __be32 status;
+ ssize_t bytes;
+
+ bytes = _nfsd_copy_file_range(copy);
+ /* for async copy, we ignore the error, client can always retry
+ * to get the error
+ */
+ if (bytes < 0 && !copy->cp_res.wr_bytes_written)
+ status = nfserrno(bytes);
+ else {
+ nfsd4_init_copy_res(copy, sync);
+ status = nfs_ok;
+ }
+
+ fput(copy->file_src);
+ fput(copy->file_dst);
+ return status;
+}
+
+static void dup_copy_fields(struct nfsd4_copy *src, struct nfsd4_copy *dst)
+{
+ dst->cp_src_pos = src->cp_src_pos;
+ dst->cp_dst_pos = src->cp_dst_pos;
+ dst->cp_count = src->cp_count;
+ dst->cp_synchronous = src->cp_synchronous;
+ memcpy(&dst->cp_res, &src->cp_res, sizeof(src->cp_res));
+ memcpy(&dst->fh, &src->fh, sizeof(src->fh));
+ dst->cp_clp = src->cp_clp;
+ dst->file_dst = get_file(src->file_dst);
+ dst->file_src = get_file(src->file_src);
+ memcpy(&dst->cp_stateid, &src->cp_stateid, sizeof(src->cp_stateid));
+}
+
+static void cleanup_async_copy(struct nfsd4_copy *copy)
+{
+ nfs4_free_cp_state(copy);
+ fput(copy->file_dst);
+ fput(copy->file_src);
+ spin_lock(&copy->cp_clp->async_lock);
+ list_del(&copy->copies);
+ spin_unlock(&copy->cp_clp->async_lock);
+ nfs4_put_copy(copy);
+}
+
+static int nfsd4_do_async_copy(void *data)
+{
+ struct nfsd4_copy *copy = (struct nfsd4_copy *)data;
+ struct nfsd4_copy *cb_copy;
+
+ copy->nfserr = nfsd4_do_copy(copy, 0);
+ cb_copy = kzalloc(sizeof(struct nfsd4_copy), GFP_KERNEL);
+ if (!cb_copy)
+ goto out;
+ memcpy(&cb_copy->cp_res, &copy->cp_res, sizeof(copy->cp_res));
+ cb_copy->cp_clp = copy->cp_clp;
+ cb_copy->nfserr = copy->nfserr;
+ memcpy(&cb_copy->fh, &copy->fh, sizeof(copy->fh));
+ nfsd4_init_cb(&cb_copy->cp_cb, cb_copy->cp_clp,
+ &nfsd4_cb_offload_ops, NFSPROC4_CLNT_CB_OFFLOAD);
+ nfsd4_run_cb(&cb_copy->cp_cb);
+out:
+ cleanup_async_copy(copy);
+ return 0;
+}
+
static __be32
nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
union nfsd4_op_u *u)
{
struct nfsd4_copy *copy = &u->copy;
- struct file *src, *dst;
__be32 status;
- ssize_t bytes;
+ struct nfsd4_copy *async_copy = NULL;
- status = nfsd4_verify_copy(rqstp, cstate, &copy->cp_src_stateid, &src,
- &copy->cp_dst_stateid, &dst);
+ status = nfsd4_verify_copy(rqstp, cstate, &copy->cp_src_stateid,
+ &copy->file_src, &copy->cp_dst_stateid,
+ &copy->file_dst);
if (status)
goto out;
- bytes = nfsd_copy_file_range(src, copy->cp_src_pos,
- dst, copy->cp_dst_pos, copy->cp_count);
+ copy->cp_clp = cstate->clp;
+ memcpy(&copy->fh, &cstate->current_fh.fh_handle,
+ sizeof(struct knfsd_fh));
+ if (!copy->cp_synchronous) {
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
- if (bytes < 0)
- status = nfserrno(bytes);
- else {
- copy->cp_res.wr_bytes_written = bytes;
- copy->cp_res.wr_stable_how = NFS_UNSTABLE;
- copy->cp_synchronous = 1;
- gen_boot_verifier(&copy->cp_res.wr_verifier, SVC_NET(rqstp));
+ status = nfserrno(-ENOMEM);
+ async_copy = kzalloc(sizeof(struct nfsd4_copy), GFP_KERNEL);
+ if (!async_copy)
+ goto out;
+ if (!nfs4_init_cp_state(nn, copy)) {
+ kfree(async_copy);
+ goto out;
+ }
+ refcount_set(&async_copy->refcount, 1);
+ memcpy(&copy->cp_res.cb_stateid, &copy->cp_stateid,
+ sizeof(copy->cp_stateid));
+ dup_copy_fields(copy, async_copy);
+ async_copy->copy_task = kthread_create(nfsd4_do_async_copy,
+ async_copy, "%s", "copy thread");
+ if (IS_ERR(async_copy->copy_task))
+ goto out_err;
+ spin_lock(&async_copy->cp_clp->async_lock);
+ list_add(&async_copy->copies,
+ &async_copy->cp_clp->async_copies);
+ spin_unlock(&async_copy->cp_clp->async_lock);
+ wake_up_process(async_copy->copy_task);
status = nfs_ok;
+ } else
+ status = nfsd4_do_copy(copy, 1);
+out:
+ return status;
+out_err:
+ cleanup_async_copy(async_copy);
+ goto out;
+}
+
+struct nfsd4_copy *
+find_async_copy(struct nfs4_client *clp, stateid_t *stateid)
+{
+ struct nfsd4_copy *copy;
+
+ spin_lock(&clp->async_lock);
+ list_for_each_entry(copy, &clp->async_copies, copies) {
+ if (memcmp(&copy->cp_stateid, stateid, NFS4_STATEID_SIZE))
+ continue;
+ refcount_inc(&copy->refcount);
+ spin_unlock(&clp->async_lock);
+ return copy;
}
+ spin_unlock(&clp->async_lock);
+ return NULL;
+}
+
+static __be32
+nfsd4_offload_cancel(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ struct nfsd4_offload_status *os = &u->offload_status;
+ __be32 status = 0;
+ struct nfsd4_copy *copy;
+ struct nfs4_client *clp = cstate->clp;
+
+ copy = find_async_copy(clp, &os->stateid);
+ if (copy)
+ nfsd4_stop_copy(copy);
+ else
+ status = nfserr_bad_stateid;
- fput(src);
- fput(dst);
-out:
return status;
}
@@ -1126,7 +1345,7 @@ static __be32
nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_fallocate *fallocate, int flags)
{
- __be32 status = nfserr_notsupp;
+ __be32 status;
struct file *file;
status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
@@ -1144,6 +1363,25 @@ nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
fput(file);
return status;
}
+static __be32
+nfsd4_offload_status(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ struct nfsd4_offload_status *os = &u->offload_status;
+ __be32 status = 0;
+ struct nfsd4_copy *copy;
+ struct nfs4_client *clp = cstate->clp;
+
+ copy = find_async_copy(clp, &os->stateid);
+ if (copy) {
+ os->count = copy->cp_res.wr_bytes_written;
+ nfs4_put_copy(copy);
+ } else
+ status = nfserr_bad_stateid;
+
+ return status;
+}
static __be32
nfsd4_allocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
@@ -2047,6 +2285,14 @@ static inline u32 nfsd4_copy_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1 /* cr_synchronous */) * sizeof(__be32);
}
+static inline u32 nfsd4_offload_status_rsize(struct svc_rqst *rqstp,
+ struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size +
+ 2 /* osr_count */ +
+ 1 /* osr_complete<1> optional 0 for now */) * sizeof(__be32);
+}
+
#ifdef CONFIG_NFSD_PNFS
static inline u32 nfsd4_getdeviceinfo_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
{
@@ -2433,25 +2679,25 @@ static const struct nfsd4_operation nfsd4_ops[] = {
/* NFSv4.2 operations */
[OP_ALLOCATE] = {
.op_func = nfsd4_allocate,
- .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
+ .op_flags = OP_MODIFIES_SOMETHING,
.op_name = "OP_ALLOCATE",
.op_rsize_bop = nfsd4_only_status_rsize,
},
[OP_DEALLOCATE] = {
.op_func = nfsd4_deallocate,
- .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
+ .op_flags = OP_MODIFIES_SOMETHING,
.op_name = "OP_DEALLOCATE",
.op_rsize_bop = nfsd4_only_status_rsize,
},
[OP_CLONE] = {
.op_func = nfsd4_clone,
- .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
+ .op_flags = OP_MODIFIES_SOMETHING,
.op_name = "OP_CLONE",
.op_rsize_bop = nfsd4_only_status_rsize,
},
[OP_COPY] = {
.op_func = nfsd4_copy,
- .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
+ .op_flags = OP_MODIFIES_SOMETHING,
.op_name = "OP_COPY",
.op_rsize_bop = nfsd4_copy_rsize,
},
@@ -2460,6 +2706,17 @@ static const struct nfsd4_operation nfsd4_ops[] = {
.op_name = "OP_SEEK",
.op_rsize_bop = nfsd4_seek_rsize,
},
+ [OP_OFFLOAD_STATUS] = {
+ .op_func = nfsd4_offload_status,
+ .op_name = "OP_OFFLOAD_STATUS",
+ .op_rsize_bop = nfsd4_offload_status_rsize,
+ },
+ [OP_OFFLOAD_CANCEL] = {
+ .op_func = nfsd4_offload_cancel,
+ .op_flags = OP_MODIFIES_SOMETHING,
+ .op_name = "OP_OFFLOAD_CANCEL",
+ .op_rsize_bop = nfsd4_only_status_rsize,
+ },
};
/**
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 9c247fa1e959..5188f9f70c78 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -662,7 +662,7 @@ struct cld_net {
struct cld_upcall {
struct list_head cu_list;
struct cld_net *cu_net;
- struct task_struct *cu_task;
+ struct completion cu_done;
struct cld_msg cu_msg;
};
@@ -671,23 +671,18 @@ __cld_pipe_upcall(struct rpc_pipe *pipe, struct cld_msg *cmsg)
{
int ret;
struct rpc_pipe_msg msg;
+ struct cld_upcall *cup = container_of(cmsg, struct cld_upcall, cu_msg);
memset(&msg, 0, sizeof(msg));
msg.data = cmsg;
msg.len = sizeof(*cmsg);
- /*
- * Set task state before we queue the upcall. That prevents
- * wake_up_process in the downcall from racing with schedule.
- */
- set_current_state(TASK_UNINTERRUPTIBLE);
ret = rpc_queue_upcall(pipe, &msg);
if (ret < 0) {
- set_current_state(TASK_RUNNING);
goto out;
}
- schedule();
+ wait_for_completion(&cup->cu_done);
if (msg.errno < 0)
ret = msg.errno;
@@ -754,7 +749,7 @@ cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
if (copy_from_user(&cup->cu_msg, src, mlen) != 0)
return -EFAULT;
- wake_up_process(cup->cu_task);
+ complete(&cup->cu_done);
return mlen;
}
@@ -769,7 +764,7 @@ cld_pipe_destroy_msg(struct rpc_pipe_msg *msg)
if (msg->errno >= 0)
return;
- wake_up_process(cup->cu_task);
+ complete(&cup->cu_done);
}
static const struct rpc_pipe_ops cld_upcall_ops = {
@@ -900,7 +895,7 @@ restart_search:
goto restart_search;
}
}
- new->cu_task = current;
+ init_completion(&new->cu_done);
new->cu_msg.cm_vers = CLD_UPCALL_VERSION;
put_unaligned(cn->cn_xid++, &new->cu_msg.cm_xid);
new->cu_net = cn;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index b0ca0efd2875..fb3c9844c82a 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -238,7 +238,7 @@ find_blocked_lock(struct nfs4_lockowner *lo, struct knfsd_fh *fh,
}
spin_unlock(&nn->blocked_locks_lock);
if (found)
- posix_unblock_lock(&found->nbl_lock);
+ locks_delete_block(&found->nbl_lock);
return found;
}
@@ -293,7 +293,7 @@ remove_blocked_locks(struct nfs4_lockowner *lo)
nbl = list_first_entry(&reaplist, struct nfsd4_blocked_lock,
nbl_lru);
list_del_init(&nbl->nbl_lru);
- posix_unblock_lock(&nbl->nbl_lock);
+ locks_delete_block(&nbl->nbl_lock);
free_blocked_lock(nbl);
}
}
@@ -713,6 +713,36 @@ out_free:
return NULL;
}
+/*
+ * Create a unique stateid_t to represent each COPY.
+ */
+int nfs4_init_cp_state(struct nfsd_net *nn, struct nfsd4_copy *copy)
+{
+ int new_id;
+
+ idr_preload(GFP_KERNEL);
+ spin_lock(&nn->s2s_cp_lock);
+ new_id = idr_alloc_cyclic(&nn->s2s_cp_stateids, copy, 0, 0, GFP_NOWAIT);
+ spin_unlock(&nn->s2s_cp_lock);
+ idr_preload_end();
+ if (new_id < 0)
+ return 0;
+ copy->cp_stateid.si_opaque.so_id = new_id;
+ copy->cp_stateid.si_opaque.so_clid.cl_boot = nn->boot_time;
+ copy->cp_stateid.si_opaque.so_clid.cl_id = nn->s2s_cp_cl_id;
+ return 1;
+}
+
+void nfs4_free_cp_state(struct nfsd4_copy *copy)
+{
+ struct nfsd_net *nn;
+
+ nn = net_generic(copy->cp_clp->net, nfsd_net_id);
+ spin_lock(&nn->s2s_cp_lock);
+ idr_remove(&nn->s2s_cp_stateids, copy->cp_stateid.si_opaque.so_id);
+ spin_unlock(&nn->s2s_cp_lock);
+}
+
static struct nfs4_ol_stateid * nfs4_alloc_open_stateid(struct nfs4_client *clp)
{
struct nfs4_stid *stid;
@@ -1827,6 +1857,8 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
#ifdef CONFIG_NFSD_PNFS
INIT_LIST_HEAD(&clp->cl_lo_states);
#endif
+ INIT_LIST_HEAD(&clp->async_copies);
+ spin_lock_init(&clp->async_lock);
spin_lock_init(&clp->cl_lock);
rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
return clp;
@@ -1942,6 +1974,7 @@ __destroy_client(struct nfs4_client *clp)
}
}
nfsd4_return_all_client_layouts(clp);
+ nfsd4_shutdown_copy(clp);
nfsd4_shutdown_callback(clp);
if (clp->cl_cb_conn.cb_xprt)
svc_xprt_put(clp->cl_cb_conn.cb_xprt);
@@ -2475,7 +2508,8 @@ static bool client_has_state(struct nfs4_client *clp)
|| !list_empty(&clp->cl_lo_states)
#endif
|| !list_empty(&clp->cl_delegations)
- || !list_empty(&clp->cl_sessions);
+ || !list_empty(&clp->cl_sessions)
+ || !list_empty(&clp->async_copies);
}
__be32
@@ -4364,7 +4398,7 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
fl = nfs4_alloc_init_lease(dp, NFS4_OPEN_DELEGATE_READ);
if (!fl)
- goto out_stid;
+ goto out_clnt_odstate;
status = vfs_setlease(fp->fi_deleg_file, fl->fl_type, &fl, NULL);
if (fl)
@@ -4389,7 +4423,6 @@ out_unlock:
vfs_setlease(fp->fi_deleg_file, F_UNLCK, NULL, (void **)&dp);
out_clnt_odstate:
put_clnt_odstate(dp->dl_clnt_odstate);
-out_stid:
nfs4_put_stid(&dp->dl_stid);
out_delegees:
put_deleg_file(fp);
@@ -4830,7 +4863,7 @@ nfs4_laundromat(struct nfsd_net *nn)
nbl = list_first_entry(&reaplist,
struct nfsd4_blocked_lock, nbl_lru);
list_del_init(&nbl->nbl_lru);
- posix_unblock_lock(&nbl->nbl_lock);
+ locks_delete_block(&nbl->nbl_lock);
free_blocked_lock(nbl);
}
out:
@@ -5079,7 +5112,7 @@ nfs4_find_file(struct nfs4_stid *s, int flags)
}
static __be32
-nfs4_check_olstateid(struct svc_fh *fhp, struct nfs4_ol_stateid *ols, int flags)
+nfs4_check_olstateid(struct nfs4_ol_stateid *ols, int flags)
{
__be32 status;
@@ -5162,7 +5195,7 @@ nfs4_preprocess_stateid_op(struct svc_rqst *rqstp,
break;
case NFS4_OPEN_STID:
case NFS4_LOCK_STID:
- status = nfs4_check_olstateid(fhp, openlockstateid(s), flags);
+ status = nfs4_check_olstateid(openlockstateid(s), flags);
break;
default:
status = nfserr_bad_stateid;
@@ -6197,15 +6230,15 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
case NFS4_READ_LT:
case NFS4_READW_LT:
file_lock->fl_type = F_RDLCK;
- break;
+ break;
case NFS4_WRITE_LT:
case NFS4_WRITEW_LT:
file_lock->fl_type = F_WRLCK;
- break;
+ break;
default:
dprintk("NFSD: nfs4_lockt: bad lock type!\n");
status = nfserr_inval;
- goto out;
+ goto out;
}
lo = find_lockowner_str(cstate->clp, &lockt->lt_owner);
@@ -7161,6 +7194,8 @@ static int nfs4_state_create_net(struct net *net)
INIT_LIST_HEAD(&nn->close_lru);
INIT_LIST_HEAD(&nn->del_recall_lru);
spin_lock_init(&nn->client_lock);
+ spin_lock_init(&nn->s2s_cp_lock);
+ idr_init(&nn->s2s_cp_stateids);
spin_lock_init(&nn->blocked_locks_lock);
INIT_LIST_HEAD(&nn->blocked_locks_lru);
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 418fa9c78186..3de42a729093 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1768,6 +1768,13 @@ nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy)
}
static __be32
+nfsd4_decode_offload_status(struct nfsd4_compoundargs *argp,
+ struct nfsd4_offload_status *os)
+{
+ return nfsd4_decode_stateid(argp, &os->stateid);
+}
+
+static __be32
nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek)
{
DECODE_HEAD;
@@ -1873,8 +1880,8 @@ static const nfsd4_dec nfsd4_dec_ops[] = {
[OP_IO_ADVISE] = (nfsd4_dec)nfsd4_decode_notsupp,
[OP_LAYOUTERROR] = (nfsd4_dec)nfsd4_decode_notsupp,
[OP_LAYOUTSTATS] = (nfsd4_dec)nfsd4_decode_notsupp,
- [OP_OFFLOAD_CANCEL] = (nfsd4_dec)nfsd4_decode_notsupp,
- [OP_OFFLOAD_STATUS] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_OFFLOAD_CANCEL] = (nfsd4_dec)nfsd4_decode_offload_status,
+ [OP_OFFLOAD_STATUS] = (nfsd4_dec)nfsd4_decode_offload_status,
[OP_READ_PLUS] = (nfsd4_dec)nfsd4_decode_notsupp,
[OP_SEEK] = (nfsd4_dec)nfsd4_decode_seek,
[OP_WRITE_SAME] = (nfsd4_dec)nfsd4_decode_notsupp,
@@ -4224,15 +4231,27 @@ nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr,
#endif /* CONFIG_NFSD_PNFS */
static __be32
-nfsd42_encode_write_res(struct nfsd4_compoundres *resp, struct nfsd42_write_res *write)
+nfsd42_encode_write_res(struct nfsd4_compoundres *resp,
+ struct nfsd42_write_res *write, bool sync)
{
__be32 *p;
+ p = xdr_reserve_space(&resp->xdr, 4);
+ if (!p)
+ return nfserr_resource;
- p = xdr_reserve_space(&resp->xdr, 4 + 8 + 4 + NFS4_VERIFIER_SIZE);
+ if (sync)
+ *p++ = cpu_to_be32(0);
+ else {
+ __be32 nfserr;
+ *p++ = cpu_to_be32(1);
+ nfserr = nfsd4_encode_stateid(&resp->xdr, &write->cb_stateid);
+ if (nfserr)
+ return nfserr;
+ }
+ p = xdr_reserve_space(&resp->xdr, 8 + 4 + NFS4_VERIFIER_SIZE);
if (!p)
return nfserr_resource;
- *p++ = cpu_to_be32(0);
p = xdr_encode_hyper(p, write->wr_bytes_written);
*p++ = cpu_to_be32(write->wr_stable_how);
p = xdr_encode_opaque_fixed(p, write->wr_verifier.data,
@@ -4246,7 +4265,8 @@ nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr,
{
__be32 *p;
- nfserr = nfsd42_encode_write_res(resp, &copy->cp_res);
+ nfserr = nfsd42_encode_write_res(resp, &copy->cp_res,
+ copy->cp_synchronous);
if (nfserr)
return nfserr;
@@ -4257,6 +4277,22 @@ nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr,
}
static __be32
+nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr,
+ struct nfsd4_offload_status *os)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 8 + 4);
+ if (!p)
+ return nfserr_resource;
+ p = xdr_encode_hyper(p, os->count);
+ *p++ = cpu_to_be32(0);
+
+ return nfserr;
+}
+
+static __be32
nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_seek *seek)
{
@@ -4359,7 +4395,7 @@ static const nfsd4_enc nfsd4_enc_ops[] = {
[OP_LAYOUTERROR] = (nfsd4_enc)nfsd4_encode_noop,
[OP_LAYOUTSTATS] = (nfsd4_enc)nfsd4_encode_noop,
[OP_OFFLOAD_CANCEL] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_OFFLOAD_STATUS] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_OFFLOAD_STATUS] = (nfsd4_enc)nfsd4_encode_offload_status,
[OP_READ_PLUS] = (nfsd4_enc)nfsd4_encode_noop,
[OP_SEEK] = (nfsd4_enc)nfsd4_encode_seek,
[OP_WRITE_SAME] = (nfsd4_enc)nfsd4_encode_noop,
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index dbdeb9d6af03..da52b594362a 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -30,6 +30,7 @@
#define TARGET_BUCKET_SIZE 64
struct nfsd_drc_bucket {
+ struct rb_root rb_head;
struct list_head lru_head;
spinlock_t cache_lock;
};
@@ -98,7 +99,7 @@ static unsigned int
nfsd_cache_size_limit(void)
{
unsigned int limit;
- unsigned long low_pages = totalram_pages - totalhigh_pages;
+ unsigned long low_pages = totalram_pages() - totalhigh_pages();
limit = (16 * int_sqrt(low_pages)) << (PAGE_SHIFT-10);
return min_t(unsigned int, limit, 256*1024);
@@ -121,7 +122,7 @@ nfsd_cache_hash(__be32 xid)
}
static struct svc_cacherep *
-nfsd_reply_cache_alloc(void)
+nfsd_reply_cache_alloc(struct svc_rqst *rqstp, __wsum csum)
{
struct svc_cacherep *rp;
@@ -129,21 +130,35 @@ nfsd_reply_cache_alloc(void)
if (rp) {
rp->c_state = RC_UNUSED;
rp->c_type = RC_NOCACHE;
+ RB_CLEAR_NODE(&rp->c_node);
INIT_LIST_HEAD(&rp->c_lru);
+
+ memset(&rp->c_key, 0, sizeof(rp->c_key));
+ rp->c_key.k_xid = rqstp->rq_xid;
+ rp->c_key.k_proc = rqstp->rq_proc;
+ rpc_copy_addr((struct sockaddr *)&rp->c_key.k_addr, svc_addr(rqstp));
+ rpc_set_port((struct sockaddr *)&rp->c_key.k_addr, rpc_get_port(svc_addr(rqstp)));
+ rp->c_key.k_prot = rqstp->rq_prot;
+ rp->c_key.k_vers = rqstp->rq_vers;
+ rp->c_key.k_len = rqstp->rq_arg.len;
+ rp->c_key.k_csum = csum;
}
return rp;
}
static void
-nfsd_reply_cache_free_locked(struct svc_cacherep *rp)
+nfsd_reply_cache_free_locked(struct nfsd_drc_bucket *b, struct svc_cacherep *rp)
{
if (rp->c_type == RC_REPLBUFF && rp->c_replvec.iov_base) {
drc_mem_usage -= rp->c_replvec.iov_len;
kfree(rp->c_replvec.iov_base);
}
- list_del(&rp->c_lru);
- atomic_dec(&num_drc_entries);
- drc_mem_usage -= sizeof(*rp);
+ if (rp->c_state != RC_UNUSED) {
+ rb_erase(&rp->c_node, &b->rb_head);
+ list_del(&rp->c_lru);
+ atomic_dec(&num_drc_entries);
+ drc_mem_usage -= sizeof(*rp);
+ }
kmem_cache_free(drc_slab, rp);
}
@@ -151,7 +166,7 @@ static void
nfsd_reply_cache_free(struct nfsd_drc_bucket *b, struct svc_cacherep *rp)
{
spin_lock(&b->cache_lock);
- nfsd_reply_cache_free_locked(rp);
+ nfsd_reply_cache_free_locked(b, rp);
spin_unlock(&b->cache_lock);
}
@@ -207,7 +222,7 @@ void nfsd_reply_cache_shutdown(void)
struct list_head *head = &drc_hashtbl[i].lru_head;
while (!list_empty(head)) {
rp = list_first_entry(head, struct svc_cacherep, c_lru);
- nfsd_reply_cache_free_locked(rp);
+ nfsd_reply_cache_free_locked(&drc_hashtbl[i], rp);
}
}
@@ -246,7 +261,7 @@ prune_bucket(struct nfsd_drc_bucket *b)
if (atomic_read(&num_drc_entries) <= max_drc_entries &&
time_before(jiffies, rp->c_timestamp + RC_EXPIRE))
break;
- nfsd_reply_cache_free_locked(rp);
+ nfsd_reply_cache_free_locked(b, rp);
freed++;
}
return freed;
@@ -318,51 +333,48 @@ nfsd_cache_csum(struct svc_rqst *rqstp)
return csum;
}
-static bool
-nfsd_cache_match(struct svc_rqst *rqstp, __wsum csum, struct svc_cacherep *rp)
+static int
+nfsd_cache_key_cmp(const struct svc_cacherep *key, const struct svc_cacherep *rp)
{
- /* Check RPC XID first */
- if (rqstp->rq_xid != rp->c_xid)
- return false;
- /* compare checksum of NFS data */
- if (csum != rp->c_csum) {
+ if (key->c_key.k_xid == rp->c_key.k_xid &&
+ key->c_key.k_csum != rp->c_key.k_csum)
++payload_misses;
- return false;
- }
- /* Other discriminators */
- if (rqstp->rq_proc != rp->c_proc ||
- rqstp->rq_prot != rp->c_prot ||
- rqstp->rq_vers != rp->c_vers ||
- rqstp->rq_arg.len != rp->c_len ||
- !rpc_cmp_addr(svc_addr(rqstp), (struct sockaddr *)&rp->c_addr) ||
- rpc_get_port(svc_addr(rqstp)) != rpc_get_port((struct sockaddr *)&rp->c_addr))
- return false;
-
- return true;
+ return memcmp(&key->c_key, &rp->c_key, sizeof(key->c_key));
}
/*
* Search the request hash for an entry that matches the given rqstp.
* Must be called with cache_lock held. Returns the found entry or
- * NULL on failure.
+ * inserts an empty key on failure.
*/
static struct svc_cacherep *
-nfsd_cache_search(struct nfsd_drc_bucket *b, struct svc_rqst *rqstp,
- __wsum csum)
+nfsd_cache_insert(struct nfsd_drc_bucket *b, struct svc_cacherep *key)
{
- struct svc_cacherep *rp, *ret = NULL;
- struct list_head *rh = &b->lru_head;
+ struct svc_cacherep *rp, *ret = key;
+ struct rb_node **p = &b->rb_head.rb_node,
+ *parent = NULL;
unsigned int entries = 0;
+ int cmp;
- list_for_each_entry(rp, rh, c_lru) {
+ while (*p != NULL) {
++entries;
- if (nfsd_cache_match(rqstp, csum, rp)) {
+ parent = *p;
+ rp = rb_entry(parent, struct svc_cacherep, c_node);
+
+ cmp = nfsd_cache_key_cmp(key, rp);
+ if (cmp < 0)
+ p = &parent->rb_left;
+ else if (cmp > 0)
+ p = &parent->rb_right;
+ else {
ret = rp;
- break;
+ goto out;
}
}
-
+ rb_link_node(&key->c_node, parent, p);
+ rb_insert_color(&key->c_node, &b->rb_head);
+out:
/* tally hash chain length stats */
if (entries > longest_chain) {
longest_chain = entries;
@@ -374,6 +386,7 @@ nfsd_cache_search(struct nfsd_drc_bucket *b, struct svc_rqst *rqstp,
atomic_read(&num_drc_entries));
}
+ lru_put_end(b, ret);
return ret;
}
@@ -389,9 +402,6 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
{
struct svc_cacherep *rp, *found;
__be32 xid = rqstp->rq_xid;
- u32 proto = rqstp->rq_prot,
- vers = rqstp->rq_vers,
- proc = rqstp->rq_proc;
__wsum csum;
u32 hash = nfsd_cache_hash(xid);
struct nfsd_drc_bucket *b = &drc_hashtbl[hash];
@@ -410,60 +420,38 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
* Since the common case is a cache miss followed by an insert,
* preallocate an entry.
*/
- rp = nfsd_reply_cache_alloc();
- spin_lock(&b->cache_lock);
- if (likely(rp)) {
- atomic_inc(&num_drc_entries);
- drc_mem_usage += sizeof(*rp);
+ rp = nfsd_reply_cache_alloc(rqstp, csum);
+ if (!rp) {
+ dprintk("nfsd: unable to allocate DRC entry!\n");
+ return rtn;
}
- /* go ahead and prune the cache */
- prune_bucket(b);
-
- found = nfsd_cache_search(b, rqstp, csum);
- if (found) {
- if (likely(rp))
- nfsd_reply_cache_free_locked(rp);
+ spin_lock(&b->cache_lock);
+ found = nfsd_cache_insert(b, rp);
+ if (found != rp) {
+ nfsd_reply_cache_free_locked(NULL, rp);
rp = found;
goto found_entry;
}
- if (!rp) {
- dprintk("nfsd: unable to allocate DRC entry!\n");
- goto out;
- }
-
nfsdstats.rcmisses++;
rqstp->rq_cacherep = rp;
rp->c_state = RC_INPROG;
- rp->c_xid = xid;
- rp->c_proc = proc;
- rpc_copy_addr((struct sockaddr *)&rp->c_addr, svc_addr(rqstp));
- rpc_set_port((struct sockaddr *)&rp->c_addr, rpc_get_port(svc_addr(rqstp)));
- rp->c_prot = proto;
- rp->c_vers = vers;
- rp->c_len = rqstp->rq_arg.len;
- rp->c_csum = csum;
- lru_put_end(b, rp);
+ atomic_inc(&num_drc_entries);
+ drc_mem_usage += sizeof(*rp);
- /* release any buffer */
- if (rp->c_type == RC_REPLBUFF) {
- drc_mem_usage -= rp->c_replvec.iov_len;
- kfree(rp->c_replvec.iov_base);
- rp->c_replvec.iov_base = NULL;
- }
- rp->c_type = RC_NOCACHE;
+ /* go ahead and prune the cache */
+ prune_bucket(b);
out:
spin_unlock(&b->cache_lock);
return rtn;
found_entry:
- nfsdstats.rchits++;
/* We found a matching entry which is either in progress or done. */
- lru_put_end(b, rp);
-
+ nfsdstats.rchits++;
rtn = RC_DROPIT;
+
/* Request being processed */
if (rp->c_state == RC_INPROG)
goto out;
@@ -489,7 +477,7 @@ found_entry:
break;
default:
printk(KERN_WARNING "nfsd: bad repcache type %d\n", rp->c_type);
- nfsd_reply_cache_free_locked(rp);
+ nfsd_reply_cache_free_locked(b, rp);
}
goto out;
@@ -524,7 +512,7 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
if (!rp)
return;
- hash = nfsd_cache_hash(rp->c_xid);
+ hash = nfsd_cache_hash(rp->c_key.k_xid);
b = &drc_hashtbl[hash];
len = resv->iov_len - ((char*)statp - (char*)resv->iov_base);
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 7fb9f7c667b1..72a7681f4046 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1126,6 +1126,8 @@ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size)
case 'Y':
case 'y':
case '1':
+ if (nn->nfsd_serv)
+ return -EBUSY;
nfsd4_end_grace(nn);
break;
default:
@@ -1237,11 +1239,12 @@ static __net_init int nfsd_init_net(struct net *net)
retval = nfsd_idmap_init(net);
if (retval)
goto out_idmap_error;
- nn->nfsd4_lease = 45; /* default lease time */
- nn->nfsd4_grace = 45;
+ nn->nfsd4_lease = 90; /* default lease time */
+ nn->nfsd4_grace = 90;
nn->somebody_reclaimed = false;
nn->clverifier_counter = prandom_u32();
nn->clientid_counter = prandom_u32();
+ nn->s2s_cp_cl_id = nn->clientid_counter++;
atomic_set(&nn->ntf_refcnt, 0);
init_waitqueue_head(&nn->ntf_wq);
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 0b15dac7e609..396c76755b03 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -327,7 +327,7 @@ struct nfs4_client {
#define NFSD4_CLIENT_CB_FLAG_MASK (1 << NFSD4_CLIENT_CB_UPDATE | \
1 << NFSD4_CLIENT_CB_KILL)
unsigned long cl_flags;
- struct rpc_cred *cl_cb_cred;
+ const struct cred *cl_cb_cred;
struct rpc_clnt *cl_cb_client;
u32 cl_cb_ident;
#define NFSD4_CB_UP 0
@@ -355,6 +355,8 @@ struct nfs4_client {
struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */
/* wait here for slots */
struct net *net;
+ struct list_head async_copies; /* list of async copies */
+ spinlock_t async_lock; /* lock for async copies */
};
/* struct nfs4_client_reset
@@ -573,6 +575,7 @@ enum nfsd4_cb_op {
NFSPROC4_CLNT_CB_NULL = 0,
NFSPROC4_CLNT_CB_RECALL,
NFSPROC4_CLNT_CB_LAYOUT,
+ NFSPROC4_CLNT_CB_OFFLOAD,
NFSPROC4_CLNT_CB_SEQUENCE,
NFSPROC4_CLNT_CB_NOTIFY_LOCK,
};
@@ -599,6 +602,7 @@ struct nfsd4_blocked_lock {
struct nfsd4_compound_state;
struct nfsd_net;
+struct nfsd4_copy;
extern __be32 nfs4_preprocess_stateid_op(struct svc_rqst *rqstp,
struct nfsd4_compound_state *cstate, struct svc_fh *fhp,
@@ -608,6 +612,8 @@ __be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
struct nfs4_stid **s, struct nfsd_net *nn);
struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab,
void (*sc_free)(struct nfs4_stid *));
+int nfs4_init_cp_state(struct nfsd_net *nn, struct nfsd4_copy *copy);
+void nfs4_free_cp_state(struct nfsd4_copy *copy);
void nfs4_unhash_stid(struct nfs4_stid *s);
void nfs4_put_stid(struct nfs4_stid *s);
void nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid);
@@ -626,6 +632,7 @@ extern void nfsd4_run_cb(struct nfsd4_callback *cb);
extern int nfsd4_create_callback_queue(void);
extern void nfsd4_destroy_callback_queue(void);
extern void nfsd4_shutdown_callback(struct nfs4_client *);
+extern void nfsd4_shutdown_copy(struct nfs4_client *clp);
extern void nfsd4_prepare_cb_recall(struct nfs4_delegation *dp);
extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name,
struct nfsd_net *nn);
@@ -633,6 +640,9 @@ extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn);
struct nfs4_file *find_file(struct knfsd_fh *fh);
void put_nfs4_file(struct nfs4_file *fi);
+extern void nfs4_put_copy(struct nfsd4_copy *copy);
+extern struct nfsd4_copy *
+find_async_copy(struct nfs4_client *clp, stateid_t *staetid);
static inline void get_nfs4_file(struct nfs4_file *fi)
{
refcount_inc(&fi->fi_ref);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index b53e76391e52..7dc98e14655d 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -396,10 +396,23 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
bool get_write_count;
bool size_change = (iap->ia_valid & ATTR_SIZE);
- if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE))
+ if (iap->ia_valid & ATTR_SIZE) {
accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE;
- if (iap->ia_valid & ATTR_SIZE)
ftype = S_IFREG;
+ }
+
+ /*
+ * If utimes(2) and friends are called with times not NULL, we should
+ * not set NFSD_MAY_WRITE bit. Otherwise fh_verify->nfsd_permission
+ * will return EACCESS, when the caller's effective UID does not match
+ * the owner of the file, and the caller is not privileged. In this
+ * situation, we should return EPERM(notify_change will return this).
+ */
+ if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME)) {
+ accmode |= NFSD_MAY_OWNER_OVERRIDE;
+ if (!(iap->ia_valid & (ATTR_ATIME_SET | ATTR_MTIME_SET)))
+ accmode |= NFSD_MAY_WRITE;
+ }
/* Callers that do fh_verify should do the fh_want_write: */
get_write_count = !fhp->fh_dentry;
@@ -541,8 +554,14 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp,
__be32 nfsd4_clone_file_range(struct file *src, u64 src_pos, struct file *dst,
u64 dst_pos, u64 count)
{
- return nfserrno(vfs_clone_file_range(src, src_pos, dst, dst_pos,
- count));
+ loff_t cloned;
+
+ cloned = vfs_clone_file_range(src, src_pos, dst, dst_pos, count, 0);
+ if (cloned < 0)
+ return nfserrno(cloned);
+ if (count && cloned != count)
+ return nfserrno(-EINVAL);
+ return 0;
}
ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
@@ -923,7 +942,7 @@ __be32 nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp,
int host_err;
trace_nfsd_read_vector(rqstp, fhp, offset, *count);
- iov_iter_kvec(&iter, READ | ITER_KVEC, vec, vlen, *count);
+ iov_iter_kvec(&iter, READ, vec, vlen, *count);
host_err = vfs_iter_read(file, &iter, &offset, 0);
return nfsd_finish_read(rqstp, fhp, file, offset, count, host_err);
}
@@ -999,7 +1018,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
if (stable && !use_wgather)
flags |= RWF_SYNC;
- iov_iter_kvec(&iter, WRITE | ITER_KVEC, vec, vlen, *cnt);
+ iov_iter_kvec(&iter, WRITE, vec, vlen, *cnt);
host_err = vfs_iter_write(file, &iter, &pos, flags);
if (host_err < 0)
goto out_nfserr;
@@ -1276,7 +1295,6 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
int type, dev_t rdev, struct svc_fh *resfhp)
{
struct dentry *dentry, *dchild = NULL;
- struct inode *dirp;
__be32 err;
int host_err;
@@ -1288,7 +1306,6 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
return err;
dentry = fhp->fh_dentry;
- dirp = d_inode(dentry);
host_err = fh_want_write(fhp);
if (host_err)
@@ -1409,6 +1426,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
*created = 1;
break;
}
+ /* fall through */
case NFS4_CREATE_EXCLUSIVE4_1:
if ( d_inode(dchild)->i_mtime.tv_sec == v_mtime
&& d_inode(dchild)->i_atime.tv_sec == v_atime
@@ -1417,7 +1435,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
*created = 1;
goto set_attr;
}
- /* fallthru */
+ /* fall through */
case NFS3_CREATE_GUARDED:
err = nfserr_exist;
}
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 17c453a7999c..feeb6d4bdffd 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -511,6 +511,7 @@ struct nfsd42_write_res {
u64 wr_bytes_written;
u32 wr_stable_how;
nfs4_verifier wr_verifier;
+ stateid_t cb_stateid;
};
struct nfsd4_copy {
@@ -526,6 +527,23 @@ struct nfsd4_copy {
/* response */
struct nfsd42_write_res cp_res;
+
+ /* for cb_offload */
+ struct nfsd4_callback cp_cb;
+ __be32 nfserr;
+ struct knfsd_fh fh;
+
+ struct nfs4_client *cp_clp;
+
+ struct file *file_src;
+ struct file *file_dst;
+
+ stateid_t cp_stateid;
+
+ struct list_head copies;
+ struct task_struct *copy_task;
+ refcount_t refcount;
+ bool stopped;
};
struct nfsd4_seek {
@@ -539,6 +557,15 @@ struct nfsd4_seek {
loff_t seek_pos;
};
+struct nfsd4_offload_status {
+ /* request */
+ stateid_t stateid;
+
+ /* response */
+ u64 count;
+ u32 status;
+};
+
struct nfsd4_op {
int opnum;
const struct nfsd4_operation * opdesc;
@@ -597,6 +624,7 @@ struct nfsd4_op {
struct nfsd4_fallocate deallocate;
struct nfsd4_clone clone;
struct nfsd4_copy copy;
+ struct nfsd4_offload_status offload_status;
struct nfsd4_seek seek;
} u;
struct nfs4_replay * replay;
diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h
index 517239af0302..547cf07cf4e0 100644
--- a/fs/nfsd/xdr4cb.h
+++ b/fs/nfsd/xdr4cb.h
@@ -38,3 +38,13 @@
#define NFS4_dec_cb_notify_lock_sz (cb_compound_dec_hdr_sz + \
cb_sequence_dec_sz + \
op_dec_sz)
+#define enc_cb_offload_info_sz (1 + 1 + 2 + 1 + \
+ XDR_QUADLEN(NFS4_VERIFIER_SIZE))
+#define NFS4_enc_cb_offload_sz (cb_compound_enc_hdr_sz + \
+ cb_sequence_enc_sz + \
+ enc_nfs4_fh_sz + \
+ enc_stateid_sz + \
+ enc_cb_offload_info_sz)
+#define NFS4_dec_cb_offload_sz (cb_compound_dec_hdr_sz + \
+ cb_sequence_dec_sz + \
+ op_dec_sz)
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index ebb24a314f43..f2129a5d9f23 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -168,24 +168,18 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc,
ctxt->newbh = NULL;
if (inode->i_blkbits == PAGE_SHIFT) {
- lock_page(obh->b_page);
- /*
- * We cannot call radix_tree_preload for the kernels older
- * than 2.6.23, because it is not exported for modules.
- */
+ struct page *opage = obh->b_page;
+ lock_page(opage);
retry:
- err = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
- if (err)
- goto failed_unlock;
/* BUG_ON(oldkey != obh->b_page->index); */
- if (unlikely(oldkey != obh->b_page->index))
- NILFS_PAGE_BUG(obh->b_page,
+ if (unlikely(oldkey != opage->index))
+ NILFS_PAGE_BUG(opage,
"invalid oldkey %lld (newkey=%lld)",
(unsigned long long)oldkey,
(unsigned long long)newkey);
xa_lock_irq(&btnc->i_pages);
- err = radix_tree_insert(&btnc->i_pages, newkey, obh->b_page);
+ err = __xa_insert(&btnc->i_pages, newkey, opage, GFP_NOFS);
xa_unlock_irq(&btnc->i_pages);
/*
* Note: page->index will not change to newkey until
@@ -193,7 +187,6 @@ retry:
* To protect the page in intermediate state, the page lock
* is held.
*/
- radix_tree_preload_end();
if (!err)
return 0;
else if (err != -EEXIST)
@@ -203,7 +196,7 @@ retry:
if (!err)
goto retry;
/* fallback to copy mode */
- unlock_page(obh->b_page);
+ unlock_page(opage);
}
nbh = nilfs_btnode_create_block(btnc, newkey);
@@ -243,9 +236,8 @@ void nilfs_btnode_commit_change_key(struct address_space *btnc,
mark_buffer_dirty(obh);
xa_lock_irq(&btnc->i_pages);
- radix_tree_delete(&btnc->i_pages, oldkey);
- radix_tree_tag_set(&btnc->i_pages, newkey,
- PAGECACHE_TAG_DIRTY);
+ __xa_erase(&btnc->i_pages, oldkey);
+ __xa_set_mark(&btnc->i_pages, newkey, PAGECACHE_TAG_DIRTY);
xa_unlock_irq(&btnc->i_pages);
opage->index = obh->b_blocknr = newkey;
@@ -274,9 +266,7 @@ void nilfs_btnode_abort_change_key(struct address_space *btnc,
return;
if (nbh == NULL) { /* blocksize == pagesize */
- xa_lock_irq(&btnc->i_pages);
- radix_tree_delete(&btnc->i_pages, newkey);
- xa_unlock_irq(&btnc->i_pages);
+ xa_erase_irq(&btnc->i_pages, newkey);
unlock_page(ctxt->bh->b_page);
} else
brelse(nbh);
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 329a056b73b1..d7fc8d369d89 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -289,7 +289,7 @@ repeat:
* @dmap: destination page cache
* @smap: source page cache
*
- * No pages must no be added to the cache during this process.
+ * No pages must be added to the cache during this process.
* This must be ensured by the caller.
*/
void nilfs_copy_back_pages(struct address_space *dmap,
@@ -298,7 +298,6 @@ void nilfs_copy_back_pages(struct address_space *dmap,
struct pagevec pvec;
unsigned int i, n;
pgoff_t index = 0;
- int err;
pagevec_init(&pvec);
repeat:
@@ -313,35 +312,34 @@ repeat:
lock_page(page);
dpage = find_lock_page(dmap, offset);
if (dpage) {
- /* override existing page on the destination cache */
+ /* overwrite existing page in the destination cache */
WARN_ON(PageDirty(dpage));
nilfs_copy_page(dpage, page, 0);
unlock_page(dpage);
put_page(dpage);
+ /* Do we not need to remove page from smap here? */
} else {
- struct page *page2;
+ struct page *p;
/* move the page to the destination cache */
xa_lock_irq(&smap->i_pages);
- page2 = radix_tree_delete(&smap->i_pages, offset);
- WARN_ON(page2 != page);
-
+ p = __xa_erase(&smap->i_pages, offset);
+ WARN_ON(page != p);
smap->nrpages--;
xa_unlock_irq(&smap->i_pages);
xa_lock_irq(&dmap->i_pages);
- err = radix_tree_insert(&dmap->i_pages, offset, page);
- if (unlikely(err < 0)) {
- WARN_ON(err == -EEXIST);
+ p = __xa_store(&dmap->i_pages, offset, page, GFP_NOFS);
+ if (unlikely(p)) {
+ /* Probably -ENOMEM */
page->mapping = NULL;
- put_page(page); /* for cache */
+ put_page(page);
} else {
page->mapping = dmap;
dmap->nrpages++;
if (PageDirty(page))
- radix_tree_tag_set(&dmap->i_pages,
- offset,
- PAGECACHE_TAG_DIRTY);
+ __xa_set_mark(&dmap->i_pages, offset,
+ PAGECACHE_TAG_DIRTY);
}
xa_unlock_irq(&dmap->i_pages);
}
@@ -467,8 +465,7 @@ int __nilfs_clear_page_dirty(struct page *page)
if (mapping) {
xa_lock_irq(&mapping->i_pages);
if (test_bit(PG_dirty, &page->flags)) {
- radix_tree_tag_clear(&mapping->i_pages,
- page_index(page),
+ __xa_clear_mark(&mapping->i_pages, page_index(page),
PAGECACHE_TAG_DIRTY);
xa_unlock_irq(&mapping->i_pages);
return clear_page_dirty_for_io(page);
diff --git a/fs/notify/fanotify/Kconfig b/fs/notify/fanotify/Kconfig
index 41355ce74ac0..735bfb2e9190 100644
--- a/fs/notify/fanotify/Kconfig
+++ b/fs/notify/fanotify/Kconfig
@@ -2,6 +2,7 @@ config FANOTIFY
bool "Filesystem wide access notification"
select FSNOTIFY
select ANON_INODES
+ select EXPORTFS
default n
---help---
Say Y here to enable fanotify support. fanotify is a file access
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 94b52157bf8d..6b9c27548997 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -13,22 +13,40 @@
#include <linux/wait.h>
#include <linux/audit.h>
#include <linux/sched/mm.h>
+#include <linux/statfs.h>
#include "fanotify.h"
static bool should_merge(struct fsnotify_event *old_fsn,
struct fsnotify_event *new_fsn)
{
- struct fanotify_event_info *old, *new;
+ struct fanotify_event *old, *new;
pr_debug("%s: old=%p new=%p\n", __func__, old_fsn, new_fsn);
old = FANOTIFY_E(old_fsn);
new = FANOTIFY_E(new_fsn);
- if (old_fsn->inode == new_fsn->inode && old->tgid == new->tgid &&
- old->path.mnt == new->path.mnt &&
- old->path.dentry == new->path.dentry)
- return true;
+ if (old_fsn->inode != new_fsn->inode || old->pid != new->pid ||
+ old->fh_type != new->fh_type || old->fh_len != new->fh_len)
+ return false;
+
+ if (fanotify_event_has_path(old)) {
+ return old->path.mnt == new->path.mnt &&
+ old->path.dentry == new->path.dentry;
+ } else if (fanotify_event_has_fid(old)) {
+ /*
+ * We want to merge many dirent events in the same dir (i.e.
+ * creates/unlinks/renames), but we do not want to merge dirent
+ * events referring to subdirs with dirent events referring to
+ * non subdirs, otherwise, user won't be able to tell from a
+ * mask FAN_CREATE|FAN_DELETE|FAN_ONDIR if it describes mkdir+
+ * unlink pair or rmdir+create pair of events.
+ */
+ return (old->mask & FS_ISDIR) == (new->mask & FS_ISDIR) &&
+ fanotify_fid_equal(&old->fid, &new->fid, old->fh_len);
+ }
+
+ /* Do not merge events if we failed to encode fid */
return false;
}
@@ -36,20 +54,22 @@ static bool should_merge(struct fsnotify_event *old_fsn,
static int fanotify_merge(struct list_head *list, struct fsnotify_event *event)
{
struct fsnotify_event *test_event;
+ struct fanotify_event *new;
pr_debug("%s: list=%p event=%p\n", __func__, list, event);
+ new = FANOTIFY_E(event);
/*
* Don't merge a permission event with any other event so that we know
* the event structure we have created in fanotify_handle_event() is the
* one we should check for permission response.
*/
- if (fanotify_is_perm_event(event->mask))
+ if (fanotify_is_perm_event(new->mask))
return 0;
list_for_each_entry_reverse(test_event, list, list) {
if (should_merge(test_event, event)) {
- test_event->mask |= event->mask;
+ FANOTIFY_E(test_event)->mask |= new->mask;
return 1;
}
}
@@ -57,15 +77,44 @@ static int fanotify_merge(struct list_head *list, struct fsnotify_event *event)
return 0;
}
+/*
+ * Wait for response to permission event. The function also takes care of
+ * freeing the permission event (or offloads that in case the wait is canceled
+ * by a signal). The function returns 0 in case access got allowed by userspace,
+ * -EPERM in case userspace disallowed the access, and -ERESTARTSYS in case
+ * the wait got interrupted by a signal.
+ */
static int fanotify_get_response(struct fsnotify_group *group,
- struct fanotify_perm_event_info *event,
+ struct fanotify_perm_event *event,
struct fsnotify_iter_info *iter_info)
{
int ret;
pr_debug("%s: group=%p event=%p\n", __func__, group, event);
- wait_event(group->fanotify_data.access_waitq, event->response);
+ ret = wait_event_killable(group->fanotify_data.access_waitq,
+ event->state == FAN_EVENT_ANSWERED);
+ /* Signal pending? */
+ if (ret < 0) {
+ spin_lock(&group->notification_lock);
+ /* Event reported to userspace and no answer yet? */
+ if (event->state == FAN_EVENT_REPORTED) {
+ /* Event will get freed once userspace answers to it */
+ event->state = FAN_EVENT_CANCELED;
+ spin_unlock(&group->notification_lock);
+ return ret;
+ }
+ /* Event not yet reported? Just remove it. */
+ if (event->state == FAN_EVENT_INIT)
+ fsnotify_remove_queued_event(group, &event->fae.fse);
+ /*
+ * Event may be also answered in case signal delivery raced
+ * with wakeup. In that case we have nothing to do besides
+ * freeing the event and reporting error.
+ */
+ spin_unlock(&group->notification_lock);
+ goto out;
+ }
/* userspace responded, convert to something usable */
switch (event->response & ~FAN_AUDIT) {
@@ -81,19 +130,27 @@ static int fanotify_get_response(struct fsnotify_group *group,
if (event->response & FAN_AUDIT)
audit_fanotify(event->response & ~FAN_AUDIT);
- event->response = 0;
-
pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__,
group, event, ret);
-
+out:
+ fsnotify_destroy_event(group, &event->fae.fse);
+
return ret;
}
-static bool fanotify_should_send_event(struct fsnotify_iter_info *iter_info,
- u32 event_mask, const void *data,
- int data_type)
+/*
+ * This function returns a mask for an event that only contains the flags
+ * that have been specifically requested by the user. Flags that may have
+ * been included within the event mask, but have not been explicitly
+ * requested by the user, will not be present in the returned mask.
+ */
+static u32 fanotify_group_event_mask(struct fsnotify_group *group,
+ struct fsnotify_iter_info *iter_info,
+ u32 event_mask, const void *data,
+ int data_type)
{
__u32 marks_mask = 0, marks_ignored_mask = 0;
+ __u32 test_mask, user_mask = FANOTIFY_OUTGOING_EVENTS;
const struct path *path = data;
struct fsnotify_mark *mark;
int type;
@@ -101,49 +158,132 @@ static bool fanotify_should_send_event(struct fsnotify_iter_info *iter_info,
pr_debug("%s: report_mask=%x mask=%x data=%p data_type=%d\n",
__func__, iter_info->report_mask, event_mask, data, data_type);
- /* if we don't have enough info to send an event to userspace say no */
- if (data_type != FSNOTIFY_EVENT_PATH)
- return false;
-
- /* sorry, fanotify only gives a damn about files and dirs */
- if (!d_is_reg(path->dentry) &&
- !d_can_lookup(path->dentry))
- return false;
+ if (!FAN_GROUP_FLAG(group, FAN_REPORT_FID)) {
+ /* Do we have path to open a file descriptor? */
+ if (data_type != FSNOTIFY_EVENT_PATH)
+ return 0;
+ /* Path type events are only relevant for files and dirs */
+ if (!d_is_reg(path->dentry) && !d_can_lookup(path->dentry))
+ return 0;
+ }
fsnotify_foreach_obj_type(type) {
if (!fsnotify_iter_should_report_type(iter_info, type))
continue;
mark = iter_info->marks[type];
/*
- * if the event is for a child and this inode doesn't care about
- * events on the child, don't send it!
+ * If the event is for a child and this mark doesn't care about
+ * events on a child, don't send it!
*/
- if (type == FSNOTIFY_OBJ_TYPE_INODE &&
- (event_mask & FS_EVENT_ON_CHILD) &&
- !(mark->mask & FS_EVENT_ON_CHILD))
+ if (event_mask & FS_EVENT_ON_CHILD &&
+ (type != FSNOTIFY_OBJ_TYPE_INODE ||
+ !(mark->mask & FS_EVENT_ON_CHILD)))
continue;
marks_mask |= mark->mask;
marks_ignored_mask |= mark->ignored_mask;
}
- if (d_is_dir(path->dentry) &&
+ test_mask = event_mask & marks_mask & ~marks_ignored_mask;
+
+ /*
+ * dirent modification events (create/delete/move) do not carry the
+ * child entry name/inode information. Instead, we report FAN_ONDIR
+ * for mkdir/rmdir so user can differentiate them from creat/unlink.
+ *
+ * For backward compatibility and consistency, do not report FAN_ONDIR
+ * to user in legacy fanotify mode (reporting fd) and report FAN_ONDIR
+ * to user in FAN_REPORT_FID mode for all event types.
+ */
+ if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) {
+ /* Do not report FAN_ONDIR without any event */
+ if (!(test_mask & ~FAN_ONDIR))
+ return 0;
+ } else {
+ user_mask &= ~FAN_ONDIR;
+ }
+
+ if (event_mask & FS_ISDIR &&
!(marks_mask & FS_ISDIR & ~marks_ignored_mask))
- return false;
+ return 0;
- if (event_mask & FAN_ALL_OUTGOING_EVENTS & marks_mask &
- ~marks_ignored_mask)
- return true;
+ return test_mask & user_mask;
+}
- return false;
+static int fanotify_encode_fid(struct fanotify_event *event,
+ struct inode *inode, gfp_t gfp,
+ __kernel_fsid_t *fsid)
+{
+ struct fanotify_fid *fid = &event->fid;
+ int dwords, bytes = 0;
+ int err, type;
+
+ fid->ext_fh = NULL;
+ dwords = 0;
+ err = -ENOENT;
+ type = exportfs_encode_inode_fh(inode, NULL, &dwords, NULL);
+ if (!dwords)
+ goto out_err;
+
+ bytes = dwords << 2;
+ if (bytes > FANOTIFY_INLINE_FH_LEN) {
+ /* Treat failure to allocate fh as failure to allocate event */
+ err = -ENOMEM;
+ fid->ext_fh = kmalloc(bytes, gfp);
+ if (!fid->ext_fh)
+ goto out_err;
+ }
+
+ type = exportfs_encode_inode_fh(inode, fanotify_fid_fh(fid, bytes),
+ &dwords, NULL);
+ err = -EINVAL;
+ if (!type || type == FILEID_INVALID || bytes != dwords << 2)
+ goto out_err;
+
+ fid->fsid = *fsid;
+ event->fh_len = bytes;
+
+ return type;
+
+out_err:
+ pr_warn_ratelimited("fanotify: failed to encode fid (fsid=%x.%x, "
+ "type=%d, bytes=%d, err=%i)\n",
+ fsid->val[0], fsid->val[1], type, bytes, err);
+ kfree(fid->ext_fh);
+ fid->ext_fh = NULL;
+ event->fh_len = 0;
+
+ return FILEID_INVALID;
}
-struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group,
- struct inode *inode, u32 mask,
- const struct path *path)
+/*
+ * The inode to use as identifier when reporting fid depends on the event.
+ * Report the modified directory inode on dirent modification events.
+ * Report the "victim" inode otherwise.
+ * For example:
+ * FS_ATTRIB reports the child inode even if reported on a watched parent.
+ * FS_CREATE reports the modified dir inode and not the created inode.
+ */
+static struct inode *fanotify_fid_inode(struct inode *to_tell, u32 event_mask,
+ const void *data, int data_type)
{
- struct fanotify_event_info *event = NULL;
+ if (event_mask & ALL_FSNOTIFY_DIRENT_EVENTS)
+ return to_tell;
+ else if (data_type == FSNOTIFY_EVENT_INODE)
+ return (struct inode *)data;
+ else if (data_type == FSNOTIFY_EVENT_PATH)
+ return d_inode(((struct path *)data)->dentry);
+ return NULL;
+}
+
+struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
+ struct inode *inode, u32 mask,
+ const void *data, int data_type,
+ __kernel_fsid_t *fsid)
+{
+ struct fanotify_event *event = NULL;
gfp_t gfp = GFP_KERNEL_ACCOUNT;
+ struct inode *id = fanotify_fid_inode(inode, mask, data, data_type);
/*
* For queues with unlimited length lost events are not expected and
@@ -157,25 +297,36 @@ struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group,
memalloc_use_memcg(group->memcg);
if (fanotify_is_perm_event(mask)) {
- struct fanotify_perm_event_info *pevent;
+ struct fanotify_perm_event *pevent;
pevent = kmem_cache_alloc(fanotify_perm_event_cachep, gfp);
if (!pevent)
goto out;
event = &pevent->fae;
pevent->response = 0;
+ pevent->state = FAN_EVENT_INIT;
goto init;
}
event = kmem_cache_alloc(fanotify_event_cachep, gfp);
if (!event)
goto out;
init: __maybe_unused
- fsnotify_init_event(&event->fse, inode, mask);
- event->tgid = get_pid(task_tgid(current));
- if (path) {
- event->path = *path;
+ fsnotify_init_event(&event->fse, inode);
+ event->mask = mask;
+ if (FAN_GROUP_FLAG(group, FAN_REPORT_TID))
+ event->pid = get_pid(task_pid(current));
+ else
+ event->pid = get_pid(task_tgid(current));
+ event->fh_len = 0;
+ if (id && FAN_GROUP_FLAG(group, FAN_REPORT_FID)) {
+ /* Report the event without a file identifier on encode error */
+ event->fh_type = fanotify_encode_fid(event, id, gfp, fsid);
+ } else if (data_type == FSNOTIFY_EVENT_PATH) {
+ event->fh_type = FILEID_ROOT;
+ event->path = *((struct path *)data);
path_get(&event->path);
} else {
+ event->fh_type = FILEID_INVALID;
event->path.mnt = NULL;
event->path.dentry = NULL;
}
@@ -184,6 +335,29 @@ out:
return event;
}
+/*
+ * Get cached fsid of the filesystem containing the object from any connector.
+ * All connectors are supposed to have the same fsid, but we do not verify that
+ * here.
+ */
+static __kernel_fsid_t fanotify_get_fsid(struct fsnotify_iter_info *iter_info)
+{
+ int type;
+ __kernel_fsid_t fsid = {};
+
+ fsnotify_foreach_obj_type(type) {
+ if (!fsnotify_iter_should_report_type(iter_info, type))
+ continue;
+
+ fsid = iter_info->marks[type]->connector->fsid;
+ if (WARN_ON_ONCE(!fsid.val[0] && !fsid.val[1]))
+ continue;
+ return fsid;
+ }
+
+ return fsid;
+}
+
static int fanotify_handle_event(struct fsnotify_group *group,
struct inode *inode,
u32 mask, const void *data, int data_type,
@@ -191,21 +365,35 @@ static int fanotify_handle_event(struct fsnotify_group *group,
struct fsnotify_iter_info *iter_info)
{
int ret = 0;
- struct fanotify_event_info *event;
+ struct fanotify_event *event;
struct fsnotify_event *fsn_event;
+ __kernel_fsid_t fsid = {};
BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS);
BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY);
+ BUILD_BUG_ON(FAN_ATTRIB != FS_ATTRIB);
BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE);
BUILD_BUG_ON(FAN_OPEN != FS_OPEN);
+ BUILD_BUG_ON(FAN_MOVED_TO != FS_MOVED_TO);
+ BUILD_BUG_ON(FAN_MOVED_FROM != FS_MOVED_FROM);
+ BUILD_BUG_ON(FAN_CREATE != FS_CREATE);
+ BUILD_BUG_ON(FAN_DELETE != FS_DELETE);
+ BUILD_BUG_ON(FAN_DELETE_SELF != FS_DELETE_SELF);
+ BUILD_BUG_ON(FAN_MOVE_SELF != FS_MOVE_SELF);
BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD);
BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW);
BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM);
BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM);
BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR);
+ BUILD_BUG_ON(FAN_OPEN_EXEC != FS_OPEN_EXEC);
+ BUILD_BUG_ON(FAN_OPEN_EXEC_PERM != FS_OPEN_EXEC_PERM);
- if (!fanotify_should_send_event(iter_info, mask, data, data_type))
+ BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 19);
+
+ mask = fanotify_group_event_mask(group, iter_info, mask, data,
+ data_type);
+ if (!mask)
return 0;
pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode,
@@ -220,7 +408,11 @@ static int fanotify_handle_event(struct fsnotify_group *group,
return 0;
}
- event = fanotify_alloc_event(group, inode, mask, data);
+ if (FAN_GROUP_FLAG(group, FAN_REPORT_FID))
+ fsid = fanotify_get_fsid(iter_info);
+
+ event = fanotify_alloc_event(group, inode, mask, data, data_type,
+ &fsid);
ret = -ENOMEM;
if (unlikely(!event)) {
/*
@@ -236,7 +428,7 @@ static int fanotify_handle_event(struct fsnotify_group *group,
ret = fsnotify_add_event(group, fsn_event, fanotify_merge);
if (ret) {
/* Permission events shouldn't be merged */
- BUG_ON(ret == 1 && mask & FAN_ALL_PERM_EVENTS);
+ BUG_ON(ret == 1 && mask & FANOTIFY_PERM_EVENTS);
/* Our event wasn't used in the end. Free it. */
fsnotify_destroy_event(group, fsn_event);
@@ -244,7 +436,6 @@ static int fanotify_handle_event(struct fsnotify_group *group,
} else if (fanotify_is_perm_event(mask)) {
ret = fanotify_get_response(group, FANOTIFY_PE(fsn_event),
iter_info);
- fsnotify_destroy_event(group, fsn_event);
}
finish:
if (fanotify_is_perm_event(mask))
@@ -264,12 +455,15 @@ static void fanotify_free_group_priv(struct fsnotify_group *group)
static void fanotify_free_event(struct fsnotify_event *fsn_event)
{
- struct fanotify_event_info *event;
+ struct fanotify_event *event;
event = FANOTIFY_E(fsn_event);
- path_put(&event->path);
- put_pid(event->tgid);
- if (fanotify_is_perm_event(fsn_event->mask)) {
+ if (fanotify_event_has_path(event))
+ path_put(&event->path);
+ else if (fanotify_event_has_ext_fh(event))
+ kfree(event->fid.ext_fh);
+ put_pid(event->pid);
+ if (fanotify_is_perm_event(event->mask)) {
kmem_cache_free(fanotify_perm_event_cachep,
FANOTIFY_PE(fsn_event));
return;
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index 8609ba06f474..68b30504284c 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -2,26 +2,112 @@
#include <linux/fsnotify_backend.h>
#include <linux/path.h>
#include <linux/slab.h>
+#include <linux/exportfs.h>
extern struct kmem_cache *fanotify_mark_cache;
extern struct kmem_cache *fanotify_event_cachep;
extern struct kmem_cache *fanotify_perm_event_cachep;
+/* Possible states of the permission event */
+enum {
+ FAN_EVENT_INIT,
+ FAN_EVENT_REPORTED,
+ FAN_EVENT_ANSWERED,
+ FAN_EVENT_CANCELED,
+};
+
+/*
+ * 3 dwords are sufficient for most local fs (64bit ino, 32bit generation).
+ * For 32bit arch, fid increases the size of fanotify_event by 12 bytes and
+ * fh_* fields increase the size of fanotify_event by another 4 bytes.
+ * For 64bit arch, fid increases the size of fanotify_fid by 8 bytes and
+ * fh_* fields are packed in a hole after mask.
+ */
+#if BITS_PER_LONG == 32
+#define FANOTIFY_INLINE_FH_LEN (3 << 2)
+#else
+#define FANOTIFY_INLINE_FH_LEN (4 << 2)
+#endif
+
+struct fanotify_fid {
+ __kernel_fsid_t fsid;
+ union {
+ unsigned char fh[FANOTIFY_INLINE_FH_LEN];
+ unsigned char *ext_fh;
+ };
+};
+
+static inline void *fanotify_fid_fh(struct fanotify_fid *fid,
+ unsigned int fh_len)
+{
+ return fh_len <= FANOTIFY_INLINE_FH_LEN ? fid->fh : fid->ext_fh;
+}
+
+static inline bool fanotify_fid_equal(struct fanotify_fid *fid1,
+ struct fanotify_fid *fid2,
+ unsigned int fh_len)
+{
+ return fid1->fsid.val[0] == fid2->fsid.val[0] &&
+ fid1->fsid.val[1] == fid2->fsid.val[1] &&
+ !memcmp(fanotify_fid_fh(fid1, fh_len),
+ fanotify_fid_fh(fid2, fh_len), fh_len);
+}
+
/*
* Structure for normal fanotify events. It gets allocated in
* fanotify_handle_event() and freed when the information is retrieved by
* userspace
*/
-struct fanotify_event_info {
+struct fanotify_event {
struct fsnotify_event fse;
+ u32 mask;
/*
- * We hold ref to this path so it may be dereferenced at any point
- * during this object's lifetime
+ * Those fields are outside fanotify_fid to pack fanotify_event nicely
+ * on 64bit arch and to use fh_type as an indication of whether path
+ * or fid are used in the union:
+ * FILEID_ROOT (0) for path, > 0 for fid, FILEID_INVALID for neither.
*/
- struct path path;
- struct pid *tgid;
+ u8 fh_type;
+ u8 fh_len;
+ u16 pad;
+ union {
+ /*
+ * We hold ref to this path so it may be dereferenced at any
+ * point during this object's lifetime
+ */
+ struct path path;
+ /*
+ * With FAN_REPORT_FID, we do not hold any reference on the
+ * victim object. Instead we store its NFS file handle and its
+ * filesystem's fsid as a unique identifier.
+ */
+ struct fanotify_fid fid;
+ };
+ struct pid *pid;
};
+static inline bool fanotify_event_has_path(struct fanotify_event *event)
+{
+ return event->fh_type == FILEID_ROOT;
+}
+
+static inline bool fanotify_event_has_fid(struct fanotify_event *event)
+{
+ return event->fh_type != FILEID_ROOT &&
+ event->fh_type != FILEID_INVALID;
+}
+
+static inline bool fanotify_event_has_ext_fh(struct fanotify_event *event)
+{
+ return fanotify_event_has_fid(event) &&
+ event->fh_len > FANOTIFY_INLINE_FH_LEN;
+}
+
+static inline void *fanotify_event_fh(struct fanotify_event *event)
+{
+ return fanotify_fid_fh(&event->fid, event->fh_len);
+}
+
/*
* Structure for permission fanotify events. It gets allocated and freed in
* fanotify_handle_event() since we wait there for user response. When the
@@ -29,29 +115,31 @@ struct fanotify_event_info {
* group->notification_list to group->fanotify_data.access_list to wait for
* user response.
*/
-struct fanotify_perm_event_info {
- struct fanotify_event_info fae;
- int response; /* userspace answer to question */
+struct fanotify_perm_event {
+ struct fanotify_event fae;
+ unsigned short response; /* userspace answer to the event */
+ unsigned short state; /* state of the event */
int fd; /* fd we passed to userspace for this event */
};
-static inline struct fanotify_perm_event_info *
+static inline struct fanotify_perm_event *
FANOTIFY_PE(struct fsnotify_event *fse)
{
- return container_of(fse, struct fanotify_perm_event_info, fae.fse);
+ return container_of(fse, struct fanotify_perm_event, fae.fse);
}
static inline bool fanotify_is_perm_event(u32 mask)
{
return IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS) &&
- mask & FAN_ALL_PERM_EVENTS;
+ mask & FANOTIFY_PERM_EVENTS;
}
-static inline struct fanotify_event_info *FANOTIFY_E(struct fsnotify_event *fse)
+static inline struct fanotify_event *FANOTIFY_E(struct fsnotify_event *fse)
{
- return container_of(fse, struct fanotify_event_info, fse);
+ return container_of(fse, struct fanotify_event, fse);
}
-struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group,
- struct inode *inode, u32 mask,
- const struct path *path);
+struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
+ struct inode *inode, u32 mask,
+ const void *data, int data_type,
+ __kernel_fsid_t *fsid);
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 69054886915b..56992b32c6bb 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -17,6 +17,8 @@
#include <linux/compat.h>
#include <linux/sched/signal.h>
#include <linux/memcontrol.h>
+#include <linux/statfs.h>
+#include <linux/exportfs.h>
#include <asm/ioctls.h>
@@ -47,33 +49,55 @@ struct kmem_cache *fanotify_mark_cache __read_mostly;
struct kmem_cache *fanotify_event_cachep __read_mostly;
struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
+#define FANOTIFY_EVENT_ALIGN 4
+
+static int fanotify_event_info_len(struct fanotify_event *event)
+{
+ if (!fanotify_event_has_fid(event))
+ return 0;
+
+ return roundup(sizeof(struct fanotify_event_info_fid) +
+ sizeof(struct file_handle) + event->fh_len,
+ FANOTIFY_EVENT_ALIGN);
+}
+
/*
* Get an fsnotify notification event if one exists and is small
* enough to fit in "count". Return an error pointer if the count
- * is not large enough.
- *
- * Called with the group->notification_lock held.
+ * is not large enough. When permission event is dequeued, its state is
+ * updated accordingly.
*/
static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
size_t count)
{
- assert_spin_locked(&group->notification_lock);
+ size_t event_size = FAN_EVENT_METADATA_LEN;
+ struct fsnotify_event *fsn_event = NULL;
pr_debug("%s: group=%p count=%zd\n", __func__, group, count);
+ spin_lock(&group->notification_lock);
if (fsnotify_notify_queue_is_empty(group))
- return NULL;
+ goto out;
- if (FAN_EVENT_METADATA_LEN > count)
- return ERR_PTR(-EINVAL);
+ if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) {
+ event_size += fanotify_event_info_len(
+ FANOTIFY_E(fsnotify_peek_first_event(group)));
+ }
- /* held the notification_lock the whole time, so this is the
- * same event we peeked above */
- return fsnotify_remove_first_event(group);
+ if (event_size > count) {
+ fsn_event = ERR_PTR(-EINVAL);
+ goto out;
+ }
+ fsn_event = fsnotify_remove_first_event(group);
+ if (fanotify_is_perm_event(FANOTIFY_E(fsn_event)->mask))
+ FANOTIFY_PE(fsn_event)->state = FAN_EVENT_REPORTED;
+out:
+ spin_unlock(&group->notification_lock);
+ return fsn_event;
}
static int create_fd(struct fsnotify_group *group,
- struct fanotify_event_info *event,
+ struct fanotify_event *event,
struct file **file)
{
int client_fd;
@@ -114,62 +138,32 @@ static int create_fd(struct fsnotify_group *group,
return client_fd;
}
-static int fill_event_metadata(struct fsnotify_group *group,
- struct fanotify_event_metadata *metadata,
- struct fsnotify_event *fsn_event,
- struct file **file)
-{
- int ret = 0;
- struct fanotify_event_info *event;
-
- pr_debug("%s: group=%p metadata=%p event=%p\n", __func__,
- group, metadata, fsn_event);
-
- *file = NULL;
- event = container_of(fsn_event, struct fanotify_event_info, fse);
- metadata->event_len = FAN_EVENT_METADATA_LEN;
- metadata->metadata_len = FAN_EVENT_METADATA_LEN;
- metadata->vers = FANOTIFY_METADATA_VERSION;
- metadata->reserved = 0;
- metadata->mask = fsn_event->mask & FAN_ALL_OUTGOING_EVENTS;
- metadata->pid = pid_vnr(event->tgid);
- if (unlikely(fsn_event->mask & FAN_Q_OVERFLOW))
- metadata->fd = FAN_NOFD;
- else {
- metadata->fd = create_fd(group, event, file);
- if (metadata->fd < 0)
- ret = metadata->fd;
- }
-
- return ret;
-}
-
-static struct fanotify_perm_event_info *dequeue_event(
- struct fsnotify_group *group, int fd)
+/*
+ * Finish processing of permission event by setting it to ANSWERED state and
+ * drop group->notification_lock.
+ */
+static void finish_permission_event(struct fsnotify_group *group,
+ struct fanotify_perm_event *event,
+ unsigned int response)
+ __releases(&group->notification_lock)
{
- struct fanotify_perm_event_info *event, *return_e = NULL;
-
- spin_lock(&group->notification_lock);
- list_for_each_entry(event, &group->fanotify_data.access_list,
- fae.fse.list) {
- if (event->fd != fd)
- continue;
+ bool destroy = false;
- list_del_init(&event->fae.fse.list);
- return_e = event;
- break;
- }
+ assert_spin_locked(&group->notification_lock);
+ event->response = response;
+ if (event->state == FAN_EVENT_CANCELED)
+ destroy = true;
+ else
+ event->state = FAN_EVENT_ANSWERED;
spin_unlock(&group->notification_lock);
-
- pr_debug("%s: found return_re=%p\n", __func__, return_e);
-
- return return_e;
+ if (destroy)
+ fsnotify_destroy_event(group, &event->fae.fse);
}
static int process_access_response(struct fsnotify_group *group,
struct fanotify_response *response_struct)
{
- struct fanotify_perm_event_info *event;
+ struct fanotify_perm_event *event;
int fd = response_struct->fd;
int response = response_struct->response;
@@ -191,45 +185,118 @@ static int process_access_response(struct fsnotify_group *group,
if (fd < 0)
return -EINVAL;
- if ((response & FAN_AUDIT) && !group->fanotify_data.audit)
+ if ((response & FAN_AUDIT) && !FAN_GROUP_FLAG(group, FAN_ENABLE_AUDIT))
return -EINVAL;
- event = dequeue_event(group, fd);
- if (!event)
- return -ENOENT;
+ spin_lock(&group->notification_lock);
+ list_for_each_entry(event, &group->fanotify_data.access_list,
+ fae.fse.list) {
+ if (event->fd != fd)
+ continue;
- event->response = response;
- wake_up(&group->fanotify_data.access_waitq);
+ list_del_init(&event->fae.fse.list);
+ finish_permission_event(group, event, response);
+ wake_up(&group->fanotify_data.access_waitq);
+ return 0;
+ }
+ spin_unlock(&group->notification_lock);
- return 0;
+ return -ENOENT;
}
-static ssize_t copy_event_to_user(struct fsnotify_group *group,
- struct fsnotify_event *event,
- char __user *buf)
+static int copy_fid_to_user(struct fanotify_event *event, char __user *buf)
{
- struct fanotify_event_metadata fanotify_event_metadata;
- struct file *f;
- int fd, ret;
+ struct fanotify_event_info_fid info = { };
+ struct file_handle handle = { };
+ size_t fh_len = event->fh_len;
+ size_t len = fanotify_event_info_len(event);
- pr_debug("%s: group=%p event=%p\n", __func__, group, event);
+ if (!len)
+ return 0;
- ret = fill_event_metadata(group, &fanotify_event_metadata, event, &f);
- if (ret < 0)
- return ret;
+ if (WARN_ON_ONCE(len < sizeof(info) + sizeof(handle) + fh_len))
+ return -EFAULT;
+
+ /* Copy event info fid header followed by vaiable sized file handle */
+ info.hdr.info_type = FAN_EVENT_INFO_TYPE_FID;
+ info.hdr.len = len;
+ info.fsid = event->fid.fsid;
+ if (copy_to_user(buf, &info, sizeof(info)))
+ return -EFAULT;
+
+ buf += sizeof(info);
+ len -= sizeof(info);
+ handle.handle_type = event->fh_type;
+ handle.handle_bytes = fh_len;
+ if (copy_to_user(buf, &handle, sizeof(handle)))
+ return -EFAULT;
+
+ buf += sizeof(handle);
+ len -= sizeof(handle);
+ if (copy_to_user(buf, fanotify_event_fh(event), fh_len))
+ return -EFAULT;
+
+ /* Pad with 0's */
+ buf += fh_len;
+ len -= fh_len;
+ WARN_ON_ONCE(len < 0 || len >= FANOTIFY_EVENT_ALIGN);
+ if (len > 0 && clear_user(buf, len))
+ return -EFAULT;
+
+ return 0;
+}
+
+static ssize_t copy_event_to_user(struct fsnotify_group *group,
+ struct fsnotify_event *fsn_event,
+ char __user *buf, size_t count)
+{
+ struct fanotify_event_metadata metadata;
+ struct fanotify_event *event;
+ struct file *f = NULL;
+ int ret, fd = FAN_NOFD;
+
+ pr_debug("%s: group=%p event=%p\n", __func__, group, fsn_event);
+
+ event = container_of(fsn_event, struct fanotify_event, fse);
+ metadata.event_len = FAN_EVENT_METADATA_LEN;
+ metadata.metadata_len = FAN_EVENT_METADATA_LEN;
+ metadata.vers = FANOTIFY_METADATA_VERSION;
+ metadata.reserved = 0;
+ metadata.mask = event->mask & FANOTIFY_OUTGOING_EVENTS;
+ metadata.pid = pid_vnr(event->pid);
+
+ if (fanotify_event_has_path(event)) {
+ fd = create_fd(group, event, &f);
+ if (fd < 0)
+ return fd;
+ } else if (fanotify_event_has_fid(event)) {
+ metadata.event_len += fanotify_event_info_len(event);
+ }
+ metadata.fd = fd;
- fd = fanotify_event_metadata.fd;
ret = -EFAULT;
- if (copy_to_user(buf, &fanotify_event_metadata,
- fanotify_event_metadata.event_len))
+ /*
+ * Sanity check copy size in case get_one_event() and
+ * fill_event_metadata() event_len sizes ever get out of sync.
+ */
+ if (WARN_ON_ONCE(metadata.event_len > count))
+ goto out_close_fd;
+
+ if (copy_to_user(buf, &metadata, FAN_EVENT_METADATA_LEN))
goto out_close_fd;
if (fanotify_is_perm_event(event->mask))
- FANOTIFY_PE(event)->fd = fd;
+ FANOTIFY_PE(fsn_event)->fd = fd;
- if (fd != FAN_NOFD)
+ if (fanotify_event_has_path(event)) {
fd_install(fd, f);
- return fanotify_event_metadata.event_len;
+ } else if (fanotify_event_has_fid(event)) {
+ ret = copy_fid_to_user(event, buf + FAN_EVENT_METADATA_LEN);
+ if (ret < 0)
+ return ret;
+ }
+
+ return metadata.event_len;
out_close_fd:
if (fd != FAN_NOFD) {
@@ -270,10 +337,7 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
add_wait_queue(&group->notification_waitq, &wait);
while (1) {
- spin_lock(&group->notification_lock);
kevent = get_one_event(group, count);
- spin_unlock(&group->notification_lock);
-
if (IS_ERR(kevent)) {
ret = PTR_ERR(kevent);
break;
@@ -295,7 +359,7 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
continue;
}
- ret = copy_event_to_user(group, kevent, buf);
+ ret = copy_event_to_user(group, kevent, buf, count);
if (unlikely(ret == -EOPENSTALE)) {
/*
* We cannot report events with stale fd so drop it.
@@ -310,11 +374,13 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
* Permission events get queued to wait for response. Other
* events can be destroyed now.
*/
- if (!fanotify_is_perm_event(kevent->mask)) {
+ if (!fanotify_is_perm_event(FANOTIFY_E(kevent)->mask)) {
fsnotify_destroy_event(group, kevent);
} else {
if (ret <= 0) {
- FANOTIFY_PE(kevent)->response = FAN_DENY;
+ spin_lock(&group->notification_lock);
+ finish_permission_event(group,
+ FANOTIFY_PE(kevent), FAN_DENY);
wake_up(&group->fanotify_data.access_waitq);
} else {
spin_lock(&group->notification_lock);
@@ -364,7 +430,7 @@ static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t
static int fanotify_release(struct inode *ignored, struct file *file)
{
struct fsnotify_group *group = file->private_data;
- struct fanotify_perm_event_info *event, *next;
+ struct fanotify_perm_event *event;
struct fsnotify_event *fsn_event;
/*
@@ -379,13 +445,12 @@ static int fanotify_release(struct inode *ignored, struct file *file)
* and simulate reply from userspace.
*/
spin_lock(&group->notification_lock);
- list_for_each_entry_safe(event, next, &group->fanotify_data.access_list,
- fae.fse.list) {
- pr_debug("%s: found group=%p event=%p\n", __func__, group,
- event);
-
+ while (!list_empty(&group->fanotify_data.access_list)) {
+ event = list_first_entry(&group->fanotify_data.access_list,
+ struct fanotify_perm_event, fae.fse.list);
list_del_init(&event->fae.fse.list);
- event->response = FAN_ALLOW;
+ finish_permission_event(group, event, FAN_ALLOW);
+ spin_lock(&group->notification_lock);
}
/*
@@ -395,13 +460,14 @@ static int fanotify_release(struct inode *ignored, struct file *file)
*/
while (!fsnotify_notify_queue_is_empty(group)) {
fsn_event = fsnotify_remove_first_event(group);
- if (!(fsn_event->mask & FAN_ALL_PERM_EVENTS)) {
+ if (!(FANOTIFY_E(fsn_event)->mask & FANOTIFY_PERM_EVENTS)) {
spin_unlock(&group->notification_lock);
fsnotify_destroy_event(group, fsn_event);
- spin_lock(&group->notification_lock);
} else {
- FANOTIFY_PE(fsn_event)->response = FAN_ALLOW;
+ finish_permission_event(group, FANOTIFY_PE(fsn_event),
+ FAN_ALLOW);
}
+ spin_lock(&group->notification_lock);
}
spin_unlock(&group->notification_lock);
@@ -506,18 +572,10 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
spin_lock(&fsn_mark->lock);
if (!(flags & FAN_MARK_IGNORED_MASK)) {
- __u32 tmask = fsn_mark->mask & ~mask;
-
- if (flags & FAN_MARK_ONDIR)
- tmask &= ~FAN_ONDIR;
-
oldmask = fsn_mark->mask;
- fsn_mark->mask = tmask;
+ fsn_mark->mask &= ~mask;
} else {
- __u32 tmask = fsn_mark->ignored_mask & ~mask;
- if (flags & FAN_MARK_ONDIR)
- tmask &= ~FAN_ONDIR;
- fsn_mark->ignored_mask = tmask;
+ fsn_mark->ignored_mask &= ~mask;
}
*destroy = !(fsn_mark->mask | fsn_mark->ignored_mask);
spin_unlock(&fsn_mark->lock);
@@ -563,6 +621,13 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
mask, flags);
}
+static int fanotify_remove_sb_mark(struct fsnotify_group *group,
+ struct super_block *sb, __u32 mask,
+ unsigned int flags)
+{
+ return fanotify_remove_mark(group, &sb->s_fsnotify_marks, mask, flags);
+}
+
static int fanotify_remove_inode_mark(struct fsnotify_group *group,
struct inode *inode, __u32 mask,
unsigned int flags)
@@ -579,19 +644,10 @@ static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
spin_lock(&fsn_mark->lock);
if (!(flags & FAN_MARK_IGNORED_MASK)) {
- __u32 tmask = fsn_mark->mask | mask;
-
- if (flags & FAN_MARK_ONDIR)
- tmask |= FAN_ONDIR;
-
oldmask = fsn_mark->mask;
- fsn_mark->mask = tmask;
+ fsn_mark->mask |= mask;
} else {
- __u32 tmask = fsn_mark->ignored_mask | mask;
- if (flags & FAN_MARK_ONDIR)
- tmask |= FAN_ONDIR;
-
- fsn_mark->ignored_mask = tmask;
+ fsn_mark->ignored_mask |= mask;
if (flags & FAN_MARK_IGNORED_SURV_MODIFY)
fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
}
@@ -602,7 +658,8 @@ static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
fsnotify_connp_t *connp,
- unsigned int type)
+ unsigned int type,
+ __kernel_fsid_t *fsid)
{
struct fsnotify_mark *mark;
int ret;
@@ -615,7 +672,7 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
return ERR_PTR(-ENOMEM);
fsnotify_init_mark(mark, group);
- ret = fsnotify_add_mark_locked(mark, connp, type, 0);
+ ret = fsnotify_add_mark_locked(mark, connp, type, 0, fsid);
if (ret) {
fsnotify_put_mark(mark);
return ERR_PTR(ret);
@@ -627,7 +684,8 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
static int fanotify_add_mark(struct fsnotify_group *group,
fsnotify_connp_t *connp, unsigned int type,
- __u32 mask, unsigned int flags)
+ __u32 mask, unsigned int flags,
+ __kernel_fsid_t *fsid)
{
struct fsnotify_mark *fsn_mark;
__u32 added;
@@ -635,7 +693,7 @@ static int fanotify_add_mark(struct fsnotify_group *group,
mutex_lock(&group->mark_mutex);
fsn_mark = fsnotify_find_mark(connp, group);
if (!fsn_mark) {
- fsn_mark = fanotify_add_new_mark(group, connp, type);
+ fsn_mark = fanotify_add_new_mark(group, connp, type, fsid);
if (IS_ERR(fsn_mark)) {
mutex_unlock(&group->mark_mutex);
return PTR_ERR(fsn_mark);
@@ -652,15 +710,23 @@ static int fanotify_add_mark(struct fsnotify_group *group,
static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
struct vfsmount *mnt, __u32 mask,
- unsigned int flags)
+ unsigned int flags, __kernel_fsid_t *fsid)
{
return fanotify_add_mark(group, &real_mount(mnt)->mnt_fsnotify_marks,
- FSNOTIFY_OBJ_TYPE_VFSMOUNT, mask, flags);
+ FSNOTIFY_OBJ_TYPE_VFSMOUNT, mask, flags, fsid);
+}
+
+static int fanotify_add_sb_mark(struct fsnotify_group *group,
+ struct super_block *sb, __u32 mask,
+ unsigned int flags, __kernel_fsid_t *fsid)
+{
+ return fanotify_add_mark(group, &sb->s_fsnotify_marks,
+ FSNOTIFY_OBJ_TYPE_SB, mask, flags, fsid);
}
static int fanotify_add_inode_mark(struct fsnotify_group *group,
struct inode *inode, __u32 mask,
- unsigned int flags)
+ unsigned int flags, __kernel_fsid_t *fsid)
{
pr_debug("%s: group=%p inode=%p\n", __func__, group, inode);
@@ -671,11 +737,11 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
*/
if ((flags & FAN_MARK_IGNORED_MASK) &&
!(flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
- (atomic_read(&inode->i_writecount) > 0))
+ inode_is_open_for_write(inode))
return 0;
return fanotify_add_mark(group, &inode->i_fsnotify_marks,
- FSNOTIFY_OBJ_TYPE_INODE, mask, flags);
+ FSNOTIFY_OBJ_TYPE_INODE, mask, flags, fsid);
}
/* fanotify syscalls */
@@ -684,18 +750,18 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
struct fsnotify_group *group;
int f_flags, fd;
struct user_struct *user;
- struct fanotify_event_info *oevent;
+ struct fanotify_event *oevent;
- pr_debug("%s: flags=%d event_f_flags=%d\n",
- __func__, flags, event_f_flags);
+ pr_debug("%s: flags=%x event_f_flags=%x\n",
+ __func__, flags, event_f_flags);
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
#ifdef CONFIG_AUDITSYSCALL
- if (flags & ~(FAN_ALL_INIT_FLAGS | FAN_ENABLE_AUDIT))
+ if (flags & ~(FANOTIFY_INIT_FLAGS | FAN_ENABLE_AUDIT))
#else
- if (flags & ~FAN_ALL_INIT_FLAGS)
+ if (flags & ~FANOTIFY_INIT_FLAGS)
#endif
return -EINVAL;
@@ -711,6 +777,10 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
return -EINVAL;
}
+ if ((flags & FAN_REPORT_FID) &&
+ (flags & FANOTIFY_CLASS_BITS) != FAN_CLASS_NOTIF)
+ return -EINVAL;
+
user = get_current_user();
if (atomic_read(&user->fanotify_listeners) > FANOTIFY_DEFAULT_MAX_LISTENERS) {
free_uid(user);
@@ -731,10 +801,12 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
}
group->fanotify_data.user = user;
+ group->fanotify_data.flags = flags;
atomic_inc(&user->fanotify_listeners);
group->memcg = get_mem_cgroup_from_mm(current->mm);
- oevent = fanotify_alloc_event(group, NULL, FS_Q_OVERFLOW, NULL);
+ oevent = fanotify_alloc_event(group, NULL, FS_Q_OVERFLOW, NULL,
+ FSNOTIFY_EVENT_NONE, NULL);
if (unlikely(!oevent)) {
fd = -ENOMEM;
goto out_destroy_group;
@@ -746,7 +818,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
group->fanotify_data.f_flags = event_f_flags;
init_waitqueue_head(&group->fanotify_data.access_waitq);
INIT_LIST_HEAD(&group->fanotify_data.access_list);
- switch (flags & FAN_ALL_CLASS_BITS) {
+ switch (flags & FANOTIFY_CLASS_BITS) {
case FAN_CLASS_NOTIF:
group->priority = FS_PRIO_0;
break;
@@ -783,7 +855,6 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
fd = -EPERM;
if (!capable(CAP_AUDIT_WRITE))
goto out_destroy_group;
- group->fanotify_data.audit = true;
}
fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags);
@@ -797,6 +868,48 @@ out_destroy_group:
return fd;
}
+/* Check if filesystem can encode a unique fid */
+static int fanotify_test_fid(struct path *path, __kernel_fsid_t *fsid)
+{
+ __kernel_fsid_t root_fsid;
+ int err;
+
+ /*
+ * Make sure path is not in filesystem with zero fsid (e.g. tmpfs).
+ */
+ err = vfs_get_fsid(path->dentry, fsid);
+ if (err)
+ return err;
+
+ if (!fsid->val[0] && !fsid->val[1])
+ return -ENODEV;
+
+ /*
+ * Make sure path is not inside a filesystem subvolume (e.g. btrfs)
+ * which uses a different fsid than sb root.
+ */
+ err = vfs_get_fsid(path->dentry->d_sb->s_root, &root_fsid);
+ if (err)
+ return err;
+
+ if (root_fsid.val[0] != fsid->val[0] ||
+ root_fsid.val[1] != fsid->val[1])
+ return -EXDEV;
+
+ /*
+ * We need to make sure that the file system supports at least
+ * encoding a file handle so user can use name_to_handle_at() to
+ * compare fid returned with event to the file handle of watched
+ * objects. However, name_to_handle_at() requires that the
+ * filesystem also supports decoding file handles.
+ */
+ if (!path->dentry->d_sb->s_export_op ||
+ !path->dentry->d_sb->s_export_op->fh_to_dentry)
+ return -EOPNOTSUPP;
+
+ return 0;
+}
+
static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
int dfd, const char __user *pathname)
{
@@ -805,7 +918,9 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
struct fsnotify_group *group;
struct fd f;
struct path path;
- u32 valid_mask = FAN_ALL_EVENTS | FAN_EVENT_ON_CHILD;
+ __kernel_fsid_t __fsid, *fsid = NULL;
+ u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS;
+ unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
int ret;
pr_debug("%s: fanotify_fd=%d flags=%x dfd=%d pathname=%p mask=%llx\n",
@@ -815,8 +930,18 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
if (mask & ((__u64)0xffffffff << 32))
return -EINVAL;
- if (flags & ~FAN_ALL_MARK_FLAGS)
+ if (flags & ~FANOTIFY_MARK_FLAGS)
+ return -EINVAL;
+
+ switch (mark_type) {
+ case FAN_MARK_INODE:
+ case FAN_MARK_MOUNT:
+ case FAN_MARK_FILESYSTEM:
+ break;
+ default:
return -EINVAL;
+ }
+
switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) {
case FAN_MARK_ADD: /* fallthrough */
case FAN_MARK_REMOVE:
@@ -824,20 +949,15 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
return -EINVAL;
break;
case FAN_MARK_FLUSH:
- if (flags & ~(FAN_MARK_MOUNT | FAN_MARK_FLUSH))
+ if (flags & ~(FANOTIFY_MARK_TYPE_BITS | FAN_MARK_FLUSH))
return -EINVAL;
break;
default:
return -EINVAL;
}
- if (mask & FAN_ONDIR) {
- flags |= FAN_MARK_ONDIR;
- mask &= ~FAN_ONDIR;
- }
-
if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS))
- valid_mask |= FAN_ALL_PERM_EVENTS;
+ valid_mask |= FANOTIFY_PERM_EVENTS;
if (mask & ~valid_mask)
return -EINVAL;
@@ -857,14 +977,28 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
* allowed to set permissions events.
*/
ret = -EINVAL;
- if (mask & FAN_ALL_PERM_EVENTS &&
+ if (mask & FANOTIFY_PERM_EVENTS &&
group->priority == FS_PRIO_0)
goto fput_and_out;
+ /*
+ * Events with data type inode do not carry enough information to report
+ * event->fd, so we do not allow setting a mask for inode events unless
+ * group supports reporting fid.
+ * inode events are not supported on a mount mark, because they do not
+ * carry enough information (i.e. path) to be filtered by mount point.
+ */
+ if (mask & FANOTIFY_INODE_EVENTS &&
+ (!FAN_GROUP_FLAG(group, FAN_REPORT_FID) ||
+ mark_type == FAN_MARK_MOUNT))
+ goto fput_and_out;
+
if (flags & FAN_MARK_FLUSH) {
ret = 0;
- if (flags & FAN_MARK_MOUNT)
+ if (mark_type == FAN_MARK_MOUNT)
fsnotify_clear_vfsmount_marks_by_group(group);
+ else if (mark_type == FAN_MARK_FILESYSTEM)
+ fsnotify_clear_sb_marks_by_group(group);
else
fsnotify_clear_inode_marks_by_group(group);
goto fput_and_out;
@@ -874,8 +1008,16 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
if (ret)
goto fput_and_out;
+ if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) {
+ ret = fanotify_test_fid(&path, &__fsid);
+ if (ret)
+ goto path_put_and_out;
+
+ fsid = &__fsid;
+ }
+
/* inode held in place by reference to path; group by fget on fd */
- if (!(flags & FAN_MARK_MOUNT))
+ if (mark_type == FAN_MARK_INODE)
inode = path.dentry->d_inode;
else
mnt = path.mnt;
@@ -883,21 +1025,32 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
/* create/update an inode mark */
switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE)) {
case FAN_MARK_ADD:
- if (flags & FAN_MARK_MOUNT)
- ret = fanotify_add_vfsmount_mark(group, mnt, mask, flags);
+ if (mark_type == FAN_MARK_MOUNT)
+ ret = fanotify_add_vfsmount_mark(group, mnt, mask,
+ flags, fsid);
+ else if (mark_type == FAN_MARK_FILESYSTEM)
+ ret = fanotify_add_sb_mark(group, mnt->mnt_sb, mask,
+ flags, fsid);
else
- ret = fanotify_add_inode_mark(group, inode, mask, flags);
+ ret = fanotify_add_inode_mark(group, inode, mask,
+ flags, fsid);
break;
case FAN_MARK_REMOVE:
- if (flags & FAN_MARK_MOUNT)
- ret = fanotify_remove_vfsmount_mark(group, mnt, mask, flags);
+ if (mark_type == FAN_MARK_MOUNT)
+ ret = fanotify_remove_vfsmount_mark(group, mnt, mask,
+ flags);
+ else if (mark_type == FAN_MARK_FILESYSTEM)
+ ret = fanotify_remove_sb_mark(group, mnt->mnt_sb, mask,
+ flags);
else
- ret = fanotify_remove_inode_mark(group, inode, mask, flags);
+ ret = fanotify_remove_inode_mark(group, inode, mask,
+ flags);
break;
default:
ret = -EINVAL;
}
+path_put_and_out:
path_put(&path);
fput_and_out:
fdput(f);
@@ -934,12 +1087,15 @@ COMPAT_SYSCALL_DEFINE6(fanotify_mark,
*/
static int __init fanotify_user_setup(void)
{
+ BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 8);
+ BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9);
+
fanotify_mark_cache = KMEM_CACHE(fsnotify_mark,
SLAB_PANIC|SLAB_ACCOUNT);
- fanotify_event_cachep = KMEM_CACHE(fanotify_event_info, SLAB_PANIC);
+ fanotify_event_cachep = KMEM_CACHE(fanotify_event, SLAB_PANIC);
if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) {
fanotify_perm_event_cachep =
- KMEM_CACHE(fanotify_perm_event_info, SLAB_PANIC);
+ KMEM_CACHE(fanotify_perm_event, SLAB_PANIC);
}
return 0;
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index 86fcf5814279..1e2bfd26b352 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -15,6 +15,7 @@
#include <linux/exportfs.h>
#include "inotify/inotify.h"
+#include "fdinfo.h"
#include "fsnotify.h"
#if defined(CONFIG_PROC_FS)
@@ -131,37 +132,20 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
seq_printf(m, "fanotify mnt_id:%x mflags:%x mask:%x ignored_mask:%x\n",
mnt->mnt_id, mflags, mark->mask, mark->ignored_mask);
+ } else if (mark->connector->type == FSNOTIFY_OBJ_TYPE_SB) {
+ struct super_block *sb = fsnotify_conn_sb(mark->connector);
+
+ seq_printf(m, "fanotify sdev:%x mflags:%x mask:%x ignored_mask:%x\n",
+ sb->s_dev, mflags, mark->mask, mark->ignored_mask);
}
}
void fanotify_show_fdinfo(struct seq_file *m, struct file *f)
{
struct fsnotify_group *group = f->private_data;
- unsigned int flags = 0;
-
- switch (group->priority) {
- case FS_PRIO_0:
- flags |= FAN_CLASS_NOTIF;
- break;
- case FS_PRIO_1:
- flags |= FAN_CLASS_CONTENT;
- break;
- case FS_PRIO_2:
- flags |= FAN_CLASS_PRE_CONTENT;
- break;
- }
-
- if (group->max_events == UINT_MAX)
- flags |= FAN_UNLIMITED_QUEUE;
-
- if (group->fanotify_data.max_marks == UINT_MAX)
- flags |= FAN_UNLIMITED_MARKS;
-
- if (group->fanotify_data.audit)
- flags |= FAN_ENABLE_AUDIT;
seq_printf(m, "fanotify flags:%x event-flags:%x\n",
- flags, group->fanotify_data.f_flags);
+ group->fanotify_data.flags, group->fanotify_data.f_flags);
show_fdinfo(m, f, fanotify_fdinfo);
}
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index ababdbfab537..df06f3da166c 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -48,7 +48,7 @@ void __fsnotify_vfsmount_delete(struct vfsmount *mnt)
* Called during unmount with no locks held, so needs to be safe against
* concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block.
*/
-void fsnotify_unmount_inodes(struct super_block *sb)
+static void fsnotify_unmount_inodes(struct super_block *sb)
{
struct inode *inode, *iput_inode = NULL;
@@ -96,6 +96,15 @@ void fsnotify_unmount_inodes(struct super_block *sb)
if (iput_inode)
iput(iput_inode);
+ /* Wait for outstanding inode references from connectors */
+ wait_var_event(&sb->s_fsnotify_inode_refs,
+ !atomic_long_read(&sb->s_fsnotify_inode_refs));
+}
+
+void fsnotify_sb_delete(struct super_block *sb)
+{
+ fsnotify_unmount_inodes(sb);
+ fsnotify_clear_marks_by_sb(sb);
}
/*
@@ -158,9 +167,9 @@ int __fsnotify_parent(const struct path *path, struct dentry *dentry, __u32 mask
parent = dget_parent(dentry);
p_inode = parent->d_inode;
- if (unlikely(!fsnotify_inode_watches_children(p_inode)))
+ if (unlikely(!fsnotify_inode_watches_children(p_inode))) {
__fsnotify_update_child_dentry_flags(p_inode);
- else if (p_inode->i_fsnotify_mask & mask) {
+ } else if (p_inode->i_fsnotify_mask & mask & ALL_FSNOTIFY_EVENTS) {
struct name_snapshot name;
/* we are notifying a parent so come up with the new mask which
@@ -190,7 +199,7 @@ static int send_to_group(struct inode *to_tell,
struct fsnotify_iter_info *iter_info)
{
struct fsnotify_group *group = NULL;
- __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD);
+ __u32 test_mask = (mask & ALL_FSNOTIFY_EVENTS);
__u32 marks_mask = 0;
__u32 marks_ignored_mask = 0;
struct fsnotify_mark *mark;
@@ -319,15 +328,19 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
const unsigned char *file_name, u32 cookie)
{
struct fsnotify_iter_info iter_info = {};
- struct mount *mnt;
+ struct super_block *sb = to_tell->i_sb;
+ struct mount *mnt = NULL;
+ __u32 mnt_or_sb_mask = sb->s_fsnotify_mask;
int ret = 0;
- /* global tests shouldn't care about events on child only the specific event */
- __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD);
+ __u32 test_mask = (mask & ALL_FSNOTIFY_EVENTS);
- if (data_is == FSNOTIFY_EVENT_PATH)
+ if (data_is == FSNOTIFY_EVENT_PATH) {
mnt = real_mount(((const struct path *)data)->mnt);
- else
- mnt = NULL;
+ mnt_or_sb_mask |= mnt->mnt_fsnotify_mask;
+ }
+ /* An event "on child" is not intended for a mount/sb mark */
+ if (mask & FS_EVENT_ON_CHILD)
+ mnt_or_sb_mask = 0;
/*
* Optimization: srcu_read_lock() has a memory barrier which can
@@ -336,31 +349,32 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
* SRCU because we have no references to any objects and do not
* need SRCU to keep them "alive".
*/
- if (!to_tell->i_fsnotify_marks &&
+ if (!to_tell->i_fsnotify_marks && !sb->s_fsnotify_marks &&
(!mnt || !mnt->mnt_fsnotify_marks))
return 0;
/*
* if this is a modify event we may need to clear the ignored masks
- * otherwise return if neither the inode nor the vfsmount care about
+ * otherwise return if neither the inode nor the vfsmount/sb care about
* this type of event.
*/
if (!(mask & FS_MODIFY) &&
- !(test_mask & to_tell->i_fsnotify_mask) &&
- !(mnt && test_mask & mnt->mnt_fsnotify_mask))
+ !(test_mask & (to_tell->i_fsnotify_mask | mnt_or_sb_mask)))
return 0;
iter_info.srcu_idx = srcu_read_lock(&fsnotify_mark_srcu);
iter_info.marks[FSNOTIFY_OBJ_TYPE_INODE] =
fsnotify_first_mark(&to_tell->i_fsnotify_marks);
+ iter_info.marks[FSNOTIFY_OBJ_TYPE_SB] =
+ fsnotify_first_mark(&sb->s_fsnotify_marks);
if (mnt) {
iter_info.marks[FSNOTIFY_OBJ_TYPE_VFSMOUNT] =
fsnotify_first_mark(&mnt->mnt_fsnotify_marks);
}
/*
- * We need to merge inode & vfsmount mark lists so that inode mark
- * ignore masks are properly reflected for mount mark notifications.
+ * We need to merge inode/vfsmount/sb mark lists so that e.g. inode mark
+ * ignore masks are properly reflected for mount/sb mark notifications.
* That's why this traversal is so complicated...
*/
while (fsnotify_iter_select_report_types(&iter_info)) {
@@ -386,7 +400,7 @@ static __init int fsnotify_init(void)
{
int ret;
- BUG_ON(hweight32(ALL_FSNOTIFY_EVENTS) != 23);
+ BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 25);
ret = init_srcu_struct(&fsnotify_mark_srcu);
if (ret)
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 7902653dd577..5a00121fb219 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -21,6 +21,12 @@ static inline struct mount *fsnotify_conn_mount(
return container_of(conn->obj, struct mount, mnt_fsnotify_marks);
}
+static inline struct super_block *fsnotify_conn_sb(
+ struct fsnotify_mark_connector *conn)
+{
+ return container_of(conn->obj, struct super_block, s_fsnotify_marks);
+}
+
/* destroy all events sitting in this groups notification queue */
extern void fsnotify_flush_notify(struct fsnotify_group *group);
@@ -43,6 +49,11 @@ static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
{
fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks);
}
+/* run the list of all marks associated with sb and destroy them */
+static inline void fsnotify_clear_marks_by_sb(struct super_block *sb)
+{
+ fsnotify_destroy_marks(&sb->s_fsnotify_marks);
+}
/* Wait until all marks queued for destruction are destroyed */
extern void fsnotify_wait_marks_destroyed(void);
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index 7e4578d35b61..74ae60305189 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -5,6 +5,7 @@
struct inotify_event_info {
struct fsnotify_event fse;
+ u32 mask;
int wd;
u32 sync_cookie;
int name_len;
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index f4184b4f3815..ff30abd6a49b 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -43,11 +43,11 @@ static bool event_compare(struct fsnotify_event *old_fsn,
{
struct inotify_event_info *old, *new;
- if (old_fsn->mask & FS_IN_IGNORED)
- return false;
old = INOTIFY_E(old_fsn);
new = INOTIFY_E(new_fsn);
- if ((old_fsn->mask == new_fsn->mask) &&
+ if (old->mask & FS_IN_IGNORED)
+ return false;
+ if ((old->mask == new->mask) &&
(old_fsn->inode == new_fsn->inode) &&
(old->name_len == new->name_len) &&
(!old->name_len || !strcmp(old->name, new->name)))
@@ -113,8 +113,18 @@ int inotify_handle_event(struct fsnotify_group *group,
return -ENOMEM;
}
+ /*
+ * We now report FS_ISDIR flag with MOVE_SELF and DELETE_SELF events
+ * for fanotify. inotify never reported IN_ISDIR with those events.
+ * It looks like an oversight, but to avoid the risk of breaking
+ * existing inotify programs, mask the flag out from those events.
+ */
+ if (mask & (IN_MOVE_SELF | IN_DELETE_SELF))
+ mask &= ~IN_ISDIR;
+
fsn_event = &event->fse;
- fsnotify_init_event(fsn_event, inode, mask);
+ fsnotify_init_event(fsn_event, inode);
+ event->mask = mask;
event->wd = i_mark->wd;
event->sync_cookie = cookie;
event->name_len = len;
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index ac6978d3208c..e2901fbb9f76 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -189,7 +189,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
*/
pad_name_len = round_event_name_len(fsn_event);
inotify_event.len = pad_name_len;
- inotify_event.mask = inotify_mask_to_arg(fsn_event->mask);
+ inotify_event.mask = inotify_mask_to_arg(event->mask);
inotify_event.wd = event->wd;
inotify_event.cookie = event->sync_cookie;
@@ -634,7 +634,8 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events)
return ERR_PTR(-ENOMEM);
}
group->overflow_event = &oevent->fse;
- fsnotify_init_event(group->overflow_event, NULL, FS_Q_OVERFLOW);
+ fsnotify_init_event(group->overflow_event, NULL);
+ oevent->mask = FS_Q_OVERFLOW;
oevent->wd = -1;
oevent->sync_cookie = 0;
oevent->name_len = 0;
@@ -724,8 +725,10 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
return -EBADF;
/* IN_MASK_ADD and IN_MASK_CREATE don't make sense together */
- if (unlikely((mask & IN_MASK_ADD) && (mask & IN_MASK_CREATE)))
- return -EINVAL;
+ if (unlikely((mask & IN_MASK_ADD) && (mask & IN_MASK_CREATE))) {
+ ret = -EINVAL;
+ goto fput_and_out;
+ }
/* verify that this is indeed an inotify instance */
if (unlikely(f.file->f_op != &inotify_fops)) {
@@ -815,7 +818,7 @@ static int __init inotify_user_setup(void)
BUILD_BUG_ON(IN_ISDIR != FS_ISDIR);
BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT);
- BUG_ON(hweight32(ALL_INOTIFY_BITS) != 22);
+ BUILD_BUG_ON(HWEIGHT32(ALL_INOTIFY_BITS) != 22);
inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark,
SLAB_PANIC|SLAB_ACCOUNT);
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 59cdb27826de..d593d4269561 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -82,6 +82,7 @@
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/srcu.h>
+#include <linux/ratelimit.h>
#include <linux/atomic.h>
@@ -115,6 +116,8 @@ static __u32 *fsnotify_conn_mask_p(struct fsnotify_mark_connector *conn)
return &fsnotify_conn_inode(conn)->i_fsnotify_mask;
else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT)
return &fsnotify_conn_mount(conn)->mnt_fsnotify_mask;
+ else if (conn->type == FSNOTIFY_OBJ_TYPE_SB)
+ return &fsnotify_conn_sb(conn)->s_fsnotify_mask;
return NULL;
}
@@ -179,19 +182,24 @@ static void fsnotify_connector_destroy_workfn(struct work_struct *work)
}
}
-static struct inode *fsnotify_detach_connector_from_object(
- struct fsnotify_mark_connector *conn)
+static void *fsnotify_detach_connector_from_object(
+ struct fsnotify_mark_connector *conn,
+ unsigned int *type)
{
struct inode *inode = NULL;
+ *type = conn->type;
if (conn->type == FSNOTIFY_OBJ_TYPE_DETACHED)
return NULL;
if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) {
inode = fsnotify_conn_inode(conn);
inode->i_fsnotify_mask = 0;
+ atomic_long_inc(&inode->i_sb->s_fsnotify_inode_refs);
} else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) {
fsnotify_conn_mount(conn)->mnt_fsnotify_mask = 0;
+ } else if (conn->type == FSNOTIFY_OBJ_TYPE_SB) {
+ fsnotify_conn_sb(conn)->s_fsnotify_mask = 0;
}
rcu_assign_pointer(*(conn->obj), NULL);
@@ -211,10 +219,29 @@ static void fsnotify_final_mark_destroy(struct fsnotify_mark *mark)
fsnotify_put_group(group);
}
+/* Drop object reference originally held by a connector */
+static void fsnotify_drop_object(unsigned int type, void *objp)
+{
+ struct inode *inode;
+ struct super_block *sb;
+
+ if (!objp)
+ return;
+ /* Currently only inode references are passed to be dropped */
+ if (WARN_ON_ONCE(type != FSNOTIFY_OBJ_TYPE_INODE))
+ return;
+ inode = objp;
+ sb = inode->i_sb;
+ iput(inode);
+ if (atomic_long_dec_and_test(&sb->s_fsnotify_inode_refs))
+ wake_up_var(&sb->s_fsnotify_inode_refs);
+}
+
void fsnotify_put_mark(struct fsnotify_mark *mark)
{
struct fsnotify_mark_connector *conn;
- struct inode *inode = NULL;
+ void *objp = NULL;
+ unsigned int type = FSNOTIFY_OBJ_TYPE_DETACHED;
bool free_conn = false;
/* Catch marks that were actually never attached to object */
@@ -234,7 +261,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
conn = mark->connector;
hlist_del_init_rcu(&mark->obj_list);
if (hlist_empty(&conn->list)) {
- inode = fsnotify_detach_connector_from_object(conn);
+ objp = fsnotify_detach_connector_from_object(conn, &type);
free_conn = true;
} else {
__fsnotify_recalc_mask(conn);
@@ -242,7 +269,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
mark->connector = NULL;
spin_unlock(&conn->lock);
- iput(inode);
+ fsnotify_drop_object(type, objp);
if (free_conn) {
spin_lock(&destroy_lock);
@@ -455,7 +482,8 @@ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b)
}
static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
- unsigned int type)
+ unsigned int type,
+ __kernel_fsid_t *fsid)
{
struct inode *inode = NULL;
struct fsnotify_mark_connector *conn;
@@ -467,6 +495,11 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
INIT_HLIST_HEAD(&conn->list);
conn->type = type;
conn->obj = connp;
+ /* Cache fsid of filesystem containing the object */
+ if (fsid)
+ conn->fsid = *fsid;
+ else
+ conn->fsid.val[0] = conn->fsid.val[1] = 0;
if (conn->type == FSNOTIFY_OBJ_TYPE_INODE)
inode = igrab(fsnotify_conn_inode(conn));
/*
@@ -518,7 +551,7 @@ out:
*/
static int fsnotify_add_mark_list(struct fsnotify_mark *mark,
fsnotify_connp_t *connp, unsigned int type,
- int allow_dups)
+ int allow_dups, __kernel_fsid_t *fsid)
{
struct fsnotify_mark *lmark, *last = NULL;
struct fsnotify_mark_connector *conn;
@@ -527,15 +560,36 @@ static int fsnotify_add_mark_list(struct fsnotify_mark *mark,
if (WARN_ON(!fsnotify_valid_obj_type(type)))
return -EINVAL;
+
+ /* Backend is expected to check for zero fsid (e.g. tmpfs) */
+ if (fsid && WARN_ON_ONCE(!fsid->val[0] && !fsid->val[1]))
+ return -ENODEV;
+
restart:
spin_lock(&mark->lock);
conn = fsnotify_grab_connector(connp);
if (!conn) {
spin_unlock(&mark->lock);
- err = fsnotify_attach_connector_to_object(connp, type);
+ err = fsnotify_attach_connector_to_object(connp, type, fsid);
if (err)
return err;
goto restart;
+ } else if (fsid && (conn->fsid.val[0] || conn->fsid.val[1]) &&
+ (fsid->val[0] != conn->fsid.val[0] ||
+ fsid->val[1] != conn->fsid.val[1])) {
+ /*
+ * Backend is expected to check for non uniform fsid
+ * (e.g. btrfs), but maybe we missed something?
+ * Only allow setting conn->fsid once to non zero fsid.
+ * inotify and non-fid fanotify groups do not set nor test
+ * conn->fsid.
+ */
+ pr_warn_ratelimited("%s: fsid mismatch on object of type %u: "
+ "%x.%x != %x.%x\n", __func__, conn->type,
+ fsid->val[0], fsid->val[1],
+ conn->fsid.val[0], conn->fsid.val[1]);
+ err = -EXDEV;
+ goto out_err;
}
/* is mark the first mark? */
@@ -580,7 +634,7 @@ out_err:
*/
int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
fsnotify_connp_t *connp, unsigned int type,
- int allow_dups)
+ int allow_dups, __kernel_fsid_t *fsid)
{
struct fsnotify_group *group = mark->group;
int ret = 0;
@@ -601,7 +655,7 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
fsnotify_get_mark(mark); /* for g_list */
spin_unlock(&mark->lock);
- ret = fsnotify_add_mark_list(mark, connp, type, allow_dups);
+ ret = fsnotify_add_mark_list(mark, connp, type, allow_dups, fsid);
if (ret)
goto err;
@@ -622,13 +676,13 @@ err:
}
int fsnotify_add_mark(struct fsnotify_mark *mark, fsnotify_connp_t *connp,
- unsigned int type, int allow_dups)
+ unsigned int type, int allow_dups, __kernel_fsid_t *fsid)
{
int ret;
struct fsnotify_group *group = mark->group;
mutex_lock(&group->mark_mutex);
- ret = fsnotify_add_mark_locked(mark, connp, type, allow_dups);
+ ret = fsnotify_add_mark_locked(mark, connp, type, allow_dups, fsid);
mutex_unlock(&group->mark_mutex);
return ret;
}
@@ -709,7 +763,8 @@ void fsnotify_destroy_marks(fsnotify_connp_t *connp)
{
struct fsnotify_mark_connector *conn;
struct fsnotify_mark *mark, *old_mark = NULL;
- struct inode *inode;
+ void *objp;
+ unsigned int type;
conn = fsnotify_grab_connector(connp);
if (!conn)
@@ -735,11 +790,11 @@ void fsnotify_destroy_marks(fsnotify_connp_t *connp)
* mark references get dropped. It would lead to strange results such
* as delaying inode deletion or blocking unmount.
*/
- inode = fsnotify_detach_connector_from_object(conn);
+ objp = fsnotify_detach_connector_from_object(conn, &type);
spin_unlock(&conn->lock);
if (old_mark)
fsnotify_put_mark(old_mark);
- iput(inode);
+ fsnotify_drop_object(type, objp);
}
/*
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 3c3e36745f59..5f3a54d444b5 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -71,7 +71,7 @@ void fsnotify_destroy_event(struct fsnotify_group *group,
struct fsnotify_event *event)
{
/* Overflow events are per-group and we don't want to free them */
- if (!event || event->mask == FS_Q_OVERFLOW)
+ if (!event || event == group->overflow_event)
return;
/*
* If the event is still queued, we have a problem... Do an unreliable
@@ -141,6 +141,18 @@ queue:
return ret;
}
+void fsnotify_remove_queued_event(struct fsnotify_group *group,
+ struct fsnotify_event *event)
+{
+ assert_spin_locked(&group->notification_lock);
+ /*
+ * We need to init list head for the case of overflow event so that
+ * check in fsnotify_add_event() works
+ */
+ list_del_init(&event->list);
+ group->q_len--;
+}
+
/*
* Remove and return the first event from the notification list. It is the
* responsibility of the caller to destroy the obtained event
@@ -155,13 +167,7 @@ struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group)
event = list_first_entry(&group->notification_list,
struct fsnotify_event, list);
- /*
- * We need to init list head for the case of overflow event so that
- * check in fsnotify_add_event() works
- */
- list_del_init(&event->list);
- group->q_len--;
-
+ fsnotify_remove_queued_event(group, event);
return event;
}
@@ -194,23 +200,3 @@ void fsnotify_flush_notify(struct fsnotify_group *group)
}
spin_unlock(&group->notification_lock);
}
-
-/*
- * fsnotify_create_event - Allocate a new event which will be sent to each
- * group's handle_event function if the group was interested in this
- * particular event.
- *
- * @inode the inode which is supposed to receive the event (sometimes a
- * parent of the inode to which the event happened.
- * @mask what actually happened.
- * @data pointer to the object which was actually affected
- * @data_type flag indication if the data is a file, path, inode, nothing...
- * @name the filename, if available
- */
-void fsnotify_init_event(struct fsnotify_event *event, struct inode *inode,
- u32 mask)
-{
- INIT_LIST_HEAD(&event->list);
- event->inode = inode;
- event->mask = mask;
-}
diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h
index ab172e5f51d9..5becc8acc8f4 100644
--- a/fs/ntfs/malloc.h
+++ b/fs/ntfs/malloc.h
@@ -47,7 +47,7 @@ static inline void *__ntfs_malloc(unsigned long size, gfp_t gfp_mask)
return kmalloc(PAGE_SIZE, gfp_mask & ~__GFP_HIGHMEM);
/* return (void *)__get_free_page(gfp_mask); */
}
- if (likely((size >> PAGE_SHIFT) < totalram_pages))
+ if (likely((size >> PAGE_SHIFT) < totalram_pages()))
return __vmalloc(size, gfp_mask, PAGE_KERNEL);
return NULL;
}
diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
index 4690cd75d8d7..3986c7a1f6a8 100644
--- a/fs/ntfs/namei.c
+++ b/fs/ntfs/namei.c
@@ -312,7 +312,7 @@ static struct dentry *ntfs_get_parent(struct dentry *child_dent)
/* Get the mft record of the inode belonging to the child dentry. */
mrec = map_mft_record(ni);
if (IS_ERR(mrec))
- return (struct dentry *)mrec;
+ return ERR_CAST(mrec);
/* Find the first file name attribute in the mft record. */
ctx = ntfs_attr_get_search_ctx(ni, mrec);
if (unlikely(!ctx)) {
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 99ee093182cb..cc9b32b9db7c 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -1,5 +1,5 @@
# SPDX-License-Identifier: GPL-2.0
-ccflags-y := -Ifs/ocfs2
+ccflags-y := -I$(src)
obj-$(CONFIG_OCFS2_FS) += \
ocfs2.o \
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index a342f008e42f..6f0999015a44 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5106,8 +5106,6 @@ int ocfs2_split_extent(handle_t *handle,
* rightmost extent list.
*/
if (path->p_tree_depth) {
- struct ocfs2_extent_block *eb;
-
ret = ocfs2_read_extent_block(et->et_ci,
ocfs2_et_get_last_eb_blk(et),
&last_eb_bh);
@@ -5115,8 +5113,6 @@ int ocfs2_split_extent(handle_t *handle,
mlog_errno(ret);
goto out;
}
-
- eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
}
if (rec->e_cpos == split_rec->e_cpos &&
@@ -7536,10 +7532,11 @@ static int ocfs2_trim_group(struct super_block *sb,
return count;
}
-int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
+static
+int ocfs2_trim_mainbm(struct super_block *sb, struct fstrim_range *range)
{
struct ocfs2_super *osb = OCFS2_SB(sb);
- u64 start, len, trimmed, first_group, last_group, group;
+ u64 start, len, trimmed = 0, first_group, last_group = 0, group = 0;
int ret, cnt;
u32 first_bit, last_bit, minlen;
struct buffer_head *main_bm_bh = NULL;
@@ -7547,7 +7544,6 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
struct buffer_head *gd_bh = NULL;
struct ocfs2_dinode *main_bm;
struct ocfs2_group_desc *gd = NULL;
- struct ocfs2_trim_fs_info info, *pinfo = NULL;
start = range->start >> osb->s_clustersize_bits;
len = range->len >> osb->s_clustersize_bits;
@@ -7556,6 +7552,9 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize)
return -EINVAL;
+ trace_ocfs2_trim_mainbm(start, len, minlen);
+
+next_group:
main_bm_inode = ocfs2_get_system_file_inode(osb,
GLOBAL_BITMAP_SYSTEM_INODE,
OCFS2_INVALID_SLOT);
@@ -7574,64 +7573,34 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
}
main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
- if (start >= le32_to_cpu(main_bm->i_clusters)) {
- ret = -EINVAL;
- goto out_unlock;
- }
-
- len = range->len >> osb->s_clustersize_bits;
- if (start + len > le32_to_cpu(main_bm->i_clusters))
- len = le32_to_cpu(main_bm->i_clusters) - start;
-
- trace_ocfs2_trim_fs(start, len, minlen);
-
- ocfs2_trim_fs_lock_res_init(osb);
- ret = ocfs2_trim_fs_lock(osb, NULL, 1);
- if (ret < 0) {
- if (ret != -EAGAIN) {
- mlog_errno(ret);
- ocfs2_trim_fs_lock_res_uninit(osb);
+ /*
+ * Do some check before trim the first group.
+ */
+ if (!group) {
+ if (start >= le32_to_cpu(main_bm->i_clusters)) {
+ ret = -EINVAL;
goto out_unlock;
}
- mlog(ML_NOTICE, "Wait for trim on device (%s) to "
- "finish, which is running from another node.\n",
- osb->dev_str);
- ret = ocfs2_trim_fs_lock(osb, &info, 0);
- if (ret < 0) {
- mlog_errno(ret);
- ocfs2_trim_fs_lock_res_uninit(osb);
- goto out_unlock;
- }
+ if (start + len > le32_to_cpu(main_bm->i_clusters))
+ len = le32_to_cpu(main_bm->i_clusters) - start;
- if (info.tf_valid && info.tf_success &&
- info.tf_start == start && info.tf_len == len &&
- info.tf_minlen == minlen) {
- /* Avoid sending duplicated trim to a shared device */
- mlog(ML_NOTICE, "The same trim on device (%s) was "
- "just done from node (%u), return.\n",
- osb->dev_str, info.tf_nodenum);
- range->len = info.tf_trimlen;
- goto out_trimunlock;
- }
+ /*
+ * Determine first and last group to examine based on
+ * start and len
+ */
+ first_group = ocfs2_which_cluster_group(main_bm_inode, start);
+ if (first_group == osb->first_cluster_group_blkno)
+ first_bit = start;
+ else
+ first_bit = start - ocfs2_blocks_to_clusters(sb,
+ first_group);
+ last_group = ocfs2_which_cluster_group(main_bm_inode,
+ start + len - 1);
+ group = first_group;
}
- info.tf_nodenum = osb->node_num;
- info.tf_start = start;
- info.tf_len = len;
- info.tf_minlen = minlen;
-
- /* Determine first and last group to examine based on start and len */
- first_group = ocfs2_which_cluster_group(main_bm_inode, start);
- if (first_group == osb->first_cluster_group_blkno)
- first_bit = start;
- else
- first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
- last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
- last_bit = osb->bitmap_cpg;
-
- trimmed = 0;
- for (group = first_group; group <= last_group;) {
+ do {
if (first_bit + len >= osb->bitmap_cpg)
last_bit = osb->bitmap_cpg;
else
@@ -7663,21 +7632,81 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
else
group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
- }
- range->len = trimmed * sb->s_blocksize;
+ } while (0);
- info.tf_trimlen = range->len;
- info.tf_success = (ret ? 0 : 1);
- pinfo = &info;
-out_trimunlock:
- ocfs2_trim_fs_unlock(osb, pinfo);
- ocfs2_trim_fs_lock_res_uninit(osb);
out_unlock:
ocfs2_inode_unlock(main_bm_inode, 0);
brelse(main_bm_bh);
+ main_bm_bh = NULL;
out_mutex:
inode_unlock(main_bm_inode);
iput(main_bm_inode);
+
+ /*
+ * If all the groups trim are not done or failed, but we should release
+ * main_bm related locks for avoiding the current IO starve, then go to
+ * trim the next group
+ */
+ if (ret >= 0 && group <= last_group)
+ goto next_group;
+out:
+ range->len = trimmed * sb->s_blocksize;
+ return ret;
+}
+
+int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
+{
+ int ret;
+ struct ocfs2_super *osb = OCFS2_SB(sb);
+ struct ocfs2_trim_fs_info info, *pinfo = NULL;
+
+ ocfs2_trim_fs_lock_res_init(osb);
+
+ trace_ocfs2_trim_fs(range->start, range->len, range->minlen);
+
+ ret = ocfs2_trim_fs_lock(osb, NULL, 1);
+ if (ret < 0) {
+ if (ret != -EAGAIN) {
+ mlog_errno(ret);
+ ocfs2_trim_fs_lock_res_uninit(osb);
+ return ret;
+ }
+
+ mlog(ML_NOTICE, "Wait for trim on device (%s) to "
+ "finish, which is running from another node.\n",
+ osb->dev_str);
+ ret = ocfs2_trim_fs_lock(osb, &info, 0);
+ if (ret < 0) {
+ mlog_errno(ret);
+ ocfs2_trim_fs_lock_res_uninit(osb);
+ return ret;
+ }
+
+ if (info.tf_valid && info.tf_success &&
+ info.tf_start == range->start &&
+ info.tf_len == range->len &&
+ info.tf_minlen == range->minlen) {
+ /* Avoid sending duplicated trim to a shared device */
+ mlog(ML_NOTICE, "The same trim on device (%s) was "
+ "just done from node (%u), return.\n",
+ osb->dev_str, info.tf_nodenum);
+ range->len = info.tf_trimlen;
+ goto out;
+ }
+ }
+
+ info.tf_nodenum = osb->node_num;
+ info.tf_start = range->start;
+ info.tf_len = range->len;
+ info.tf_minlen = range->minlen;
+
+ ret = ocfs2_trim_mainbm(sb, range);
+
+ info.tf_trimlen = range->len;
+ info.tf_success = (ret < 0 ? 0 : 1);
+ pinfo = &info;
out:
+ ocfs2_trim_fs_unlock(osb, pinfo);
+ ocfs2_trim_fs_lock_res_uninit(osb);
return ret;
}
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 302cd7caa4a7..832c1759a09a 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -30,6 +30,7 @@
#include <linux/quotaops.h>
#include <linux/blkdev.h>
#include <linux/uio.h>
+#include <linux/mm.h>
#include <cluster/masklog.h>
@@ -397,7 +398,7 @@ static int ocfs2_readpages(struct file *filp, struct address_space *mapping,
* Check whether a remote node truncated this file - we just
* drop out in that case as it's not worth handling here.
*/
- last = list_entry(pages->prev, struct page, lru);
+ last = lru_to_page(pages);
start = (loff_t)last->index << PAGE_SHIFT;
if (start >= i_size_read(inode))
goto out_unlock;
@@ -1392,8 +1393,7 @@ retry:
unlock:
spin_unlock(&oi->ip_lock);
out:
- if (new)
- kfree(new);
+ kfree(new);
return ret;
}
@@ -2412,8 +2412,16 @@ static int ocfs2_dio_end_io(struct kiocb *iocb,
/* this io's submitter should not have unlocked this before we could */
BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
- if (bytes > 0 && private)
- ret = ocfs2_dio_end_io_write(inode, private, offset, bytes);
+ if (bytes <= 0)
+ mlog_ratelimited(ML_ERROR, "Direct IO failed, bytes = %lld",
+ (long long)bytes);
+ if (private) {
+ if (bytes > 0)
+ ret = ocfs2_dio_end_io_write(inode, private, offset,
+ bytes);
+ else
+ ocfs2_dio_free_write_ctx(inode, private);
+ }
ocfs2_iocb_clear_rw_locked(iocb);
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 1d098c3c00e0..f9b84f7a3e4b 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -99,25 +99,34 @@ out:
return ret;
}
+/* Caller must provide a bhs[] with all NULL or non-NULL entries, so it
+ * will be easier to handle read failure.
+ */
int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
unsigned int nr, struct buffer_head *bhs[])
{
int status = 0;
unsigned int i;
struct buffer_head *bh;
+ int new_bh = 0;
trace_ocfs2_read_blocks_sync((unsigned long long)block, nr);
if (!nr)
goto bail;
+ /* Don't put buffer head and re-assign it to NULL if it is allocated
+ * outside since the caller can't be aware of this alternation!
+ */
+ new_bh = (bhs[0] == NULL);
+
for (i = 0 ; i < nr ; i++) {
if (bhs[i] == NULL) {
bhs[i] = sb_getblk(osb->sb, block++);
if (bhs[i] == NULL) {
status = -ENOMEM;
mlog_errno(status);
- goto bail;
+ break;
}
}
bh = bhs[i];
@@ -152,15 +161,31 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
#endif
}
- clear_buffer_uptodate(bh);
get_bh(bh); /* for end_buffer_read_sync() */
bh->b_end_io = end_buffer_read_sync;
submit_bh(REQ_OP_READ, 0, bh);
}
+read_failure:
for (i = nr; i > 0; i--) {
bh = bhs[i - 1];
+ if (unlikely(status)) {
+ if (new_bh && bh) {
+ /* If middle bh fails, let previous bh
+ * finish its read and then put it to
+ * aovoid bh leak
+ */
+ if (!buffer_jbd(bh))
+ wait_on_buffer(bh);
+ put_bh(bh);
+ bhs[i - 1] = NULL;
+ } else if (bh && buffer_uptodate(bh)) {
+ clear_buffer_uptodate(bh);
+ }
+ continue;
+ }
+
/* No need to wait on the buffer if it's managed by JBD. */
if (!buffer_jbd(bh))
wait_on_buffer(bh);
@@ -170,8 +195,7 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
* so we can safely record this and loop back
* to cleanup the other buffers. */
status = -EIO;
- put_bh(bh);
- bhs[i - 1] = NULL;
+ goto read_failure;
}
}
@@ -179,6 +203,9 @@ bail:
return status;
}
+/* Caller must provide a bhs[] with all NULL or non-NULL entries, so it
+ * will be easier to handle read failure.
+ */
int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
struct buffer_head *bhs[], int flags,
int (*validate)(struct super_block *sb,
@@ -188,6 +215,7 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
int i, ignore_cache = 0;
struct buffer_head *bh;
struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+ int new_bh = 0;
trace_ocfs2_read_blocks_begin(ci, (unsigned long long)block, nr, flags);
@@ -213,6 +241,11 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
goto bail;
}
+ /* Don't put buffer head and re-assign it to NULL if it is allocated
+ * outside since the caller can't be aware of this alternation!
+ */
+ new_bh = (bhs[0] == NULL);
+
ocfs2_metadata_cache_io_lock(ci);
for (i = 0 ; i < nr ; i++) {
if (bhs[i] == NULL) {
@@ -221,7 +254,8 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
ocfs2_metadata_cache_io_unlock(ci);
status = -ENOMEM;
mlog_errno(status);
- goto bail;
+ /* Don't forget to put previous bh! */
+ break;
}
}
bh = bhs[i];
@@ -306,7 +340,6 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
continue;
}
- clear_buffer_uptodate(bh);
get_bh(bh); /* for end_buffer_read_sync() */
if (validate)
set_buffer_needs_validate(bh);
@@ -316,16 +349,27 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
}
}
- status = 0;
-
+read_failure:
for (i = (nr - 1); i >= 0; i--) {
bh = bhs[i];
if (!(flags & OCFS2_BH_READAHEAD)) {
- if (status) {
- /* Clear the rest of the buffers on error */
- put_bh(bh);
- bhs[i] = NULL;
+ if (unlikely(status)) {
+ /* Clear the buffers on error including those
+ * ever succeeded in reading
+ */
+ if (new_bh && bh) {
+ /* If middle bh fails, let previous bh
+ * finish its read and then put it to
+ * aovoid bh leak
+ */
+ if (!buffer_jbd(bh))
+ wait_on_buffer(bh);
+ put_bh(bh);
+ bhs[i] = NULL;
+ } else if (bh && buffer_uptodate(bh)) {
+ clear_buffer_uptodate(bh);
+ }
continue;
}
/* We know this can't have changed as we hold the
@@ -343,9 +387,7 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
* uptodate. */
status = -EIO;
clear_buffer_needs_validate(bh);
- put_bh(bh);
- bhs[i] = NULL;
- continue;
+ goto read_failure;
}
if (buffer_needs_validate(bh)) {
@@ -355,11 +397,8 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
BUG_ON(buffer_jbd(bh));
clear_buffer_needs_validate(bh);
status = validate(sb, bh);
- if (status) {
- put_bh(bh);
- bhs[i] = NULL;
- continue;
- }
+ if (status)
+ goto read_failure;
}
}
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 9b2ed62dd638..f3c20b279eb2 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -582,9 +582,10 @@ bail:
}
static int o2hb_read_slots(struct o2hb_region *reg,
+ unsigned int begin_slot,
unsigned int max_slots)
{
- unsigned int current_slot=0;
+ unsigned int current_slot = begin_slot;
int status;
struct o2hb_bio_wait_ctxt wc;
struct bio *bio;
@@ -1093,9 +1094,14 @@ static int o2hb_highest_node(unsigned long *nodes, int numbits)
return find_last_bit(nodes, numbits);
}
+static int o2hb_lowest_node(unsigned long *nodes, int numbits)
+{
+ return find_first_bit(nodes, numbits);
+}
+
static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
{
- int i, ret, highest_node;
+ int i, ret, highest_node, lowest_node;
int membership_change = 0, own_slot_ok = 0;
unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)];
unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
@@ -1120,7 +1126,8 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
}
highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES);
- if (highest_node >= O2NM_MAX_NODES) {
+ lowest_node = o2hb_lowest_node(configured_nodes, O2NM_MAX_NODES);
+ if (highest_node >= O2NM_MAX_NODES || lowest_node >= O2NM_MAX_NODES) {
mlog(ML_NOTICE, "o2hb: No configured nodes found!\n");
ret = -EINVAL;
goto bail;
@@ -1130,7 +1137,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
* yet. Of course, if the node definitions have holes in them
* then we're reading an empty slot anyway... Consider this
* best-effort. */
- ret = o2hb_read_slots(reg, highest_node + 1);
+ ret = o2hb_read_slots(reg, lowest_node, highest_node + 1);
if (ret < 0) {
mlog_errno(ret);
goto bail;
@@ -1801,7 +1808,7 @@ static int o2hb_populate_slot_data(struct o2hb_region *reg)
struct o2hb_disk_slot *slot;
struct o2hb_disk_heartbeat_block *hb_block;
- ret = o2hb_read_slots(reg, reg->hr_blocks);
+ ret = o2hb_read_slots(reg, 0, reg->hr_blocks);
if (ret)
goto out;
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 308ea0eb35fd..a396096a5099 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -178,6 +178,15 @@ do { \
##__VA_ARGS__); \
} while (0)
+#define mlog_ratelimited(mask, fmt, ...) \
+do { \
+ static DEFINE_RATELIMIT_STATE(_rs, \
+ DEFAULT_RATELIMIT_INTERVAL, \
+ DEFAULT_RATELIMIT_BURST); \
+ if (__ratelimit(&_rs)) \
+ mlog(mask, fmt, ##__VA_ARGS__); \
+} while (0)
+
#define mlog_errno(st) ({ \
int _st = (st); \
if (_st != -ERESTARTSYS && _st != -EINTR && \
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 0e4166cc23a0..4ac775e32240 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -621,13 +621,15 @@ static void o2nm_node_group_drop_item(struct config_group *group,
struct o2nm_node *node = to_o2nm_node(item);
struct o2nm_cluster *cluster = to_o2nm_cluster(group->cg_item.ci_parent);
- o2net_disconnect_node(node);
+ if (cluster->cl_nodes[node->nd_num] == node) {
+ o2net_disconnect_node(node);
- if (cluster->cl_has_local &&
- (cluster->cl_local_node == node->nd_num)) {
- cluster->cl_has_local = 0;
- cluster->cl_local_node = O2NM_INVALID_NODE_NUM;
- o2net_stop_listening(node);
+ if (cluster->cl_has_local &&
+ (cluster->cl_local_node == node->nd_num)) {
+ cluster->cl_has_local = 0;
+ cluster->cl_local_node = O2NM_INVALID_NODE_NUM;
+ o2net_stop_listening(node);
+ }
}
/* XXX call into net to stop this node from trading messages */
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 7d9eea7d4a87..e9f236af1927 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -916,7 +916,7 @@ static int o2net_recv_tcp_msg(struct socket *sock, void *data, size_t len)
{
struct kvec vec = { .iov_len = len, .iov_base = data, };
struct msghdr msg = { .msg_flags = MSG_DONTWAIT, };
- iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &vec, 1, len);
+ iov_iter_kvec(&msg.msg_iter, READ, &vec, 1, len);
return sock_recvmsg(sock, &msg, MSG_DONTWAIT);
}
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index b048d4fa3959..c121abbdfc7d 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1897,8 +1897,7 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
/* On error, skip the f_pos to the
next block. */
ctx->pos = (ctx->pos | (sb->s_blocksize - 1)) + 1;
- brelse(bh);
- continue;
+ break;
}
if (le64_to_cpu(de->inode)) {
unsigned char d_type = DT_UNKNOWN;
diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile
index bd1aab1f49a4..ef2854422a6e 100644
--- a/fs/ocfs2/dlm/Makefile
+++ b/fs/ocfs2/dlm/Makefile
@@ -1,4 +1,4 @@
-ccflags-y := -Ifs/ocfs2
+ccflags-y := -I$(src)/..
obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 9b984cae4c4e..1d6dc8422899 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -329,7 +329,7 @@ void dlm_print_one_mle(struct dlm_master_list_entry *mle)
{
char *buf;
- buf = (char *) get_zeroed_page(GFP_NOFS);
+ buf = (char *) get_zeroed_page(GFP_ATOMIC);
if (buf) {
dump_mle(mle, buf, PAGE_SIZE - 1);
free_page((unsigned long)buf);
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 838a06d4066a..074d5de17bb2 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -531,7 +531,7 @@ void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
assert_spin_locked(&res->spinlock);
/* don't shuffle secondary queues */
- if ((res->owner == dlm->node_num)) {
+ if (res->owner == dlm->node_num) {
if (res->state & (DLM_LOCK_RES_MIGRATING |
DLM_LOCK_RES_BLOCK_DIRTY))
return;
diff --git a/fs/ocfs2/dlmfs/Makefile b/fs/ocfs2/dlmfs/Makefile
index eed3db8c5b49..33431a0296a3 100644
--- a/fs/ocfs2/dlmfs/Makefile
+++ b/fs/ocfs2/dlmfs/Makefile
@@ -1,4 +1,4 @@
-ccflags-y := -Ifs/ocfs2
+ccflags-y := -I$(src)/..
obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 602c71f32740..8decbe95dcec 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -179,7 +179,7 @@ bail:
static int dlmfs_file_release(struct inode *inode,
struct file *file)
{
- int level, status;
+ int level;
struct dlmfs_inode_private *ip = DLMFS_I(inode);
struct dlmfs_filp_private *fp = file->private_data;
@@ -188,7 +188,6 @@ static int dlmfs_file_release(struct inode *inode,
mlog(0, "close called on inode %lu\n", inode->i_ino);
- status = 0;
if (fp) {
level = fp->fp_lock_level;
if (level != DLM_LOCK_IV)
@@ -255,7 +254,7 @@ static ssize_t dlmfs_file_read(struct file *filp,
if (!count)
return 0;
- if (!access_ok(VERIFY_WRITE, buf, count))
+ if (!access_ok(buf, count))
return -EFAULT;
/* don't read past the lvb */
@@ -303,7 +302,7 @@ static ssize_t dlmfs_file_write(struct file *filp,
if (!count)
return 0;
- if (!access_ok(VERIFY_READ, buf, count))
+ if (!access_ok(buf, count))
return -EFAULT;
/* don't write past the lvb */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 933aac5da193..af405586c5b1 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -686,6 +686,9 @@ void ocfs2_trim_fs_lock_res_init(struct ocfs2_super *osb)
{
struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
+ /* Only one trimfs thread are allowed to work at the same time. */
+ mutex_lock(&osb->obs_trim_fs_mutex);
+
ocfs2_lock_res_init_once(lockres);
ocfs2_build_lock_name(OCFS2_LOCK_TYPE_TRIM_FS, 0, 0, lockres->l_name);
ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_TRIM_FS,
@@ -698,6 +701,8 @@ void ocfs2_trim_fs_lock_res_uninit(struct ocfs2_super *osb)
ocfs2_simple_drop_lockres(osb, lockres);
ocfs2_lock_res_free(lockres);
+
+ mutex_unlock(&osb->obs_trim_fs_mutex);
}
static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res,
@@ -2123,10 +2128,10 @@ static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
/* LVB only has room for 64 bits of time here so we pack it for
* now. */
-static u64 ocfs2_pack_timespec(struct timespec *spec)
+static u64 ocfs2_pack_timespec(struct timespec64 *spec)
{
u64 res;
- u64 sec = spec->tv_sec;
+ u64 sec = clamp_t(time64_t, spec->tv_sec, 0, 0x3ffffffffull);
u32 nsec = spec->tv_nsec;
res = (sec << OCFS2_SEC_SHIFT) | (nsec & OCFS2_NSEC_MASK);
@@ -2142,7 +2147,6 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
struct ocfs2_meta_lvb *lvb;
- struct timespec ts;
lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
@@ -2163,15 +2167,12 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
lvb->lvb_igid = cpu_to_be32(i_gid_read(inode));
lvb->lvb_imode = cpu_to_be16(inode->i_mode);
lvb->lvb_inlink = cpu_to_be16(inode->i_nlink);
- ts = timespec64_to_timespec(inode->i_atime);
lvb->lvb_iatime_packed =
- cpu_to_be64(ocfs2_pack_timespec(&ts));
- ts = timespec64_to_timespec(inode->i_ctime);
+ cpu_to_be64(ocfs2_pack_timespec(&inode->i_atime));
lvb->lvb_ictime_packed =
- cpu_to_be64(ocfs2_pack_timespec(&ts));
- ts = timespec64_to_timespec(inode->i_mtime);
+ cpu_to_be64(ocfs2_pack_timespec(&inode->i_ctime));
lvb->lvb_imtime_packed =
- cpu_to_be64(ocfs2_pack_timespec(&ts));
+ cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime));
lvb->lvb_iattr = cpu_to_be32(oi->ip_attr);
lvb->lvb_idynfeatures = cpu_to_be16(oi->ip_dyn_features);
lvb->lvb_igeneration = cpu_to_be32(inode->i_generation);
@@ -2180,7 +2181,7 @@ out:
mlog_meta_lvb(0, lockres);
}
-static void ocfs2_unpack_timespec(struct timespec *spec,
+static void ocfs2_unpack_timespec(struct timespec64 *spec,
u64 packed_time)
{
spec->tv_sec = packed_time >> OCFS2_SEC_SHIFT;
@@ -2189,7 +2190,6 @@ static void ocfs2_unpack_timespec(struct timespec *spec,
static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
{
- struct timespec ts;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
struct ocfs2_meta_lvb *lvb;
@@ -2217,15 +2217,12 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
i_gid_write(inode, be32_to_cpu(lvb->lvb_igid));
inode->i_mode = be16_to_cpu(lvb->lvb_imode);
set_nlink(inode, be16_to_cpu(lvb->lvb_inlink));
- ocfs2_unpack_timespec(&ts,
+ ocfs2_unpack_timespec(&inode->i_atime,
be64_to_cpu(lvb->lvb_iatime_packed));
- inode->i_atime = timespec_to_timespec64(ts);
- ocfs2_unpack_timespec(&ts,
+ ocfs2_unpack_timespec(&inode->i_mtime,
be64_to_cpu(lvb->lvb_imtime_packed));
- inode->i_mtime = timespec_to_timespec64(ts);
- ocfs2_unpack_timespec(&ts,
+ ocfs2_unpack_timespec(&inode->i_ctime,
be64_to_cpu(lvb->lvb_ictime_packed));
- inode->i_ctime = timespec_to_timespec64(ts);
spin_unlock(&oi->ip_lock);
}
@@ -3603,7 +3600,7 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
* we can recover correctly from node failure. Otherwise, we may get
* invalid LVB in LKB, but without DLM_SBF_VALNOTVALID being set.
*/
- if (!ocfs2_is_o2cb_active() &&
+ if (ocfs2_userspace_stack(osb) &&
lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
lvb = 1;
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 9f88188060db..4bf8d5854b27 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -125,10 +125,10 @@ check_err:
check_gen:
if (handle->ih_generation != inode->i_generation) {
- iput(inode);
trace_ocfs2_get_dentry_generation((unsigned long long)blkno,
handle->ih_generation,
inode->i_generation);
+ iput(inode);
result = ERR_PTR(-ESTALE);
goto bail;
}
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 9fa35cb6f6e0..d640c5f8a85d 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2343,7 +2343,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
written = __generic_file_write_iter(iocb, from);
/* buffered aio wouldn't have proper lock coverage today */
- BUG_ON(written == -EIOCBQUEUED && !(iocb->ki_flags & IOCB_DIRECT));
+ BUG_ON(written == -EIOCBQUEUED && !direct_io);
/*
* deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
@@ -2463,7 +2463,7 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
trace_generic_file_read_iter_ret(ret);
/* buffered aio wouldn't have proper lock coverage today */
- BUG_ON(ret == -EIOCBQUEUED && !(iocb->ki_flags & IOCB_DIRECT));
+ BUG_ON(ret == -EIOCBQUEUED && !direct_io);
/* see ocfs2_file_write_iter */
if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
@@ -2527,24 +2527,79 @@ out:
return offset;
}
-static int ocfs2_file_clone_range(struct file *file_in,
- loff_t pos_in,
- struct file *file_out,
- loff_t pos_out,
- u64 len)
+static loff_t ocfs2_remap_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t len, unsigned int remap_flags)
{
- return ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out,
- len, false);
-}
+ struct inode *inode_in = file_inode(file_in);
+ struct inode *inode_out = file_inode(file_out);
+ struct ocfs2_super *osb = OCFS2_SB(inode_in->i_sb);
+ struct buffer_head *in_bh = NULL, *out_bh = NULL;
+ bool same_inode = (inode_in == inode_out);
+ loff_t remapped = 0;
+ ssize_t ret;
-static int ocfs2_file_dedupe_range(struct file *file_in,
- loff_t pos_in,
- struct file *file_out,
- loff_t pos_out,
- u64 len)
-{
- return ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out,
- len, true);
+ if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
+ return -EINVAL;
+ if (!ocfs2_refcount_tree(osb))
+ return -EOPNOTSUPP;
+ if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+ return -EROFS;
+
+ /* Lock both files against IO */
+ ret = ocfs2_reflink_inodes_lock(inode_in, &in_bh, inode_out, &out_bh);
+ if (ret)
+ return ret;
+
+ /* Check file eligibility and prepare for block sharing. */
+ ret = -EINVAL;
+ if ((OCFS2_I(inode_in)->ip_flags & OCFS2_INODE_SYSTEM_FILE) ||
+ (OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE))
+ goto out_unlock;
+
+ ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
+ &len, remap_flags);
+ if (ret < 0 || len == 0)
+ goto out_unlock;
+
+ /* Lock out changes to the allocation maps and remap. */
+ down_write(&OCFS2_I(inode_in)->ip_alloc_sem);
+ if (!same_inode)
+ down_write_nested(&OCFS2_I(inode_out)->ip_alloc_sem,
+ SINGLE_DEPTH_NESTING);
+
+ /* Zap any page cache for the destination file's range. */
+ truncate_inode_pages_range(&inode_out->i_data,
+ round_down(pos_out, PAGE_SIZE),
+ round_up(pos_out + len, PAGE_SIZE) - 1);
+
+ remapped = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in,
+ inode_out, out_bh, pos_out, len);
+ up_write(&OCFS2_I(inode_in)->ip_alloc_sem);
+ if (!same_inode)
+ up_write(&OCFS2_I(inode_out)->ip_alloc_sem);
+ if (remapped < 0) {
+ ret = remapped;
+ mlog_errno(ret);
+ goto out_unlock;
+ }
+
+ /*
+ * Empty the extent map so that we may get the right extent
+ * record from the disk.
+ */
+ ocfs2_extent_map_trunc(inode_in, 0);
+ ocfs2_extent_map_trunc(inode_out, 0);
+
+ ret = ocfs2_reflink_update_dest(inode_out, out_bh, pos_out + len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_unlock;
+ }
+
+out_unlock:
+ ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
+ return remapped > 0 ? remapped : ret;
}
const struct inode_operations ocfs2_file_iops = {
@@ -2586,8 +2641,7 @@ const struct file_operations ocfs2_fops = {
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
.fallocate = ocfs2_fallocate,
- .clone_file_range = ocfs2_file_clone_range,
- .dedupe_file_range = ocfs2_file_dedupe_range,
+ .remap_file_range = ocfs2_remap_file_range,
};
const struct file_operations ocfs2_dops = {
@@ -2633,8 +2687,7 @@ const struct file_operations ocfs2_fops_no_plocks = {
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
.fallocate = ocfs2_fallocate,
- .clone_file_range = ocfs2_file_clone_range,
- .dedupe_file_range = ocfs2_file_dedupe_range,
+ .remap_file_range = ocfs2_remap_file_range,
};
const struct file_operations ocfs2_dops_no_plocks = {
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index bd3475694e83..46fd3ef2cf21 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1017,7 +1017,8 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
mlog_errno(status);
}
- if (status == 0) {
+ /* Shutdown the kernel journal system */
+ if (!jbd2_journal_destroy(journal->j_journal) && !status) {
/*
* Do not toggle if flush was unsuccessful otherwise
* will leave dirty metadata in a "clean" journal
@@ -1026,9 +1027,6 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
if (status < 0)
mlog_errno(status);
}
-
- /* Shutdown the kernel journal system */
- jbd2_journal_destroy(journal->j_journal);
journal->j_journal = NULL;
OCFS2_I(inode)->ip_open_count--;
@@ -1378,15 +1376,23 @@ static int __ocfs2_recovery_thread(void *arg)
int rm_quota_used = 0, i;
struct ocfs2_quota_recovery *qrec;
+ /* Whether the quota supported. */
+ int quota_enabled = OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb,
+ OCFS2_FEATURE_RO_COMPAT_USRQUOTA)
+ || OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb,
+ OCFS2_FEATURE_RO_COMPAT_GRPQUOTA);
+
status = ocfs2_wait_on_mount(osb);
if (status < 0) {
goto bail;
}
- rm_quota = kcalloc(osb->max_slots, sizeof(int), GFP_NOFS);
- if (!rm_quota) {
- status = -ENOMEM;
- goto bail;
+ if (quota_enabled) {
+ rm_quota = kcalloc(osb->max_slots, sizeof(int), GFP_NOFS);
+ if (!rm_quota) {
+ status = -ENOMEM;
+ goto bail;
+ }
}
restart:
status = ocfs2_super_lock(osb, 1);
@@ -1422,9 +1428,14 @@ restart:
* then quota usage would be out of sync until some node takes
* the slot. So we remember which nodes need quota recovery
* and when everything else is done, we recover quotas. */
- for (i = 0; i < rm_quota_used && rm_quota[i] != slot_num; i++);
- if (i == rm_quota_used)
- rm_quota[rm_quota_used++] = slot_num;
+ if (quota_enabled) {
+ for (i = 0; i < rm_quota_used
+ && rm_quota[i] != slot_num; i++)
+ ;
+
+ if (i == rm_quota_used)
+ rm_quota[rm_quota_used++] = slot_num;
+ }
status = ocfs2_recover_node(osb, node_num, slot_num);
skip_recovery:
@@ -1452,16 +1463,19 @@ skip_recovery:
/* Now it is right time to recover quotas... We have to do this under
* superblock lock so that no one can start using the slot (and crash)
* before we recover it */
- for (i = 0; i < rm_quota_used; i++) {
- qrec = ocfs2_begin_quota_recovery(osb, rm_quota[i]);
- if (IS_ERR(qrec)) {
- status = PTR_ERR(qrec);
- mlog_errno(status);
- continue;
+ if (quota_enabled) {
+ for (i = 0; i < rm_quota_used; i++) {
+ qrec = ocfs2_begin_quota_recovery(osb, rm_quota[i]);
+ if (IS_ERR(qrec)) {
+ status = PTR_ERR(qrec);
+ mlog_errno(status);
+ continue;
+ }
+ ocfs2_queue_recovery_completion(osb->journal,
+ rm_quota[i],
+ NULL, NULL, qrec,
+ ORPHAN_NEED_TRUNCATE);
}
- ocfs2_queue_recovery_completion(osb->journal, rm_quota[i],
- NULL, NULL, qrec,
- ORPHAN_NEED_TRUNCATE);
}
ocfs2_super_unlock(osb, 1);
@@ -1483,7 +1497,8 @@ bail:
mutex_unlock(&osb->recovery_lock);
- kfree(rm_quota);
+ if (quota_enabled)
+ kfree(rm_quota);
/* no one is callint kthread_stop() for us so the kthread() api
* requires that we call do_exit(). And it isn't exported, but
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 7642b6712c39..58973e4d2471 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -345,13 +345,18 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
if (num_used
|| alloc->id1.bitmap1.i_used
|| alloc->id1.bitmap1.i_total
- || la->la_bm_off)
- mlog(ML_ERROR, "Local alloc hasn't been recovered!\n"
+ || la->la_bm_off) {
+ mlog(ML_ERROR, "inconsistent detected, clean journal with"
+ " unrecovered local alloc, please run fsck.ocfs2!\n"
"found = %u, set = %u, taken = %u, off = %u\n",
num_used, le32_to_cpu(alloc->id1.bitmap1.i_used),
le32_to_cpu(alloc->id1.bitmap1.i_total),
OCFS2_LOCAL_ALLOC(alloc)->la_bm_off);
+ status = -EINVAL;
+ goto bail;
+ }
+
osb->local_alloc_bh = alloc_bh;
osb->local_alloc_state = OCFS2_LA_ENABLED;
@@ -835,7 +840,7 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
u32 *numbits,
struct ocfs2_alloc_reservation *resv)
{
- int numfound = 0, bitoff, left, startoff, lastzero;
+ int numfound = 0, bitoff, left, startoff;
int local_resv = 0;
struct ocfs2_alloc_reservation r;
void *bitmap = NULL;
@@ -873,7 +878,6 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
bitmap = OCFS2_LOCAL_ALLOC(alloc)->la_bitmap;
numfound = bitoff = startoff = 0;
- lastzero = -1;
left = le32_to_cpu(alloc->id1.bitmap1.i_total);
while ((bitoff = ocfs2_find_next_zero_bit(bitmap, left, startoff)) != -1) {
if (bitoff == left) {
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
index d56f0079b858..b11acd34001a 100644
--- a/fs/ocfs2/locks.c
+++ b/fs/ocfs2/locks.c
@@ -52,6 +52,7 @@ static int ocfs2_do_flock(struct file *file, struct inode *inode,
if (lockres->l_flags & OCFS2_LOCK_ATTACHED &&
lockres->l_level > LKM_NLMODE) {
int old_level = 0;
+ struct file_lock request;
if (lockres->l_level == LKM_EXMODE)
old_level = 1;
@@ -66,11 +67,10 @@ static int ocfs2_do_flock(struct file *file, struct inode *inode,
* level.
*/
- locks_lock_file_wait(file,
- &(struct file_lock) {
- .fl_type = F_UNLCK,
- .fl_flags = FL_FLOCK
- });
+ locks_init_lock(&request);
+ request.fl_type = F_UNLCK;
+ request.fl_flags = FL_FLOCK;
+ locks_lock_file_wait(file, &request);
ocfs2_file_unlock(file);
}
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 7eb3b0a6347e..1565dd8e8856 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -25,6 +25,7 @@
#include "ocfs2_ioctl.h"
#include "alloc.h"
+#include "localalloc.h"
#include "aops.h"
#include "dlmglue.h"
#include "extent_map.h"
@@ -156,18 +157,14 @@ out:
}
/*
- * lock allocators, and reserving appropriate number of bits for
- * meta blocks and data clusters.
- *
- * in some cases, we don't need to reserve clusters, just let data_ac
- * be NULL.
+ * lock allocator, and reserve appropriate number of bits for
+ * meta blocks.
*/
-static int ocfs2_lock_allocators_move_extents(struct inode *inode,
+static int ocfs2_lock_meta_allocator_move_extents(struct inode *inode,
struct ocfs2_extent_tree *et,
u32 clusters_to_move,
u32 extents_to_split,
struct ocfs2_alloc_context **meta_ac,
- struct ocfs2_alloc_context **data_ac,
int extra_blocks,
int *credits)
{
@@ -192,13 +189,6 @@ static int ocfs2_lock_allocators_move_extents(struct inode *inode,
goto out;
}
- if (data_ac) {
- ret = ocfs2_reserve_clusters(osb, clusters_to_move, data_ac);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
- }
*credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el);
@@ -233,6 +223,7 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
struct ocfs2_refcount_tree *ref_tree = NULL;
u32 new_phys_cpos, new_len;
u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+ int need_free = 0;
if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) {
BUG_ON(!ocfs2_is_refcount_inode(inode));
@@ -257,10 +248,10 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
}
}
- ret = ocfs2_lock_allocators_move_extents(inode, &context->et, *len, 1,
- &context->meta_ac,
- &context->data_ac,
- extra_blocks, &credits);
+ ret = ocfs2_lock_meta_allocator_move_extents(inode, &context->et,
+ *len, 1,
+ &context->meta_ac,
+ extra_blocks, &credits);
if (ret) {
mlog_errno(ret);
goto out;
@@ -283,6 +274,21 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
}
}
+ /*
+ * Make sure ocfs2_reserve_cluster is called after
+ * __ocfs2_flush_truncate_log, otherwise, dead lock may happen.
+ *
+ * If ocfs2_reserve_cluster is called
+ * before __ocfs2_flush_truncate_log, dead lock on global bitmap
+ * may happen.
+ *
+ */
+ ret = ocfs2_reserve_clusters(osb, *len, &context->data_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_unlock_mutex;
+ }
+
handle = ocfs2_start_trans(osb, credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
@@ -308,6 +314,7 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
if (!partial) {
context->range->me_flags &= ~OCFS2_MOVE_EXT_FL_COMPLETE;
ret = -ENOSPC;
+ need_free = 1;
goto out_commit;
}
}
@@ -332,6 +339,20 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
mlog_errno(ret);
out_commit:
+ if (need_free && context->data_ac) {
+ struct ocfs2_alloc_context *data_ac = context->data_ac;
+
+ if (context->data_ac->ac_which == OCFS2_AC_USE_LOCAL)
+ ocfs2_free_local_alloc_bits(osb, handle, data_ac,
+ new_phys_cpos, new_len);
+ else
+ ocfs2_free_clusters(handle,
+ data_ac->ac_inode,
+ data_ac->ac_bh,
+ ocfs2_clusters_to_blocks(osb->sb, new_phys_cpos),
+ new_len);
+ }
+
ocfs2_commit_trans(osb, handle);
out_unlock_mutex:
@@ -600,9 +621,10 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
}
}
- ret = ocfs2_lock_allocators_move_extents(inode, &context->et, len, 1,
- &context->meta_ac,
- NULL, extra_blocks, &credits);
+ ret = ocfs2_lock_meta_allocator_move_extents(inode, &context->et,
+ len, 1,
+ &context->meta_ac,
+ extra_blocks, &credits);
if (ret) {
mlog_errno(ret);
goto out;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 4f86ac0027b5..1f029fbe8b8d 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -407,6 +407,7 @@ struct ocfs2_super
struct ocfs2_lock_res osb_rename_lockres;
struct ocfs2_lock_res osb_nfs_sync_lockres;
struct ocfs2_lock_res osb_trim_fs_lockres;
+ struct mutex obs_trim_fs_mutex;
struct ocfs2_dlm_debug *osb_dlm_debug;
struct dentry *osb_debug_root;
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 2ee76a90ba8f..dc4bce1649c1 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -712,6 +712,8 @@ TRACE_EVENT(ocfs2_trim_extent,
DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group);
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_mainbm);
+
DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs);
/* End of trace events for fs/ocfs2/alloc.c. */
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 7a5ee145c733..a35259eebc56 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4135,7 +4135,6 @@ static int ocfs2_create_reflink_node(struct inode *s_inode,
struct buffer_head *ref_root_bh = NULL;
struct ocfs2_cached_dealloc_ctxt dealloc;
struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb);
- struct ocfs2_refcount_block *rb;
struct ocfs2_dinode *di = (struct ocfs2_dinode *)s_bh->b_data;
struct ocfs2_refcount_tree *ref_tree;
@@ -4162,7 +4161,6 @@ static int ocfs2_create_reflink_node(struct inode *s_inode,
mlog_errno(ret);
goto out;
}
- rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
ret = ocfs2_duplicate_extent_list(s_inode, t_inode, t_bh,
&ref_tree->rf_ci, ref_root_bh,
@@ -4468,9 +4466,9 @@ out:
}
/* Update destination inode size, if necessary. */
-static int ocfs2_reflink_update_dest(struct inode *dest,
- struct buffer_head *d_bh,
- loff_t newlen)
+int ocfs2_reflink_update_dest(struct inode *dest,
+ struct buffer_head *d_bh,
+ loff_t newlen)
{
handle_t *handle;
int ret;
@@ -4507,14 +4505,14 @@ out_commit:
}
/* Remap the range pos_in:len in s_inode to pos_out:len in t_inode. */
-static int ocfs2_reflink_remap_extent(struct inode *s_inode,
- struct buffer_head *s_bh,
- loff_t pos_in,
- struct inode *t_inode,
- struct buffer_head *t_bh,
- loff_t pos_out,
- loff_t len,
- struct ocfs2_cached_dealloc_ctxt *dealloc)
+static loff_t ocfs2_reflink_remap_extent(struct inode *s_inode,
+ struct buffer_head *s_bh,
+ loff_t pos_in,
+ struct inode *t_inode,
+ struct buffer_head *t_bh,
+ loff_t pos_out,
+ loff_t len,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
{
struct ocfs2_extent_tree s_et;
struct ocfs2_extent_tree t_et;
@@ -4522,8 +4520,9 @@ static int ocfs2_reflink_remap_extent(struct inode *s_inode,
struct buffer_head *ref_root_bh = NULL;
struct ocfs2_refcount_tree *ref_tree;
struct ocfs2_super *osb;
+ loff_t remapped_bytes = 0;
loff_t pstart, plen;
- u32 p_cluster, num_clusters, slast, spos, tpos;
+ u32 p_cluster, num_clusters, slast, spos, tpos, remapped_clus = 0;
unsigned int ext_flags;
int ret = 0;
@@ -4605,30 +4604,34 @@ static int ocfs2_reflink_remap_extent(struct inode *s_inode,
next_loop:
spos += num_clusters;
tpos += num_clusters;
+ remapped_clus += num_clusters;
}
-out:
- return ret;
+ goto out;
out_unlock_refcount:
ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
brelse(ref_root_bh);
- return ret;
+out:
+ remapped_bytes = ocfs2_clusters_to_bytes(t_inode->i_sb, remapped_clus);
+ remapped_bytes = min_t(loff_t, len, remapped_bytes);
+
+ return remapped_bytes > 0 ? remapped_bytes : ret;
}
/* Set up refcount tree and remap s_inode to t_inode. */
-static int ocfs2_reflink_remap_blocks(struct inode *s_inode,
- struct buffer_head *s_bh,
- loff_t pos_in,
- struct inode *t_inode,
- struct buffer_head *t_bh,
- loff_t pos_out,
- loff_t len)
+loff_t ocfs2_reflink_remap_blocks(struct inode *s_inode,
+ struct buffer_head *s_bh,
+ loff_t pos_in,
+ struct inode *t_inode,
+ struct buffer_head *t_bh,
+ loff_t pos_out,
+ loff_t len)
{
struct ocfs2_cached_dealloc_ctxt dealloc;
struct ocfs2_super *osb;
struct ocfs2_dinode *dis;
struct ocfs2_dinode *dit;
- int ret;
+ loff_t ret;
osb = OCFS2_SB(s_inode->i_sb);
dis = (struct ocfs2_dinode *)s_bh->b_data;
@@ -4700,7 +4703,7 @@ static int ocfs2_reflink_remap_blocks(struct inode *s_inode,
/* Actually remap extents now. */
ret = ocfs2_reflink_remap_extent(s_inode, s_bh, pos_in, t_inode, t_bh,
pos_out, len, &dealloc);
- if (ret) {
+ if (ret < 0) {
mlog_errno(ret);
goto out;
}
@@ -4715,10 +4718,10 @@ out:
}
/* Lock an inode and grab a bh pointing to the inode. */
-static int ocfs2_reflink_inodes_lock(struct inode *s_inode,
- struct buffer_head **bh1,
- struct inode *t_inode,
- struct buffer_head **bh2)
+int ocfs2_reflink_inodes_lock(struct inode *s_inode,
+ struct buffer_head **bh1,
+ struct inode *t_inode,
+ struct buffer_head **bh2)
{
struct inode *inode1;
struct inode *inode2;
@@ -4803,10 +4806,10 @@ out_i1:
}
/* Unlock both inodes and release buffers. */
-static void ocfs2_reflink_inodes_unlock(struct inode *s_inode,
- struct buffer_head *s_bh,
- struct inode *t_inode,
- struct buffer_head *t_bh)
+void ocfs2_reflink_inodes_unlock(struct inode *s_inode,
+ struct buffer_head *s_bh,
+ struct inode *t_inode,
+ struct buffer_head *t_bh)
{
ocfs2_inode_unlock(s_inode, 1);
ocfs2_rw_unlock(s_inode, 1);
@@ -4818,82 +4821,3 @@ static void ocfs2_reflink_inodes_unlock(struct inode *s_inode,
}
unlock_two_nondirectories(s_inode, t_inode);
}
-
-/* Link a range of blocks from one file to another. */
-int ocfs2_reflink_remap_range(struct file *file_in,
- loff_t pos_in,
- struct file *file_out,
- loff_t pos_out,
- u64 len,
- bool is_dedupe)
-{
- struct inode *inode_in = file_inode(file_in);
- struct inode *inode_out = file_inode(file_out);
- struct ocfs2_super *osb = OCFS2_SB(inode_in->i_sb);
- struct buffer_head *in_bh = NULL, *out_bh = NULL;
- bool same_inode = (inode_in == inode_out);
- ssize_t ret;
-
- if (!ocfs2_refcount_tree(osb))
- return -EOPNOTSUPP;
- if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
- return -EROFS;
-
- /* Lock both files against IO */
- ret = ocfs2_reflink_inodes_lock(inode_in, &in_bh, inode_out, &out_bh);
- if (ret)
- return ret;
-
- /* Check file eligibility and prepare for block sharing. */
- ret = -EINVAL;
- if ((OCFS2_I(inode_in)->ip_flags & OCFS2_INODE_SYSTEM_FILE) ||
- (OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE))
- goto out_unlock;
-
- ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out,
- &len, is_dedupe);
- if (ret <= 0)
- goto out_unlock;
-
- /* Lock out changes to the allocation maps and remap. */
- down_write(&OCFS2_I(inode_in)->ip_alloc_sem);
- if (!same_inode)
- down_write_nested(&OCFS2_I(inode_out)->ip_alloc_sem,
- SINGLE_DEPTH_NESTING);
-
- ret = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in, inode_out,
- out_bh, pos_out, len);
-
- /* Zap any page cache for the destination file's range. */
- if (!ret)
- truncate_inode_pages_range(&inode_out->i_data, pos_out,
- PAGE_ALIGN(pos_out + len) - 1);
-
- up_write(&OCFS2_I(inode_in)->ip_alloc_sem);
- if (!same_inode)
- up_write(&OCFS2_I(inode_out)->ip_alloc_sem);
- if (ret) {
- mlog_errno(ret);
- goto out_unlock;
- }
-
- /*
- * Empty the extent map so that we may get the right extent
- * record from the disk.
- */
- ocfs2_extent_map_trunc(inode_in, 0);
- ocfs2_extent_map_trunc(inode_out, 0);
-
- ret = ocfs2_reflink_update_dest(inode_out, out_bh, pos_out + len);
- if (ret) {
- mlog_errno(ret);
- goto out_unlock;
- }
-
- ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
- return 0;
-
-out_unlock:
- ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
- return ret;
-}
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index 4af55bf4b35b..e9e862be4a1e 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -115,11 +115,23 @@ int ocfs2_reflink_ioctl(struct inode *inode,
const char __user *oldname,
const char __user *newname,
bool preserve);
-int ocfs2_reflink_remap_range(struct file *file_in,
- loff_t pos_in,
- struct file *file_out,
- loff_t pos_out,
- u64 len,
- bool is_dedupe);
+loff_t ocfs2_reflink_remap_blocks(struct inode *s_inode,
+ struct buffer_head *s_bh,
+ loff_t pos_in,
+ struct inode *t_inode,
+ struct buffer_head *t_bh,
+ loff_t pos_out,
+ loff_t len);
+int ocfs2_reflink_inodes_lock(struct inode *s_inode,
+ struct buffer_head **bh1,
+ struct inode *t_inode,
+ struct buffer_head **bh2);
+void ocfs2_reflink_inodes_unlock(struct inode *s_inode,
+ struct buffer_head *s_bh,
+ struct inode *t_inode,
+ struct buffer_head *t_bh);
+int ocfs2_reflink_update_dest(struct inode *dest,
+ struct buffer_head *d_bh,
+ loff_t newlen);
#endif /* OCFS2_REFCOUNTTREE_H */
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index d7407994f308..ea0756d83250 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -55,7 +55,7 @@ struct ocfs2_slot_info {
unsigned int si_blocks;
struct buffer_head **si_bh;
unsigned int si_num_slots;
- struct ocfs2_slot *si_slots;
+ struct ocfs2_slot si_slots[];
};
@@ -420,9 +420,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
struct inode *inode = NULL;
struct ocfs2_slot_info *si;
- si = kzalloc(sizeof(struct ocfs2_slot_info) +
- (sizeof(struct ocfs2_slot) * osb->max_slots),
- GFP_KERNEL);
+ si = kzalloc(struct_size(si, si_slots, osb->max_slots), GFP_KERNEL);
if (!si) {
status = -ENOMEM;
mlog_errno(status);
@@ -431,8 +429,6 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
si->si_extended = ocfs2_uses_extended_slot_map(osb);
si->si_num_slots = osb->max_slots;
- si->si_slots = (struct ocfs2_slot *)((char *)si +
- sizeof(struct ocfs2_slot_info));
inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE,
OCFS2_INVALID_SLOT);
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index d6c350ba25b9..c4b029c43464 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -48,12 +48,6 @@ static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl";
*/
static struct ocfs2_stack_plugin *active_stack;
-inline int ocfs2_is_o2cb_active(void)
-{
- return !strcmp(active_stack->sp_name, OCFS2_STACK_PLUGIN_O2CB);
-}
-EXPORT_SYMBOL_GPL(ocfs2_is_o2cb_active);
-
static struct ocfs2_stack_plugin *ocfs2_stack_lookup(const char *name)
{
struct ocfs2_stack_plugin *p;
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index e3036e1790e8..f2dce10fae54 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -298,9 +298,6 @@ void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_p
int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin);
void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin);
-/* In ocfs2_downconvert_lock(), we need to know which stack we are using */
-int ocfs2_is_o2cb_active(void);
-
extern struct kset *ocfs2_kset;
#endif /* STACKGLUE_H */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 3415e0b09398..96ae7cedd487 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1847,6 +1847,8 @@ static int ocfs2_mount_volume(struct super_block *sb)
if (ocfs2_is_hard_readonly(osb))
goto leave;
+ mutex_init(&osb->obs_trim_fs_mutex);
+
status = ocfs2_dlm_init(osb);
if (status < 0) {
mlog_errno(status);
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 607092f367ad..1b2d0d2fe2ee 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -199,10 +199,11 @@ static struct dentry *openpromfs_lookup(struct inode *dir, struct dentry *dentry
child = dp->child;
while (child) {
- int n = strlen(child->path_component_name);
+ const char *node_name = kbasename(child->full_name);
+ int n = strlen(node_name);
if (len == n &&
- !strncmp(child->path_component_name, name, len)) {
+ !strncmp(node_name, name, len)) {
ent_type = op_inode_node;
ent_data.node = child;
ino = child->unique_id;
@@ -245,7 +246,7 @@ found:
set_nlink(inode, 2);
break;
case op_inode_prop:
- if (!strcmp(dp->name, "options") && (len == 17) &&
+ if (of_node_name_eq(dp, "options") && (len == 17) &&
!strncmp (name, "security-password", 17))
inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR;
else
@@ -293,8 +294,8 @@ static int openpromfs_readdir(struct file *file, struct dir_context *ctx)
}
while (child) {
if (!dir_emit(ctx,
- child->path_component_name,
- strlen(child->path_component_name),
+ kbasename(child->full_name),
+ strlen(kbasename(child->full_name)),
child->unique_id, DT_DIR))
goto out;
diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
index a5a2fe76568f..b094d3d79354 100644
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
@@ -398,8 +398,6 @@ static ssize_t orangefs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter
loff_t pos = iocb->ki_pos;
ssize_t rc = 0;
- BUG_ON(iocb->private);
-
gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_read_iter\n");
orangefs_stats.reads++;
@@ -416,8 +414,6 @@ static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *ite
loff_t pos;
ssize_t rc;
- BUG_ON(iocb->private);
-
gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_write_iter\n");
inode_lock(file->f_mapping->host);
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 5e65d818937b..f038235c64bd 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -25,7 +25,7 @@ static int read_one_page(struct page *page)
struct iov_iter to;
struct bio_vec bv = {.bv_page = page, .bv_len = PAGE_SIZE};
- iov_iter_bvec(&to, ITER_BVEC | READ, &bv, 1, PAGE_SIZE);
+ iov_iter_bvec(&to, READ, &bv, 1, PAGE_SIZE);
gossip_debug(GOSSIP_INODE_DEBUG,
"orangefs_readpage called with page %p\n",
@@ -77,7 +77,7 @@ static int orangefs_readpages(struct file *file,
for (page_idx = 0; page_idx < nr_pages; page_idx++) {
struct page *page;
- page = list_entry(pages->prev, struct page, lru);
+ page = lru_to_page(pages);
list_del(&page->lru);
if (!add_to_page_cache(page,
mapping,
diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c
index c4e98c9c1621..443bcd8c3c19 100644
--- a/fs/orangefs/orangefs-bufmap.c
+++ b/fs/orangefs/orangefs-bufmap.c
@@ -105,7 +105,7 @@ static int wait_for_free(struct slot_map *m)
left = t;
else
left = t + (left - n);
- if (unlikely(signal_pending(current)))
+ if (signal_pending(current))
left = -EINTR;
} while (left > 0);
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 1cc797a08a5b..9e62dcf06fc4 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -125,6 +125,7 @@ static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
struct file *new_file;
loff_t old_pos = 0;
loff_t new_pos = 0;
+ loff_t cloned;
int error = 0;
if (len == 0)
@@ -141,11 +142,10 @@ static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
}
/* Try to use clone_file_range to clone up within the same fs */
- error = do_clone_file_range(old_file, 0, new_file, 0, len);
- if (!error)
+ cloned = do_clone_file_range(old_file, 0, new_file, 0, len, 0);
+ if (cloned == len)
goto out;
/* Couldn't clone, so now we try to copy the data */
- error = 0;
/* FIXME: copy up sparse files efficiently */
while (len) {
@@ -395,7 +395,6 @@ struct ovl_copy_up_ctx {
struct dentry *destdir;
struct qstr destname;
struct dentry *workdir;
- bool tmpfile;
bool origin;
bool indexed;
bool metacopy;
@@ -440,63 +439,6 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c)
return err;
}
-static int ovl_install_temp(struct ovl_copy_up_ctx *c, struct dentry *temp,
- struct dentry **newdentry)
-{
- int err;
- struct dentry *upper;
- struct inode *udir = d_inode(c->destdir);
-
- upper = lookup_one_len(c->destname.name, c->destdir, c->destname.len);
- if (IS_ERR(upper))
- return PTR_ERR(upper);
-
- if (c->tmpfile)
- err = ovl_do_link(temp, udir, upper);
- else
- err = ovl_do_rename(d_inode(c->workdir), temp, udir, upper, 0);
-
- if (!err)
- *newdentry = dget(c->tmpfile ? upper : temp);
- dput(upper);
-
- return err;
-}
-
-static struct dentry *ovl_get_tmpfile(struct ovl_copy_up_ctx *c)
-{
- int err;
- struct dentry *temp;
- const struct cred *old_creds = NULL;
- struct cred *new_creds = NULL;
- struct ovl_cattr cattr = {
- /* Can't properly set mode on creation because of the umask */
- .mode = c->stat.mode & S_IFMT,
- .rdev = c->stat.rdev,
- .link = c->link
- };
-
- err = security_inode_copy_up(c->dentry, &new_creds);
- temp = ERR_PTR(err);
- if (err < 0)
- goto out;
-
- if (new_creds)
- old_creds = override_creds(new_creds);
-
- if (c->tmpfile)
- temp = ovl_do_tmpfile(c->workdir, c->stat.mode);
- else
- temp = ovl_create_temp(c->workdir, &cattr);
-out:
- if (new_creds) {
- revert_creds(old_creds);
- put_cred(new_creds);
- }
-
- return temp;
-}
-
static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp)
{
int err;
@@ -548,51 +490,148 @@ static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp)
return err;
}
-static int ovl_copy_up_locked(struct ovl_copy_up_ctx *c)
+struct ovl_cu_creds {
+ const struct cred *old;
+ struct cred *new;
+};
+
+static int ovl_prep_cu_creds(struct dentry *dentry, struct ovl_cu_creds *cc)
+{
+ int err;
+
+ cc->old = cc->new = NULL;
+ err = security_inode_copy_up(dentry, &cc->new);
+ if (err < 0)
+ return err;
+
+ if (cc->new)
+ cc->old = override_creds(cc->new);
+
+ return 0;
+}
+
+static void ovl_revert_cu_creds(struct ovl_cu_creds *cc)
+{
+ if (cc->new) {
+ revert_creds(cc->old);
+ put_cred(cc->new);
+ }
+}
+
+/*
+ * Copyup using workdir to prepare temp file. Used when copying up directories,
+ * special files or when upper fs doesn't support O_TMPFILE.
+ */
+static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c)
{
- struct inode *udir = c->destdir->d_inode;
struct inode *inode;
- struct dentry *newdentry = NULL;
- struct dentry *temp;
+ struct inode *udir = d_inode(c->destdir), *wdir = d_inode(c->workdir);
+ struct dentry *temp, *upper;
+ struct ovl_cu_creds cc;
int err;
+ struct ovl_cattr cattr = {
+ /* Can't properly set mode on creation because of the umask */
+ .mode = c->stat.mode & S_IFMT,
+ .rdev = c->stat.rdev,
+ .link = c->link
+ };
+
+ err = ovl_lock_rename_workdir(c->workdir, c->destdir);
+ if (err)
+ return err;
+
+ err = ovl_prep_cu_creds(c->dentry, &cc);
+ if (err)
+ goto unlock;
- temp = ovl_get_tmpfile(c);
+ temp = ovl_create_temp(c->workdir, &cattr);
+ ovl_revert_cu_creds(&cc);
+
+ err = PTR_ERR(temp);
if (IS_ERR(temp))
- return PTR_ERR(temp);
+ goto unlock;
err = ovl_copy_up_inode(c, temp);
if (err)
- goto out;
+ goto cleanup;
if (S_ISDIR(c->stat.mode) && c->indexed) {
err = ovl_create_index(c->dentry, c->lowerpath.dentry, temp);
if (err)
- goto out;
+ goto cleanup;
}
- if (c->tmpfile) {
- inode_lock_nested(udir, I_MUTEX_PARENT);
- err = ovl_install_temp(c, temp, &newdentry);
- inode_unlock(udir);
- } else {
- err = ovl_install_temp(c, temp, &newdentry);
- }
+ upper = lookup_one_len(c->destname.name, c->destdir, c->destname.len);
+ err = PTR_ERR(upper);
+ if (IS_ERR(upper))
+ goto cleanup;
+
+ err = ovl_do_rename(wdir, temp, udir, upper, 0);
+ dput(upper);
if (err)
- goto out;
+ goto cleanup;
if (!c->metacopy)
ovl_set_upperdata(d_inode(c->dentry));
inode = d_inode(c->dentry);
- ovl_inode_update(inode, newdentry);
+ ovl_inode_update(inode, temp);
if (S_ISDIR(inode->i_mode))
ovl_set_flag(OVL_WHITEOUTS, inode);
+unlock:
+ unlock_rename(c->workdir, c->destdir);
-out:
- if (err && !c->tmpfile)
- ovl_cleanup(d_inode(c->workdir), temp);
- dput(temp);
return err;
+cleanup:
+ ovl_cleanup(wdir, temp);
+ dput(temp);
+ goto unlock;
+}
+
+/* Copyup using O_TMPFILE which does not require cross dir locking */
+static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c)
+{
+ struct inode *udir = d_inode(c->destdir);
+ struct dentry *temp, *upper;
+ struct ovl_cu_creds cc;
+ int err;
+
+ err = ovl_prep_cu_creds(c->dentry, &cc);
+ if (err)
+ return err;
+
+ temp = ovl_do_tmpfile(c->workdir, c->stat.mode);
+ ovl_revert_cu_creds(&cc);
+
+ if (IS_ERR(temp))
+ return PTR_ERR(temp);
+
+ err = ovl_copy_up_inode(c, temp);
+ if (err)
+ goto out_dput;
+
+ inode_lock_nested(udir, I_MUTEX_PARENT);
+
+ upper = lookup_one_len(c->destname.name, c->destdir, c->destname.len);
+ err = PTR_ERR(upper);
+ if (!IS_ERR(upper)) {
+ err = ovl_do_link(temp, udir, upper);
+ dput(upper);
+ }
+ inode_unlock(udir);
+
+ if (err)
+ goto out_dput;
+
+ if (!c->metacopy)
+ ovl_set_upperdata(d_inode(c->dentry));
+ ovl_inode_update(d_inode(c->dentry), temp);
+
+ return 0;
+
+out_dput:
+ dput(temp);
+ return err;
}
/*
@@ -646,18 +685,10 @@ static int ovl_do_copy_up(struct ovl_copy_up_ctx *c)
}
/* Should we copyup with O_TMPFILE or with workdir? */
- if (S_ISREG(c->stat.mode) && ofs->tmpfile) {
- c->tmpfile = true;
- err = ovl_copy_up_locked(c);
- } else {
- err = ovl_lock_rename_workdir(c->workdir, c->destdir);
- if (!err) {
- err = ovl_copy_up_locked(c);
- unlock_rename(c->workdir, c->destdir);
- }
- }
-
-
+ if (S_ISREG(c->stat.mode) && ofs->tmpfile)
+ err = ovl_copy_up_tmpfile(c);
+ else
+ err = ovl_copy_up_workdir(c);
if (err)
goto out;
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 276914ae3c60..82c129bfe58d 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -414,13 +414,12 @@ static int ovl_set_upper_acl(struct dentry *upperdentry, const char *name,
if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !acl)
return 0;
- size = posix_acl_to_xattr(NULL, acl, NULL, 0);
+ size = posix_acl_xattr_size(acl->a_count);
buffer = kmalloc(size, GFP_KERNEL);
if (!buffer)
return -ENOMEM;
- size = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
- err = size;
+ err = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
if (err < 0)
goto out_free;
@@ -463,6 +462,10 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
if (IS_ERR(upper))
goto out_unlock;
+ err = -ESTALE;
+ if (d_is_negative(upper) || !IS_WHITEOUT(d_inode(upper)))
+ goto out_dput;
+
newdentry = ovl_create_temp(workdir, cattr);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
@@ -648,11 +651,22 @@ static int ovl_symlink(struct inode *dir, struct dentry *dentry,
return ovl_create_object(dentry, S_IFLNK, 0, link);
}
+static int ovl_set_link_redirect(struct dentry *dentry)
+{
+ const struct cred *old_cred;
+ int err;
+
+ old_cred = ovl_override_creds(dentry->d_sb);
+ err = ovl_set_redirect(dentry, false);
+ revert_creds(old_cred);
+
+ return err;
+}
+
static int ovl_link(struct dentry *old, struct inode *newdir,
struct dentry *new)
{
int err;
- bool locked = false;
struct inode *inode;
err = ovl_want_write(old);
@@ -663,13 +677,17 @@ static int ovl_link(struct dentry *old, struct inode *newdir,
if (err)
goto out_drop_write;
+ err = ovl_copy_up(new->d_parent);
+ if (err)
+ goto out_drop_write;
+
if (ovl_is_metacopy_dentry(old)) {
- err = ovl_set_redirect(old, false);
+ err = ovl_set_link_redirect(old);
if (err)
goto out_drop_write;
}
- err = ovl_nlink_start(old, &locked);
+ err = ovl_nlink_start(old);
if (err)
goto out_drop_write;
@@ -682,7 +700,7 @@ static int ovl_link(struct dentry *old, struct inode *newdir,
if (err)
iput(inode);
- ovl_nlink_end(old, locked);
+ ovl_nlink_end(old);
out_drop_write:
ovl_drop_write(old);
out:
@@ -807,7 +825,6 @@ static bool ovl_pure_upper(struct dentry *dentry)
static int ovl_do_remove(struct dentry *dentry, bool is_dir)
{
int err;
- bool locked = false;
const struct cred *old_cred;
struct dentry *upperdentry;
bool lower_positive = ovl_lower_positive(dentry);
@@ -828,7 +845,7 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir)
if (err)
goto out_drop_write;
- err = ovl_nlink_start(dentry, &locked);
+ err = ovl_nlink_start(dentry);
if (err)
goto out_drop_write;
@@ -844,7 +861,7 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir)
else
drop_nlink(dentry->d_inode);
}
- ovl_nlink_end(dentry, locked);
+ ovl_nlink_end(dentry);
/*
* Copy ctime
@@ -1008,7 +1025,6 @@ static int ovl_rename(struct inode *olddir, struct dentry *old,
unsigned int flags)
{
int err;
- bool locked = false;
struct dentry *old_upperdir;
struct dentry *new_upperdir;
struct dentry *olddentry;
@@ -1017,6 +1033,7 @@ static int ovl_rename(struct inode *olddir, struct dentry *old,
bool old_opaque;
bool new_opaque;
bool cleanup_whiteout = false;
+ bool update_nlink = false;
bool overwrite = !(flags & RENAME_EXCHANGE);
bool is_dir = d_is_dir(old);
bool new_is_dir = d_is_dir(new);
@@ -1074,10 +1091,12 @@ static int ovl_rename(struct inode *olddir, struct dentry *old,
err = ovl_copy_up(new);
if (err)
goto out_drop_write;
- } else {
- err = ovl_nlink_start(new, &locked);
+ } else if (d_inode(new)) {
+ err = ovl_nlink_start(new);
if (err)
goto out_drop_write;
+
+ update_nlink = true;
}
old_cred = ovl_override_creds(old->d_sb);
@@ -1206,7 +1225,8 @@ out_unlock:
unlock_rename(new_upperdir, old_upperdir);
out_revert_creds:
revert_creds(old_cred);
- ovl_nlink_end(new, locked);
+ if (update_nlink)
+ ovl_nlink_end(new);
out_drop_write:
ovl_drop_write(old);
out:
diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
index 8fa37cd7818a..54e5d17d7f3e 100644
--- a/fs/overlayfs/export.c
+++ b/fs/overlayfs/export.c
@@ -754,9 +754,8 @@ static struct dentry *ovl_lower_fh_to_d(struct super_block *sb,
goto out;
}
- /* Otherwise, get a connected non-upper dir or disconnected non-dir */
- if (d_is_dir(origin.dentry) &&
- (origin.dentry->d_flags & DCACHE_DISCONNECTED)) {
+ /* Find origin.dentry again with ovl_acceptable() layer check */
+ if (d_is_dir(origin.dentry)) {
dput(origin.dentry);
origin.dentry = NULL;
err = ovl_check_origin_fh(ofs, fh, true, NULL, &stack);
@@ -769,6 +768,7 @@ static struct dentry *ovl_lower_fh_to_d(struct super_block *sb,
goto out_err;
}
+ /* Get a connected non-upper dir or disconnected non-dir */
dentry = ovl_get_dentry(sb, NULL, &origin, index);
out:
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 986313da0c88..84dd957efa24 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -434,14 +434,14 @@ enum ovl_copyop {
OVL_DEDUPE,
};
-static ssize_t ovl_copyfile(struct file *file_in, loff_t pos_in,
+static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
- u64 len, unsigned int flags, enum ovl_copyop op)
+ loff_t len, unsigned int flags, enum ovl_copyop op)
{
struct inode *inode_out = file_inode(file_out);
struct fd real_in, real_out;
const struct cred *old_cred;
- ssize_t ret;
+ loff_t ret;
ret = ovl_real_fdget(file_out, &real_out);
if (ret)
@@ -462,12 +462,13 @@ static ssize_t ovl_copyfile(struct file *file_in, loff_t pos_in,
case OVL_CLONE:
ret = vfs_clone_file_range(real_in.file, pos_in,
- real_out.file, pos_out, len);
+ real_out.file, pos_out, len, flags);
break;
case OVL_DEDUPE:
ret = vfs_dedupe_file_range_one(real_in.file, pos_in,
- real_out.file, pos_out, len);
+ real_out.file, pos_out, len,
+ flags);
break;
}
revert_creds(old_cred);
@@ -489,26 +490,31 @@ static ssize_t ovl_copy_file_range(struct file *file_in, loff_t pos_in,
OVL_COPY);
}
-static int ovl_clone_file_range(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out, u64 len)
+static loff_t ovl_remap_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t len, unsigned int remap_flags)
{
- return ovl_copyfile(file_in, pos_in, file_out, pos_out, len, 0,
- OVL_CLONE);
-}
+ enum ovl_copyop op;
+
+ if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
+ return -EINVAL;
+
+ if (remap_flags & REMAP_FILE_DEDUP)
+ op = OVL_DEDUPE;
+ else
+ op = OVL_CLONE;
-static int ovl_dedupe_file_range(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out, u64 len)
-{
/*
* Don't copy up because of a dedupe request, this wouldn't make sense
* most of the time (data would be duplicated instead of deduplicated).
*/
- if (!ovl_inode_upper(file_inode(file_in)) ||
- !ovl_inode_upper(file_inode(file_out)))
+ if (op == OVL_DEDUPE &&
+ (!ovl_inode_upper(file_inode(file_in)) ||
+ !ovl_inode_upper(file_inode(file_out))))
return -EPERM;
- return ovl_copyfile(file_in, pos_in, file_out, pos_out, len, 0,
- OVL_DEDUPE);
+ return ovl_copyfile(file_in, pos_in, file_out, pos_out, len,
+ remap_flags, op);
}
const struct file_operations ovl_file_operations = {
@@ -525,6 +531,5 @@ const struct file_operations ovl_file_operations = {
.compat_ioctl = ovl_compat_ioctl,
.copy_file_range = ovl_copy_file_range,
- .clone_file_range = ovl_clone_file_range,
- .dedupe_file_range = ovl_dedupe_file_range,
+ .remap_file_range = ovl_remap_file_range,
};
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index 9c0ca6a7becf..efd372312ef1 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -422,8 +422,10 @@ int ovl_verify_set_fh(struct dentry *dentry, const char *name,
fh = ovl_encode_real_fh(real, is_upper);
err = PTR_ERR(fh);
- if (IS_ERR(fh))
+ if (IS_ERR(fh)) {
+ fh = NULL;
goto fail;
+ }
err = ovl_verify_fh(dentry, name, fh);
if (set && err == -ENODATA)
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index a3c0d9584312..5e45cb3630a0 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -271,8 +271,8 @@ bool ovl_test_flag(unsigned long flag, struct inode *inode);
bool ovl_inuse_trylock(struct dentry *dentry);
void ovl_inuse_unlock(struct dentry *dentry);
bool ovl_need_index(struct dentry *dentry);
-int ovl_nlink_start(struct dentry *dentry, bool *locked);
-void ovl_nlink_end(struct dentry *dentry, bool locked);
+int ovl_nlink_start(struct dentry *dentry);
+void ovl_nlink_end(struct dentry *dentry);
int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir);
int ovl_check_metacopy_xattr(struct dentry *dentry);
bool ovl_is_metacopy_dentry(struct dentry *dentry);
@@ -290,6 +290,16 @@ static inline unsigned int ovl_xino_bits(struct super_block *sb)
return ofs->xino_bits;
}
+static inline int ovl_inode_lock(struct inode *inode)
+{
+ return mutex_lock_interruptible(&OVL_I(inode)->lock);
+}
+
+static inline void ovl_inode_unlock(struct inode *inode)
+{
+ mutex_unlock(&OVL_I(inode)->lock);
+}
+
/* namei.c */
int ovl_check_fh_len(struct ovl_fh *fh, int fh_len);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 30adc9d408a0..0116735cc321 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -472,6 +472,7 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config)
{
char *p;
int err;
+ bool metacopy_opt = false, redirect_opt = false;
config->redirect_mode = kstrdup(ovl_redirect_mode_def(), GFP_KERNEL);
if (!config->redirect_mode)
@@ -516,6 +517,7 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config)
config->redirect_mode = match_strdup(&args[0]);
if (!config->redirect_mode)
return -ENOMEM;
+ redirect_opt = true;
break;
case OPT_INDEX_ON:
@@ -548,6 +550,7 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config)
case OPT_METACOPY_ON:
config->metacopy = true;
+ metacopy_opt = true;
break;
case OPT_METACOPY_OFF:
@@ -572,13 +575,32 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config)
if (err)
return err;
- /* metacopy feature with upper requires redirect_dir=on */
- if (config->upperdir && config->metacopy && !config->redirect_dir) {
- pr_warn("overlayfs: metadata only copy up requires \"redirect_dir=on\", falling back to metacopy=off.\n");
- config->metacopy = false;
- } else if (config->metacopy && !config->redirect_follow) {
- pr_warn("overlayfs: metadata only copy up requires \"redirect_dir=follow\" on non-upper mount, falling back to metacopy=off.\n");
- config->metacopy = false;
+ /*
+ * This is to make the logic below simpler. It doesn't make any other
+ * difference, since config->redirect_dir is only used for upper.
+ */
+ if (!config->upperdir && config->redirect_follow)
+ config->redirect_dir = true;
+
+ /* Resolve metacopy -> redirect_dir dependency */
+ if (config->metacopy && !config->redirect_dir) {
+ if (metacopy_opt && redirect_opt) {
+ pr_err("overlayfs: conflicting options: metacopy=on,redirect_dir=%s\n",
+ config->redirect_mode);
+ return -EINVAL;
+ }
+ if (redirect_opt) {
+ /*
+ * There was an explicit redirect_dir=... that resulted
+ * in this conflict.
+ */
+ pr_info("overlayfs: disabling metacopy due to redirect_dir=%s\n",
+ config->redirect_mode);
+ config->metacopy = false;
+ } else {
+ /* Automatically enable redirect otherwise. */
+ config->redirect_follow = config->redirect_dir = true;
+ }
}
return 0;
@@ -1175,9 +1197,29 @@ out:
return err;
}
+static bool ovl_lower_uuid_ok(struct ovl_fs *ofs, const uuid_t *uuid)
+{
+ unsigned int i;
+
+ if (!ofs->config.nfs_export && !(ofs->config.index && ofs->upper_mnt))
+ return true;
+
+ for (i = 0; i < ofs->numlowerfs; i++) {
+ /*
+ * We use uuid to associate an overlay lower file handle with a
+ * lower layer, so we can accept lower fs with null uuid as long
+ * as all lower layers with null uuid are on the same fs.
+ */
+ if (uuid_equal(&ofs->lower_fs[i].sb->s_uuid, uuid))
+ return false;
+ }
+ return true;
+}
+
/* Get a unique fsid for the layer */
-static int ovl_get_fsid(struct ovl_fs *ofs, struct super_block *sb)
+static int ovl_get_fsid(struct ovl_fs *ofs, const struct path *path)
{
+ struct super_block *sb = path->mnt->mnt_sb;
unsigned int i;
dev_t dev;
int err;
@@ -1191,6 +1233,14 @@ static int ovl_get_fsid(struct ovl_fs *ofs, struct super_block *sb)
return i + 1;
}
+ if (!ovl_lower_uuid_ok(ofs, &sb->s_uuid)) {
+ ofs->config.index = false;
+ ofs->config.nfs_export = false;
+ pr_warn("overlayfs: %s uuid detected in lower fs '%pd2', falling back to index=off,nfs_export=off.\n",
+ uuid_is_null(&sb->s_uuid) ? "null" : "conflicting",
+ path->dentry);
+ }
+
err = get_anon_bdev(&dev);
if (err) {
pr_err("overlayfs: failed to get anonymous bdev for lowerpath\n");
@@ -1225,7 +1275,7 @@ static int ovl_get_lower_layers(struct ovl_fs *ofs, struct path *stack,
struct vfsmount *mnt;
int fsid;
- err = fsid = ovl_get_fsid(ofs, stack[i].mnt->mnt_sb);
+ err = fsid = ovl_get_fsid(ofs, &stack[i]);
if (err < 0)
goto out;
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index ace4fe4c39a9..7c01327b1852 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -65,8 +65,7 @@ struct super_block *ovl_same_sb(struct super_block *sb)
*/
int ovl_can_decode_fh(struct super_block *sb)
{
- if (!sb->s_export_op || !sb->s_export_op->fh_to_dentry ||
- uuid_is_null(&sb->s_uuid))
+ if (!sb->s_export_op || !sb->s_export_op->fh_to_dentry)
return 0;
return sb->s_export_op->encode_fh ? -1 : FILEID_INO32_GEN;
@@ -522,13 +521,13 @@ bool ovl_already_copied_up(struct dentry *dentry, int flags)
int ovl_copy_up_start(struct dentry *dentry, int flags)
{
- struct ovl_inode *oi = OVL_I(d_inode(dentry));
+ struct inode *inode = d_inode(dentry);
int err;
- err = mutex_lock_interruptible(&oi->lock);
+ err = ovl_inode_lock(inode);
if (!err && ovl_already_copied_up_locked(dentry, flags)) {
err = 1; /* Already copied up */
- mutex_unlock(&oi->lock);
+ ovl_inode_unlock(inode);
}
return err;
@@ -536,7 +535,7 @@ int ovl_copy_up_start(struct dentry *dentry, int flags)
void ovl_copy_up_end(struct dentry *dentry)
{
- mutex_unlock(&OVL_I(d_inode(dentry))->lock);
+ ovl_inode_unlock(d_inode(dentry));
}
bool ovl_check_origin_xattr(struct dentry *dentry)
@@ -739,14 +738,14 @@ fail:
* Operations that change overlay inode and upper inode nlink need to be
* synchronized with copy up for persistent nlink accounting.
*/
-int ovl_nlink_start(struct dentry *dentry, bool *locked)
+int ovl_nlink_start(struct dentry *dentry)
{
- struct ovl_inode *oi = OVL_I(d_inode(dentry));
+ struct inode *inode = d_inode(dentry);
const struct cred *old_cred;
int err;
- if (!d_inode(dentry))
- return 0;
+ if (WARN_ON(!inode))
+ return -ENOENT;
/*
* With inodes index is enabled, we store the union overlay nlink
@@ -768,11 +767,11 @@ int ovl_nlink_start(struct dentry *dentry, bool *locked)
return err;
}
- err = mutex_lock_interruptible(&oi->lock);
+ err = ovl_inode_lock(inode);
if (err)
return err;
- if (d_is_dir(dentry) || !ovl_test_flag(OVL_INDEX, d_inode(dentry)))
+ if (d_is_dir(dentry) || !ovl_test_flag(OVL_INDEX, inode))
goto out;
old_cred = ovl_override_creds(dentry->d_sb);
@@ -787,27 +786,24 @@ int ovl_nlink_start(struct dentry *dentry, bool *locked)
out:
if (err)
- mutex_unlock(&oi->lock);
- else
- *locked = true;
+ ovl_inode_unlock(inode);
return err;
}
-void ovl_nlink_end(struct dentry *dentry, bool locked)
+void ovl_nlink_end(struct dentry *dentry)
{
- if (locked) {
- if (ovl_test_flag(OVL_INDEX, d_inode(dentry)) &&
- d_inode(dentry)->i_nlink == 0) {
- const struct cred *old_cred;
+ struct inode *inode = d_inode(dentry);
- old_cred = ovl_override_creds(dentry->d_sb);
- ovl_cleanup_index(dentry);
- revert_creds(old_cred);
- }
+ if (ovl_test_flag(OVL_INDEX, inode) && inode->i_nlink == 0) {
+ const struct cred *old_cred;
- mutex_unlock(&OVL_I(d_inode(dentry))->lock);
+ old_cred = ovl_override_creds(dentry->d_sb);
+ ovl_cleanup_index(dentry);
+ revert_creds(old_cred);
}
+
+ ovl_inode_unlock(inode);
}
int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir)
diff --git a/fs/pipe.c b/fs/pipe.c
index bdc5d3c0977d..51d5fd8840ab 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -140,8 +140,7 @@ static int anon_pipe_buf_steal(struct pipe_inode_info *pipe,
struct page *page = buf->page;
if (page_count(page) == 1) {
- if (memcg_kmem_enabled())
- memcg_kmem_uncharge(page, 0);
+ memcg_kmem_uncharge(page, 0);
__SetPageLocked(page);
return 0;
}
diff --git a/fs/pnode.c b/fs/pnode.c
index 53d411a371ce..1100e810d855 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -10,6 +10,7 @@
#include <linux/mount.h>
#include <linux/fs.h>
#include <linux/nsproxy.h>
+#include <uapi/linux/mount.h>
#include "internal.h"
#include "pnode.h"
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 0ceb3b6b37e7..2edbb657f859 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -343,28 +343,28 @@ static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
#ifdef CONFIG_SECCOMP
seq_put_decimal_ull(m, "\nSeccomp:\t", p->seccomp.mode);
#endif
- seq_printf(m, "\nSpeculation_Store_Bypass:\t");
+ seq_puts(m, "\nSpeculation_Store_Bypass:\t");
switch (arch_prctl_spec_ctrl_get(p, PR_SPEC_STORE_BYPASS)) {
case -EINVAL:
- seq_printf(m, "unknown");
+ seq_puts(m, "unknown");
break;
case PR_SPEC_NOT_AFFECTED:
- seq_printf(m, "not vulnerable");
+ seq_puts(m, "not vulnerable");
break;
case PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE:
- seq_printf(m, "thread force mitigated");
+ seq_puts(m, "thread force mitigated");
break;
case PR_SPEC_PRCTL | PR_SPEC_DISABLE:
- seq_printf(m, "thread mitigated");
+ seq_puts(m, "thread mitigated");
break;
case PR_SPEC_PRCTL | PR_SPEC_ENABLE:
- seq_printf(m, "thread vulnerable");
+ seq_puts(m, "thread vulnerable");
break;
case PR_SPEC_DISABLE:
- seq_printf(m, "globally mitigated");
+ seq_puts(m, "globally mitigated");
break;
default:
- seq_printf(m, "vulnerable");
+ seq_puts(m, "vulnerable");
break;
}
seq_putc(m, '\n');
@@ -392,6 +392,15 @@ static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm)
seq_putc(m, '\n');
}
+static inline void task_thp_status(struct seq_file *m, struct mm_struct *mm)
+{
+ bool thp_enabled = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE);
+
+ if (thp_enabled)
+ thp_enabled = !test_bit(MMF_DISABLE_THP, &mm->flags);
+ seq_printf(m, "THP_enabled:\t%d\n", thp_enabled);
+}
+
int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
@@ -406,6 +415,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
if (mm) {
task_mem(m, mm);
task_core_dumping(m, mm);
+ task_thp_status(m, mm);
mmput(mm);
}
task_sig(m, task);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 7e9f07bf260d..5ab1849971b4 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -140,9 +140,13 @@ struct pid_entry {
#define REG(NAME, MODE, fops) \
NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {})
#define ONE(NAME, MODE, show) \
- NOD(NAME, (S_IFREG|(MODE)), \
+ NOD(NAME, (S_IFREG|(MODE)), \
NULL, &proc_single_file_operations, \
{ .proc_show = show } )
+#define ATTR(LSM, NAME, MODE) \
+ NOD(NAME, (S_IFREG|(MODE)), \
+ NULL, &proc_pid_attr_operations, \
+ { .lsm = LSM })
/*
* Count the number of hardlinks for the pid_entry table, excluding the .
@@ -456,7 +460,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
if (unlikely(!sched_info_on()))
- seq_printf(m, "0 0 0\n");
+ seq_puts(m, "0 0 0\n");
else
seq_printf(m, "%llu %llu %lu\n",
(unsigned long long)task->se.sum_exec_runtime,
@@ -530,7 +534,7 @@ static const struct file_operations proc_lstats_operations = {
static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
- unsigned long totalpages = totalram_pages + total_swap_pages;
+ unsigned long totalpages = totalram_pages() + total_swap_pages;
unsigned long points = 0;
points = oom_badness(task, NULL, NULL, totalpages) *
@@ -581,8 +585,10 @@ static int proc_pid_limits(struct seq_file *m, struct pid_namespace *ns,
/*
* print the file header
*/
- seq_printf(m, "%-25s %-20s %-20s %-10s\n",
- "Limit", "Soft Limit", "Hard Limit", "Units");
+ seq_puts(m, "Limit "
+ "Soft Limit "
+ "Hard Limit "
+ "Units \n");
for (i = 0; i < RLIM_NLIMITS; i++) {
if (rlim[i].rlim_cur == RLIM_INFINITY)
@@ -1084,10 +1090,6 @@ static int __set_oom_adj(struct file *file, int oom_adj, bool legacy)
task_lock(p);
if (!p->vfork_done && process_shares_mm(p, mm)) {
- pr_info("updating oom_score_adj for %d (%s) from %d to %d because it shares mm with %d (%s). Report if this is unexpected.\n",
- task_pid_nr(p), p->comm,
- p->signal->oom_score_adj, oom_adj,
- task_pid_nr(task), task->comm);
p->signal->oom_score_adj = oom_adj;
if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE))
p->signal->oom_score_adj_min = (short)oom_adj;
@@ -1208,7 +1210,7 @@ static const struct file_operations proc_oom_score_adj_operations = {
.llseek = default_llseek,
};
-#ifdef CONFIG_AUDITSYSCALL
+#ifdef CONFIG_AUDIT
#define TMPBUFLEN 11
static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
size_t count, loff_t *ppos)
@@ -2356,10 +2358,13 @@ static ssize_t timerslack_ns_write(struct file *file, const char __user *buf,
return -ESRCH;
if (p != current) {
- if (!capable(CAP_SYS_NICE)) {
+ rcu_read_lock();
+ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
+ rcu_read_unlock();
count = -EPERM;
goto out;
}
+ rcu_read_unlock();
err = security_task_setscheduler(p);
if (err) {
@@ -2392,11 +2397,14 @@ static int timerslack_ns_show(struct seq_file *m, void *v)
return -ESRCH;
if (p != current) {
-
- if (!capable(CAP_SYS_NICE)) {
+ rcu_read_lock();
+ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
+ rcu_read_unlock();
err = -EPERM;
goto out;
}
+ rcu_read_unlock();
+
err = security_task_getscheduler(p);
if (err)
goto out;
@@ -2517,7 +2525,7 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
if (!task)
return -ESRCH;
- length = security_getprocattr(task,
+ length = security_getprocattr(task, PROC_I(inode)->op.lsm,
(char*)file->f_path.dentry->d_name.name,
&p);
put_task_struct(task);
@@ -2566,7 +2574,9 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
if (rv < 0)
goto out_free;
- rv = security_setprocattr(file->f_path.dentry->d_name.name, page, count);
+ rv = security_setprocattr(PROC_I(inode)->op.lsm,
+ file->f_path.dentry->d_name.name, page,
+ count);
mutex_unlock(&current->signal->cred_guard_mutex);
out_free:
kfree(page);
@@ -2580,13 +2590,53 @@ static const struct file_operations proc_pid_attr_operations = {
.llseek = generic_file_llseek,
};
+#define LSM_DIR_OPS(LSM) \
+static int proc_##LSM##_attr_dir_iterate(struct file *filp, \
+ struct dir_context *ctx) \
+{ \
+ return proc_pident_readdir(filp, ctx, \
+ LSM##_attr_dir_stuff, \
+ ARRAY_SIZE(LSM##_attr_dir_stuff)); \
+} \
+\
+static const struct file_operations proc_##LSM##_attr_dir_ops = { \
+ .read = generic_read_dir, \
+ .iterate = proc_##LSM##_attr_dir_iterate, \
+ .llseek = default_llseek, \
+}; \
+\
+static struct dentry *proc_##LSM##_attr_dir_lookup(struct inode *dir, \
+ struct dentry *dentry, unsigned int flags) \
+{ \
+ return proc_pident_lookup(dir, dentry, \
+ LSM##_attr_dir_stuff, \
+ ARRAY_SIZE(LSM##_attr_dir_stuff)); \
+} \
+\
+static const struct inode_operations proc_##LSM##_attr_dir_inode_ops = { \
+ .lookup = proc_##LSM##_attr_dir_lookup, \
+ .getattr = pid_getattr, \
+ .setattr = proc_setattr, \
+}
+
+#ifdef CONFIG_SECURITY_SMACK
+static const struct pid_entry smack_attr_dir_stuff[] = {
+ ATTR("smack", "current", 0666),
+};
+LSM_DIR_OPS(smack);
+#endif
+
static const struct pid_entry attr_dir_stuff[] = {
- REG("current", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
- REG("prev", S_IRUGO, proc_pid_attr_operations),
- REG("exec", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
- REG("fscreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
- REG("keycreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
- REG("sockcreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
+ ATTR(NULL, "current", 0666),
+ ATTR(NULL, "prev", 0444),
+ ATTR(NULL, "exec", 0666),
+ ATTR(NULL, "fscreate", 0666),
+ ATTR(NULL, "keycreate", 0666),
+ ATTR(NULL, "sockcreate", 0666),
+#ifdef CONFIG_SECURITY_SMACK
+ DIR("smack", 0555,
+ proc_smack_attr_dir_inode_ops, proc_smack_attr_dir_ops),
+#endif
};
static int proc_attr_dir_readdir(struct file *file, struct dir_context *ctx)
@@ -2905,6 +2955,21 @@ static int proc_pid_patch_state(struct seq_file *m, struct pid_namespace *ns,
}
#endif /* CONFIG_LIVEPATCH */
+#ifdef CONFIG_STACKLEAK_METRICS
+static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
+{
+ unsigned long prev_depth = THREAD_SIZE -
+ (task->prev_lowest_stack & (THREAD_SIZE - 1));
+ unsigned long depth = THREAD_SIZE -
+ (task->lowest_stack & (THREAD_SIZE - 1));
+
+ seq_printf(m, "previous stack depth: %lu\nstack depth: %lu\n",
+ prev_depth, depth);
+ return 0;
+}
+#endif /* CONFIG_STACKLEAK_METRICS */
+
/*
* Thread groups
*/
@@ -2979,7 +3044,7 @@ static const struct pid_entry tgid_base_stuff[] = {
ONE("oom_score", S_IRUGO, proc_oom_score),
REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations),
REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
-#ifdef CONFIG_AUDITSYSCALL
+#ifdef CONFIG_AUDIT
REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
REG("sessionid", S_IRUGO, proc_sessionid_operations),
#endif
@@ -3006,6 +3071,9 @@ static const struct pid_entry tgid_base_stuff[] = {
#ifdef CONFIG_LIVEPATCH
ONE("patch_state", S_IRUSR, proc_pid_patch_state),
#endif
+#ifdef CONFIG_STACKLEAK_METRICS
+ ONE("stack_depth", S_IRUGO, proc_stack_depth),
+#endif
};
static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
@@ -3139,7 +3207,7 @@ static struct dentry *proc_pid_instantiate(struct dentry * dentry,
return d_splice_alias(inode, dentry);
}
-struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
+struct dentry *proc_pid_lookup(struct dentry *dentry, unsigned int flags)
{
struct task_struct *task;
unsigned tgid;
@@ -3364,7 +3432,7 @@ static const struct pid_entry tid_base_stuff[] = {
ONE("oom_score", S_IRUGO, proc_oom_score),
REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations),
REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
-#ifdef CONFIG_AUDITSYSCALL
+#ifdef CONFIG_AUDIT
REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
REG("sessionid", S_IRUGO, proc_sessionid_operations),
#endif
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 8ae109429a88..e39bac94dead 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -256,7 +256,7 @@ struct dentry *proc_lookup_de(struct inode *dir, struct dentry *dentry,
inode = proc_get_inode(dir->i_sb, de);
if (!inode)
return ERR_PTR(-ENOMEM);
- d_set_d_op(dentry, &proc_misc_dentry_ops);
+ d_set_d_op(dentry, de->proc_dops);
return d_splice_alias(inode, dentry);
}
read_unlock(&proc_subdir_lock);
@@ -429,6 +429,8 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
INIT_LIST_HEAD(&ent->pde_openers);
proc_set_user(ent, (*parent)->uid, (*parent)->gid);
+ ent->proc_dops = &proc_misc_dentry_ops;
+
out:
return ent;
}
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index fc5306a31a1d..da649ccd6804 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -59,7 +59,6 @@ static struct kmem_cache *pde_opener_cache __ro_after_init;
static struct inode *proc_alloc_inode(struct super_block *sb)
{
struct proc_inode *ei;
- struct inode *inode;
ei = kmem_cache_alloc(proc_inode_cachep, GFP_KERNEL);
if (!ei)
@@ -71,8 +70,7 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
ei->sysctl = NULL;
ei->sysctl_entry = NULL;
ei->ns_ops = NULL;
- inode = &ei->vfs_inode;
- return inode;
+ return &ei->vfs_inode;
}
static void proc_i_callback(struct rcu_head *head)
@@ -516,6 +514,9 @@ int proc_fill_super(struct super_block *s, void *data, int silent)
*/
s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
+ /* procfs dentries and inodes don't require IO to create */
+ s->s_shrink.seeks = 0;
+
pde_get(&proc_root);
root_inode = proc_get_inode(s, &proc_root);
if (!root_inode) {
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 5185d7f6a51e..ea575375f210 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -44,6 +44,7 @@ struct proc_dir_entry {
struct completion *pde_unload_completion;
const struct inode_operations *proc_iops;
const struct file_operations *proc_fops;
+ const struct dentry_operations *proc_dops;
union {
const struct seq_operations *seq_ops;
int (*single_show)(struct seq_file *, void *);
@@ -81,6 +82,7 @@ union proc_op {
int (*proc_show)(struct seq_file *m,
struct pid_namespace *ns, struct pid *pid,
struct task_struct *task);
+ const char *lsm;
};
struct proc_inode {
@@ -161,7 +163,7 @@ extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struc
extern void pid_update_inode(struct task_struct *, struct inode *);
extern int pid_delete_dentry(const struct dentry *);
extern int proc_pid_readdir(struct file *, struct dir_context *);
-extern struct dentry *proc_pid_lookup(struct inode *, struct dentry *, unsigned int);
+struct dentry *proc_pid_lookup(struct dentry *, unsigned int);
extern loff_t mem_lseek(struct file *, loff_t, int);
/* Lookups */
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index d297fe4472a9..bbcc185062bb 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -22,7 +22,7 @@
#include <linux/vmalloc.h>
#include <linux/highmem.h>
#include <linux/printk.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
index d06694757201..8468baee951d 100644
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -10,9 +10,6 @@
#include <linux/seqlock.h>
#include <linux/time.h>
-#define LOAD_INT(x) ((x) >> FSHIFT)
-#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
-
static int loadavg_proc_show(struct seq_file *m, void *v)
{
unsigned long avnrun[3];
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index edda898714eb..568d90e17c17 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -38,6 +38,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
long cached;
long available;
unsigned long pages[NR_LRU_LISTS];
+ unsigned long sreclaimable, sunreclaim;
int lru;
si_meminfo(&i);
@@ -53,6 +54,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
available = si_mem_available();
+ sreclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE);
+ sunreclaim = global_node_page_state(NR_SLAB_UNRECLAIMABLE);
show_val_kb(m, "MemTotal: ", i.totalram);
show_val_kb(m, "MemFree: ", i.freeram);
@@ -94,14 +97,11 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
show_val_kb(m, "Mapped: ",
global_node_page_state(NR_FILE_MAPPED));
show_val_kb(m, "Shmem: ", i.sharedram);
- show_val_kb(m, "Slab: ",
- global_node_page_state(NR_SLAB_RECLAIMABLE) +
- global_node_page_state(NR_SLAB_UNRECLAIMABLE));
-
- show_val_kb(m, "SReclaimable: ",
- global_node_page_state(NR_SLAB_RECLAIMABLE));
- show_val_kb(m, "SUnreclaim: ",
- global_node_page_state(NR_SLAB_UNRECLAIMABLE));
+ show_val_kb(m, "KReclaimable: ", sreclaimable +
+ global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE));
+ show_val_kb(m, "Slab: ", sreclaimable + sunreclaim);
+ show_val_kb(m, "SReclaimable: ", sreclaimable);
+ show_val_kb(m, "SUnreclaim: ", sunreclaim);
seq_printf(m, "KernelStack: %8lu kB\n",
global_zone_page_state(NR_KERNEL_STACK_KB));
show_val_kb(m, "PageTables: ",
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 792c78a49174..544d1ee15aee 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/compiler.h>
#include <linux/fs.h>
#include <linux/init.h>
@@ -46,7 +46,7 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
ppage = pfn_to_page(pfn);
else
ppage = NULL;
- if (!ppage || PageSlab(ppage))
+ if (!ppage || PageSlab(ppage) || page_has_type(ppage))
pcount = 0;
else
pcount = page_mapcount(ppage);
@@ -152,8 +152,8 @@ u64 stable_page_flags(struct page *page)
else if (page_count(page) == 0 && is_free_buddy_page(page))
u |= 1 << KPF_BUDDY;
- if (PageBalloon(page))
- u |= 1 << KPF_BALLOON;
+ if (PageOffline(page))
+ u |= 1 << KPF_OFFLINE;
if (PageTable(page))
u |= 1 << KPF_PGTABLE;
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index d5e0fcb3439e..a7b12435519e 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -38,6 +38,22 @@ static struct net *get_proc_net(const struct inode *inode)
return maybe_get_net(PDE_NET(PDE(inode)));
}
+static int proc_net_d_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ return 0;
+}
+
+static const struct dentry_operations proc_net_dentry_ops = {
+ .d_revalidate = proc_net_d_revalidate,
+ .d_delete = always_delete_dentry,
+};
+
+static void pde_force_lookup(struct proc_dir_entry *pde)
+{
+ /* /proc/net/ entries can be changed under us by setns(CLONE_NEWNET) */
+ pde->proc_dops = &proc_net_dentry_ops;
+}
+
static int seq_open_net(struct inode *inode, struct file *file)
{
unsigned int state_size = PDE(inode)->state_size;
@@ -90,6 +106,7 @@ struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode,
p = proc_create_reg(name, mode, &parent, data);
if (!p)
return NULL;
+ pde_force_lookup(p);
p->proc_fops = &proc_net_seq_fops;
p->seq_ops = ops;
p->state_size = state_size;
@@ -133,6 +150,7 @@ struct proc_dir_entry *proc_create_net_data_write(const char *name, umode_t mode
p = proc_create_reg(name, mode, &parent, data);
if (!p)
return NULL;
+ pde_force_lookup(p);
p->proc_fops = &proc_net_seq_fops;
p->seq_ops = ops;
p->state_size = state_size;
@@ -181,6 +199,7 @@ struct proc_dir_entry *proc_create_net_single(const char *name, umode_t mode,
p = proc_create_reg(name, mode, &parent, data);
if (!p)
return NULL;
+ pde_force_lookup(p);
p->proc_fops = &proc_net_single_fops;
p->single_show = show;
return proc_register(parent, p);
@@ -223,6 +242,7 @@ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mo
p = proc_create_reg(name, mode, &parent, data);
if (!p)
return NULL;
+ pde_force_lookup(p);
p->proc_fops = &proc_net_single_fops;
p->single_show = show;
p->write = write;
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 89921a0d2ebb..4d598a399bbf 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -464,7 +464,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
inode = new_inode(sb);
if (!inode)
- goto out;
+ return ERR_PTR(-ENOMEM);
inode->i_ino = get_next_ino();
@@ -474,8 +474,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
if (unlikely(head->unregistering)) {
spin_unlock(&sysctl_lock);
iput(inode);
- inode = NULL;
- goto out;
+ return ERR_PTR(-ENOENT);
}
ei->sysctl = head;
ei->sysctl_entry = table;
@@ -500,7 +499,6 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
if (root->set_ownership)
root->set_ownership(head, table, &inode->i_uid, &inode->i_gid);
-out:
return inode;
}
@@ -549,10 +547,11 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
goto out;
}
- err = ERR_PTR(-ENOMEM);
inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p);
- if (!inode)
+ if (IS_ERR(inode)) {
+ err = ERR_CAST(inode);
goto out;
+ }
d_set_d_op(dentry, &proc_sys_dentry_operations);
err = d_splice_alias(inode, dentry);
@@ -685,7 +684,7 @@ static bool proc_sys_fill_cache(struct file *file,
if (d_in_lookup(child)) {
struct dentry *res;
inode = proc_sys_make_inode(dir->d_sb, head, table);
- if (!inode) {
+ if (IS_ERR(inode)) {
d_lookup_done(child);
dput(child);
return false;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index f4b1a9d2eca6..621e6ec322ca 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -154,7 +154,7 @@ static int proc_root_getattr(const struct path *path, struct kstat *stat,
static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags)
{
- if (!proc_pid_lookup(dir, dentry, flags))
+ if (!proc_pid_lookup(dentry, flags))
return NULL;
return proc_lookup(dir, dentry, flags);
diff --git a/fs/proc/self.c b/fs/proc/self.c
index 127265e5c55f..57c0a1047250 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -38,6 +38,7 @@ int proc_setup_self(struct super_block *s)
struct inode *root_inode = d_inode(s->s_root);
struct pid_namespace *ns = proc_pid_ns(root_inode);
struct dentry *self;
+ int ret = -ENOMEM;
inode_lock(root_inode);
self = d_alloc_name(s->s_root, "self");
@@ -51,20 +52,19 @@ int proc_setup_self(struct super_block *s)
inode->i_gid = GLOBAL_ROOT_GID;
inode->i_op = &proc_self_inode_operations;
d_add(self, inode);
+ ret = 0;
} else {
dput(self);
- self = ERR_PTR(-ENOMEM);
}
- } else {
- self = ERR_PTR(-ENOMEM);
}
inode_unlock(root_inode);
- if (IS_ERR(self)) {
+
+ if (ret)
pr_err("proc_fill_super: can't allocate /proc/self\n");
- return PTR_ERR(self);
- }
- ns->proc_self = self;
- return 0;
+ else
+ ns->proc_self = self;
+
+ return ret;
}
void __init proc_self_init(void)
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 535eda7857cf..80c305f206bb 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -23,21 +23,21 @@
#ifdef arch_idle_time
-static u64 get_idle_time(int cpu)
+static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
{
u64 idle;
- idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE];
+ idle = kcs->cpustat[CPUTIME_IDLE];
if (cpu_online(cpu) && !nr_iowait_cpu(cpu))
idle += arch_idle_time(cpu);
return idle;
}
-static u64 get_iowait_time(int cpu)
+static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu)
{
u64 iowait;
- iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT];
+ iowait = kcs->cpustat[CPUTIME_IOWAIT];
if (cpu_online(cpu) && nr_iowait_cpu(cpu))
iowait += arch_idle_time(cpu);
return iowait;
@@ -45,7 +45,7 @@ static u64 get_iowait_time(int cpu)
#else
-static u64 get_idle_time(int cpu)
+static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
{
u64 idle, idle_usecs = -1ULL;
@@ -54,14 +54,14 @@ static u64 get_idle_time(int cpu)
if (idle_usecs == -1ULL)
/* !NO_HZ or cpu offline so we can rely on cpustat.idle */
- idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE];
+ idle = kcs->cpustat[CPUTIME_IDLE];
else
idle = idle_usecs * NSEC_PER_USEC;
return idle;
}
-static u64 get_iowait_time(int cpu)
+static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu)
{
u64 iowait, iowait_usecs = -1ULL;
@@ -70,7 +70,7 @@ static u64 get_iowait_time(int cpu)
if (iowait_usecs == -1ULL)
/* !NO_HZ or cpu offline so we can rely on cpustat.iowait */
- iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT];
+ iowait = kcs->cpustat[CPUTIME_IOWAIT];
else
iowait = iowait_usecs * NSEC_PER_USEC;
@@ -79,6 +79,31 @@ static u64 get_iowait_time(int cpu)
#endif
+static void show_irq_gap(struct seq_file *p, unsigned int gap)
+{
+ static const char zeros[] = " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0";
+
+ while (gap > 0) {
+ unsigned int inc;
+
+ inc = min_t(unsigned int, gap, ARRAY_SIZE(zeros) / 2);
+ seq_write(p, zeros, 2 * inc);
+ gap -= inc;
+ }
+}
+
+static void show_all_irqs(struct seq_file *p)
+{
+ unsigned int i, next = 0;
+
+ for_each_active_irq(i) {
+ show_irq_gap(p, i - next);
+ seq_put_decimal_ull(p, " ", kstat_irqs_usr(i));
+ next = i + 1;
+ }
+ show_irq_gap(p, nr_irqs - next);
+}
+
static int show_stat(struct seq_file *p, void *v)
{
int i, j;
@@ -95,16 +120,18 @@ static int show_stat(struct seq_file *p, void *v)
getboottime64(&boottime);
for_each_possible_cpu(i) {
- user += kcpustat_cpu(i).cpustat[CPUTIME_USER];
- nice += kcpustat_cpu(i).cpustat[CPUTIME_NICE];
- system += kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM];
- idle += get_idle_time(i);
- iowait += get_iowait_time(i);
- irq += kcpustat_cpu(i).cpustat[CPUTIME_IRQ];
- softirq += kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ];
- steal += kcpustat_cpu(i).cpustat[CPUTIME_STEAL];
- guest += kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
- guest_nice += kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
+ struct kernel_cpustat *kcs = &kcpustat_cpu(i);
+
+ user += kcs->cpustat[CPUTIME_USER];
+ nice += kcs->cpustat[CPUTIME_NICE];
+ system += kcs->cpustat[CPUTIME_SYSTEM];
+ idle += get_idle_time(kcs, i);
+ iowait += get_iowait_time(kcs, i);
+ irq += kcs->cpustat[CPUTIME_IRQ];
+ softirq += kcs->cpustat[CPUTIME_SOFTIRQ];
+ steal += kcs->cpustat[CPUTIME_STEAL];
+ guest += kcs->cpustat[CPUTIME_GUEST];
+ guest_nice += kcs->cpustat[CPUTIME_GUEST_NICE];
sum += kstat_cpu_irqs_sum(i);
sum += arch_irq_stat_cpu(i);
@@ -130,17 +157,19 @@ static int show_stat(struct seq_file *p, void *v)
seq_putc(p, '\n');
for_each_online_cpu(i) {
+ struct kernel_cpustat *kcs = &kcpustat_cpu(i);
+
/* Copy values here to work around gcc-2.95.3, gcc-2.96 */
- user = kcpustat_cpu(i).cpustat[CPUTIME_USER];
- nice = kcpustat_cpu(i).cpustat[CPUTIME_NICE];
- system = kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM];
- idle = get_idle_time(i);
- iowait = get_iowait_time(i);
- irq = kcpustat_cpu(i).cpustat[CPUTIME_IRQ];
- softirq = kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ];
- steal = kcpustat_cpu(i).cpustat[CPUTIME_STEAL];
- guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
- guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
+ user = kcs->cpustat[CPUTIME_USER];
+ nice = kcs->cpustat[CPUTIME_NICE];
+ system = kcs->cpustat[CPUTIME_SYSTEM];
+ idle = get_idle_time(kcs, i);
+ iowait = get_iowait_time(kcs, i);
+ irq = kcs->cpustat[CPUTIME_IRQ];
+ softirq = kcs->cpustat[CPUTIME_SOFTIRQ];
+ steal = kcs->cpustat[CPUTIME_STEAL];
+ guest = kcs->cpustat[CPUTIME_GUEST];
+ guest_nice = kcs->cpustat[CPUTIME_GUEST_NICE];
seq_printf(p, "cpu%d", i);
seq_put_decimal_ull(p, " ", nsec_to_clock_t(user));
seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice));
@@ -156,9 +185,7 @@ static int show_stat(struct seq_file *p, void *v)
}
seq_put_decimal_ull(p, "intr ", (unsigned long long)sum);
- /* sum again ? it could be updated? */
- for_each_irq_nr(j)
- seq_put_decimal_ull(p, " ", kstat_irqs_usr(j));
+ show_all_irqs(p);
seq_printf(p,
"\nctxt %llu\n"
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 5ea1d64cb0b4..beccb0b1d57c 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -423,7 +423,7 @@ struct mem_size_stats {
};
static void smaps_account(struct mem_size_stats *mss, struct page *page,
- bool compound, bool young, bool dirty)
+ bool compound, bool young, bool dirty, bool locked)
{
int i, nr = compound ? 1 << compound_order(page) : 1;
unsigned long size = nr * PAGE_SIZE;
@@ -450,24 +450,31 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
else
mss->private_clean += size;
mss->pss += (u64)size << PSS_SHIFT;
+ if (locked)
+ mss->pss_locked += (u64)size << PSS_SHIFT;
return;
}
for (i = 0; i < nr; i++, page++) {
int mapcount = page_mapcount(page);
+ unsigned long pss = (PAGE_SIZE << PSS_SHIFT);
if (mapcount >= 2) {
if (dirty || PageDirty(page))
mss->shared_dirty += PAGE_SIZE;
else
mss->shared_clean += PAGE_SIZE;
- mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount;
+ mss->pss += pss / mapcount;
+ if (locked)
+ mss->pss_locked += pss / mapcount;
} else {
if (dirty || PageDirty(page))
mss->private_dirty += PAGE_SIZE;
else
mss->private_clean += PAGE_SIZE;
- mss->pss += PAGE_SIZE << PSS_SHIFT;
+ mss->pss += pss;
+ if (locked)
+ mss->pss_locked += pss;
}
}
}
@@ -490,6 +497,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
{
struct mem_size_stats *mss = walk->private;
struct vm_area_struct *vma = walk->vma;
+ bool locked = !!(vma->vm_flags & VM_LOCKED);
struct page *page = NULL;
if (pte_present(*pte)) {
@@ -521,7 +529,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
if (!page)
return;
- if (radix_tree_exceptional_entry(page))
+ if (xa_is_value(page))
mss->swap += PAGE_SIZE;
else
put_page(page);
@@ -532,7 +540,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
if (!page)
return;
- smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte));
+ smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte), locked);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -541,6 +549,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
{
struct mem_size_stats *mss = walk->private;
struct vm_area_struct *vma = walk->vma;
+ bool locked = !!(vma->vm_flags & VM_LOCKED);
struct page *page;
/* FOLL_DUMP will return -EFAULT on huge zero page */
@@ -555,7 +564,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
/* pass */;
else
VM_BUG_ON_PAGE(1, page);
- smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd));
+ smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), locked);
}
#else
static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
@@ -713,6 +722,8 @@ static void smap_gather_stats(struct vm_area_struct *vma,
smaps_walk.private = mss;
#ifdef CONFIG_SHMEM
+ /* In case of smaps_rollup, reset the value from previous vma */
+ mss->check_shmem_swap = false;
if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) {
/*
* For shared or readonly shmem mappings we know that all
@@ -728,18 +739,15 @@ static void smap_gather_stats(struct vm_area_struct *vma,
if (!shmem_swapped || (vma->vm_flags & VM_SHARED) ||
!(vma->vm_flags & VM_WRITE)) {
- mss->swap = shmem_swapped;
+ mss->swap += shmem_swapped;
} else {
mss->check_shmem_swap = true;
smaps_walk.pte_hole = smaps_pte_hole;
}
}
#endif
-
/* mmap_sem is held in m_start */
walk_page_vma(vma, &smaps_walk);
- if (vma->vm_flags & VM_LOCKED)
- mss->pss_locked += mss->pss;
}
#define SEQ_PUT_DEC(str, val) \
@@ -788,6 +796,8 @@ static int show_smap(struct seq_file *m, void *v)
__show_smap(m, &mss);
+ seq_printf(m, "THPeligible: %d\n", transparent_hugepage_enabled(vma));
+
if (arch_pkeys_enabled())
seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma));
show_smap_vma_flags(m, vma);
@@ -938,10 +948,12 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
pte_t ptent = *pte;
if (pte_present(ptent)) {
- ptent = ptep_modify_prot_start(vma->vm_mm, addr, pte);
- ptent = pte_wrprotect(ptent);
+ pte_t old_pte;
+
+ old_pte = ptep_modify_prot_start(vma, addr, pte);
+ ptent = pte_wrprotect(old_pte);
ptent = pte_clear_soft_dirty(ptent);
- ptep_modify_prot_commit(vma->vm_mm, addr, pte, ptent);
+ ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent);
} else if (is_swap_pte(ptent)) {
ptent = pte_swp_clear_soft_dirty(ptent);
set_pte_at(vma->vm_mm, addr, pte, ptent);
@@ -1094,6 +1106,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
return -ESRCH;
mm = get_task_mm(task);
if (mm) {
+ struct mmu_notifier_range range;
struct clear_refs_private cp = {
.type = type,
};
@@ -1137,11 +1150,13 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
downgrade_write(&mm->mmap_sem);
break;
}
- mmu_notifier_invalidate_range_start(mm, 0, -1);
+
+ mmu_notifier_range_init(&range, mm, 0, -1UL);
+ mmu_notifier_invalidate_range_start(&range);
}
walk_page_range(0, mm->highest_vm_end, &clear_refs_walk);
if (type == CLEAR_REFS_SOFT_DIRTY)
- mmu_notifier_invalidate_range_end(mm, 0, -1);
+ mmu_notifier_invalidate_range_end(&range);
tlb_finish_mmu(&tlb, 0, -1);
up_read(&mm->mmap_sem);
out_mm:
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 0b63d68dedb2..36bf0f2e102e 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -64,7 +64,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
else
bytes += kobjsize(current->files);
- if (current->sighand && atomic_read(&current->sighand->count) > 1)
+ if (current->sighand && refcount_read(&current->sighand->count) > 1)
sbytes += kobjsize(current->sighand);
else
bytes += kobjsize(current->sighand);
@@ -178,7 +178,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
seq_file_path(m, file, "");
} else if (mm && is_stack(vma)) {
seq_pad(m, ' ');
- seq_printf(m, "[stack]");
+ seq_puts(m, "[stack]");
}
seq_putc(m, '\n');
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
index b905010ca9eb..f61ae53533f5 100644
--- a/fs/proc/thread_self.c
+++ b/fs/proc/thread_self.c
@@ -38,6 +38,7 @@ int proc_setup_thread_self(struct super_block *s)
struct inode *root_inode = d_inode(s->s_root);
struct pid_namespace *ns = proc_pid_ns(root_inode);
struct dentry *thread_self;
+ int ret = -ENOMEM;
inode_lock(root_inode);
thread_self = d_alloc_name(s->s_root, "thread-self");
@@ -51,20 +52,19 @@ int proc_setup_thread_self(struct super_block *s)
inode->i_gid = GLOBAL_ROOT_GID;
inode->i_op = &proc_thread_self_inode_operations;
d_add(thread_self, inode);
+ ret = 0;
} else {
dput(thread_self);
- thread_self = ERR_PTR(-ENOMEM);
}
- } else {
- thread_self = ERR_PTR(-ENOMEM);
}
inode_unlock(root_inode);
- if (IS_ERR(thread_self)) {
+
+ if (ret)
pr_err("proc_fill_super: can't allocate /proc/thread_self\n");
- return PTR_ERR(thread_self);
- }
- ns->proc_thread_self = thread_self;
- return 0;
+ else
+ ns->proc_thread_self = thread_self;
+
+ return ret;
}
void __init proc_thread_self_init(void)
diff --git a/fs/proc/util.c b/fs/proc/util.c
index b161cfa0f9fa..98f8adc17345 100644
--- a/fs/proc/util.c
+++ b/fs/proc/util.c
@@ -1,4 +1,5 @@
#include <linux/dcache.h>
+#include "internal.h"
unsigned name_to_int(const struct qstr *qstr)
{
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 91ae16fbd7d5..3fe90443c1bb 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -16,7 +16,7 @@
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/printk.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/init.h>
#include <linux/crash_dump.h>
#include <linux/list.h>
@@ -423,7 +423,7 @@ static vm_fault_t mmap_vmcore_fault(struct vm_fault *vmf)
if (rc < 0) {
unlock_page(page);
put_page(page);
- return (rc == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS;
+ return vmf_error(rc);
}
SetPageUptodate(page);
}
diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig
index 503086f7f7c1..0d19d191ae70 100644
--- a/fs/pstore/Kconfig
+++ b/fs/pstore/Kconfig
@@ -141,7 +141,6 @@ config PSTORE_RAM
tristate "Log panic/oops to a RAM buffer"
depends on PSTORE
depends on HAS_IOMEM
- depends on HAVE_MEMBLOCK
select REED_SOLOMON
select REED_SOLOMON_ENC8
select REED_SOLOMON_DEC8
diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c
index 06aab07b6bb7..b8a0931568f8 100644
--- a/fs/pstore/ftrace.c
+++ b/fs/pstore/ftrace.c
@@ -148,7 +148,7 @@ void pstore_unregister_ftrace(void)
mutex_lock(&pstore_ftrace_lock);
if (pstore_ftrace_enabled) {
unregister_ftrace_function(&pstore_ftrace_ops);
- pstore_ftrace_enabled = 0;
+ pstore_ftrace_enabled = false;
}
mutex_unlock(&pstore_ftrace_lock);
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 8cf2218b46a7..c60ee46f3e39 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -335,53 +335,10 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record)
goto fail_alloc;
private->record = record;
- switch (record->type) {
- case PSTORE_TYPE_DMESG:
- scnprintf(name, sizeof(name), "dmesg-%s-%llu%s",
- record->psi->name, record->id,
- record->compressed ? ".enc.z" : "");
- break;
- case PSTORE_TYPE_CONSOLE:
- scnprintf(name, sizeof(name), "console-%s-%llu",
- record->psi->name, record->id);
- break;
- case PSTORE_TYPE_FTRACE:
- scnprintf(name, sizeof(name), "ftrace-%s-%llu",
- record->psi->name, record->id);
- break;
- case PSTORE_TYPE_MCE:
- scnprintf(name, sizeof(name), "mce-%s-%llu",
- record->psi->name, record->id);
- break;
- case PSTORE_TYPE_PPC_RTAS:
- scnprintf(name, sizeof(name), "rtas-%s-%llu",
- record->psi->name, record->id);
- break;
- case PSTORE_TYPE_PPC_OF:
- scnprintf(name, sizeof(name), "powerpc-ofw-%s-%llu",
- record->psi->name, record->id);
- break;
- case PSTORE_TYPE_PPC_COMMON:
- scnprintf(name, sizeof(name), "powerpc-common-%s-%llu",
- record->psi->name, record->id);
- break;
- case PSTORE_TYPE_PMSG:
- scnprintf(name, sizeof(name), "pmsg-%s-%llu",
- record->psi->name, record->id);
- break;
- case PSTORE_TYPE_PPC_OPAL:
- scnprintf(name, sizeof(name), "powerpc-opal-%s-%llu",
- record->psi->name, record->id);
- break;
- case PSTORE_TYPE_UNKNOWN:
- scnprintf(name, sizeof(name), "unknown-%s-%llu",
- record->psi->name, record->id);
- break;
- default:
- scnprintf(name, sizeof(name), "type%d-%s-%llu",
- record->type, record->psi->name, record->id);
- break;
- }
+ scnprintf(name, sizeof(name), "%s-%s-%llu%s",
+ pstore_type_to_name(record->type),
+ record->psi->name, record->id,
+ record->compressed ? ".enc.z" : "");
dentry = d_alloc_name(root, name);
if (!dentry)
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index b821054ca3ed..75887a269b64 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -59,6 +59,19 @@ MODULE_PARM_DESC(update_ms, "milliseconds before pstore updates its content "
"enabling this option is not safe, it may lead to further "
"corruption on Oopses)");
+/* Names should be in the same order as the enum pstore_type_id */
+static const char * const pstore_type_names[] = {
+ "dmesg",
+ "mce",
+ "console",
+ "ftrace",
+ "rtas",
+ "powerpc-ofw",
+ "powerpc-common",
+ "pmsg",
+ "powerpc-opal",
+};
+
static int pstore_new_entry;
static void pstore_timefunc(struct timer_list *);
@@ -104,6 +117,30 @@ void pstore_set_kmsg_bytes(int bytes)
/* Tag each group of saved records with a sequence number */
static int oopscount;
+const char *pstore_type_to_name(enum pstore_type_id type)
+{
+ BUILD_BUG_ON(ARRAY_SIZE(pstore_type_names) != PSTORE_TYPE_MAX);
+
+ if (WARN_ON_ONCE(type >= PSTORE_TYPE_MAX))
+ return "unknown";
+
+ return pstore_type_names[type];
+}
+EXPORT_SYMBOL_GPL(pstore_type_to_name);
+
+enum pstore_type_id pstore_name_to_type(const char *name)
+{
+ int i;
+
+ for (i = 0; i < PSTORE_TYPE_MAX; i++) {
+ if (!strcmp(pstore_type_names[i], name))
+ return i;
+ }
+
+ return PSTORE_TYPE_MAX;
+}
+EXPORT_SYMBOL_GPL(pstore_name_to_type);
+
static const char *get_reason_str(enum kmsg_dump_reason reason)
{
switch (reason) {
@@ -124,26 +161,27 @@ static const char *get_reason_str(enum kmsg_dump_reason reason)
}
}
-bool pstore_cannot_block_path(enum kmsg_dump_reason reason)
+/*
+ * Should pstore_dump() wait for a concurrent pstore_dump()? If
+ * not, the current pstore_dump() will report a failure to dump
+ * and return.
+ */
+static bool pstore_cannot_wait(enum kmsg_dump_reason reason)
{
- /*
- * In case of NMI path, pstore shouldn't be blocked
- * regardless of reason.
- */
+ /* In NMI path, pstore shouldn't block regardless of reason. */
if (in_nmi())
return true;
switch (reason) {
/* In panic case, other cpus are stopped by smp_send_stop(). */
case KMSG_DUMP_PANIC:
- /* Emergency restart shouldn't be blocked by spin lock. */
+ /* Emergency restart shouldn't be blocked. */
case KMSG_DUMP_EMERG:
return true;
default:
return false;
}
}
-EXPORT_SYMBOL_GPL(pstore_cannot_block_path);
#if IS_ENABLED(CONFIG_PSTORE_DEFLATE_COMPRESS)
static int zbufsize_deflate(size_t size)
@@ -258,20 +296,6 @@ static int pstore_compress(const void *in, void *out,
return outlen;
}
-static int pstore_decompress(void *in, void *out,
- unsigned int inlen, unsigned int outlen)
-{
- int ret;
-
- ret = crypto_comp_decompress(tfm, in, inlen, out, &outlen);
- if (ret) {
- pr_err("crypto_comp_decompress failed, ret = %d!\n", ret);
- return ret;
- }
-
- return outlen;
-}
-
static void allocate_buf_for_compression(void)
{
struct crypto_comp *ctx;
@@ -318,7 +342,7 @@ static void allocate_buf_for_compression(void)
big_oops_buf_sz = size;
big_oops_buf = buf;
- pr_info("Using compression: %s\n", zbackend->name);
+ pr_info("Using crash dump compression: %s\n", zbackend->name);
}
static void free_buf_for_compression(void)
@@ -368,9 +392,8 @@ void pstore_record_init(struct pstore_record *record,
}
/*
- * callback from kmsg_dump. (s2,l2) has the most recently
- * written bytes, older bytes are in (s1,l1). Save as much
- * as we can from the end of the buffer.
+ * callback from kmsg_dump. Save as much as we can (up to kmsg_bytes) from the
+ * end of the buffer.
*/
static void pstore_dump(struct kmsg_dumper *dumper,
enum kmsg_dump_reason reason)
@@ -378,23 +401,23 @@ static void pstore_dump(struct kmsg_dumper *dumper,
unsigned long total = 0;
const char *why;
unsigned int part = 1;
- unsigned long flags = 0;
- int is_locked;
int ret;
why = get_reason_str(reason);
- if (pstore_cannot_block_path(reason)) {
- is_locked = spin_trylock_irqsave(&psinfo->buf_lock, flags);
- if (!is_locked) {
- pr_err("pstore dump routine blocked in %s path, may corrupt error record\n"
- , in_nmi() ? "NMI" : why);
+ if (down_trylock(&psinfo->buf_lock)) {
+ /* Failed to acquire lock: give up if we cannot wait. */
+ if (pstore_cannot_wait(reason)) {
+ pr_err("dump skipped in %s path: may corrupt error record\n",
+ in_nmi() ? "NMI" : why);
+ return;
+ }
+ if (down_interruptible(&psinfo->buf_lock)) {
+ pr_err("could not grab semaphore?!\n");
return;
}
- } else {
- spin_lock_irqsave(&psinfo->buf_lock, flags);
- is_locked = 1;
}
+
oopscount++;
while (total < kmsg_bytes) {
char *dst;
@@ -411,7 +434,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
record.part = part;
record.buf = psinfo->buf;
- if (big_oops_buf && is_locked) {
+ if (big_oops_buf) {
dst = big_oops_buf;
dst_size = big_oops_buf_sz;
} else {
@@ -429,7 +452,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
dst_size, &dump_size))
break;
- if (big_oops_buf && is_locked) {
+ if (big_oops_buf) {
zipped_len = pstore_compress(dst, psinfo->buf,
header_size + dump_size,
psinfo->bufsize);
@@ -452,8 +475,8 @@ static void pstore_dump(struct kmsg_dumper *dumper,
total += record.size;
part++;
}
- if (is_locked)
- spin_unlock_irqrestore(&psinfo->buf_lock, flags);
+
+ up(&psinfo->buf_lock);
}
static struct kmsg_dumper pstore_dumper = {
@@ -476,31 +499,17 @@ static void pstore_unregister_kmsg(void)
#ifdef CONFIG_PSTORE_CONSOLE
static void pstore_console_write(struct console *con, const char *s, unsigned c)
{
- const char *e = s + c;
-
- while (s < e) {
- struct pstore_record record;
- unsigned long flags;
+ struct pstore_record record;
- pstore_record_init(&record, psinfo);
- record.type = PSTORE_TYPE_CONSOLE;
+ if (!c)
+ return;
- if (c > psinfo->bufsize)
- c = psinfo->bufsize;
+ pstore_record_init(&record, psinfo);
+ record.type = PSTORE_TYPE_CONSOLE;
- if (oops_in_progress) {
- if (!spin_trylock_irqsave(&psinfo->buf_lock, flags))
- break;
- } else {
- spin_lock_irqsave(&psinfo->buf_lock, flags);
- }
- record.buf = (char *)s;
- record.size = c;
- psinfo->write(&record);
- spin_unlock_irqrestore(&psinfo->buf_lock, flags);
- s += c;
- c = e - s;
- }
+ record.buf = (char *)s;
+ record.size = c;
+ psinfo->write(&record);
}
static struct console pstore_console = {
@@ -589,6 +598,7 @@ int pstore_register(struct pstore_info *psi)
psi->write_user = pstore_write_user_compat;
psinfo = psi;
mutex_init(&psinfo->read_mutex);
+ sema_init(&psinfo->buf_lock, 1);
spin_unlock(&pstore_lock);
if (owner && !try_module_get(owner)) {
@@ -656,8 +666,9 @@ EXPORT_SYMBOL_GPL(pstore_unregister);
static void decompress_record(struct pstore_record *record)
{
+ int ret;
int unzipped_len;
- char *decompressed;
+ char *unzipped, *workspace;
if (!record->compressed)
return;
@@ -668,35 +679,42 @@ static void decompress_record(struct pstore_record *record)
return;
}
- /* No compression method has created the common buffer. */
+ /* Missing compression buffer means compression was not initialized. */
if (!big_oops_buf) {
- pr_warn("no decompression buffer allocated\n");
+ pr_warn("no decompression method initialized!\n");
return;
}
- unzipped_len = pstore_decompress(record->buf, big_oops_buf,
- record->size, big_oops_buf_sz);
- if (unzipped_len <= 0) {
- pr_err("decompression failed: %d\n", unzipped_len);
+ /* Allocate enough space to hold max decompression and ECC. */
+ unzipped_len = big_oops_buf_sz;
+ workspace = kmalloc(unzipped_len + record->ecc_notice_size,
+ GFP_KERNEL);
+ if (!workspace)
return;
- }
- /* Build new buffer for decompressed contents. */
- decompressed = kmalloc(unzipped_len + record->ecc_notice_size,
- GFP_KERNEL);
- if (!decompressed) {
- pr_err("decompression ran out of memory\n");
+ /* After decompression "unzipped_len" is almost certainly smaller. */
+ ret = crypto_comp_decompress(tfm, record->buf, record->size,
+ workspace, &unzipped_len);
+ if (ret) {
+ pr_err("crypto_comp_decompress failed, ret = %d!\n", ret);
+ kfree(workspace);
return;
}
- memcpy(decompressed, big_oops_buf, unzipped_len);
/* Append ECC notice to decompressed buffer. */
- memcpy(decompressed + unzipped_len, record->buf + record->size,
+ memcpy(workspace + unzipped_len, record->buf + record->size,
record->ecc_notice_size);
- /* Swap out compresed contents with decompressed contents. */
+ /* Copy decompressed contents into an minimum-sized allocation. */
+ unzipped = kmemdup(workspace, unzipped_len + record->ecc_notice_size,
+ GFP_KERNEL);
+ kfree(workspace);
+ if (!unzipped)
+ return;
+
+ /* Swap out compressed contents with decompressed contents. */
kfree(record->buf);
- record->buf = decompressed;
+ record->buf = unzipped;
record->size = unzipped_len;
record->compressed = false;
}
diff --git a/fs/pstore/pmsg.c b/fs/pstore/pmsg.c
index 24db02de1787..97fcef74e5af 100644
--- a/fs/pstore/pmsg.c
+++ b/fs/pstore/pmsg.c
@@ -33,7 +33,7 @@ static ssize_t write_pmsg(struct file *file, const char __user *buf,
record.size = count;
/* check outside lock, page in any data. write_user also checks */
- if (!access_ok(VERIFY_READ, buf, count))
+ if (!access_ok(buf, count))
return -EFAULT;
mutex_lock(&pmsg_lock);
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index ffcff6516e89..c5c685589e36 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -110,7 +110,6 @@ struct ramoops_context {
};
static struct platform_device *dummy;
-static struct ramoops_platform_data *dummy_data;
static int ramoops_pstore_open(struct pstore_info *psi)
{
@@ -124,31 +123,28 @@ static int ramoops_pstore_open(struct pstore_info *psi)
}
static struct persistent_ram_zone *
-ramoops_get_next_prz(struct persistent_ram_zone *przs[], uint *c, uint max,
- u64 *id,
- enum pstore_type_id *typep, enum pstore_type_id type,
- bool update)
+ramoops_get_next_prz(struct persistent_ram_zone *przs[], int id,
+ struct pstore_record *record)
{
struct persistent_ram_zone *prz;
- int i = (*c)++;
/* Give up if we never existed or have hit the end. */
- if (!przs || i >= max)
+ if (!przs)
return NULL;
- prz = przs[i];
+ prz = przs[id];
if (!prz)
return NULL;
/* Update old/shadowed buffer. */
- if (update)
+ if (prz->type == PSTORE_TYPE_DMESG)
persistent_ram_save_old(prz);
if (!persistent_ram_old_size(prz))
return NULL;
- *typep = type;
- *id = i;
+ record->type = prz->type;
+ record->id = id;
return prz;
}
@@ -255,10 +251,8 @@ static ssize_t ramoops_pstore_read(struct pstore_record *record)
/* Find the next valid persistent_ram_zone for DMESG */
while (cxt->dump_read_cnt < cxt->max_dump_cnt && !prz) {
- prz = ramoops_get_next_prz(cxt->dprzs, &cxt->dump_read_cnt,
- cxt->max_dump_cnt, &record->id,
- &record->type,
- PSTORE_TYPE_DMESG, 1);
+ prz = ramoops_get_next_prz(cxt->dprzs, cxt->dump_read_cnt++,
+ record);
if (!prz_ok(prz))
continue;
header_length = ramoops_read_kmsg_hdr(persistent_ram_old(prz),
@@ -272,22 +266,18 @@ static ssize_t ramoops_pstore_read(struct pstore_record *record)
}
}
- if (!prz_ok(prz))
- prz = ramoops_get_next_prz(&cxt->cprz, &cxt->console_read_cnt,
- 1, &record->id, &record->type,
- PSTORE_TYPE_CONSOLE, 0);
+ if (!prz_ok(prz) && !cxt->console_read_cnt++)
+ prz = ramoops_get_next_prz(&cxt->cprz, 0 /* single */, record);
- if (!prz_ok(prz))
- prz = ramoops_get_next_prz(&cxt->mprz, &cxt->pmsg_read_cnt,
- 1, &record->id, &record->type,
- PSTORE_TYPE_PMSG, 0);
+ if (!prz_ok(prz) && !cxt->pmsg_read_cnt++)
+ prz = ramoops_get_next_prz(&cxt->mprz, 0 /* single */, record);
/* ftrace is last since it may want to dynamically allocate memory. */
if (!prz_ok(prz)) {
- if (!(cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU)) {
- prz = ramoops_get_next_prz(cxt->fprzs,
- &cxt->ftrace_read_cnt, 1, &record->id,
- &record->type, PSTORE_TYPE_FTRACE, 0);
+ if (!(cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU) &&
+ !cxt->ftrace_read_cnt++) {
+ prz = ramoops_get_next_prz(cxt->fprzs, 0 /* single */,
+ record);
} else {
/*
* Build a new dummy record which combines all the
@@ -299,15 +289,12 @@ static ssize_t ramoops_pstore_read(struct pstore_record *record)
GFP_KERNEL);
if (!tmp_prz)
return -ENOMEM;
+ prz = tmp_prz;
free_prz = true;
while (cxt->ftrace_read_cnt < cxt->max_ftrace_cnt) {
prz_next = ramoops_get_next_prz(cxt->fprzs,
- &cxt->ftrace_read_cnt,
- cxt->max_ftrace_cnt,
- &record->id,
- &record->type,
- PSTORE_TYPE_FTRACE, 0);
+ cxt->ftrace_read_cnt++, record);
if (!prz_ok(prz_next))
continue;
@@ -321,7 +308,6 @@ static ssize_t ramoops_pstore_read(struct pstore_record *record)
goto out;
}
record->id = 0;
- prz = tmp_prz;
}
}
@@ -359,17 +345,15 @@ out:
static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz,
struct pstore_record *record)
{
- char *hdr;
+ char hdr[36]; /* "===="(4), %lld(20), "."(1), %06lu(6), "-%c\n"(3) */
size_t len;
- hdr = kasprintf(GFP_ATOMIC, RAMOOPS_KERNMSG_HDR "%lld.%06lu-%c\n",
+ len = scnprintf(hdr, sizeof(hdr),
+ RAMOOPS_KERNMSG_HDR "%lld.%06lu-%c\n",
(time64_t)record->time.tv_sec,
record->time.tv_nsec / 1000,
record->compressed ? 'C' : 'D');
- WARN_ON_ONCE(!hdr);
- len = hdr ? strlen(hdr) : 0;
persistent_ram_write(prz, hdr, len);
- kfree(hdr);
return len;
}
@@ -437,6 +421,9 @@ static int notrace ramoops_pstore_write(struct pstore_record *record)
/* Build header and append record contents. */
hlen = ramoops_write_kmsg_hdr(prz, record);
+ if (!hlen)
+ return -ENOMEM;
+
size = record->size;
if (size + hlen > prz->buffer_size)
size = prz->buffer_size - hlen;
@@ -611,6 +598,7 @@ static int ramoops_init_przs(const char *name,
goto fail;
}
*paddr += zone_sz;
+ prz_ar[i]->type = pstore_name_to_type(name);
}
*przs = prz_ar;
@@ -640,7 +628,7 @@ static int ramoops_init_prz(const char *name,
label = kasprintf(GFP_KERNEL, "ramoops:%s", name);
*prz = persistent_ram_new(*paddr, sz, sig, &cxt->ecc_info,
- cxt->memtype, 0, label);
+ cxt->memtype, PRZ_FLAG_ZAP_OLD, label);
if (IS_ERR(*prz)) {
int err = PTR_ERR(*prz);
@@ -649,9 +637,8 @@ static int ramoops_init_prz(const char *name,
return err;
}
- persistent_ram_zap(*prz);
-
*paddr += sz;
+ (*prz)->type = pstore_name_to_type(name);
return 0;
}
@@ -723,24 +710,12 @@ static int ramoops_probe(struct platform_device *pdev)
{
struct device *dev = &pdev->dev;
struct ramoops_platform_data *pdata = dev->platform_data;
+ struct ramoops_platform_data pdata_local;
struct ramoops_context *cxt = &oops_cxt;
size_t dump_mem_sz;
phys_addr_t paddr;
int err = -EINVAL;
- if (dev_of_node(dev) && !pdata) {
- pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL);
- if (!pdata) {
- pr_err("cannot allocate platform data buffer\n");
- err = -ENOMEM;
- goto fail_out;
- }
-
- err = ramoops_parse_dt(pdev, pdata);
- if (err < 0)
- goto fail_out;
- }
-
/*
* Only a single ramoops area allowed at a time, so fail extra
* probes.
@@ -750,6 +725,15 @@ static int ramoops_probe(struct platform_device *pdev)
goto fail_out;
}
+ if (dev_of_node(dev) && !pdata) {
+ pdata = &pdata_local;
+ memset(pdata, 0, sizeof(*pdata));
+
+ err = ramoops_parse_dt(pdev, pdata);
+ if (err < 0)
+ goto fail_out;
+ }
+
/* Make sure we didn't get bogus platform data pointer. */
if (!pdata) {
pr_err("NULL platform data\n");
@@ -787,7 +771,7 @@ static int ramoops_probe(struct platform_device *pdev)
dump_mem_sz = cxt->size - cxt->console_size - cxt->ftrace_size
- cxt->pmsg_size;
- err = ramoops_init_przs("dump", dev, cxt, &cxt->dprzs, &paddr,
+ err = ramoops_init_przs("dmesg", dev, cxt, &cxt->dprzs, &paddr,
dump_mem_sz, cxt->record_size,
&cxt->max_dump_cnt, 0, 0);
if (err)
@@ -816,21 +800,17 @@ static int ramoops_probe(struct platform_device *pdev)
cxt->pstore.data = cxt;
/*
- * Console can handle any buffer size, so prefer LOG_LINE_MAX. If we
- * have to handle dumps, we must have at least record_size buffer. And
- * for ftrace, bufsize is irrelevant (if bufsize is 0, buf will be
- * ZERO_SIZE_PTR).
+ * Since bufsize is only used for dmesg crash dumps, it
+ * must match the size of the dprz record (after PRZ header
+ * and ECC bytes have been accounted for).
*/
- if (cxt->console_size)
- cxt->pstore.bufsize = 1024; /* LOG_LINE_MAX */
- cxt->pstore.bufsize = max(cxt->record_size, cxt->pstore.bufsize);
- cxt->pstore.buf = kmalloc(cxt->pstore.bufsize, GFP_KERNEL);
+ cxt->pstore.bufsize = cxt->dprzs[0]->buffer_size;
+ cxt->pstore.buf = kzalloc(cxt->pstore.bufsize, GFP_KERNEL);
if (!cxt->pstore.buf) {
- pr_err("cannot allocate pstore buffer\n");
+ pr_err("cannot allocate pstore crash dump buffer\n");
err = -ENOMEM;
goto fail_clear;
}
- spin_lock_init(&cxt->pstore.buf_lock);
cxt->pstore.flags = PSTORE_FLAGS_DMESG;
if (cxt->console_size)
@@ -858,9 +838,9 @@ static int ramoops_probe(struct platform_device *pdev)
ramoops_pmsg_size = pdata->pmsg_size;
ramoops_ftrace_size = pdata->ftrace_size;
- pr_info("attached 0x%lx@0x%llx, ecc: %d/%d\n",
+ pr_info("using 0x%lx@0x%llx, ecc: %d\n",
cxt->size, (unsigned long long)cxt->phys_addr,
- cxt->ecc_info.ecc_size, cxt->ecc_info.block_size);
+ cxt->ecc_info.ecc_size);
return 0;
@@ -912,13 +892,12 @@ static inline void ramoops_unregister_dummy(void)
{
platform_device_unregister(dummy);
dummy = NULL;
-
- kfree(dummy_data);
- dummy_data = NULL;
}
static void __init ramoops_register_dummy(void)
{
+ struct ramoops_platform_data pdata;
+
/*
* Prepare a dummy platform data structure to carry the module
* parameters. If mem_size isn't set, then there are no module
@@ -929,30 +908,25 @@ static void __init ramoops_register_dummy(void)
pr_info("using module parameters\n");
- dummy_data = kzalloc(sizeof(*dummy_data), GFP_KERNEL);
- if (!dummy_data) {
- pr_info("could not allocate pdata\n");
- return;
- }
-
- dummy_data->mem_size = mem_size;
- dummy_data->mem_address = mem_address;
- dummy_data->mem_type = mem_type;
- dummy_data->record_size = record_size;
- dummy_data->console_size = ramoops_console_size;
- dummy_data->ftrace_size = ramoops_ftrace_size;
- dummy_data->pmsg_size = ramoops_pmsg_size;
- dummy_data->dump_oops = dump_oops;
- dummy_data->flags = RAMOOPS_FLAG_FTRACE_PER_CPU;
+ memset(&pdata, 0, sizeof(pdata));
+ pdata.mem_size = mem_size;
+ pdata.mem_address = mem_address;
+ pdata.mem_type = mem_type;
+ pdata.record_size = record_size;
+ pdata.console_size = ramoops_console_size;
+ pdata.ftrace_size = ramoops_ftrace_size;
+ pdata.pmsg_size = ramoops_pmsg_size;
+ pdata.dump_oops = dump_oops;
+ pdata.flags = RAMOOPS_FLAG_FTRACE_PER_CPU;
/*
* For backwards compatibility ramoops.ecc=1 means 16 bytes ECC
* (using 1 byte for ECC isn't much of use anyway).
*/
- dummy_data->ecc_info.ecc_size = ramoops_ecc == 1 ? 16 : ramoops_ecc;
+ pdata.ecc_info.ecc_size = ramoops_ecc == 1 ? 16 : ramoops_ecc;
dummy = platform_device_register_data(NULL, "ramoops", -1,
- dummy_data, sizeof(struct ramoops_platform_data));
+ &pdata, sizeof(pdata));
if (IS_ERR(dummy)) {
pr_info("could not create platform device: %ld\n",
PTR_ERR(dummy));
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index 12e21f789194..f375c0735351 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -12,7 +12,7 @@
*
*/
-#define pr_fmt(fmt) "persistent_ram: " fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/device.h>
#include <linux/err.h>
@@ -29,6 +29,16 @@
#include <linux/vmalloc.h>
#include <asm/page.h>
+/**
+ * struct persistent_ram_buffer - persistent circular RAM buffer
+ *
+ * @sig:
+ * signature to indicate header (PERSISTENT_RAM_SIG xor PRZ-type value)
+ * @start:
+ * offset into @data where the beginning of the stored bytes begin
+ * @size:
+ * number of valid bytes stored in @data
+ */
struct persistent_ram_buffer {
uint32_t sig;
atomic_t start;
@@ -347,7 +357,7 @@ int notrace persistent_ram_write_user(struct persistent_ram_zone *prz,
int rem, ret = 0, c = count;
size_t start;
- if (unlikely(!access_ok(VERIFY_READ, s, count)))
+ if (unlikely(!access_ok(s, count)))
return -EFAULT;
if (unlikely(c > prz->buffer_size)) {
s += c - prz->buffer_size;
@@ -443,7 +453,8 @@ static void *persistent_ram_iomap(phys_addr_t start, size_t size,
void *va;
if (!request_mem_region(start, size, label ?: "ramoops")) {
- pr_err("request mem region (0x%llx@0x%llx) failed\n",
+ pr_err("request mem region (%s 0x%llx@0x%llx) failed\n",
+ label ?: "ramoops",
(unsigned long long)size, (unsigned long long)start);
return NULL;
}
@@ -489,32 +500,42 @@ static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig,
struct persistent_ram_ecc_info *ecc_info)
{
int ret;
+ bool zap = !!(prz->flags & PRZ_FLAG_ZAP_OLD);
ret = persistent_ram_init_ecc(prz, ecc_info);
- if (ret)
+ if (ret) {
+ pr_warn("ECC failed %s\n", prz->label);
return ret;
+ }
sig ^= PERSISTENT_RAM_SIG;
if (prz->buffer->sig == sig) {
+ if (buffer_size(prz) == 0) {
+ pr_debug("found existing empty buffer\n");
+ return 0;
+ }
+
if (buffer_size(prz) > prz->buffer_size ||
- buffer_start(prz) > buffer_size(prz))
+ buffer_start(prz) > buffer_size(prz)) {
pr_info("found existing invalid buffer, size %zu, start %zu\n",
buffer_size(prz), buffer_start(prz));
- else {
+ zap = true;
+ } else {
pr_debug("found existing buffer, size %zu, start %zu\n",
buffer_size(prz), buffer_start(prz));
persistent_ram_save_old(prz);
- return 0;
}
} else {
pr_debug("no valid data in buffer (sig = 0x%08x)\n",
prz->buffer->sig);
+ prz->buffer->sig = sig;
+ zap = true;
}
- /* Rewind missing or invalid memory area. */
- prz->buffer->sig = sig;
- persistent_ram_zap(prz);
+ /* Reset missing, invalid, or single-use memory area. */
+ if (zap)
+ persistent_ram_zap(prz);
return 0;
}
@@ -572,6 +593,12 @@ struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size,
if (ret)
goto err;
+ pr_debug("attached %s 0x%zx@0x%llx: %zu header, %zu data, %zu ecc (%d/%d)\n",
+ prz->label, prz->size, (unsigned long long)prz->paddr,
+ sizeof(*prz->buffer), prz->buffer_size,
+ prz->size - sizeof(*prz->buffer) - prz->buffer_size,
+ prz->ecc_info.ecc_size, prz->ecc_info.block_size);
+
return prz;
err:
persistent_ram_free(prz);
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index f0cbf58ad4da..fd5dd806f1b9 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -791,7 +791,8 @@ static int quotactl_cmd_write(int cmd)
/* Return true if quotactl command is manipulating quota on/off state */
static bool quotactl_cmd_onoff(int cmd)
{
- return (cmd == Q_QUOTAON) || (cmd == Q_QUOTAOFF);
+ return (cmd == Q_QUOTAON) || (cmd == Q_QUOTAOFF) ||
+ (cmd == Q_XQUOTAON) || (cmd == Q_XQUOTAOFF);
}
/*
diff --git a/fs/read_write.c b/fs/read_write.c
index 603794b207eb..30df848b7451 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -426,7 +426,7 @@ ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
ssize_t result;
old_fs = get_fs();
- set_fs(get_ds());
+ set_fs(KERNEL_DS);
/* The cast to a user pointer is valid due to the set_fs() */
result = vfs_read(file, (void __user *)buf, count, pos);
set_fs(old_fs);
@@ -442,7 +442,7 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
return -EBADF;
if (!(file->f_mode & FMODE_CAN_READ))
return -EINVAL;
- if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
+ if (unlikely(!access_ok(buf, count)))
return -EFAULT;
ret = rw_verify_area(READ, file, pos, count);
@@ -499,7 +499,7 @@ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t
return -EINVAL;
old_fs = get_fs();
- set_fs(get_ds());
+ set_fs(KERNEL_DS);
p = (__force const char __user *)buf;
if (count > MAX_RW_COUNT)
count = MAX_RW_COUNT;
@@ -521,7 +521,7 @@ ssize_t kernel_write(struct file *file, const void *buf, size_t count,
ssize_t res;
old_fs = get_fs();
- set_fs(get_ds());
+ set_fs(KERNEL_DS);
/* The cast to a user pointer is valid due to the set_fs() */
res = vfs_write(file, (__force const char __user *)buf, count, pos);
set_fs(old_fs);
@@ -538,7 +538,7 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_
return -EBADF;
if (!(file->f_mode & FMODE_CAN_WRITE))
return -EINVAL;
- if (unlikely(!access_ok(VERIFY_READ, buf, count)))
+ if (unlikely(!access_ok(buf, count)))
return -EFAULT;
ret = rw_verify_area(WRITE, file, pos, count);
@@ -718,9 +718,6 @@ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
return ret;
}
-/* A write operation does a read from user space and vice versa */
-#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
-
/**
* rw_copy_check_uvector() - Copy an array of &struct iovec from userspace
* into the kernel and check that it is valid.
@@ -810,7 +807,7 @@ ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
goto out;
}
if (type >= 0
- && unlikely(!access_ok(vrfy_dir(type), buf, len))) {
+ && unlikely(!access_ok(buf, len))) {
ret = -EFAULT;
goto out;
}
@@ -856,7 +853,7 @@ ssize_t compat_rw_copy_check_uvector(int type,
*ret_pointer = iov;
ret = -EFAULT;
- if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
+ if (!access_ok(uvector, nr_segs*sizeof(*uvector)))
goto out;
/*
@@ -881,7 +878,7 @@ ssize_t compat_rw_copy_check_uvector(int type,
if (len < 0) /* size_t not fitting in compat_ssize_t .. */
goto out;
if (type >= 0 &&
- !access_ok(vrfy_dir(type), compat_ptr(buf), len)) {
+ !access_ok(compat_ptr(buf), len)) {
ret = -EFAULT;
goto out;
}
@@ -1407,7 +1404,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
goto fput_in;
if (!(out.file->f_mode & FMODE_WRITE))
goto fput_out;
- retval = -EINVAL;
in_inode = file_inode(in.file);
out_inode = file_inode(out.file);
out_pos = out.file->f_pos;
@@ -1588,11 +1584,15 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
* Try cloning first, this is supported by more file systems, and
* more efficient if both clone and copy are supported (e.g. NFS).
*/
- if (file_in->f_op->clone_file_range) {
- ret = file_in->f_op->clone_file_range(file_in, pos_in,
- file_out, pos_out, len);
- if (ret == 0) {
- ret = len;
+ if (file_in->f_op->remap_file_range) {
+ loff_t cloned;
+
+ cloned = file_in->f_op->remap_file_range(file_in, pos_in,
+ file_out, pos_out,
+ min_t(loff_t, MAX_RW_COUNT, len),
+ REMAP_FILE_CAN_SHORTEN);
+ if (cloned > 0) {
+ ret = cloned;
goto done;
}
}
@@ -1686,11 +1686,12 @@ out2:
return ret;
}
-static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
+static int remap_verify_area(struct file *file, loff_t pos, loff_t len,
+ bool write)
{
struct inode *inode = file_inode(file);
- if (unlikely(pos < 0))
+ if (unlikely(pos < 0 || len < 0))
return -EINVAL;
if (unlikely((loff_t) (pos + len) < 0))
@@ -1708,22 +1709,150 @@ static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
}
+/*
+ * Ensure that we don't remap a partial EOF block in the middle of something
+ * else. Assume that the offsets have already been checked for block
+ * alignment.
+ *
+ * For deduplication we always scale down to the previous block because we
+ * can't meaningfully compare post-EOF contents.
+ *
+ * For clone we only link a partial EOF block above the destination file's EOF.
+ *
+ * Shorten the request if possible.
+ */
+static int generic_remap_check_len(struct inode *inode_in,
+ struct inode *inode_out,
+ loff_t pos_out,
+ loff_t *len,
+ unsigned int remap_flags)
+{
+ u64 blkmask = i_blocksize(inode_in) - 1;
+ loff_t new_len = *len;
+
+ if ((*len & blkmask) == 0)
+ return 0;
+
+ if ((remap_flags & REMAP_FILE_DEDUP) ||
+ pos_out + *len < i_size_read(inode_out))
+ new_len &= ~blkmask;
+
+ if (new_len == *len)
+ return 0;
+
+ if (remap_flags & REMAP_FILE_CAN_SHORTEN) {
+ *len = new_len;
+ return 0;
+ }
+
+ return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL;
+}
+
+/*
+ * Read a page's worth of file data into the page cache. Return the page
+ * locked.
+ */
+static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
+{
+ struct page *page;
+
+ page = read_mapping_page(inode->i_mapping, offset >> PAGE_SHIFT, NULL);
+ if (IS_ERR(page))
+ return page;
+ if (!PageUptodate(page)) {
+ put_page(page);
+ return ERR_PTR(-EIO);
+ }
+ lock_page(page);
+ return page;
+}
+
+/*
+ * Compare extents of two files to see if they are the same.
+ * Caller must have locked both inodes to prevent write races.
+ */
+static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
+ struct inode *dest, loff_t destoff,
+ loff_t len, bool *is_same)
+{
+ loff_t src_poff;
+ loff_t dest_poff;
+ void *src_addr;
+ void *dest_addr;
+ struct page *src_page;
+ struct page *dest_page;
+ loff_t cmp_len;
+ bool same;
+ int error;
+
+ error = -EINVAL;
+ same = true;
+ while (len) {
+ src_poff = srcoff & (PAGE_SIZE - 1);
+ dest_poff = destoff & (PAGE_SIZE - 1);
+ cmp_len = min(PAGE_SIZE - src_poff,
+ PAGE_SIZE - dest_poff);
+ cmp_len = min(cmp_len, len);
+ if (cmp_len <= 0)
+ goto out_error;
+
+ src_page = vfs_dedupe_get_page(src, srcoff);
+ if (IS_ERR(src_page)) {
+ error = PTR_ERR(src_page);
+ goto out_error;
+ }
+ dest_page = vfs_dedupe_get_page(dest, destoff);
+ if (IS_ERR(dest_page)) {
+ error = PTR_ERR(dest_page);
+ unlock_page(src_page);
+ put_page(src_page);
+ goto out_error;
+ }
+ src_addr = kmap_atomic(src_page);
+ dest_addr = kmap_atomic(dest_page);
+
+ flush_dcache_page(src_page);
+ flush_dcache_page(dest_page);
+
+ if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
+ same = false;
+
+ kunmap_atomic(dest_addr);
+ kunmap_atomic(src_addr);
+ unlock_page(dest_page);
+ unlock_page(src_page);
+ put_page(dest_page);
+ put_page(src_page);
+
+ if (!same)
+ break;
+
+ srcoff += cmp_len;
+ destoff += cmp_len;
+ len -= cmp_len;
+ }
+
+ *is_same = same;
+ return 0;
+
+out_error:
+ return error;
+}
/*
* Check that the two inodes are eligible for cloning, the ranges make
* sense, and then flush all dirty data. Caller must ensure that the
* inodes have been locked against any other modifications.
*
- * Returns: 0 for "nothing to clone", 1 for "something to clone", or
- * the usual negative error code.
+ * If there's an error, then the usual negative error code is returned.
+ * Otherwise returns 0 with *len set to the request length.
*/
-int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in,
- struct inode *inode_out, loff_t pos_out,
- u64 *len, bool is_dedupe)
+int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t *len, unsigned int remap_flags)
{
- loff_t bs = inode_out->i_sb->s_blocksize;
- loff_t blen;
- loff_t isize;
+ struct inode *inode_in = file_inode(file_in);
+ struct inode *inode_out = file_inode(file_out);
bool same_inode = (inode_in == inode_out);
int ret;
@@ -1740,50 +1869,24 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in,
if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
return -EINVAL;
- /* Are we going all the way to the end? */
- isize = i_size_read(inode_in);
- if (isize == 0)
- return 0;
-
/* Zero length dedupe exits immediately; reflink goes to EOF. */
if (*len == 0) {
- if (is_dedupe || pos_in == isize)
+ loff_t isize = i_size_read(inode_in);
+
+ if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize)
return 0;
if (pos_in > isize)
return -EINVAL;
*len = isize - pos_in;
+ if (*len == 0)
+ return 0;
}
- /* Ensure offsets don't wrap and the input is inside i_size */
- if (pos_in + *len < pos_in || pos_out + *len < pos_out ||
- pos_in + *len > isize)
- return -EINVAL;
-
- /* Don't allow dedupe past EOF in the dest file */
- if (is_dedupe) {
- loff_t disize;
-
- disize = i_size_read(inode_out);
- if (pos_out >= disize || pos_out + *len > disize)
- return -EINVAL;
- }
-
- /* If we're linking to EOF, continue to the block boundary. */
- if (pos_in + *len == isize)
- blen = ALIGN(isize, bs) - pos_in;
- else
- blen = *len;
-
- /* Only reflink if we're aligned to block boundaries */
- if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) ||
- !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs))
- return -EINVAL;
-
- /* Don't allow overlapped reflink within the same file */
- if (same_inode) {
- if (pos_out + blen > pos_in && pos_out < pos_in + blen)
- return -EINVAL;
- }
+ /* Check that we don't violate system file offset limits. */
+ ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len,
+ remap_flags);
+ if (ret)
+ return ret;
/* Wait for the completion of any pending IOs on both files */
inode_dio_wait(inode_in);
@@ -1803,7 +1906,7 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in,
/*
* Check that the extents are the same.
*/
- if (is_dedupe) {
+ if (remap_flags & REMAP_FILE_DEDUP) {
bool is_same = false;
ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
@@ -1814,16 +1917,43 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in,
return -EBADE;
}
- return 1;
+ ret = generic_remap_check_len(inode_in, inode_out, pos_out, len,
+ remap_flags);
+ if (ret)
+ return ret;
+
+ /* If can't alter the file contents, we're done. */
+ if (!(remap_flags & REMAP_FILE_DEDUP)) {
+ /* Update the timestamps, since we can alter file contents. */
+ if (!(file_out->f_mode & FMODE_NOCMTIME)) {
+ ret = file_update_time(file_out);
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * Clear the security bits if the process is not being run by
+ * root. This keeps people from modifying setuid and setgid
+ * binaries.
+ */
+ ret = file_remove_privs(file_out);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
}
-EXPORT_SYMBOL(vfs_clone_file_prep_inodes);
+EXPORT_SYMBOL(generic_remap_file_range_prep);
-int do_clone_file_range(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out, u64 len)
+loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t len, unsigned int remap_flags)
{
struct inode *inode_in = file_inode(file_in);
struct inode *inode_out = file_inode(file_out);
- int ret;
+ loff_t ret;
+
+ WARN_ON_ONCE(remap_flags & REMAP_FILE_DEDUP);
if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
return -EISDIR;
@@ -1843,155 +1973,76 @@ int do_clone_file_range(struct file *file_in, loff_t pos_in,
(file_out->f_flags & O_APPEND))
return -EBADF;
- if (!file_in->f_op->clone_file_range)
+ if (!file_in->f_op->remap_file_range)
return -EOPNOTSUPP;
- ret = clone_verify_area(file_in, pos_in, len, false);
+ ret = remap_verify_area(file_in, pos_in, len, false);
if (ret)
return ret;
- ret = clone_verify_area(file_out, pos_out, len, true);
+ ret = remap_verify_area(file_out, pos_out, len, true);
if (ret)
return ret;
- if (pos_in + len > i_size_read(inode_in))
- return -EINVAL;
-
- ret = file_in->f_op->clone_file_range(file_in, pos_in,
- file_out, pos_out, len);
- if (!ret) {
- fsnotify_access(file_in);
- fsnotify_modify(file_out);
- }
+ ret = file_in->f_op->remap_file_range(file_in, pos_in,
+ file_out, pos_out, len, remap_flags);
+ if (ret < 0)
+ return ret;
+ fsnotify_access(file_in);
+ fsnotify_modify(file_out);
return ret;
}
EXPORT_SYMBOL(do_clone_file_range);
-int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out, u64 len)
+loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t len, unsigned int remap_flags)
{
- int ret;
+ loff_t ret;
file_start_write(file_out);
- ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len);
+ ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len,
+ remap_flags);
file_end_write(file_out);
return ret;
}
EXPORT_SYMBOL(vfs_clone_file_range);
-/*
- * Read a page's worth of file data into the page cache. Return the page
- * locked.
- */
-static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
+/* Check whether we are allowed to dedupe the destination file */
+static bool allow_file_dedupe(struct file *file)
{
- struct address_space *mapping;
- struct page *page;
- pgoff_t n;
-
- n = offset >> PAGE_SHIFT;
- mapping = inode->i_mapping;
- page = read_mapping_page(mapping, n, NULL);
- if (IS_ERR(page))
- return page;
- if (!PageUptodate(page)) {
- put_page(page);
- return ERR_PTR(-EIO);
- }
- lock_page(page);
- return page;
+ if (capable(CAP_SYS_ADMIN))
+ return true;
+ if (file->f_mode & FMODE_WRITE)
+ return true;
+ if (uid_eq(current_fsuid(), file_inode(file)->i_uid))
+ return true;
+ if (!inode_permission(file_inode(file), MAY_WRITE))
+ return true;
+ return false;
}
-/*
- * Compare extents of two files to see if they are the same.
- * Caller must have locked both inodes to prevent write races.
- */
-int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
- struct inode *dest, loff_t destoff,
- loff_t len, bool *is_same)
+loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
+ struct file *dst_file, loff_t dst_pos,
+ loff_t len, unsigned int remap_flags)
{
- loff_t src_poff;
- loff_t dest_poff;
- void *src_addr;
- void *dest_addr;
- struct page *src_page;
- struct page *dest_page;
- loff_t cmp_len;
- bool same;
- int error;
+ loff_t ret;
- error = -EINVAL;
- same = true;
- while (len) {
- src_poff = srcoff & (PAGE_SIZE - 1);
- dest_poff = destoff & (PAGE_SIZE - 1);
- cmp_len = min(PAGE_SIZE - src_poff,
- PAGE_SIZE - dest_poff);
- cmp_len = min(cmp_len, len);
- if (cmp_len <= 0)
- goto out_error;
-
- src_page = vfs_dedupe_get_page(src, srcoff);
- if (IS_ERR(src_page)) {
- error = PTR_ERR(src_page);
- goto out_error;
- }
- dest_page = vfs_dedupe_get_page(dest, destoff);
- if (IS_ERR(dest_page)) {
- error = PTR_ERR(dest_page);
- unlock_page(src_page);
- put_page(src_page);
- goto out_error;
- }
- src_addr = kmap_atomic(src_page);
- dest_addr = kmap_atomic(dest_page);
-
- flush_dcache_page(src_page);
- flush_dcache_page(dest_page);
-
- if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
- same = false;
-
- kunmap_atomic(dest_addr);
- kunmap_atomic(src_addr);
- unlock_page(dest_page);
- unlock_page(src_page);
- put_page(dest_page);
- put_page(src_page);
-
- if (!same)
- break;
-
- srcoff += cmp_len;
- destoff += cmp_len;
- len -= cmp_len;
- }
-
- *is_same = same;
- return 0;
-
-out_error:
- return error;
-}
-EXPORT_SYMBOL(vfs_dedupe_file_range_compare);
-
-int vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
- struct file *dst_file, loff_t dst_pos, u64 len)
-{
- s64 ret;
+ WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP |
+ REMAP_FILE_CAN_SHORTEN));
ret = mnt_want_write_file(dst_file);
if (ret)
return ret;
- ret = clone_verify_area(dst_file, dst_pos, len, true);
+ ret = remap_verify_area(dst_file, dst_pos, len, true);
if (ret < 0)
goto out_drop_write;
- ret = -EINVAL;
- if (!(capable(CAP_SYS_ADMIN) || (dst_file->f_mode & FMODE_WRITE)))
+ ret = -EPERM;
+ if (!allow_file_dedupe(dst_file))
goto out_drop_write;
ret = -EXDEV;
@@ -2003,11 +2054,16 @@ int vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
goto out_drop_write;
ret = -EINVAL;
- if (!dst_file->f_op->dedupe_file_range)
+ if (!dst_file->f_op->remap_file_range)
goto out_drop_write;
- ret = dst_file->f_op->dedupe_file_range(src_file, src_pos,
- dst_file, dst_pos, len);
+ if (len == 0) {
+ ret = 0;
+ goto out_drop_write;
+ }
+
+ ret = dst_file->f_op->remap_file_range(src_file, src_pos, dst_file,
+ dst_pos, len, remap_flags | REMAP_FILE_DEDUP);
out_drop_write:
mnt_drop_write_file(dst_file);
@@ -2024,7 +2080,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
int i;
int ret;
u16 count = same->dest_count;
- int deduped;
+ loff_t deduped;
if (!(file->f_mode & FMODE_READ))
return -EINVAL;
@@ -2035,17 +2091,18 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
off = same->src_offset;
len = same->src_length;
- ret = -EISDIR;
if (S_ISDIR(src->i_mode))
- goto out;
+ return -EISDIR;
- ret = -EINVAL;
if (!S_ISREG(src->i_mode))
- goto out;
+ return -EINVAL;
- ret = clone_verify_area(file, off, len, false);
+ if (!file->f_op->remap_file_range)
+ return -EOPNOTSUPP;
+
+ ret = remap_verify_area(file, off, len, false);
if (ret < 0)
- goto out;
+ return ret;
ret = 0;
if (off + len > i_size_read(src))
@@ -2075,7 +2132,8 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
}
deduped = vfs_dedupe_file_range_one(file, off, dst_file,
- info->dest_offset, len);
+ info->dest_offset, len,
+ REMAP_FILE_CAN_SHORTEN);
if (deduped == -EBADE)
info->status = FILE_DEDUPE_RANGE_DIFFERS;
else if (deduped < 0)
@@ -2087,10 +2145,8 @@ next_fdput:
fdput(dst_fd);
next_loop:
if (fatal_signal_pending(current))
- goto out;
+ break;
}
-
-out:
return ret;
}
EXPORT_SYMBOL(vfs_dedupe_file_range);
diff --git a/fs/readdir.c b/fs/readdir.c
index d97f548e6323..2f6a4534e0df 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -105,7 +105,7 @@ static int fillonedir(struct dir_context *ctx, const char *name, int namlen,
}
buf->result++;
dirent = buf->dirent;
- if (!access_ok(VERIFY_WRITE, dirent,
+ if (!access_ok(dirent,
(unsigned long)(dirent->d_name + namlen + 1) -
(unsigned long)dirent))
goto efault;
@@ -221,7 +221,7 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
};
int error;
- if (!access_ok(VERIFY_WRITE, dirent, count))
+ if (!access_ok(dirent, count))
return -EFAULT;
f = fdget_pos(fd);
@@ -304,7 +304,7 @@ int ksys_getdents64(unsigned int fd, struct linux_dirent64 __user *dirent,
};
int error;
- if (!access_ok(VERIFY_WRITE, dirent, count))
+ if (!access_ok(dirent, count))
return -EFAULT;
f = fdget_pos(fd);
@@ -365,7 +365,7 @@ static int compat_fillonedir(struct dir_context *ctx, const char *name,
}
buf->result++;
dirent = buf->dirent;
- if (!access_ok(VERIFY_WRITE, dirent,
+ if (!access_ok(dirent,
(unsigned long)(dirent->d_name + namlen + 1) -
(unsigned long)dirent))
goto efault;
@@ -475,7 +475,7 @@ COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd,
};
int error;
- if (!access_ok(VERIFY_WRITE, dirent, count))
+ if (!access_ok(dirent, count))
return -EFAULT;
f = fdget_pos(fd);
diff --git a/fs/reiserfs/Makefile b/fs/reiserfs/Makefile
index a39a562c1c10..bd29c58ccbd8 100644
--- a/fs/reiserfs/Makefile
+++ b/fs/reiserfs/Makefile
@@ -26,14 +26,5 @@ ifeq ($(CONFIG_REISERFS_FS_POSIX_ACL),y)
reiserfs-objs += xattr_acl.o
endif
-# gcc -O2 (the kernel default) is overaggressive on ppc32 when many inline
-# functions are used. This causes the compiler to advance the stack
-# pointer out of the available stack space, corrupting kernel space,
-# and causing a panic. Since this behavior only affects ppc32, this ifeq
-# will work around it. If any other architecture displays this behavior,
-# add it here.
-ccflags-$(CONFIG_PPC32) := $(call cc-ifversion, -lt, 0400, -O1)
-
TAGS:
etags *.c
-
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 48cdfc81fe10..32d8986c26fb 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -185,6 +185,7 @@ struct reiserfs_dentry_buf {
struct dir_context ctx;
struct dentry *xadir;
int count;
+ int err;
struct dentry *dentries[8];
};
@@ -207,6 +208,7 @@ fill_with_dentries(struct dir_context *ctx, const char *name, int namelen,
dentry = lookup_one_len(name, dbuf->xadir, namelen);
if (IS_ERR(dentry)) {
+ dbuf->err = PTR_ERR(dentry);
return PTR_ERR(dentry);
} else if (d_really_is_negative(dentry)) {
/* A directory entry exists, but no file? */
@@ -215,6 +217,7 @@ fill_with_dentries(struct dir_context *ctx, const char *name, int namelen,
"not found for file %pd.\n",
dentry, dbuf->xadir);
dput(dentry);
+ dbuf->err = -EIO;
return -EIO;
}
@@ -262,6 +265,10 @@ static int reiserfs_for_each_xattr(struct inode *inode,
err = reiserfs_readdir_inode(d_inode(dir), &buf.ctx);
if (err)
break;
+ if (buf.err) {
+ err = buf.err;
+ break;
+ }
if (!buf.count)
break;
for (i = 0; !err && i < buf.count && buf.dentries[i]; i++) {
diff --git a/fs/select.c b/fs/select.c
index 22b3bf89f051..6cbc9ff56ba0 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -287,12 +287,18 @@ int poll_select_set_timeout(struct timespec64 *to, time64_t sec, long nsec)
return 0;
}
+enum poll_time_type {
+ PT_TIMEVAL = 0,
+ PT_OLD_TIMEVAL = 1,
+ PT_TIMESPEC = 2,
+ PT_OLD_TIMESPEC = 3,
+};
+
static int poll_select_copy_remaining(struct timespec64 *end_time,
void __user *p,
- int timeval, int ret)
+ enum poll_time_type pt_type, int ret)
{
struct timespec64 rts;
- struct timeval rtv;
if (!p)
return ret;
@@ -310,18 +316,40 @@ static int poll_select_copy_remaining(struct timespec64 *end_time,
rts.tv_sec = rts.tv_nsec = 0;
- if (timeval) {
- if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec))
- memset(&rtv, 0, sizeof(rtv));
- rtv.tv_sec = rts.tv_sec;
- rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC;
+ switch (pt_type) {
+ case PT_TIMEVAL:
+ {
+ struct timeval rtv;
- if (!copy_to_user(p, &rtv, sizeof(rtv)))
+ if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec))
+ memset(&rtv, 0, sizeof(rtv));
+ rtv.tv_sec = rts.tv_sec;
+ rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC;
+ if (!copy_to_user(p, &rtv, sizeof(rtv)))
+ return ret;
+ }
+ break;
+ case PT_OLD_TIMEVAL:
+ {
+ struct old_timeval32 rtv;
+
+ rtv.tv_sec = rts.tv_sec;
+ rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC;
+ if (!copy_to_user(p, &rtv, sizeof(rtv)))
+ return ret;
+ }
+ break;
+ case PT_TIMESPEC:
+ if (!put_timespec64(&rts, p))
return ret;
-
- } else if (!put_timespec64(&rts, p))
- return ret;
-
+ break;
+ case PT_OLD_TIMESPEC:
+ if (!put_old_timespec32(&rts, p))
+ return ret;
+ break;
+ default:
+ BUG();
+ }
/*
* If an application puts its timeval in read-only memory, we
* don't want the Linux-specific update to the timeval to
@@ -353,9 +381,6 @@ typedef struct {
#define FDS_BYTES(nr) (FDS_LONGS(nr)*sizeof(long))
/*
- * We do a VERIFY_WRITE here even though we are only reading this time:
- * we'll write to it eventually..
- *
* Use "unsigned long" accesses to let user-mode fd_set's be long-aligned.
*/
static inline
@@ -689,7 +714,7 @@ static int kern_select(int n, fd_set __user *inp, fd_set __user *outp,
}
ret = core_sys_select(n, inp, outp, exp, to);
- ret = poll_select_copy_remaining(&end_time, tvp, 1, ret);
+ ret = poll_select_copy_remaining(&end_time, tvp, PT_TIMEVAL, ret);
return ret;
}
@@ -701,49 +726,41 @@ SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp,
}
static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
- fd_set __user *exp, struct timespec __user *tsp,
- const sigset_t __user *sigmask, size_t sigsetsize)
+ fd_set __user *exp, void __user *tsp,
+ const sigset_t __user *sigmask, size_t sigsetsize,
+ enum poll_time_type type)
{
sigset_t ksigmask, sigsaved;
struct timespec64 ts, end_time, *to = NULL;
int ret;
if (tsp) {
- if (get_timespec64(&ts, tsp))
- return -EFAULT;
+ switch (type) {
+ case PT_TIMESPEC:
+ if (get_timespec64(&ts, tsp))
+ return -EFAULT;
+ break;
+ case PT_OLD_TIMESPEC:
+ if (get_old_timespec32(&ts, tsp))
+ return -EFAULT;
+ break;
+ default:
+ BUG();
+ }
to = &end_time;
if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
return -EINVAL;
}
- if (sigmask) {
- /* XXX: Don't preclude handling different sized sigset_t's. */
- if (sigsetsize != sizeof(sigset_t))
- return -EINVAL;
- if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
- return -EFAULT;
-
- sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
- sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
- }
+ ret = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+ if (ret)
+ return ret;
ret = core_sys_select(n, inp, outp, exp, to);
- ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
+ ret = poll_select_copy_remaining(&end_time, tsp, type, ret);
- if (ret == -ERESTARTNOHAND) {
- /*
- * Don't restore the signal mask yet. Let do_signal() deliver
- * the signal on the way back to userspace, before the signal
- * mask is restored.
- */
- if (sigmask) {
- memcpy(&current->saved_sigmask, &sigsaved,
- sizeof(sigsaved));
- set_restore_sigmask();
- }
- } else if (sigmask)
- sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+ restore_user_sigmask(sigmask, &sigsaved);
return ret;
}
@@ -755,23 +772,45 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
* the sigset size.
*/
SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp,
- fd_set __user *, exp, struct timespec __user *, tsp,
+ fd_set __user *, exp, struct __kernel_timespec __user *, tsp,
void __user *, sig)
{
size_t sigsetsize = 0;
sigset_t __user *up = NULL;
if (sig) {
- if (!access_ok(VERIFY_READ, sig, sizeof(void *)+sizeof(size_t))
+ if (!access_ok(sig, sizeof(void *)+sizeof(size_t))
|| __get_user(up, (sigset_t __user * __user *)sig)
|| __get_user(sigsetsize,
(size_t __user *)(sig+sizeof(void *))))
return -EFAULT;
}
- return do_pselect(n, inp, outp, exp, tsp, up, sigsetsize);
+ return do_pselect(n, inp, outp, exp, tsp, up, sigsetsize, PT_TIMESPEC);
}
+#if defined(CONFIG_COMPAT_32BIT_TIME) && !defined(CONFIG_64BIT)
+
+SYSCALL_DEFINE6(pselect6_time32, int, n, fd_set __user *, inp, fd_set __user *, outp,
+ fd_set __user *, exp, struct old_timespec32 __user *, tsp,
+ void __user *, sig)
+{
+ size_t sigsetsize = 0;
+ sigset_t __user *up = NULL;
+
+ if (sig) {
+ if (!access_ok(sig, sizeof(void *)+sizeof(size_t))
+ || __get_user(up, (sigset_t __user * __user *)sig)
+ || __get_user(sigsetsize,
+ (size_t __user *)(sig+sizeof(void *))))
+ return -EFAULT;
+ }
+
+ return do_pselect(n, inp, outp, exp, tsp, up, sigsetsize, PT_OLD_TIMESPEC);
+}
+
+#endif
+
#ifdef __ARCH_WANT_SYS_OLD_SELECT
struct sel_arg_struct {
unsigned long n;
@@ -1045,7 +1084,7 @@ SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
}
SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
- struct timespec __user *, tsp, const sigset_t __user *, sigmask,
+ struct __kernel_timespec __user *, tsp, const sigset_t __user *, sigmask,
size_t, sigsetsize)
{
sigset_t ksigmask, sigsaved;
@@ -1061,89 +1100,62 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
return -EINVAL;
}
- if (sigmask) {
- /* XXX: Don't preclude handling different sized sigset_t's. */
- if (sigsetsize != sizeof(sigset_t))
- return -EINVAL;
- if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
- return -EFAULT;
-
- sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
- sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
- }
+ ret = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+ if (ret)
+ return ret;
ret = do_sys_poll(ufds, nfds, to);
+ restore_user_sigmask(sigmask, &sigsaved);
+
/* We can restart this syscall, usually */
- if (ret == -EINTR) {
- /*
- * Don't restore the signal mask yet. Let do_signal() deliver
- * the signal on the way back to userspace, before the signal
- * mask is restored.
- */
- if (sigmask) {
- memcpy(&current->saved_sigmask, &sigsaved,
- sizeof(sigsaved));
- set_restore_sigmask();
- }
+ if (ret == -EINTR)
ret = -ERESTARTNOHAND;
- } else if (sigmask)
- sigprocmask(SIG_SETMASK, &sigsaved, NULL);
- ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
+ ret = poll_select_copy_remaining(&end_time, tsp, PT_TIMESPEC, ret);
return ret;
}
-#ifdef CONFIG_COMPAT
-#define __COMPAT_NFDBITS (8 * sizeof(compat_ulong_t))
+#if defined(CONFIG_COMPAT_32BIT_TIME) && !defined(CONFIG_64BIT)
-static
-int compat_poll_select_copy_remaining(struct timespec64 *end_time, void __user *p,
- int timeval, int ret)
+SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds,
+ struct old_timespec32 __user *, tsp, const sigset_t __user *, sigmask,
+ size_t, sigsetsize)
{
- struct timespec64 ts;
+ sigset_t ksigmask, sigsaved;
+ struct timespec64 ts, end_time, *to = NULL;
+ int ret;
- if (!p)
- return ret;
+ if (tsp) {
+ if (get_old_timespec32(&ts, tsp))
+ return -EFAULT;
- if (current->personality & STICKY_TIMEOUTS)
- goto sticky;
+ to = &end_time;
+ if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
+ return -EINVAL;
+ }
- /* No update for zero timeout */
- if (!end_time->tv_sec && !end_time->tv_nsec)
+ ret = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+ if (ret)
return ret;
- ktime_get_ts64(&ts);
- ts = timespec64_sub(*end_time, ts);
- if (ts.tv_sec < 0)
- ts.tv_sec = ts.tv_nsec = 0;
+ ret = do_sys_poll(ufds, nfds, to);
- if (timeval) {
- struct old_timeval32 rtv;
+ restore_user_sigmask(sigmask, &sigsaved);
- rtv.tv_sec = ts.tv_sec;
- rtv.tv_usec = ts.tv_nsec / NSEC_PER_USEC;
+ /* We can restart this syscall, usually */
+ if (ret == -EINTR)
+ ret = -ERESTARTNOHAND;
- if (!copy_to_user(p, &rtv, sizeof(rtv)))
- return ret;
- } else {
- if (!put_old_timespec32(&ts, p))
- return ret;
- }
- /*
- * If an application puts its timeval in read-only memory, we
- * don't want the Linux-specific update to the timeval to
- * cause a fault after the select has completed
- * successfully. However, because we're not updating the
- * timeval, we can't restart the system call.
- */
+ ret = poll_select_copy_remaining(&end_time, tsp, PT_OLD_TIMESPEC, ret);
-sticky:
- if (ret == -ERESTARTNOHAND)
- ret = -EINTR;
return ret;
}
+#endif
+
+#ifdef CONFIG_COMPAT
+#define __COMPAT_NFDBITS (8 * sizeof(compat_ulong_t))
/*
* Ooo, nasty. We need here to frob 32-bit unsigned longs to
@@ -1275,7 +1287,7 @@ static int do_compat_select(int n, compat_ulong_t __user *inp,
}
ret = compat_core_sys_select(n, inp, outp, exp, to);
- ret = compat_poll_select_copy_remaining(&end_time, tvp, 1, ret);
+ ret = poll_select_copy_remaining(&end_time, tvp, PT_OLD_TIMEVAL, ret);
return ret;
}
@@ -1307,53 +1319,67 @@ COMPAT_SYSCALL_DEFINE1(old_select, struct compat_sel_arg_struct __user *, arg)
static long do_compat_pselect(int n, compat_ulong_t __user *inp,
compat_ulong_t __user *outp, compat_ulong_t __user *exp,
- struct old_timespec32 __user *tsp, compat_sigset_t __user *sigmask,
- compat_size_t sigsetsize)
+ void __user *tsp, compat_sigset_t __user *sigmask,
+ compat_size_t sigsetsize, enum poll_time_type type)
{
sigset_t ksigmask, sigsaved;
struct timespec64 ts, end_time, *to = NULL;
int ret;
if (tsp) {
- if (get_old_timespec32(&ts, tsp))
- return -EFAULT;
+ switch (type) {
+ case PT_OLD_TIMESPEC:
+ if (get_old_timespec32(&ts, tsp))
+ return -EFAULT;
+ break;
+ case PT_TIMESPEC:
+ if (get_timespec64(&ts, tsp))
+ return -EFAULT;
+ break;
+ default:
+ BUG();
+ }
to = &end_time;
if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
return -EINVAL;
}
- if (sigmask) {
- if (sigsetsize != sizeof(compat_sigset_t))
- return -EINVAL;
- if (get_compat_sigset(&ksigmask, sigmask))
- return -EFAULT;
-
- sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
- sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
- }
+ ret = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+ if (ret)
+ return ret;
ret = compat_core_sys_select(n, inp, outp, exp, to);
- ret = compat_poll_select_copy_remaining(&end_time, tsp, 0, ret);
+ ret = poll_select_copy_remaining(&end_time, tsp, type, ret);
- if (ret == -ERESTARTNOHAND) {
- /*
- * Don't restore the signal mask yet. Let do_signal() deliver
- * the signal on the way back to userspace, before the signal
- * mask is restored.
- */
- if (sigmask) {
- memcpy(&current->saved_sigmask, &sigsaved,
- sizeof(sigsaved));
- set_restore_sigmask();
- }
- } else if (sigmask)
- sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+ restore_user_sigmask(sigmask, &sigsaved);
return ret;
}
-COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp,
+COMPAT_SYSCALL_DEFINE6(pselect6_time64, int, n, compat_ulong_t __user *, inp,
+ compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
+ struct __kernel_timespec __user *, tsp, void __user *, sig)
+{
+ compat_size_t sigsetsize = 0;
+ compat_uptr_t up = 0;
+
+ if (sig) {
+ if (!access_ok(sig,
+ sizeof(compat_uptr_t)+sizeof(compat_size_t)) ||
+ __get_user(up, (compat_uptr_t __user *)sig) ||
+ __get_user(sigsetsize,
+ (compat_size_t __user *)(sig+sizeof(up))))
+ return -EFAULT;
+ }
+
+ return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(up),
+ sigsetsize, PT_TIMESPEC);
+}
+
+#if defined(CONFIG_COMPAT_32BIT_TIME)
+
+COMPAT_SYSCALL_DEFINE6(pselect6_time32, int, n, compat_ulong_t __user *, inp,
compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
struct old_timespec32 __user *, tsp, void __user *, sig)
{
@@ -1361,18 +1387,22 @@ COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp,
compat_uptr_t up = 0;
if (sig) {
- if (!access_ok(VERIFY_READ, sig,
+ if (!access_ok(sig,
sizeof(compat_uptr_t)+sizeof(compat_size_t)) ||
__get_user(up, (compat_uptr_t __user *)sig) ||
__get_user(sigsetsize,
(compat_size_t __user *)(sig+sizeof(up))))
return -EFAULT;
}
+
return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(up),
- sigsetsize);
+ sigsetsize, PT_OLD_TIMESPEC);
}
-COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds,
+#endif
+
+#if defined(CONFIG_COMPAT_32BIT_TIME)
+COMPAT_SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds,
unsigned int, nfds, struct old_timespec32 __user *, tsp,
const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
{
@@ -1389,36 +1419,57 @@ COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds,
return -EINVAL;
}
- if (sigmask) {
- if (sigsetsize != sizeof(compat_sigset_t))
- return -EINVAL;
- if (get_compat_sigset(&ksigmask, sigmask))
+ ret = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+ if (ret)
+ return ret;
+
+ ret = do_sys_poll(ufds, nfds, to);
+
+ restore_user_sigmask(sigmask, &sigsaved);
+
+ /* We can restart this syscall, usually */
+ if (ret == -EINTR)
+ ret = -ERESTARTNOHAND;
+
+ ret = poll_select_copy_remaining(&end_time, tsp, PT_OLD_TIMESPEC, ret);
+
+ return ret;
+}
+#endif
+
+/* New compat syscall for 64 bit time_t*/
+COMPAT_SYSCALL_DEFINE5(ppoll_time64, struct pollfd __user *, ufds,
+ unsigned int, nfds, struct __kernel_timespec __user *, tsp,
+ const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
+{
+ sigset_t ksigmask, sigsaved;
+ struct timespec64 ts, end_time, *to = NULL;
+ int ret;
+
+ if (tsp) {
+ if (get_timespec64(&ts, tsp))
return -EFAULT;
- sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
- sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+ to = &end_time;
+ if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
+ return -EINVAL;
}
+ ret = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+ if (ret)
+ return ret;
+
ret = do_sys_poll(ufds, nfds, to);
+ restore_user_sigmask(sigmask, &sigsaved);
+
/* We can restart this syscall, usually */
- if (ret == -EINTR) {
- /*
- * Don't restore the signal mask yet. Let do_signal() deliver
- * the signal on the way back to userspace, before the signal
- * mask is restored.
- */
- if (sigmask) {
- memcpy(&current->saved_sigmask, &sigsaved,
- sizeof(sigsaved));
- set_restore_sigmask();
- }
+ if (ret == -EINTR)
ret = -ERESTARTNOHAND;
- } else if (sigmask)
- sigprocmask(SIG_SETMASK, &sigsaved, NULL);
- ret = compat_poll_select_copy_remaining(&end_time, tsp, 0, ret);
+ ret = poll_select_copy_remaining(&end_time, tsp, PT_TIMESPEC, ret);
return ret;
}
+
#endif
diff --git a/fs/splice.c b/fs/splice.c
index b3daa971f597..6489fb9436e4 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -301,7 +301,7 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
struct kiocb kiocb;
int idx, ret;
- iov_iter_pipe(&to, ITER_PIPE | READ, pipe, len);
+ iov_iter_pipe(&to, READ, pipe, len);
idx = to.idx;
init_sync_kiocb(&kiocb, in);
kiocb.ki_pos = *ppos;
@@ -357,7 +357,7 @@ static ssize_t kernel_readv(struct file *file, const struct kvec *vec,
ssize_t res;
old_fs = get_fs();
- set_fs(get_ds());
+ set_fs(KERNEL_DS);
/* The cast to a user pointer is valid due to the set_fs() */
res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos, 0);
set_fs(old_fs);
@@ -386,7 +386,7 @@ static ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
*/
offset = *ppos & ~PAGE_MASK;
- iov_iter_pipe(&to, ITER_PIPE | READ, pipe, len + offset);
+ iov_iter_pipe(&to, READ, pipe, len + offset);
res = iov_iter_get_pages_alloc(&to, &pages, len + offset, &base);
if (res <= 0)
@@ -745,8 +745,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
left -= this_len;
}
- iov_iter_bvec(&from, ITER_BVEC | WRITE, array, n,
- sd.total_len - left);
+ iov_iter_bvec(&from, WRITE, array, n, sd.total_len - left);
ret = vfs_iter_write(out, &from, &sd.pos, 0);
if (ret <= 0)
break;
@@ -946,11 +945,16 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
sd->flags &= ~SPLICE_F_NONBLOCK;
more = sd->flags & SPLICE_F_MORE;
+ WARN_ON_ONCE(pipe->nrbufs != 0);
+
while (len) {
size_t read_len;
loff_t pos = sd->pos, prev_pos = pos;
- ret = do_splice_to(in, &pos, pipe, len, flags);
+ /* Don't try to read more the pipe has space for. */
+ read_len = min_t(size_t, len,
+ (pipe->buffers - pipe->nrbufs) << PAGE_SHIFT);
+ ret = do_splice_to(in, &pos, pipe, read_len, flags);
if (unlikely(ret <= 0))
goto out_release;
@@ -1119,6 +1123,9 @@ static long do_splice(struct file *in, loff_t __user *off_in,
if (ipipe == opipe)
return -EINVAL;
+ if ((in->f_flags | out->f_flags) & O_NONBLOCK)
+ flags |= SPLICE_F_NONBLOCK;
+
return splice_pipe_to_pipe(ipipe, opipe, len, flags);
}
@@ -1144,6 +1151,9 @@ static long do_splice(struct file *in, loff_t __user *off_in,
if (unlikely(ret < 0))
return ret;
+ if (in->f_flags & O_NONBLOCK)
+ flags |= SPLICE_F_NONBLOCK;
+
file_start_write(out);
ret = do_splice_from(ipipe, out, &offset, len, flags);
file_end_write(out);
@@ -1168,6 +1178,9 @@ static long do_splice(struct file *in, loff_t __user *off_in,
offset = in->f_pos;
}
+ if (out->f_flags & O_NONBLOCK)
+ flags |= SPLICE_F_NONBLOCK;
+
pipe_lock(opipe);
ret = wait_for_space(opipe, flags);
if (!ret)
@@ -1713,6 +1726,9 @@ static long do_tee(struct file *in, struct file *out, size_t len,
* copying the data.
*/
if (ipipe && opipe && ipipe != opipe) {
+ if ((in->f_flags | out->f_flags) & O_NONBLOCK)
+ flags |= SPLICE_F_NONBLOCK;
+
/*
* Keep going, unless we encounter an error. The ipipe/opipe
* ordering doesn't really matter.
diff --git a/fs/statfs.c b/fs/statfs.c
index f0216629621d..eea7af6f2f22 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -67,6 +67,20 @@ static int statfs_by_dentry(struct dentry *dentry, struct kstatfs *buf)
return retval;
}
+int vfs_get_fsid(struct dentry *dentry, __kernel_fsid_t *fsid)
+{
+ struct kstatfs st;
+ int error;
+
+ error = statfs_by_dentry(dentry, &st);
+ if (error)
+ return error;
+
+ *fsid = st.f_fsid;
+ return 0;
+}
+EXPORT_SYMBOL(vfs_get_fsid);
+
int vfs_statfs(const struct path *path, struct kstatfs *buf)
{
int error;
diff --git a/fs/super.c b/fs/super.c
index f3a8c008e164..48e25eba8465 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -35,6 +35,7 @@
#include <linux/fsnotify.h>
#include <linux/lockdep.h>
#include <linux/user_namespace.h>
+#include <uapi/linux/mount.h>
#include "internal.h"
static int thaw_super_locked(struct super_block *sb);
@@ -442,7 +443,7 @@ void generic_shutdown_super(struct super_block *sb)
sync_filesystem(sb);
sb->s_flags &= ~SB_ACTIVE;
- fsnotify_unmount_inodes(sb);
+ fsnotify_sb_delete(sb);
cgroup_writeback_umount();
evict_inodes(sb);
@@ -1245,17 +1246,13 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
{
struct dentry *root;
struct super_block *sb;
- char *secdata = NULL;
int error = -ENOMEM;
+ void *sec_opts = NULL;
if (data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) {
- secdata = alloc_secdata();
- if (!secdata)
- goto out;
-
- error = security_sb_copy_data(data, secdata);
+ error = security_sb_eat_lsm_opts(data, &sec_opts);
if (error)
- goto out_free_secdata;
+ return ERR_PTR(error);
}
root = type->mount(type, flags, name, data);
@@ -1276,10 +1273,16 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
smp_wmb();
sb->s_flags |= SB_BORN;
- error = security_sb_kern_mount(sb, flags, secdata);
+ error = security_sb_set_mnt_opts(sb, sec_opts, 0, NULL);
if (error)
goto out_sb;
+ if (!(flags & (MS_KERNMOUNT|MS_SUBMOUNT))) {
+ error = security_sb_kern_mount(sb);
+ if (error)
+ goto out_sb;
+ }
+
/*
* filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE
* but s_maxbytes was an unsigned long long for many releases. Throw
@@ -1290,14 +1293,13 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
"negative value (%lld)\n", type->name, sb->s_maxbytes);
up_write(&sb->s_umount);
- free_secdata(secdata);
+ security_free_mnt_opts(&sec_opts);
return root;
out_sb:
dput(root);
deactivate_locked_super(sb);
out_free_secdata:
- free_secdata(secdata);
-out:
+ security_free_mnt_opts(&sec_opts);
return ERR_PTR(error);
}
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index feeae8081c22..aa85f2874a9f 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -43,7 +43,8 @@ int sysfs_create_dir_ns(struct kobject *kobj, const void *ns)
kuid_t uid;
kgid_t gid;
- BUG_ON(!kobj);
+ if (WARN_ON(!kobj))
+ return -EINVAL;
if (kobj->parent)
parent = kobj->parent->sd;
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 0a7252aecfa5..130fc6fbcc03 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -17,7 +17,6 @@
#include <linux/seq_file.h>
#include "sysfs.h"
-#include "../kernfs/kernfs-internal.h"
/*
* Determine ktype->sysfs_ops for the given kernfs_node. This function
@@ -325,7 +324,8 @@ int sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr,
kuid_t uid;
kgid_t gid;
- BUG_ON(!kobj || !kobj->sd || !attr);
+ if (WARN_ON(!kobj || !kobj->sd || !attr))
+ return -EINVAL;
kobject_get_ownership(kobj, &uid, &gid);
return sysfs_add_file_mode_ns(kobj->sd, attr, false, attr->mode,
@@ -334,7 +334,7 @@ int sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr,
}
EXPORT_SYMBOL_GPL(sysfs_create_file_ns);
-int sysfs_create_files(struct kobject *kobj, const struct attribute **ptr)
+int sysfs_create_files(struct kobject *kobj, const struct attribute * const *ptr)
{
int err = 0;
int i;
@@ -493,9 +493,10 @@ bool sysfs_remove_file_self(struct kobject *kobj, const struct attribute *attr)
return ret;
}
-void sysfs_remove_files(struct kobject *kobj, const struct attribute **ptr)
+void sysfs_remove_files(struct kobject *kobj, const struct attribute * const *ptr)
{
int i;
+
for (i = 0; ptr[i]; i++)
sysfs_remove_file(kobj, ptr[i]);
}
@@ -537,7 +538,8 @@ int sysfs_create_bin_file(struct kobject *kobj,
kuid_t uid;
kgid_t gid;
- BUG_ON(!kobj || !kobj->sd || !attr);
+ if (WARN_ON(!kobj || !kobj->sd || !attr))
+ return -EINVAL;
kobject_get_ownership(kobj, &uid, &gid);
return sysfs_add_file_mode_ns(kobj->sd, &attr->attr, true,
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 1eb2d6307663..57038604d4a8 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -112,7 +112,8 @@ static int internal_create_group(struct kobject *kobj, int update,
kgid_t gid;
int error;
- BUG_ON(!kobj || (!update && !kobj->sd));
+ if (WARN_ON(!kobj || (!update && !kobj->sd)))
+ return -EINVAL;
/* Updates may happen before the object has been instantiated */
if (unlikely(update && !kobj->sd))
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 215c225b2ca1..c4deecc80f67 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -23,7 +23,8 @@ static int sysfs_do_create_link_sd(struct kernfs_node *parent,
{
struct kernfs_node *kn, *target = NULL;
- BUG_ON(!name || !parent);
+ if (WARN_ON(!name || !parent))
+ return -EINVAL;
/*
* We don't own @target_kobj and it may be removed at any time.
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 499a20a5a010..273736f41be3 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -275,7 +275,7 @@ static int __sysv_write_inode(struct inode *inode, int wait)
}
}
brelse(bh);
- return 0;
+ return err;
}
int sysv_write_inode(struct inode *inode, struct writeback_control *wbc)
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 803ca070d42e..6a6fc8aa1de7 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -560,7 +560,7 @@ SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct __kernel_itimerspec __user *,
}
#ifdef CONFIG_COMPAT_32BIT_TIME
-COMPAT_SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
+SYSCALL_DEFINE4(timerfd_settime32, int, ufd, int, flags,
const struct old_itimerspec32 __user *, utmr,
struct old_itimerspec32 __user *, otmr)
{
@@ -577,7 +577,7 @@ COMPAT_SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
return ret;
}
-COMPAT_SYSCALL_DEFINE2(timerfd_gettime, int, ufd,
+SYSCALL_DEFINE2(timerfd_gettime32, int, ufd,
struct old_itimerspec32 __user *, otmr)
{
struct itimerspec64 kotmr;
diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig
index bbc78549be4c..bc1e082d921d 100644
--- a/fs/ubifs/Kconfig
+++ b/fs/ubifs/Kconfig
@@ -7,13 +7,15 @@ config UBIFS_FS
select CRYPTO if UBIFS_FS_ZLIB
select CRYPTO_LZO if UBIFS_FS_LZO
select CRYPTO_DEFLATE if UBIFS_FS_ZLIB
+ select CRYPTO_HASH_INFO
depends on MTD_UBI
help
UBIFS is a file system for flash devices which works on top of UBI.
+if UBIFS_FS
+
config UBIFS_FS_ADVANCED_COMPR
bool "Advanced compression options"
- depends on UBIFS_FS
help
This option allows to explicitly choose which compressions, if any,
are enabled in UBIFS. Removing compressors means inability to read
@@ -23,7 +25,6 @@ config UBIFS_FS_ADVANCED_COMPR
config UBIFS_FS_LZO
bool "LZO compression support" if UBIFS_FS_ADVANCED_COMPR
- depends on UBIFS_FS
default y
help
LZO compressor is generally faster than zlib but compresses worse.
@@ -31,14 +32,12 @@ config UBIFS_FS_LZO
config UBIFS_FS_ZLIB
bool "ZLIB compression support" if UBIFS_FS_ADVANCED_COMPR
- depends on UBIFS_FS
default y
help
Zlib compresses better than LZO but it is slower. Say 'Y' if unsure.
config UBIFS_ATIME_SUPPORT
- bool "Access time support" if UBIFS_FS
- depends on UBIFS_FS
+ bool "Access time support"
default n
help
Originally UBIFS did not support atime, because it looked like a bad idea due
@@ -53,7 +52,6 @@ config UBIFS_ATIME_SUPPORT
config UBIFS_FS_XATTR
bool "UBIFS XATTR support"
- depends on UBIFS_FS
default y
help
Saying Y here includes support for extended attributes (xattrs).
@@ -64,7 +62,7 @@ config UBIFS_FS_XATTR
config UBIFS_FS_ENCRYPTION
bool "UBIFS Encryption"
- depends on UBIFS_FS && UBIFS_FS_XATTR && BLOCK
+ depends on UBIFS_FS_XATTR && BLOCK
select FS_ENCRYPTION
default n
help
@@ -75,7 +73,7 @@ config UBIFS_FS_ENCRYPTION
config UBIFS_FS_SECURITY
bool "UBIFS Security Labels"
- depends on UBIFS_FS && UBIFS_FS_XATTR
+ depends on UBIFS_FS_XATTR
default y
help
Security labels provide an access control facility to support Linux
@@ -85,3 +83,16 @@ config UBIFS_FS_SECURITY
the extended attribute support in advance.
If you are not using a security module, say N.
+
+config UBIFS_FS_AUTHENTICATION
+ bool "UBIFS authentication support"
+ depends on KEYS
+ select CRYPTO_HMAC
+ help
+ Enable authentication support for UBIFS. This feature offers protection
+ against offline changes for both data and metadata of the filesystem.
+ If you say yes here you should also select a hashing algorithm such as
+ sha256, these are not selected automatically since there are many
+ different options.
+
+endif # UBIFS_FS
diff --git a/fs/ubifs/Makefile b/fs/ubifs/Makefile
index 6197d7e539e4..5f838319c8d5 100644
--- a/fs/ubifs/Makefile
+++ b/fs/ubifs/Makefile
@@ -8,3 +8,4 @@ ubifs-y += recovery.o ioctl.o lpt_commit.o tnc_misc.o debug.o
ubifs-y += misc.o
ubifs-$(CONFIG_UBIFS_FS_ENCRYPTION) += crypto.o
ubifs-$(CONFIG_UBIFS_FS_XATTR) += xattr.o
+ubifs-$(CONFIG_UBIFS_FS_AUTHENTICATION) += auth.o
diff --git a/fs/ubifs/auth.c b/fs/ubifs/auth.c
new file mode 100644
index 000000000000..5bf5fd08879e
--- /dev/null
+++ b/fs/ubifs/auth.c
@@ -0,0 +1,501 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2018 Pengutronix, Sascha Hauer <s.hauer@pengutronix.de>
+ */
+
+/*
+ * This file implements various helper functions for UBIFS authentication support
+ */
+
+#include <linux/crypto.h>
+#include <crypto/hash.h>
+#include <crypto/sha.h>
+#include <crypto/algapi.h>
+#include <keys/user-type.h>
+
+#include "ubifs.h"
+
+/**
+ * ubifs_node_calc_hash - calculate the hash of a UBIFS node
+ * @c: UBIFS file-system description object
+ * @node: the node to calculate a hash for
+ * @hash: the returned hash
+ *
+ * Returns 0 for success or a negative error code otherwise.
+ */
+int __ubifs_node_calc_hash(const struct ubifs_info *c, const void *node,
+ u8 *hash)
+{
+ const struct ubifs_ch *ch = node;
+ SHASH_DESC_ON_STACK(shash, c->hash_tfm);
+ int err;
+
+ shash->tfm = c->hash_tfm;
+ shash->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+
+ err = crypto_shash_digest(shash, node, le32_to_cpu(ch->len), hash);
+ if (err < 0)
+ return err;
+ return 0;
+}
+
+/**
+ * ubifs_hash_calc_hmac - calculate a HMAC from a hash
+ * @c: UBIFS file-system description object
+ * @hash: the node to calculate a HMAC for
+ * @hmac: the returned HMAC
+ *
+ * Returns 0 for success or a negative error code otherwise.
+ */
+static int ubifs_hash_calc_hmac(const struct ubifs_info *c, const u8 *hash,
+ u8 *hmac)
+{
+ SHASH_DESC_ON_STACK(shash, c->hmac_tfm);
+ int err;
+
+ shash->tfm = c->hmac_tfm;
+ shash->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+
+ err = crypto_shash_digest(shash, hash, c->hash_len, hmac);
+ if (err < 0)
+ return err;
+ return 0;
+}
+
+/**
+ * ubifs_prepare_auth_node - Prepare an authentication node
+ * @c: UBIFS file-system description object
+ * @node: the node to calculate a hash for
+ * @hash: input hash of previous nodes
+ *
+ * This function prepares an authentication node for writing onto flash.
+ * It creates a HMAC from the given input hash and writes it to the node.
+ *
+ * Returns 0 for success or a negative error code otherwise.
+ */
+int ubifs_prepare_auth_node(struct ubifs_info *c, void *node,
+ struct shash_desc *inhash)
+{
+ SHASH_DESC_ON_STACK(hash_desc, c->hash_tfm);
+ struct ubifs_auth_node *auth = node;
+ u8 *hash;
+ int err;
+
+ hash = kmalloc(crypto_shash_descsize(c->hash_tfm), GFP_NOFS);
+ if (!hash)
+ return -ENOMEM;
+
+ hash_desc->tfm = c->hash_tfm;
+ hash_desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+ ubifs_shash_copy_state(c, inhash, hash_desc);
+
+ err = crypto_shash_final(hash_desc, hash);
+ if (err)
+ goto out;
+
+ err = ubifs_hash_calc_hmac(c, hash, auth->hmac);
+ if (err)
+ goto out;
+
+ auth->ch.node_type = UBIFS_AUTH_NODE;
+ ubifs_prepare_node(c, auth, ubifs_auth_node_sz(c), 0);
+
+ err = 0;
+out:
+ kfree(hash);
+
+ return err;
+}
+
+static struct shash_desc *ubifs_get_desc(const struct ubifs_info *c,
+ struct crypto_shash *tfm)
+{
+ struct shash_desc *desc;
+ int err;
+
+ if (!ubifs_authenticated(c))
+ return NULL;
+
+ desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(tfm), GFP_KERNEL);
+ if (!desc)
+ return ERR_PTR(-ENOMEM);
+
+ desc->tfm = tfm;
+ desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+
+ err = crypto_shash_init(desc);
+ if (err) {
+ kfree(desc);
+ return ERR_PTR(err);
+ }
+
+ return desc;
+}
+
+/**
+ * __ubifs_hash_get_desc - get a descriptor suitable for hashing a node
+ * @c: UBIFS file-system description object
+ *
+ * This function returns a descriptor suitable for hashing a node. Free after use
+ * with kfree.
+ */
+struct shash_desc *__ubifs_hash_get_desc(const struct ubifs_info *c)
+{
+ return ubifs_get_desc(c, c->hash_tfm);
+}
+
+/**
+ * __ubifs_shash_final - finalize shash
+ * @c: UBIFS file-system description object
+ * @desc: the descriptor
+ * @out: the output hash
+ *
+ * Simple wrapper around crypto_shash_final(), safe to be called with
+ * disabled authentication.
+ */
+int __ubifs_shash_final(const struct ubifs_info *c, struct shash_desc *desc,
+ u8 *out)
+{
+ if (ubifs_authenticated(c))
+ return crypto_shash_final(desc, out);
+
+ return 0;
+}
+
+/**
+ * ubifs_bad_hash - Report hash mismatches
+ * @c: UBIFS file-system description object
+ * @node: the node
+ * @hash: the expected hash
+ * @lnum: the LEB @node was read from
+ * @offs: offset in LEB @node was read from
+ *
+ * This function reports a hash mismatch when a node has a different hash than
+ * expected.
+ */
+void ubifs_bad_hash(const struct ubifs_info *c, const void *node, const u8 *hash,
+ int lnum, int offs)
+{
+ int len = min(c->hash_len, 20);
+ int cropped = len != c->hash_len;
+ const char *cont = cropped ? "..." : "";
+
+ u8 calc[UBIFS_HASH_ARR_SZ];
+
+ __ubifs_node_calc_hash(c, node, calc);
+
+ ubifs_err(c, "hash mismatch on node at LEB %d:%d", lnum, offs);
+ ubifs_err(c, "hash expected: %*ph%s", len, hash, cont);
+ ubifs_err(c, "hash calculated: %*ph%s", len, calc, cont);
+}
+
+/**
+ * __ubifs_node_check_hash - check the hash of a node against given hash
+ * @c: UBIFS file-system description object
+ * @node: the node
+ * @expected: the expected hash
+ *
+ * This function calculates a hash over a node and compares it to the given hash.
+ * Returns 0 if both hashes are equal or authentication is disabled, otherwise a
+ * negative error code is returned.
+ */
+int __ubifs_node_check_hash(const struct ubifs_info *c, const void *node,
+ const u8 *expected)
+{
+ u8 calc[UBIFS_HASH_ARR_SZ];
+ int err;
+
+ err = __ubifs_node_calc_hash(c, node, calc);
+ if (err)
+ return err;
+
+ if (ubifs_check_hash(c, expected, calc))
+ return -EPERM;
+
+ return 0;
+}
+
+/**
+ * ubifs_init_authentication - initialize UBIFS authentication support
+ * @c: UBIFS file-system description object
+ *
+ * This function returns 0 for success or a negative error code otherwise.
+ */
+int ubifs_init_authentication(struct ubifs_info *c)
+{
+ struct key *keyring_key;
+ const struct user_key_payload *ukp;
+ int err;
+ char hmac_name[CRYPTO_MAX_ALG_NAME];
+
+ if (!c->auth_hash_name) {
+ ubifs_err(c, "authentication hash name needed with authentication");
+ return -EINVAL;
+ }
+
+ c->auth_hash_algo = match_string(hash_algo_name, HASH_ALGO__LAST,
+ c->auth_hash_name);
+ if ((int)c->auth_hash_algo < 0) {
+ ubifs_err(c, "Unknown hash algo %s specified",
+ c->auth_hash_name);
+ return -EINVAL;
+ }
+
+ snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)",
+ c->auth_hash_name);
+
+ keyring_key = request_key(&key_type_logon, c->auth_key_name, NULL);
+
+ if (IS_ERR(keyring_key)) {
+ ubifs_err(c, "Failed to request key: %ld",
+ PTR_ERR(keyring_key));
+ return PTR_ERR(keyring_key);
+ }
+
+ down_read(&keyring_key->sem);
+
+ if (keyring_key->type != &key_type_logon) {
+ ubifs_err(c, "key type must be logon");
+ err = -ENOKEY;
+ goto out;
+ }
+
+ ukp = user_key_payload_locked(keyring_key);
+ if (!ukp) {
+ /* key was revoked before we acquired its semaphore */
+ err = -EKEYREVOKED;
+ goto out;
+ }
+
+ c->hash_tfm = crypto_alloc_shash(c->auth_hash_name, 0, 0);
+ if (IS_ERR(c->hash_tfm)) {
+ err = PTR_ERR(c->hash_tfm);
+ ubifs_err(c, "Can not allocate %s: %d",
+ c->auth_hash_name, err);
+ goto out;
+ }
+
+ c->hash_len = crypto_shash_digestsize(c->hash_tfm);
+ if (c->hash_len > UBIFS_HASH_ARR_SZ) {
+ ubifs_err(c, "hash %s is bigger than maximum allowed hash size (%d > %d)",
+ c->auth_hash_name, c->hash_len, UBIFS_HASH_ARR_SZ);
+ err = -EINVAL;
+ goto out_free_hash;
+ }
+
+ c->hmac_tfm = crypto_alloc_shash(hmac_name, 0, 0);
+ if (IS_ERR(c->hmac_tfm)) {
+ err = PTR_ERR(c->hmac_tfm);
+ ubifs_err(c, "Can not allocate %s: %d", hmac_name, err);
+ goto out_free_hash;
+ }
+
+ c->hmac_desc_len = crypto_shash_digestsize(c->hmac_tfm);
+ if (c->hmac_desc_len > UBIFS_HMAC_ARR_SZ) {
+ ubifs_err(c, "hmac %s is bigger than maximum allowed hmac size (%d > %d)",
+ hmac_name, c->hmac_desc_len, UBIFS_HMAC_ARR_SZ);
+ err = -EINVAL;
+ goto out_free_hash;
+ }
+
+ err = crypto_shash_setkey(c->hmac_tfm, ukp->data, ukp->datalen);
+ if (err)
+ goto out_free_hmac;
+
+ c->authenticated = true;
+
+ c->log_hash = ubifs_hash_get_desc(c);
+ if (IS_ERR(c->log_hash))
+ goto out_free_hmac;
+
+ err = 0;
+
+out_free_hmac:
+ if (err)
+ crypto_free_shash(c->hmac_tfm);
+out_free_hash:
+ if (err)
+ crypto_free_shash(c->hash_tfm);
+out:
+ up_read(&keyring_key->sem);
+ key_put(keyring_key);
+
+ return err;
+}
+
+/**
+ * __ubifs_exit_authentication - release resource
+ * @c: UBIFS file-system description object
+ *
+ * This function releases the authentication related resources.
+ */
+void __ubifs_exit_authentication(struct ubifs_info *c)
+{
+ if (!ubifs_authenticated(c))
+ return;
+
+ crypto_free_shash(c->hmac_tfm);
+ crypto_free_shash(c->hash_tfm);
+ kfree(c->log_hash);
+}
+
+/**
+ * ubifs_node_calc_hmac - calculate the HMAC of a UBIFS node
+ * @c: UBIFS file-system description object
+ * @node: the node to insert a HMAC into.
+ * @len: the length of the node
+ * @ofs_hmac: the offset in the node where the HMAC is inserted
+ * @hmac: returned HMAC
+ *
+ * This function calculates a HMAC of a UBIFS node. The HMAC is expected to be
+ * embedded into the node, so this area is not covered by the HMAC. Also not
+ * covered is the UBIFS_NODE_MAGIC and the CRC of the node.
+ */
+static int ubifs_node_calc_hmac(const struct ubifs_info *c, const void *node,
+ int len, int ofs_hmac, void *hmac)
+{
+ SHASH_DESC_ON_STACK(shash, c->hmac_tfm);
+ int hmac_len = c->hmac_desc_len;
+ int err;
+
+ ubifs_assert(c, ofs_hmac > 8);
+ ubifs_assert(c, ofs_hmac + hmac_len < len);
+
+ shash->tfm = c->hmac_tfm;
+ shash->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+
+ err = crypto_shash_init(shash);
+ if (err)
+ return err;
+
+ /* behind common node header CRC up to HMAC begin */
+ err = crypto_shash_update(shash, node + 8, ofs_hmac - 8);
+ if (err < 0)
+ return err;
+
+ /* behind HMAC, if any */
+ if (len - ofs_hmac - hmac_len > 0) {
+ err = crypto_shash_update(shash, node + ofs_hmac + hmac_len,
+ len - ofs_hmac - hmac_len);
+ if (err < 0)
+ return err;
+ }
+
+ return crypto_shash_final(shash, hmac);
+}
+
+/**
+ * __ubifs_node_insert_hmac - insert a HMAC into a UBIFS node
+ * @c: UBIFS file-system description object
+ * @node: the node to insert a HMAC into.
+ * @len: the length of the node
+ * @ofs_hmac: the offset in the node where the HMAC is inserted
+ *
+ * This function inserts a HMAC at offset @ofs_hmac into the node given in
+ * @node.
+ *
+ * This function returns 0 for success or a negative error code otherwise.
+ */
+int __ubifs_node_insert_hmac(const struct ubifs_info *c, void *node, int len,
+ int ofs_hmac)
+{
+ return ubifs_node_calc_hmac(c, node, len, ofs_hmac, node + ofs_hmac);
+}
+
+/**
+ * __ubifs_node_verify_hmac - verify the HMAC of UBIFS node
+ * @c: UBIFS file-system description object
+ * @node: the node to insert a HMAC into.
+ * @len: the length of the node
+ * @ofs_hmac: the offset in the node where the HMAC is inserted
+ *
+ * This function verifies the HMAC at offset @ofs_hmac of the node given in
+ * @node. Returns 0 if successful or a negative error code otherwise.
+ */
+int __ubifs_node_verify_hmac(const struct ubifs_info *c, const void *node,
+ int len, int ofs_hmac)
+{
+ int hmac_len = c->hmac_desc_len;
+ u8 *hmac;
+ int err;
+
+ hmac = kmalloc(hmac_len, GFP_NOFS);
+ if (!hmac)
+ return -ENOMEM;
+
+ err = ubifs_node_calc_hmac(c, node, len, ofs_hmac, hmac);
+ if (err)
+ return err;
+
+ err = crypto_memneq(hmac, node + ofs_hmac, hmac_len);
+
+ kfree(hmac);
+
+ if (!err)
+ return 0;
+
+ return -EPERM;
+}
+
+int __ubifs_shash_copy_state(const struct ubifs_info *c, struct shash_desc *src,
+ struct shash_desc *target)
+{
+ u8 *state;
+ int err;
+
+ state = kmalloc(crypto_shash_descsize(src->tfm), GFP_NOFS);
+ if (!state)
+ return -ENOMEM;
+
+ err = crypto_shash_export(src, state);
+ if (err)
+ goto out;
+
+ err = crypto_shash_import(target, state);
+
+out:
+ kfree(state);
+
+ return err;
+}
+
+/**
+ * ubifs_hmac_wkm - Create a HMAC of the well known message
+ * @c: UBIFS file-system description object
+ * @hmac: The HMAC of the well known message
+ *
+ * This function creates a HMAC of a well known message. This is used
+ * to check if the provided key is suitable to authenticate a UBIFS
+ * image. This is only a convenience to the user to provide a better
+ * error message when the wrong key is provided.
+ *
+ * This function returns 0 for success or a negative error code otherwise.
+ */
+int ubifs_hmac_wkm(struct ubifs_info *c, u8 *hmac)
+{
+ SHASH_DESC_ON_STACK(shash, c->hmac_tfm);
+ int err;
+ const char well_known_message[] = "UBIFS";
+
+ if (!ubifs_authenticated(c))
+ return 0;
+
+ shash->tfm = c->hmac_tfm;
+ shash->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+
+ err = crypto_shash_init(shash);
+ if (err)
+ return err;
+
+ err = crypto_shash_update(shash, well_known_message,
+ sizeof(well_known_message) - 1);
+ if (err < 0)
+ return err;
+
+ err = crypto_shash_final(shash, hmac);
+ if (err)
+ return err;
+ return 0;
+}
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 564e330d05b1..c49ff50fdceb 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -165,6 +165,8 @@ const char *dbg_ntype(int type)
return "commit start node";
case UBIFS_ORPH_NODE:
return "orphan node";
+ case UBIFS_AUTH_NODE:
+ return "auth node";
default:
return "unknown node";
}
@@ -542,6 +544,10 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node)
(unsigned long long)le64_to_cpu(orph->inos[i]));
break;
}
+ case UBIFS_AUTH_NODE:
+ {
+ break;
+ }
default:
pr_err("node type %d was not recognized\n",
(int)ch->node_type);
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 1b78f2e09218..5d2ffb1a45fc 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1481,7 +1481,7 @@ static int ubifs_migrate_page(struct address_space *mapping,
{
int rc;
- rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
+ rc = migrate_page_move_mapping(mapping, newpage, page, mode, 0);
if (rc != MIGRATEPAGE_SUCCESS)
return rc;
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index d2680e0b4a36..bf75fdc76fc3 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -254,7 +254,8 @@ static int sort_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
snod->type == UBIFS_DATA_NODE ||
snod->type == UBIFS_DENT_NODE ||
snod->type == UBIFS_XENT_NODE ||
- snod->type == UBIFS_TRUN_NODE);
+ snod->type == UBIFS_TRUN_NODE ||
+ snod->type == UBIFS_AUTH_NODE);
if (snod->type != UBIFS_INO_NODE &&
snod->type != UBIFS_DATA_NODE &&
@@ -364,12 +365,13 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
/* Write nodes to their new location. Use the first-fit strategy */
while (1) {
- int avail;
+ int avail, moved = 0;
struct ubifs_scan_node *snod, *tmp;
/* Move data nodes */
list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
- avail = c->leb_size - wbuf->offs - wbuf->used;
+ avail = c->leb_size - wbuf->offs - wbuf->used -
+ ubifs_auth_node_sz(c);
if (snod->len > avail)
/*
* Do not skip data nodes in order to optimize
@@ -377,14 +379,21 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
*/
break;
+ err = ubifs_shash_update(c, c->jheads[GCHD].log_hash,
+ snod->node, snod->len);
+ if (err)
+ goto out;
+
err = move_node(c, sleb, snod, wbuf);
if (err)
goto out;
+ moved = 1;
}
/* Move non-data nodes */
list_for_each_entry_safe(snod, tmp, &nondata, list) {
- avail = c->leb_size - wbuf->offs - wbuf->used;
+ avail = c->leb_size - wbuf->offs - wbuf->used -
+ ubifs_auth_node_sz(c);
if (avail < min)
break;
@@ -402,9 +411,41 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
continue;
}
+ err = ubifs_shash_update(c, c->jheads[GCHD].log_hash,
+ snod->node, snod->len);
+ if (err)
+ goto out;
+
err = move_node(c, sleb, snod, wbuf);
if (err)
goto out;
+ moved = 1;
+ }
+
+ if (ubifs_authenticated(c) && moved) {
+ struct ubifs_auth_node *auth;
+
+ auth = kmalloc(ubifs_auth_node_sz(c), GFP_NOFS);
+ if (!auth) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = ubifs_prepare_auth_node(c, auth,
+ c->jheads[GCHD].log_hash);
+ if (err) {
+ kfree(auth);
+ goto out;
+ }
+
+ err = ubifs_wbuf_write_nolock(wbuf, auth,
+ ubifs_auth_node_sz(c));
+ if (err) {
+ kfree(auth);
+ goto out;
+ }
+
+ ubifs_add_dirt(c, wbuf->lnum, ubifs_auth_node_sz(c));
}
if (list_empty(&sleb->nodes) && list_empty(&nondata))
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 099bec94b820..d124117efd42 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -365,20 +365,8 @@ static unsigned long long next_sqnum(struct ubifs_info *c)
return sqnum;
}
-/**
- * ubifs_prepare_node - prepare node to be written to flash.
- * @c: UBIFS file-system description object
- * @node: the node to pad
- * @len: node length
- * @pad: if the buffer has to be padded
- *
- * This function prepares node at @node to be written to the media - it
- * calculates node CRC, fills the common header, and adds proper padding up to
- * the next minimum I/O unit if @pad is not zero.
- */
-void ubifs_prepare_node(struct ubifs_info *c, void *node, int len, int pad)
+void ubifs_init_node(struct ubifs_info *c, void *node, int len, int pad)
{
- uint32_t crc;
struct ubifs_ch *ch = node;
unsigned long long sqnum = next_sqnum(c);
@@ -389,8 +377,6 @@ void ubifs_prepare_node(struct ubifs_info *c, void *node, int len, int pad)
ch->group_type = UBIFS_NO_NODE_GROUP;
ch->sqnum = cpu_to_le64(sqnum);
ch->padding[0] = ch->padding[1] = 0;
- crc = crc32(UBIFS_CRC32_INIT, node + 8, len - 8);
- ch->crc = cpu_to_le32(crc);
if (pad) {
len = ALIGN(len, 8);
@@ -399,6 +385,68 @@ void ubifs_prepare_node(struct ubifs_info *c, void *node, int len, int pad)
}
}
+void ubifs_crc_node(struct ubifs_info *c, void *node, int len)
+{
+ struct ubifs_ch *ch = node;
+ uint32_t crc;
+
+ crc = crc32(UBIFS_CRC32_INIT, node + 8, len - 8);
+ ch->crc = cpu_to_le32(crc);
+}
+
+/**
+ * ubifs_prepare_node_hmac - prepare node to be written to flash.
+ * @c: UBIFS file-system description object
+ * @node: the node to pad
+ * @len: node length
+ * @hmac_offs: offset of the HMAC in the node
+ * @pad: if the buffer has to be padded
+ *
+ * This function prepares node at @node to be written to the media - it
+ * calculates node CRC, fills the common header, and adds proper padding up to
+ * the next minimum I/O unit if @pad is not zero. if @hmac_offs is positive then
+ * a HMAC is inserted into the node at the given offset.
+ *
+ * This function returns 0 for success or a negative error code otherwise.
+ */
+int ubifs_prepare_node_hmac(struct ubifs_info *c, void *node, int len,
+ int hmac_offs, int pad)
+{
+ int err;
+
+ ubifs_init_node(c, node, len, pad);
+
+ if (hmac_offs > 0) {
+ err = ubifs_node_insert_hmac(c, node, len, hmac_offs);
+ if (err)
+ return err;
+ }
+
+ ubifs_crc_node(c, node, len);
+
+ return 0;
+}
+
+/**
+ * ubifs_prepare_node - prepare node to be written to flash.
+ * @c: UBIFS file-system description object
+ * @node: the node to pad
+ * @len: node length
+ * @pad: if the buffer has to be padded
+ *
+ * This function prepares node at @node to be written to the media - it
+ * calculates node CRC, fills the common header, and adds proper padding up to
+ * the next minimum I/O unit if @pad is not zero.
+ */
+void ubifs_prepare_node(struct ubifs_info *c, void *node, int len, int pad)
+{
+ /*
+ * Deliberately ignore return value since this function can only fail
+ * when a hmac offset is given.
+ */
+ ubifs_prepare_node_hmac(c, node, len, 0, pad);
+}
+
/**
* ubifs_prep_grp_node - prepare node of a group to be written to flash.
* @c: UBIFS file-system description object
@@ -849,12 +897,13 @@ out:
}
/**
- * ubifs_write_node - write node to the media.
+ * ubifs_write_node_hmac - write node to the media.
* @c: UBIFS file-system description object
* @buf: the node to write
* @len: node length
* @lnum: logical eraseblock number
* @offs: offset within the logical eraseblock
+ * @hmac_offs: offset of the HMAC within the node
*
* This function automatically fills node magic number, assigns sequence
* number, and calculates node CRC checksum. The length of the @buf buffer has
@@ -862,8 +911,8 @@ out:
* appends padding node and padding bytes if needed. Returns zero in case of
* success and a negative error code in case of failure.
*/
-int ubifs_write_node(struct ubifs_info *c, void *buf, int len, int lnum,
- int offs)
+int ubifs_write_node_hmac(struct ubifs_info *c, void *buf, int len, int lnum,
+ int offs, int hmac_offs)
{
int err, buf_len = ALIGN(len, c->min_io_size);
@@ -878,7 +927,10 @@ int ubifs_write_node(struct ubifs_info *c, void *buf, int len, int lnum,
if (c->ro_error)
return -EROFS;
- ubifs_prepare_node(c, buf, len, 1);
+ err = ubifs_prepare_node_hmac(c, buf, len, hmac_offs, 1);
+ if (err)
+ return err;
+
err = ubifs_leb_write(c, lnum, buf, offs, buf_len);
if (err)
ubifs_dump_node(c, buf);
@@ -887,6 +939,26 @@ int ubifs_write_node(struct ubifs_info *c, void *buf, int len, int lnum,
}
/**
+ * ubifs_write_node - write node to the media.
+ * @c: UBIFS file-system description object
+ * @buf: the node to write
+ * @len: node length
+ * @lnum: logical eraseblock number
+ * @offs: offset within the logical eraseblock
+ *
+ * This function automatically fills node magic number, assigns sequence
+ * number, and calculates node CRC checksum. The length of the @buf buffer has
+ * to be aligned to the minimal I/O unit size. This function automatically
+ * appends padding node and padding bytes if needed. Returns zero in case of
+ * success and a negative error code in case of failure.
+ */
+int ubifs_write_node(struct ubifs_info *c, void *buf, int len, int lnum,
+ int offs)
+{
+ return ubifs_write_node_hmac(c, buf, len, lnum, offs, -1);
+}
+
+/**
* ubifs_read_node_wbuf - read node from the media or write-buffer.
* @wbuf: wbuf to check for un-written data
* @buf: buffer to read to
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 802565a17733..729dc76c83df 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -90,6 +90,12 @@ static inline void zero_trun_node_unused(struct ubifs_trun_node *trun)
memset(trun->padding, 0, 12);
}
+static void ubifs_add_auth_dirt(struct ubifs_info *c, int lnum)
+{
+ if (ubifs_authenticated(c))
+ ubifs_add_dirt(c, lnum, ubifs_auth_node_sz(c));
+}
+
/**
* reserve_space - reserve space in the journal.
* @c: UBIFS file-system description object
@@ -228,34 +234,33 @@ out_return:
return err;
}
-/**
- * write_node - write node to a journal head.
- * @c: UBIFS file-system description object
- * @jhead: journal head
- * @node: node to write
- * @len: node length
- * @lnum: LEB number written is returned here
- * @offs: offset written is returned here
- *
- * This function writes a node to reserved space of journal head @jhead.
- * Returns zero in case of success and a negative error code in case of
- * failure.
- */
-static int write_node(struct ubifs_info *c, int jhead, void *node, int len,
- int *lnum, int *offs)
+static int ubifs_hash_nodes(struct ubifs_info *c, void *node,
+ int len, struct shash_desc *hash)
{
- struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf;
+ int auth_node_size = ubifs_auth_node_sz(c);
+ int err;
- ubifs_assert(c, jhead != GCHD);
+ while (1) {
+ const struct ubifs_ch *ch = node;
+ int nodelen = le32_to_cpu(ch->len);
- *lnum = c->jheads[jhead].wbuf.lnum;
- *offs = c->jheads[jhead].wbuf.offs + c->jheads[jhead].wbuf.used;
+ ubifs_assert(c, len >= auth_node_size);
- dbg_jnl("jhead %s, LEB %d:%d, len %d",
- dbg_jhead(jhead), *lnum, *offs, len);
- ubifs_prepare_node(c, node, len, 0);
+ if (len == auth_node_size)
+ break;
+
+ ubifs_assert(c, len > nodelen);
+ ubifs_assert(c, ch->magic == cpu_to_le32(UBIFS_NODE_MAGIC));
- return ubifs_wbuf_write_nolock(wbuf, node, len);
+ err = ubifs_shash_update(c, hash, (void *)node, nodelen);
+ if (err)
+ return err;
+
+ node += ALIGN(nodelen, 8);
+ len -= ALIGN(nodelen, 8);
+ }
+
+ return ubifs_prepare_auth_node(c, node, hash);
}
/**
@@ -268,9 +273,9 @@ static int write_node(struct ubifs_info *c, int jhead, void *node, int len,
* @offs: offset written is returned here
* @sync: non-zero if the write-buffer has to by synchronized
*
- * This function is the same as 'write_node()' but it does not assume the
- * buffer it is writing is a node, so it does not prepare it (which means
- * initializing common header and calculating CRC).
+ * This function writes data to the reserved space of journal head @jhead.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
*/
static int write_head(struct ubifs_info *c, int jhead, void *buf, int len,
int *lnum, int *offs, int sync)
@@ -285,6 +290,12 @@ static int write_head(struct ubifs_info *c, int jhead, void *buf, int len,
dbg_jnl("jhead %s, LEB %d:%d, len %d",
dbg_jhead(jhead), *lnum, *offs, len);
+ if (ubifs_authenticated(c)) {
+ err = ubifs_hash_nodes(c, buf, len, c->jheads[jhead].log_hash);
+ if (err)
+ return err;
+ }
+
err = ubifs_wbuf_write_nolock(wbuf, buf, len);
if (err)
return err;
@@ -548,6 +559,9 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
struct ubifs_dent_node *dent;
struct ubifs_ino_node *ino;
union ubifs_key dent_key, ino_key;
+ u8 hash_dent[UBIFS_HASH_ARR_SZ];
+ u8 hash_ino[UBIFS_HASH_ARR_SZ];
+ u8 hash_ino_host[UBIFS_HASH_ARR_SZ];
ubifs_assert(c, mutex_is_locked(&host_ui->ui_mutex));
@@ -570,7 +584,10 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
len = aligned_dlen + aligned_ilen + UBIFS_INO_NODE_SZ;
/* Make sure to also account for extended attributes */
- len += host_ui->data_len;
+ if (ubifs_authenticated(c))
+ len += ALIGN(host_ui->data_len, 8) + ubifs_auth_node_sz(c);
+ else
+ len += host_ui->data_len;
dent = kzalloc(len, GFP_NOFS);
if (!dent)
@@ -602,11 +619,21 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
zero_dent_node_unused(dent);
ubifs_prep_grp_node(c, dent, dlen, 0);
+ err = ubifs_node_calc_hash(c, dent, hash_dent);
+ if (err)
+ goto out_release;
ino = (void *)dent + aligned_dlen;
pack_inode(c, ino, inode, 0);
+ err = ubifs_node_calc_hash(c, ino, hash_ino);
+ if (err)
+ goto out_release;
+
ino = (void *)ino + aligned_ilen;
pack_inode(c, ino, dir, 1);
+ err = ubifs_node_calc_hash(c, ino, hash_ino_host);
+ if (err)
+ goto out_release;
if (last_reference) {
err = ubifs_add_orphan(c, inode->i_ino);
@@ -628,6 +655,7 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
}
release_head(c, BASEHD);
kfree(dent);
+ ubifs_add_auth_dirt(c, lnum);
if (deletion) {
if (nm->hash)
@@ -638,7 +666,8 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
goto out_ro;
err = ubifs_add_dirt(c, lnum, dlen);
} else
- err = ubifs_tnc_add_nm(c, &dent_key, lnum, dent_offs, dlen, nm);
+ err = ubifs_tnc_add_nm(c, &dent_key, lnum, dent_offs, dlen,
+ hash_dent, nm);
if (err)
goto out_ro;
@@ -650,14 +679,14 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
*/
ino_key_init(c, &ino_key, inode->i_ino);
ino_offs = dent_offs + aligned_dlen;
- err = ubifs_tnc_add(c, &ino_key, lnum, ino_offs, ilen);
+ err = ubifs_tnc_add(c, &ino_key, lnum, ino_offs, ilen, hash_ino);
if (err)
goto out_ro;
ino_key_init(c, &ino_key, dir->i_ino);
ino_offs += aligned_ilen;
err = ubifs_tnc_add(c, &ino_key, lnum, ino_offs,
- UBIFS_INO_NODE_SZ + host_ui->data_len);
+ UBIFS_INO_NODE_SZ + host_ui->data_len, hash_ino_host);
if (err)
goto out_ro;
@@ -706,10 +735,12 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
const union ubifs_key *key, const void *buf, int len)
{
struct ubifs_data_node *data;
- int err, lnum, offs, compr_type, out_len, compr_len;
+ int err, lnum, offs, compr_type, out_len, compr_len, auth_len;
int dlen = COMPRESSED_DATA_NODE_BUF_SZ, allocated = 1;
+ int write_len;
struct ubifs_inode *ui = ubifs_inode(inode);
bool encrypted = ubifs_crypt_is_encrypted(inode);
+ u8 hash[UBIFS_HASH_ARR_SZ];
dbg_jnlk(key, "ino %lu, blk %u, len %d, key ",
(unsigned long)key_inum(c, key), key_block(c, key), len);
@@ -718,7 +749,9 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
if (encrypted)
dlen += UBIFS_CIPHER_BLOCK_SIZE;
- data = kmalloc(dlen, GFP_NOFS | __GFP_NOWARN);
+ auth_len = ubifs_auth_node_sz(c);
+
+ data = kmalloc(dlen + auth_len, GFP_NOFS | __GFP_NOWARN);
if (!data) {
/*
* Fall-back to the write reserve buffer. Note, we might be
@@ -757,20 +790,33 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
}
dlen = UBIFS_DATA_NODE_SZ + out_len;
+ if (ubifs_authenticated(c))
+ write_len = ALIGN(dlen, 8) + auth_len;
+ else
+ write_len = dlen;
+
data->compr_type = cpu_to_le16(compr_type);
/* Make reservation before allocating sequence numbers */
- err = make_reservation(c, DATAHD, dlen);
+ err = make_reservation(c, DATAHD, write_len);
if (err)
goto out_free;
- err = write_node(c, DATAHD, data, dlen, &lnum, &offs);
+ ubifs_prepare_node(c, data, dlen, 0);
+ err = write_head(c, DATAHD, data, write_len, &lnum, &offs, 0);
+ if (err)
+ goto out_release;
+
+ err = ubifs_node_calc_hash(c, data, hash);
if (err)
goto out_release;
+
ubifs_wbuf_add_ino_nolock(&c->jheads[DATAHD].wbuf, key_inum(c, key));
release_head(c, DATAHD);
- err = ubifs_tnc_add(c, key, lnum, offs, dlen);
+ ubifs_add_auth_dirt(c, lnum);
+
+ err = ubifs_tnc_add(c, key, lnum, offs, dlen, hash);
if (err)
goto out_ro;
@@ -808,7 +854,9 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode)
int err, lnum, offs;
struct ubifs_ino_node *ino;
struct ubifs_inode *ui = ubifs_inode(inode);
- int sync = 0, len = UBIFS_INO_NODE_SZ, last_reference = !inode->i_nlink;
+ int sync = 0, write_len, ilen = UBIFS_INO_NODE_SZ;
+ int last_reference = !inode->i_nlink;
+ u8 hash[UBIFS_HASH_ARR_SZ];
dbg_jnl("ino %lu, nlink %u", inode->i_ino, inode->i_nlink);
@@ -817,20 +865,30 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode)
* need to synchronize the write-buffer either.
*/
if (!last_reference) {
- len += ui->data_len;
+ ilen += ui->data_len;
sync = IS_SYNC(inode);
}
- ino = kmalloc(len, GFP_NOFS);
+
+ if (ubifs_authenticated(c))
+ write_len = ALIGN(ilen, 8) + ubifs_auth_node_sz(c);
+ else
+ write_len = ilen;
+
+ ino = kmalloc(write_len, GFP_NOFS);
if (!ino)
return -ENOMEM;
/* Make reservation before allocating sequence numbers */
- err = make_reservation(c, BASEHD, len);
+ err = make_reservation(c, BASEHD, write_len);
if (err)
goto out_free;
pack_inode(c, ino, inode, 1);
- err = write_head(c, BASEHD, ino, len, &lnum, &offs, sync);
+ err = ubifs_node_calc_hash(c, ino, hash);
+ if (err)
+ goto out_release;
+
+ err = write_head(c, BASEHD, ino, write_len, &lnum, &offs, sync);
if (err)
goto out_release;
if (!sync)
@@ -838,17 +896,19 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode)
inode->i_ino);
release_head(c, BASEHD);
+ ubifs_add_auth_dirt(c, lnum);
+
if (last_reference) {
err = ubifs_tnc_remove_ino(c, inode->i_ino);
if (err)
goto out_ro;
ubifs_delete_orphan(c, inode->i_ino);
- err = ubifs_add_dirt(c, lnum, len);
+ err = ubifs_add_dirt(c, lnum, ilen);
} else {
union ubifs_key key;
ino_key_init(c, &key, inode->i_ino);
- err = ubifs_tnc_add(c, &key, lnum, offs, len);
+ err = ubifs_tnc_add(c, &key, lnum, offs, ilen, hash);
}
if (err)
goto out_ro;
@@ -958,6 +1018,10 @@ int ubifs_jnl_xrename(struct ubifs_info *c, const struct inode *fst_dir,
int aligned_dlen1, aligned_dlen2;
int twoparents = (fst_dir != snd_dir);
void *p;
+ u8 hash_dent1[UBIFS_HASH_ARR_SZ];
+ u8 hash_dent2[UBIFS_HASH_ARR_SZ];
+ u8 hash_p1[UBIFS_HASH_ARR_SZ];
+ u8 hash_p2[UBIFS_HASH_ARR_SZ];
ubifs_assert(c, ubifs_inode(fst_dir)->data_len == 0);
ubifs_assert(c, ubifs_inode(snd_dir)->data_len == 0);
@@ -973,6 +1037,8 @@ int ubifs_jnl_xrename(struct ubifs_info *c, const struct inode *fst_dir,
if (twoparents)
len += plen;
+ len += ubifs_auth_node_sz(c);
+
dent1 = kzalloc(len, GFP_NOFS);
if (!dent1)
return -ENOMEM;
@@ -993,6 +1059,9 @@ int ubifs_jnl_xrename(struct ubifs_info *c, const struct inode *fst_dir,
set_dent_cookie(c, dent1);
zero_dent_node_unused(dent1);
ubifs_prep_grp_node(c, dent1, dlen1, 0);
+ err = ubifs_node_calc_hash(c, dent1, hash_dent1);
+ if (err)
+ goto out_release;
/* Make new dent for 2nd entry */
dent2 = (void *)dent1 + aligned_dlen1;
@@ -1006,14 +1075,26 @@ int ubifs_jnl_xrename(struct ubifs_info *c, const struct inode *fst_dir,
set_dent_cookie(c, dent2);
zero_dent_node_unused(dent2);
ubifs_prep_grp_node(c, dent2, dlen2, 0);
+ err = ubifs_node_calc_hash(c, dent2, hash_dent2);
+ if (err)
+ goto out_release;
p = (void *)dent2 + aligned_dlen2;
- if (!twoparents)
+ if (!twoparents) {
pack_inode(c, p, fst_dir, 1);
- else {
+ err = ubifs_node_calc_hash(c, p, hash_p1);
+ if (err)
+ goto out_release;
+ } else {
pack_inode(c, p, fst_dir, 0);
+ err = ubifs_node_calc_hash(c, p, hash_p1);
+ if (err)
+ goto out_release;
p += ALIGN(plen, 8);
pack_inode(c, p, snd_dir, 1);
+ err = ubifs_node_calc_hash(c, p, hash_p2);
+ if (err)
+ goto out_release;
}
err = write_head(c, BASEHD, dent1, len, &lnum, &offs, sync);
@@ -1027,28 +1108,30 @@ int ubifs_jnl_xrename(struct ubifs_info *c, const struct inode *fst_dir,
}
release_head(c, BASEHD);
+ ubifs_add_auth_dirt(c, lnum);
+
dent_key_init(c, &key, snd_dir->i_ino, snd_nm);
- err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen1, snd_nm);
+ err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen1, hash_dent1, snd_nm);
if (err)
goto out_ro;
offs += aligned_dlen1;
dent_key_init(c, &key, fst_dir->i_ino, fst_nm);
- err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen2, fst_nm);
+ err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen2, hash_dent2, fst_nm);
if (err)
goto out_ro;
offs += aligned_dlen2;
ino_key_init(c, &key, fst_dir->i_ino);
- err = ubifs_tnc_add(c, &key, lnum, offs, plen);
+ err = ubifs_tnc_add(c, &key, lnum, offs, plen, hash_p1);
if (err)
goto out_ro;
if (twoparents) {
offs += ALIGN(plen, 8);
ino_key_init(c, &key, snd_dir->i_ino);
- err = ubifs_tnc_add(c, &key, lnum, offs, plen);
+ err = ubifs_tnc_add(c, &key, lnum, offs, plen, hash_p2);
if (err)
goto out_ro;
}
@@ -1101,6 +1184,11 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
int last_reference = !!(new_inode && new_inode->i_nlink == 0);
int move = (old_dir != new_dir);
struct ubifs_inode *uninitialized_var(new_ui);
+ u8 hash_old_dir[UBIFS_HASH_ARR_SZ];
+ u8 hash_new_dir[UBIFS_HASH_ARR_SZ];
+ u8 hash_new_inode[UBIFS_HASH_ARR_SZ];
+ u8 hash_dent1[UBIFS_HASH_ARR_SZ];
+ u8 hash_dent2[UBIFS_HASH_ARR_SZ];
ubifs_assert(c, ubifs_inode(old_dir)->data_len == 0);
ubifs_assert(c, ubifs_inode(new_dir)->data_len == 0);
@@ -1123,6 +1211,9 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
len = aligned_dlen1 + aligned_dlen2 + ALIGN(ilen, 8) + ALIGN(plen, 8);
if (move)
len += plen;
+
+ len += ubifs_auth_node_sz(c);
+
dent = kzalloc(len, GFP_NOFS);
if (!dent)
return -ENOMEM;
@@ -1143,6 +1234,9 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
set_dent_cookie(c, dent);
zero_dent_node_unused(dent);
ubifs_prep_grp_node(c, dent, dlen1, 0);
+ err = ubifs_node_calc_hash(c, dent, hash_dent1);
+ if (err)
+ goto out_release;
dent2 = (void *)dent + aligned_dlen1;
dent2->ch.node_type = UBIFS_DENT_NODE;
@@ -1162,19 +1256,36 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
set_dent_cookie(c, dent2);
zero_dent_node_unused(dent2);
ubifs_prep_grp_node(c, dent2, dlen2, 0);
+ err = ubifs_node_calc_hash(c, dent2, hash_dent2);
+ if (err)
+ goto out_release;
p = (void *)dent2 + aligned_dlen2;
if (new_inode) {
pack_inode(c, p, new_inode, 0);
+ err = ubifs_node_calc_hash(c, p, hash_new_inode);
+ if (err)
+ goto out_release;
+
p += ALIGN(ilen, 8);
}
- if (!move)
+ if (!move) {
pack_inode(c, p, old_dir, 1);
- else {
+ err = ubifs_node_calc_hash(c, p, hash_old_dir);
+ if (err)
+ goto out_release;
+ } else {
pack_inode(c, p, old_dir, 0);
+ err = ubifs_node_calc_hash(c, p, hash_old_dir);
+ if (err)
+ goto out_release;
+
p += ALIGN(plen, 8);
pack_inode(c, p, new_dir, 1);
+ err = ubifs_node_calc_hash(c, p, hash_new_dir);
+ if (err)
+ goto out_release;
}
if (last_reference) {
@@ -1200,15 +1311,17 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
}
release_head(c, BASEHD);
+ ubifs_add_auth_dirt(c, lnum);
+
dent_key_init(c, &key, new_dir->i_ino, new_nm);
- err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen1, new_nm);
+ err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen1, hash_dent1, new_nm);
if (err)
goto out_ro;
offs += aligned_dlen1;
if (whiteout) {
dent_key_init(c, &key, old_dir->i_ino, old_nm);
- err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen2, old_nm);
+ err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen2, hash_dent2, old_nm);
if (err)
goto out_ro;
@@ -1227,21 +1340,21 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
offs += aligned_dlen2;
if (new_inode) {
ino_key_init(c, &key, new_inode->i_ino);
- err = ubifs_tnc_add(c, &key, lnum, offs, ilen);
+ err = ubifs_tnc_add(c, &key, lnum, offs, ilen, hash_new_inode);
if (err)
goto out_ro;
offs += ALIGN(ilen, 8);
}
ino_key_init(c, &key, old_dir->i_ino);
- err = ubifs_tnc_add(c, &key, lnum, offs, plen);
+ err = ubifs_tnc_add(c, &key, lnum, offs, plen, hash_old_dir);
if (err)
goto out_ro;
if (move) {
offs += ALIGN(plen, 8);
ino_key_init(c, &key, new_dir->i_ino);
- err = ubifs_tnc_add(c, &key, lnum, offs, plen);
+ err = ubifs_tnc_add(c, &key, lnum, offs, plen, hash_new_dir);
if (err)
goto out_ro;
}
@@ -1360,6 +1473,8 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
struct ubifs_inode *ui = ubifs_inode(inode);
ino_t inum = inode->i_ino;
unsigned int blk;
+ u8 hash_ino[UBIFS_HASH_ARR_SZ];
+ u8 hash_dn[UBIFS_HASH_ARR_SZ];
dbg_jnl("ino %lu, size %lld -> %lld",
(unsigned long)inum, old_size, new_size);
@@ -1369,6 +1484,9 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
sz = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ +
UBIFS_MAX_DATA_NODE_SZ * WORST_COMPR_FACTOR;
+
+ sz += ubifs_auth_node_sz(c);
+
ino = kmalloc(sz, GFP_NOFS);
if (!ino)
return -ENOMEM;
@@ -1414,16 +1532,28 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
/* Must make reservation before allocating sequence numbers */
len = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ;
- if (dlen)
+
+ if (ubifs_authenticated(c))
+ len += ALIGN(dlen, 8) + ubifs_auth_node_sz(c);
+ else
len += dlen;
+
err = make_reservation(c, BASEHD, len);
if (err)
goto out_free;
pack_inode(c, ino, inode, 0);
+ err = ubifs_node_calc_hash(c, ino, hash_ino);
+ if (err)
+ goto out_release;
+
ubifs_prep_grp_node(c, trun, UBIFS_TRUN_NODE_SZ, dlen ? 0 : 1);
- if (dlen)
+ if (dlen) {
ubifs_prep_grp_node(c, dn, dlen, 1);
+ err = ubifs_node_calc_hash(c, dn, hash_dn);
+ if (err)
+ goto out_release;
+ }
err = write_head(c, BASEHD, ino, len, &lnum, &offs, sync);
if (err)
@@ -1432,15 +1562,17 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf, inum);
release_head(c, BASEHD);
+ ubifs_add_auth_dirt(c, lnum);
+
if (dlen) {
sz = offs + UBIFS_INO_NODE_SZ + UBIFS_TRUN_NODE_SZ;
- err = ubifs_tnc_add(c, &key, lnum, sz, dlen);
+ err = ubifs_tnc_add(c, &key, lnum, sz, dlen, hash_dn);
if (err)
goto out_ro;
}
ino_key_init(c, &key, inum);
- err = ubifs_tnc_add(c, &key, lnum, offs, UBIFS_INO_NODE_SZ);
+ err = ubifs_tnc_add(c, &key, lnum, offs, UBIFS_INO_NODE_SZ, hash_ino);
if (err)
goto out_ro;
@@ -1495,12 +1627,13 @@ int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host,
const struct inode *inode,
const struct fscrypt_name *nm)
{
- int err, xlen, hlen, len, lnum, xent_offs, aligned_xlen;
+ int err, xlen, hlen, len, lnum, xent_offs, aligned_xlen, write_len;
struct ubifs_dent_node *xent;
struct ubifs_ino_node *ino;
union ubifs_key xent_key, key1, key2;
int sync = IS_DIRSYNC(host);
struct ubifs_inode *host_ui = ubifs_inode(host);
+ u8 hash[UBIFS_HASH_ARR_SZ];
ubifs_assert(c, inode->i_nlink == 0);
ubifs_assert(c, mutex_is_locked(&host_ui->ui_mutex));
@@ -1514,12 +1647,14 @@ int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host,
hlen = host_ui->data_len + UBIFS_INO_NODE_SZ;
len = aligned_xlen + UBIFS_INO_NODE_SZ + ALIGN(hlen, 8);
- xent = kzalloc(len, GFP_NOFS);
+ write_len = len + ubifs_auth_node_sz(c);
+
+ xent = kzalloc(write_len, GFP_NOFS);
if (!xent)
return -ENOMEM;
/* Make reservation before allocating sequence numbers */
- err = make_reservation(c, BASEHD, len);
+ err = make_reservation(c, BASEHD, write_len);
if (err) {
kfree(xent);
return err;
@@ -1540,11 +1675,16 @@ int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host,
pack_inode(c, ino, inode, 0);
ino = (void *)ino + UBIFS_INO_NODE_SZ;
pack_inode(c, ino, host, 1);
+ err = ubifs_node_calc_hash(c, ino, hash);
+ if (err)
+ goto out_release;
- err = write_head(c, BASEHD, xent, len, &lnum, &xent_offs, sync);
+ err = write_head(c, BASEHD, xent, write_len, &lnum, &xent_offs, sync);
if (!sync && !err)
ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf, host->i_ino);
release_head(c, BASEHD);
+
+ ubifs_add_auth_dirt(c, lnum);
kfree(xent);
if (err)
goto out_ro;
@@ -1572,7 +1712,7 @@ int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host,
/* And update TNC with the new host inode position */
ino_key_init(c, &key1, host->i_ino);
- err = ubifs_tnc_add(c, &key1, lnum, xent_offs + len - hlen, hlen);
+ err = ubifs_tnc_add(c, &key1, lnum, xent_offs + len - hlen, hlen, hash);
if (err)
goto out_ro;
@@ -1583,6 +1723,9 @@ int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host,
mark_inode_clean(c, host_ui);
return 0;
+out_release:
+ kfree(xent);
+ release_head(c, BASEHD);
out_ro:
ubifs_ro_mode(c, err);
finish_reservation(c);
@@ -1610,6 +1753,8 @@ int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode,
struct ubifs_ino_node *ino;
union ubifs_key key;
int sync = IS_DIRSYNC(host);
+ u8 hash_host[UBIFS_HASH_ARR_SZ];
+ u8 hash[UBIFS_HASH_ARR_SZ];
dbg_jnl("ino %lu, ino %lu", host->i_ino, inode->i_ino);
ubifs_assert(c, host->i_nlink > 0);
@@ -1621,6 +1766,8 @@ int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode,
aligned_len1 = ALIGN(len1, 8);
aligned_len = aligned_len1 + ALIGN(len2, 8);
+ aligned_len += ubifs_auth_node_sz(c);
+
ino = kzalloc(aligned_len, GFP_NOFS);
if (!ino)
return -ENOMEM;
@@ -1631,7 +1778,13 @@ int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode,
goto out_free;
pack_inode(c, ino, host, 0);
+ err = ubifs_node_calc_hash(c, ino, hash_host);
+ if (err)
+ goto out_release;
pack_inode(c, (void *)ino + aligned_len1, inode, 1);
+ err = ubifs_node_calc_hash(c, (void *)ino + aligned_len1, hash);
+ if (err)
+ goto out_release;
err = write_head(c, BASEHD, ino, aligned_len, &lnum, &offs, 0);
if (!sync && !err) {
@@ -1644,13 +1797,15 @@ int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode,
if (err)
goto out_ro;
+ ubifs_add_auth_dirt(c, lnum);
+
ino_key_init(c, &key, host->i_ino);
- err = ubifs_tnc_add(c, &key, lnum, offs, len1);
+ err = ubifs_tnc_add(c, &key, lnum, offs, len1, hash_host);
if (err)
goto out_ro;
ino_key_init(c, &key, inode->i_ino);
- err = ubifs_tnc_add(c, &key, lnum, offs + aligned_len1, len2);
+ err = ubifs_tnc_add(c, &key, lnum, offs + aligned_len1, len2, hash);
if (err)
goto out_ro;
@@ -1662,6 +1817,8 @@ int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode,
kfree(ino);
return 0;
+out_release:
+ release_head(c, BASEHD);
out_ro:
ubifs_ro_mode(c, err);
finish_reservation(c);
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index 86b0828f5499..15fd854149bb 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -236,6 +236,7 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
bud->lnum = lnum;
bud->start = offs;
bud->jhead = jhead;
+ bud->log_hash = NULL;
ref->ch.node_type = UBIFS_REF_NODE;
ref->lnum = cpu_to_le32(bud->lnum);
@@ -275,6 +276,14 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
if (err)
goto out_unlock;
+ err = ubifs_shash_update(c, c->log_hash, ref, UBIFS_REF_NODE_SZ);
+ if (err)
+ goto out_unlock;
+
+ err = ubifs_shash_copy_state(c, c->log_hash, c->jheads[jhead].log_hash);
+ if (err)
+ goto out_unlock;
+
c->lhead_offs += c->ref_node_alsz;
ubifs_add_bud(c, bud);
@@ -377,6 +386,14 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum)
cs->cmt_no = cpu_to_le64(c->cmt_no);
ubifs_prepare_node(c, cs, UBIFS_CS_NODE_SZ, 0);
+ err = ubifs_shash_init(c, c->log_hash);
+ if (err)
+ goto out;
+
+ err = ubifs_shash_update(c, c->log_hash, cs, UBIFS_CS_NODE_SZ);
+ if (err < 0)
+ goto out;
+
/*
* Note, we do not lock 'c->log_mutex' because this is the commit start
* phase and we are exclusively using the log. And we do not lock
@@ -402,6 +419,12 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum)
ubifs_prepare_node(c, ref, UBIFS_REF_NODE_SZ, 0);
len += UBIFS_REF_NODE_SZ;
+
+ err = ubifs_shash_update(c, c->log_hash, ref,
+ UBIFS_REF_NODE_SZ);
+ if (err)
+ goto out;
+ ubifs_shash_copy_state(c, c->log_hash, c->jheads[i].log_hash);
}
ubifs_pad(c, buf + len, ALIGN(len, c->min_io_size) - len);
@@ -516,6 +539,7 @@ int ubifs_log_post_commit(struct ubifs_info *c, int old_ltail_lnum)
if (err)
return err;
list_del(&bud->list);
+ kfree(bud->log_hash);
kfree(bud);
}
mutex_lock(&c->log_mutex);
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index 31393370e334..b0c5f06128b5 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -604,11 +604,12 @@ static int calc_pnode_num_from_parent(const struct ubifs_info *c,
* @lpt_first: LEB number of first LPT LEB
* @lpt_lebs: number of LEBs for LPT is passed and returned here
* @big_lpt: use big LPT model is passed and returned here
+ * @hash: hash of the LPT is returned here
*
* This function returns %0 on success and a negative error code on failure.
*/
int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first,
- int *lpt_lebs, int *big_lpt)
+ int *lpt_lebs, int *big_lpt, u8 *hash)
{
int lnum, err = 0, node_sz, iopos, i, j, cnt, len, alen, row;
int blnum, boffs, bsz, bcnt;
@@ -617,6 +618,7 @@ int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first,
void *buf = NULL, *p;
struct ubifs_lpt_lprops *ltab = NULL;
int *lsave = NULL;
+ struct shash_desc *desc;
err = calc_dflt_lpt_geom(c, main_lebs, big_lpt);
if (err)
@@ -630,6 +632,10 @@ int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first,
/* Needed by 'ubifs_pack_lsave()' */
c->main_first = c->leb_cnt - *main_lebs;
+ desc = ubifs_hash_get_desc(c);
+ if (IS_ERR(desc))
+ return PTR_ERR(desc);
+
lsave = kmalloc_array(c->lsave_cnt, sizeof(int), GFP_KERNEL);
pnode = kzalloc(sizeof(struct ubifs_pnode), GFP_KERNEL);
nnode = kzalloc(sizeof(struct ubifs_nnode), GFP_KERNEL);
@@ -677,6 +683,10 @@ int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first,
/* Add first pnode */
ubifs_pack_pnode(c, p, pnode);
+ err = ubifs_shash_update(c, desc, p, c->pnode_sz);
+ if (err)
+ goto out;
+
p += c->pnode_sz;
len = c->pnode_sz;
pnode->num += 1;
@@ -711,6 +721,10 @@ int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first,
len = 0;
}
ubifs_pack_pnode(c, p, pnode);
+ err = ubifs_shash_update(c, desc, p, c->pnode_sz);
+ if (err)
+ goto out;
+
p += c->pnode_sz;
len += c->pnode_sz;
/*
@@ -830,6 +844,10 @@ int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first,
if (err)
goto out;
+ err = ubifs_shash_final(c, desc, hash);
+ if (err)
+ goto out;
+
c->nhead_lnum = lnum;
c->nhead_offs = ALIGN(len, c->min_io_size);
@@ -853,6 +871,7 @@ int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first,
dbg_lp("LPT lsave is at %d:%d", c->lsave_lnum, c->lsave_offs);
out:
c->ltab = NULL;
+ kfree(desc);
kfree(lsave);
vfree(ltab);
vfree(buf);
@@ -1439,26 +1458,25 @@ struct ubifs_pnode *ubifs_get_pnode(struct ubifs_info *c,
}
/**
- * ubifs_lpt_lookup - lookup LEB properties in the LPT.
+ * ubifs_pnode_lookup - lookup a pnode in the LPT.
* @c: UBIFS file-system description object
- * @lnum: LEB number to lookup
+ * @i: pnode number (0 to (main_lebs - 1) / UBIFS_LPT_FANOUT)
*
- * This function returns a pointer to the LEB properties on success or a
- * negative error code on failure.
+ * This function returns a pointer to the pnode on success or a negative
+ * error code on failure.
*/
-struct ubifs_lprops *ubifs_lpt_lookup(struct ubifs_info *c, int lnum)
+struct ubifs_pnode *ubifs_pnode_lookup(struct ubifs_info *c, int i)
{
- int err, i, h, iip, shft;
+ int err, h, iip, shft;
struct ubifs_nnode *nnode;
- struct ubifs_pnode *pnode;
if (!c->nroot) {
err = ubifs_read_nnode(c, NULL, 0);
if (err)
return ERR_PTR(err);
}
+ i <<= UBIFS_LPT_FANOUT_SHIFT;
nnode = c->nroot;
- i = lnum - c->main_first;
shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT;
for (h = 1; h < c->lpt_hght; h++) {
iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
@@ -1468,7 +1486,24 @@ struct ubifs_lprops *ubifs_lpt_lookup(struct ubifs_info *c, int lnum)
return ERR_CAST(nnode);
}
iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
- pnode = ubifs_get_pnode(c, nnode, iip);
+ return ubifs_get_pnode(c, nnode, iip);
+}
+
+/**
+ * ubifs_lpt_lookup - lookup LEB properties in the LPT.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number to lookup
+ *
+ * This function returns a pointer to the LEB properties on success or a
+ * negative error code on failure.
+ */
+struct ubifs_lprops *ubifs_lpt_lookup(struct ubifs_info *c, int lnum)
+{
+ int i, iip;
+ struct ubifs_pnode *pnode;
+
+ i = lnum - c->main_first;
+ pnode = ubifs_pnode_lookup(c, i >> UBIFS_LPT_FANOUT_SHIFT);
if (IS_ERR(pnode))
return ERR_CAST(pnode);
iip = (i & (UBIFS_LPT_FANOUT - 1));
@@ -1620,6 +1655,131 @@ struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum)
}
/**
+ * ubifs_lpt_calc_hash - Calculate hash of the LPT pnodes
+ * @c: UBIFS file-system description object
+ * @hash: the returned hash of the LPT pnodes
+ *
+ * This function iterates over the LPT pnodes and creates a hash over them.
+ * Returns 0 for success or a negative error code otherwise.
+ */
+int ubifs_lpt_calc_hash(struct ubifs_info *c, u8 *hash)
+{
+ struct ubifs_nnode *nnode, *nn;
+ struct ubifs_cnode *cnode;
+ struct shash_desc *desc;
+ int iip = 0, i;
+ int bufsiz = max_t(int, c->nnode_sz, c->pnode_sz);
+ void *buf;
+ int err;
+
+ if (!ubifs_authenticated(c))
+ return 0;
+
+ if (!c->nroot) {
+ err = ubifs_read_nnode(c, NULL, 0);
+ if (err)
+ return err;
+ }
+
+ desc = ubifs_hash_get_desc(c);
+ if (IS_ERR(desc))
+ return PTR_ERR(desc);
+
+ buf = kmalloc(bufsiz, GFP_NOFS);
+ if (!buf) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ cnode = (struct ubifs_cnode *)c->nroot;
+
+ while (cnode) {
+ nnode = cnode->parent;
+ nn = (struct ubifs_nnode *)cnode;
+ if (cnode->level > 1) {
+ while (iip < UBIFS_LPT_FANOUT) {
+ if (nn->nbranch[iip].lnum == 0) {
+ /* Go right */
+ iip++;
+ continue;
+ }
+
+ nnode = ubifs_get_nnode(c, nn, iip);
+ if (IS_ERR(nnode)) {
+ err = PTR_ERR(nnode);
+ goto out;
+ }
+
+ /* Go down */
+ iip = 0;
+ cnode = (struct ubifs_cnode *)nnode;
+ break;
+ }
+ if (iip < UBIFS_LPT_FANOUT)
+ continue;
+ } else {
+ struct ubifs_pnode *pnode;
+
+ for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+ if (nn->nbranch[i].lnum == 0)
+ continue;
+ pnode = ubifs_get_pnode(c, nn, i);
+ if (IS_ERR(pnode)) {
+ err = PTR_ERR(pnode);
+ goto out;
+ }
+
+ ubifs_pack_pnode(c, buf, pnode);
+ err = ubifs_shash_update(c, desc, buf,
+ c->pnode_sz);
+ if (err)
+ goto out;
+ }
+ }
+ /* Go up and to the right */
+ iip = cnode->iip + 1;
+ cnode = (struct ubifs_cnode *)nnode;
+ }
+
+ err = ubifs_shash_final(c, desc, hash);
+out:
+ kfree(desc);
+ kfree(buf);
+
+ return err;
+}
+
+/**
+ * lpt_check_hash - check the hash of the LPT.
+ * @c: UBIFS file-system description object
+ *
+ * This function calculates a hash over all pnodes in the LPT and compares it with
+ * the hash stored in the master node. Returns %0 on success and a negative error
+ * code on failure.
+ */
+static int lpt_check_hash(struct ubifs_info *c)
+{
+ int err;
+ u8 hash[UBIFS_HASH_ARR_SZ];
+
+ if (!ubifs_authenticated(c))
+ return 0;
+
+ err = ubifs_lpt_calc_hash(c, hash);
+ if (err)
+ return err;
+
+ if (ubifs_check_hash(c, c->mst_node->hash_lpt, hash)) {
+ err = -EPERM;
+ ubifs_err(c, "Failed to authenticate LPT");
+ } else {
+ err = 0;
+ }
+
+ return err;
+}
+
+/**
* lpt_init_rd - initialize the LPT for reading.
* @c: UBIFS file-system description object
*
@@ -1660,6 +1820,10 @@ static int lpt_init_rd(struct ubifs_info *c)
if (err)
return err;
+ err = lpt_check_hash(c);
+ if (err)
+ return err;
+
dbg_lp("space_bits %d", c->space_bits);
dbg_lp("lpt_lnum_bits %d", c->lpt_lnum_bits);
dbg_lp("lpt_offs_bits %d", c->lpt_offs_bits);
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 7ce30994bbba..1f88caffdf2a 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -619,38 +619,6 @@ static struct ubifs_pnode *next_pnode_to_dirty(struct ubifs_info *c,
}
/**
- * pnode_lookup - lookup a pnode in the LPT.
- * @c: UBIFS file-system description object
- * @i: pnode number (0 to (main_lebs - 1) / UBIFS_LPT_FANOUT))
- *
- * This function returns a pointer to the pnode on success or a negative
- * error code on failure.
- */
-static struct ubifs_pnode *pnode_lookup(struct ubifs_info *c, int i)
-{
- int err, h, iip, shft;
- struct ubifs_nnode *nnode;
-
- if (!c->nroot) {
- err = ubifs_read_nnode(c, NULL, 0);
- if (err)
- return ERR_PTR(err);
- }
- i <<= UBIFS_LPT_FANOUT_SHIFT;
- nnode = c->nroot;
- shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT;
- for (h = 1; h < c->lpt_hght; h++) {
- iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
- shft -= UBIFS_LPT_FANOUT_SHIFT;
- nnode = ubifs_get_nnode(c, nnode, iip);
- if (IS_ERR(nnode))
- return ERR_CAST(nnode);
- }
- iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
- return ubifs_get_pnode(c, nnode, iip);
-}
-
-/**
* add_pnode_dirt - add dirty space to LPT LEB properties.
* @c: UBIFS file-system description object
* @pnode: pnode for which to add dirt
@@ -702,7 +670,7 @@ static int make_tree_dirty(struct ubifs_info *c)
{
struct ubifs_pnode *pnode;
- pnode = pnode_lookup(c, 0);
+ pnode = ubifs_pnode_lookup(c, 0);
if (IS_ERR(pnode))
return PTR_ERR(pnode);
@@ -956,7 +924,7 @@ static int make_pnode_dirty(struct ubifs_info *c, int node_num, int lnum,
struct ubifs_pnode *pnode;
struct ubifs_nbranch *branch;
- pnode = pnode_lookup(c, node_num);
+ pnode = ubifs_pnode_lookup(c, node_num);
if (IS_ERR(pnode))
return PTR_ERR(pnode);
branch = &pnode->parent->nbranch[pnode->iip];
@@ -1279,6 +1247,10 @@ int ubifs_lpt_start_commit(struct ubifs_info *c)
if (err)
goto out;
+ err = ubifs_lpt_calc_hash(c, c->mst_node->hash_lpt);
+ if (err)
+ goto out;
+
/* Copy the LPT's own lprops for end commit to write */
memcpy(c->ltab_cmt, c->ltab,
sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs);
@@ -1558,7 +1530,7 @@ static int dbg_is_pnode_dirty(struct ubifs_info *c, int lnum, int offs)
struct ubifs_nbranch *branch;
cond_resched();
- pnode = pnode_lookup(c, i);
+ pnode = ubifs_pnode_lookup(c, i);
if (IS_ERR(pnode))
return PTR_ERR(pnode);
branch = &pnode->parent->nbranch[pnode->iip];
@@ -1710,7 +1682,7 @@ int dbg_check_ltab(struct ubifs_info *c)
for (i = 0; i < cnt; i++) {
struct ubifs_pnode *pnode;
- pnode = pnode_lookup(c, i);
+ pnode = ubifs_pnode_lookup(c, i);
if (IS_ERR(pnode))
return PTR_ERR(pnode);
cond_resched();
diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c
index 9df4a41bba52..5ea51bbd14c7 100644
--- a/fs/ubifs/master.c
+++ b/fs/ubifs/master.c
@@ -25,6 +25,42 @@
#include "ubifs.h"
/**
+ * ubifs_compare_master_node - compare two UBIFS master nodes
+ * @c: UBIFS file-system description object
+ * @m1: the first node
+ * @m2: the second node
+ *
+ * This function compares two UBIFS master nodes. Returns 0 if they are equal
+ * and nonzero if not.
+ */
+int ubifs_compare_master_node(struct ubifs_info *c, void *m1, void *m2)
+{
+ int ret;
+ int behind;
+ int hmac_offs = offsetof(struct ubifs_mst_node, hmac);
+
+ /*
+ * Do not compare the common node header since the sequence number and
+ * hence the CRC are different.
+ */
+ ret = memcmp(m1 + UBIFS_CH_SZ, m2 + UBIFS_CH_SZ,
+ hmac_offs - UBIFS_CH_SZ);
+ if (ret)
+ return ret;
+
+ /*
+ * Do not compare the embedded HMAC aswell which also must be different
+ * due to the different common node header.
+ */
+ behind = hmac_offs + UBIFS_MAX_HMAC_LEN;
+
+ if (UBIFS_MST_NODE_SZ > behind)
+ return memcmp(m1 + behind, m2 + behind, UBIFS_MST_NODE_SZ - behind);
+
+ return 0;
+}
+
+/**
* scan_for_master - search the valid master node.
* @c: UBIFS file-system description object
*
@@ -37,7 +73,7 @@ static int scan_for_master(struct ubifs_info *c)
{
struct ubifs_scan_leb *sleb;
struct ubifs_scan_node *snod;
- int lnum, offs = 0, nodes_cnt;
+ int lnum, offs = 0, nodes_cnt, err;
lnum = UBIFS_MST_LNUM;
@@ -69,12 +105,23 @@ static int scan_for_master(struct ubifs_info *c)
goto out_dump;
if (snod->offs != offs)
goto out;
- if (memcmp((void *)c->mst_node + UBIFS_CH_SZ,
- (void *)snod->node + UBIFS_CH_SZ,
- UBIFS_MST_NODE_SZ - UBIFS_CH_SZ))
+ if (ubifs_compare_master_node(c, c->mst_node, snod->node))
goto out;
+
c->mst_offs = offs;
ubifs_scan_destroy(sleb);
+
+ if (!ubifs_authenticated(c))
+ return 0;
+
+ err = ubifs_node_verify_hmac(c, c->mst_node,
+ sizeof(struct ubifs_mst_node),
+ offsetof(struct ubifs_mst_node, hmac));
+ if (err) {
+ ubifs_err(c, "Failed to verify master node HMAC");
+ return -EPERM;
+ }
+
return 0;
out:
@@ -305,6 +352,8 @@ int ubifs_read_master(struct ubifs_info *c)
c->lst.total_dead = le64_to_cpu(c->mst_node->total_dead);
c->lst.total_dark = le64_to_cpu(c->mst_node->total_dark);
+ ubifs_copy_hash(c, c->mst_node->hash_root_idx, c->zroot.hash);
+
c->calc_idx_sz = c->bi.old_idx_sz;
if (c->mst_node->flags & cpu_to_le32(UBIFS_MST_NO_ORPHS))
@@ -378,7 +427,9 @@ int ubifs_write_master(struct ubifs_info *c)
c->mst_offs = offs;
c->mst_node->highest_inum = cpu_to_le64(c->highest_inum);
- err = ubifs_write_node(c, c->mst_node, len, lnum, offs);
+ ubifs_copy_hash(c, c->zroot.hash, c->mst_node->hash_root_idx);
+ err = ubifs_write_node_hmac(c, c->mst_node, len, lnum, offs,
+ offsetof(struct ubifs_mst_node, hmac));
if (err)
return err;
@@ -389,7 +440,8 @@ int ubifs_write_master(struct ubifs_info *c)
if (err)
return err;
}
- err = ubifs_write_node(c, c->mst_node, len, lnum, offs);
+ err = ubifs_write_node_hmac(c, c->mst_node, len, lnum, offs,
+ offsetof(struct ubifs_mst_node, hmac));
return err;
}
diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h
index 21d35d7dd975..6f87237fdbf4 100644
--- a/fs/ubifs/misc.h
+++ b/fs/ubifs/misc.h
@@ -197,7 +197,8 @@ static inline int ubifs_return_leb(struct ubifs_info *c, int lnum)
*/
static inline int ubifs_idx_node_sz(const struct ubifs_info *c, int child_cnt)
{
- return UBIFS_IDX_NODE_SZ + (UBIFS_BRANCH_SZ + c->key_len) * child_cnt;
+ return UBIFS_IDX_NODE_SZ + (UBIFS_BRANCH_SZ + c->key_len + c->hash_len)
+ * child_cnt;
}
/**
@@ -212,7 +213,7 @@ struct ubifs_branch *ubifs_idx_branch(const struct ubifs_info *c,
int bnum)
{
return (struct ubifs_branch *)((void *)idx->branches +
- (UBIFS_BRANCH_SZ + c->key_len) * bnum);
+ (UBIFS_BRANCH_SZ + c->key_len + c->hash_len) * bnum);
}
/**
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 984e30e83c0b..8526b7ec4707 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -212,7 +212,10 @@ static int write_rcvrd_mst_node(struct ubifs_info *c,
save_flags = mst->flags;
mst->flags |= cpu_to_le32(UBIFS_MST_RCVRY);
- ubifs_prepare_node(c, mst, UBIFS_MST_NODE_SZ, 1);
+ err = ubifs_prepare_node_hmac(c, mst, UBIFS_MST_NODE_SZ,
+ offsetof(struct ubifs_mst_node, hmac), 1);
+ if (err)
+ goto out;
err = ubifs_leb_change(c, lnum, mst, sz);
if (err)
goto out;
@@ -264,9 +267,7 @@ int ubifs_recover_master_node(struct ubifs_info *c)
offs2 = (void *)mst2 - buf2;
if (offs1 == offs2) {
/* Same offset, so must be the same */
- if (memcmp((void *)mst1 + UBIFS_CH_SZ,
- (void *)mst2 + UBIFS_CH_SZ,
- UBIFS_MST_NODE_SZ - UBIFS_CH_SZ))
+ if (ubifs_compare_master_node(c, mst1, mst2))
goto out_err;
mst = mst1;
} else if (offs2 + sz == offs1) {
@@ -1462,15 +1463,81 @@ out:
}
/**
+ * inode_fix_size - fix inode size
+ * @c: UBIFS file-system description object
+ * @e: inode size information for recovery
+ */
+static int inode_fix_size(struct ubifs_info *c, struct size_entry *e)
+{
+ struct inode *inode;
+ struct ubifs_inode *ui;
+ int err;
+
+ if (c->ro_mount)
+ ubifs_assert(c, !e->inode);
+
+ if (e->inode) {
+ /* Remounting rw, pick up inode we stored earlier */
+ inode = e->inode;
+ } else {
+ inode = ubifs_iget(c->vfs_sb, e->inum);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ if (inode->i_size >= e->d_size) {
+ /*
+ * The original inode in the index already has a size
+ * big enough, nothing to do
+ */
+ iput(inode);
+ return 0;
+ }
+
+ dbg_rcvry("ino %lu size %lld -> %lld",
+ (unsigned long)e->inum,
+ inode->i_size, e->d_size);
+
+ ui = ubifs_inode(inode);
+
+ inode->i_size = e->d_size;
+ ui->ui_size = e->d_size;
+ ui->synced_i_size = e->d_size;
+
+ e->inode = inode;
+ }
+
+ /*
+ * In readonly mode just keep the inode pinned in memory until we go
+ * readwrite. In readwrite mode write the inode to the journal with the
+ * fixed size.
+ */
+ if (c->ro_mount)
+ return 0;
+
+ err = ubifs_jnl_write_inode(c, inode);
+
+ iput(inode);
+
+ if (err)
+ return err;
+
+ rb_erase(&e->rb, &c->size_tree);
+ kfree(e);
+
+ return 0;
+}
+
+/**
* ubifs_recover_size - recover inode size.
* @c: UBIFS file-system description object
+ * @in_place: If true, do a in-place size fixup
*
* This function attempts to fix inode size discrepancies identified by the
* 'ubifs_recover_size_accum()' function.
*
* This functions returns %0 on success and a negative error code on failure.
*/
-int ubifs_recover_size(struct ubifs_info *c)
+int ubifs_recover_size(struct ubifs_info *c, bool in_place)
{
struct rb_node *this = rb_first(&c->size_tree);
@@ -1479,6 +1546,9 @@ int ubifs_recover_size(struct ubifs_info *c)
int err;
e = rb_entry(this, struct size_entry, rb);
+
+ this = rb_next(this);
+
if (!e->exists) {
union ubifs_key key;
@@ -1502,40 +1572,26 @@ int ubifs_recover_size(struct ubifs_info *c)
}
if (e->exists && e->i_size < e->d_size) {
- if (c->ro_mount) {
- /* Fix the inode size and pin it in memory */
- struct inode *inode;
- struct ubifs_inode *ui;
-
- ubifs_assert(c, !e->inode);
-
- inode = ubifs_iget(c->vfs_sb, e->inum);
- if (IS_ERR(inode))
- return PTR_ERR(inode);
-
- ui = ubifs_inode(inode);
- if (inode->i_size < e->d_size) {
- dbg_rcvry("ino %lu size %lld -> %lld",
- (unsigned long)e->inum,
- inode->i_size, e->d_size);
- inode->i_size = e->d_size;
- ui->ui_size = e->d_size;
- ui->synced_i_size = e->d_size;
- e->inode = inode;
- this = rb_next(this);
- continue;
- }
- iput(inode);
- } else {
- /* Fix the size in place */
+ ubifs_assert(c, !(c->ro_mount && in_place));
+
+ /*
+ * We found data that is outside the found inode size,
+ * fixup the inode size
+ */
+
+ if (in_place) {
err = fix_size_in_place(c, e);
if (err)
return err;
iput(e->inode);
+ } else {
+ err = inode_fix_size(c, e);
+ if (err)
+ return err;
+ continue;
}
}
- this = rb_next(this);
rb_erase(&e->rb, &c->size_tree);
kfree(e);
}
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 4844538eb926..0a0e65c07c6d 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -34,6 +34,8 @@
#include "ubifs.h"
#include <linux/list_sort.h>
+#include <crypto/hash.h>
+#include <crypto/algapi.h>
/**
* struct replay_entry - replay list entry.
@@ -56,6 +58,7 @@ struct replay_entry {
int lnum;
int offs;
int len;
+ u8 hash[UBIFS_HASH_ARR_SZ];
unsigned int deletion:1;
unsigned long long sqnum;
struct list_head list;
@@ -210,6 +213,38 @@ static int trun_remove_range(struct ubifs_info *c, struct replay_entry *r)
}
/**
+ * inode_still_linked - check whether inode in question will be re-linked.
+ * @c: UBIFS file-system description object
+ * @rino: replay entry to test
+ *
+ * O_TMPFILE files can be re-linked, this means link count goes from 0 to 1.
+ * This case needs special care, otherwise all references to the inode will
+ * be removed upon the first replay entry of an inode with link count 0
+ * is found.
+ */
+static bool inode_still_linked(struct ubifs_info *c, struct replay_entry *rino)
+{
+ struct replay_entry *r;
+
+ ubifs_assert(c, rino->deletion);
+ ubifs_assert(c, key_type(c, &rino->key) == UBIFS_INO_KEY);
+
+ /*
+ * Find the most recent entry for the inode behind @rino and check
+ * whether it is a deletion.
+ */
+ list_for_each_entry_reverse(r, &c->replay_list, list) {
+ ubifs_assert(c, r->sqnum >= rino->sqnum);
+ if (key_inum(c, &r->key) == key_inum(c, &rino->key))
+ return r->deletion == 0;
+
+ }
+
+ ubifs_assert(c, 0);
+ return false;
+}
+
+/**
* apply_replay_entry - apply a replay entry to the TNC.
* @c: UBIFS file-system description object
* @r: replay entry to apply
@@ -228,7 +263,7 @@ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r)
err = ubifs_tnc_remove_nm(c, &r->key, &r->nm);
else
err = ubifs_tnc_add_nm(c, &r->key, r->lnum, r->offs,
- r->len, &r->nm);
+ r->len, r->hash, &r->nm);
} else {
if (r->deletion)
switch (key_type(c, &r->key)) {
@@ -236,6 +271,11 @@ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r)
{
ino_t inum = key_inum(c, &r->key);
+ if (inode_still_linked(c, r)) {
+ err = 0;
+ break;
+ }
+
err = ubifs_tnc_remove_ino(c, inum);
break;
}
@@ -248,7 +288,7 @@ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r)
}
else
err = ubifs_tnc_add(c, &r->key, r->lnum, r->offs,
- r->len);
+ r->len, r->hash);
if (err)
return err;
@@ -352,9 +392,9 @@ static void destroy_replay_list(struct ubifs_info *c)
* in case of success and a negative error code in case of failure.
*/
static int insert_node(struct ubifs_info *c, int lnum, int offs, int len,
- union ubifs_key *key, unsigned long long sqnum,
- int deletion, int *used, loff_t old_size,
- loff_t new_size)
+ const u8 *hash, union ubifs_key *key,
+ unsigned long long sqnum, int deletion, int *used,
+ loff_t old_size, loff_t new_size)
{
struct replay_entry *r;
@@ -372,6 +412,7 @@ static int insert_node(struct ubifs_info *c, int lnum, int offs, int len,
r->lnum = lnum;
r->offs = offs;
r->len = len;
+ ubifs_copy_hash(c, hash, r->hash);
r->deletion = !!deletion;
r->sqnum = sqnum;
key_copy(c, key, &r->key);
@@ -400,8 +441,9 @@ static int insert_node(struct ubifs_info *c, int lnum, int offs, int len,
* negative error code in case of failure.
*/
static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len,
- union ubifs_key *key, const char *name, int nlen,
- unsigned long long sqnum, int deletion, int *used)
+ const u8 *hash, union ubifs_key *key,
+ const char *name, int nlen, unsigned long long sqnum,
+ int deletion, int *used)
{
struct replay_entry *r;
char *nbuf;
@@ -425,6 +467,7 @@ static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len,
r->lnum = lnum;
r->offs = offs;
r->len = len;
+ ubifs_copy_hash(c, hash, r->hash);
r->deletion = !!deletion;
r->sqnum = sqnum;
key_copy(c, key, &r->key);
@@ -527,6 +570,118 @@ static int is_last_bud(struct ubifs_info *c, struct ubifs_bud *bud)
return data == 0xFFFFFFFF;
}
+/* authenticate_sleb_hash and authenticate_sleb_hmac are split out for stack usage */
+static int authenticate_sleb_hash(struct ubifs_info *c, struct shash_desc *log_hash, u8 *hash)
+{
+ SHASH_DESC_ON_STACK(hash_desc, c->hash_tfm);
+
+ hash_desc->tfm = c->hash_tfm;
+ hash_desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+
+ ubifs_shash_copy_state(c, log_hash, hash_desc);
+ return crypto_shash_final(hash_desc, hash);
+}
+
+static int authenticate_sleb_hmac(struct ubifs_info *c, u8 *hash, u8 *hmac)
+{
+ SHASH_DESC_ON_STACK(hmac_desc, c->hmac_tfm);
+
+ hmac_desc->tfm = c->hmac_tfm;
+ hmac_desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+
+ return crypto_shash_digest(hmac_desc, hash, c->hash_len, hmac);
+}
+
+/**
+ * authenticate_sleb - authenticate one scan LEB
+ * @c: UBIFS file-system description object
+ * @sleb: the scan LEB to authenticate
+ * @log_hash:
+ * @is_last: if true, this is is the last LEB
+ *
+ * This function iterates over the buds of a single LEB authenticating all buds
+ * with the authentication nodes on this LEB. Authentication nodes are written
+ * after some buds and contain a HMAC covering the authentication node itself
+ * and the buds between the last authentication node and the current
+ * authentication node. It can happen that the last buds cannot be authenticated
+ * because a powercut happened when some nodes were written but not the
+ * corresponding authentication node. This function returns the number of nodes
+ * that could be authenticated or a negative error code.
+ */
+static int authenticate_sleb(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+ struct shash_desc *log_hash, int is_last)
+{
+ int n_not_auth = 0;
+ struct ubifs_scan_node *snod;
+ int n_nodes = 0;
+ int err;
+ u8 *hash, *hmac;
+
+ if (!ubifs_authenticated(c))
+ return sleb->nodes_cnt;
+
+ hash = kmalloc(crypto_shash_descsize(c->hash_tfm), GFP_NOFS);
+ hmac = kmalloc(c->hmac_desc_len, GFP_NOFS);
+ if (!hash || !hmac) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ list_for_each_entry(snod, &sleb->nodes, list) {
+
+ n_nodes++;
+
+ if (snod->type == UBIFS_AUTH_NODE) {
+ struct ubifs_auth_node *auth = snod->node;
+
+ err = authenticate_sleb_hash(c, log_hash, hash);
+ if (err)
+ goto out;
+
+ err = authenticate_sleb_hmac(c, hash, hmac);
+ if (err)
+ goto out;
+
+ err = ubifs_check_hmac(c, auth->hmac, hmac);
+ if (err) {
+ err = -EPERM;
+ goto out;
+ }
+ n_not_auth = 0;
+ } else {
+ err = crypto_shash_update(log_hash, snod->node,
+ snod->len);
+ if (err)
+ goto out;
+ n_not_auth++;
+ }
+ }
+
+ /*
+ * A powercut can happen when some nodes were written, but not yet
+ * the corresponding authentication node. This may only happen on
+ * the last bud though.
+ */
+ if (n_not_auth) {
+ if (is_last) {
+ dbg_mnt("%d unauthenticated nodes found on LEB %d, Ignoring them",
+ n_not_auth, sleb->lnum);
+ err = 0;
+ } else {
+ dbg_mnt("%d unauthenticated nodes found on non-last LEB %d",
+ n_not_auth, sleb->lnum);
+ err = -EPERM;
+ }
+ } else {
+ err = 0;
+ }
+out:
+ kfree(hash);
+ kfree(hmac);
+
+ return err ? err : n_nodes - n_not_auth;
+}
+
/**
* replay_bud - replay a bud logical eraseblock.
* @c: UBIFS file-system description object
@@ -540,6 +695,7 @@ static int replay_bud(struct ubifs_info *c, struct bud_entry *b)
{
int is_last = is_last_bud(c, b->bud);
int err = 0, used = 0, lnum = b->bud->lnum, offs = b->bud->start;
+ int n_nodes, n = 0;
struct ubifs_scan_leb *sleb;
struct ubifs_scan_node *snod;
@@ -559,6 +715,15 @@ static int replay_bud(struct ubifs_info *c, struct bud_entry *b)
if (IS_ERR(sleb))
return PTR_ERR(sleb);
+ n_nodes = authenticate_sleb(c, sleb, b->bud->log_hash, is_last);
+ if (n_nodes < 0) {
+ err = n_nodes;
+ goto out;
+ }
+
+ ubifs_shash_copy_state(c, b->bud->log_hash,
+ c->jheads[b->bud->jhead].log_hash);
+
/*
* The bud does not have to start from offset zero - the beginning of
* the 'lnum' LEB may contain previously committed data. One of the
@@ -582,6 +747,7 @@ static int replay_bud(struct ubifs_info *c, struct bud_entry *b)
*/
list_for_each_entry(snod, &sleb->nodes, list) {
+ u8 hash[UBIFS_HASH_ARR_SZ];
int deletion = 0;
cond_resched();
@@ -591,6 +757,8 @@ static int replay_bud(struct ubifs_info *c, struct bud_entry *b)
goto out_dump;
}
+ ubifs_node_calc_hash(c, snod->node, hash);
+
if (snod->sqnum > c->max_sqnum)
c->max_sqnum = snod->sqnum;
@@ -602,7 +770,7 @@ static int replay_bud(struct ubifs_info *c, struct bud_entry *b)
if (le32_to_cpu(ino->nlink) == 0)
deletion = 1;
- err = insert_node(c, lnum, snod->offs, snod->len,
+ err = insert_node(c, lnum, snod->offs, snod->len, hash,
&snod->key, snod->sqnum, deletion,
&used, 0, new_size);
break;
@@ -614,7 +782,7 @@ static int replay_bud(struct ubifs_info *c, struct bud_entry *b)
key_block(c, &snod->key) *
UBIFS_BLOCK_SIZE;
- err = insert_node(c, lnum, snod->offs, snod->len,
+ err = insert_node(c, lnum, snod->offs, snod->len, hash,
&snod->key, snod->sqnum, deletion,
&used, 0, new_size);
break;
@@ -628,7 +796,7 @@ static int replay_bud(struct ubifs_info *c, struct bud_entry *b)
if (err)
goto out_dump;
- err = insert_dent(c, lnum, snod->offs, snod->len,
+ err = insert_dent(c, lnum, snod->offs, snod->len, hash,
&snod->key, dent->name,
le16_to_cpu(dent->nlen), snod->sqnum,
!le64_to_cpu(dent->inum), &used);
@@ -654,11 +822,13 @@ static int replay_bud(struct ubifs_info *c, struct bud_entry *b)
* functions which expect nodes to have keys.
*/
trun_key_init(c, &key, le32_to_cpu(trun->inum));
- err = insert_node(c, lnum, snod->offs, snod->len,
+ err = insert_node(c, lnum, snod->offs, snod->len, hash,
&key, snod->sqnum, 1, &used,
old_size, new_size);
break;
}
+ case UBIFS_AUTH_NODE:
+ break;
default:
ubifs_err(c, "unexpected node type %d in bud LEB %d:%d",
snod->type, lnum, snod->offs);
@@ -667,6 +837,10 @@ static int replay_bud(struct ubifs_info *c, struct bud_entry *b)
}
if (err)
goto out;
+
+ n++;
+ if (n == n_nodes)
+ break;
}
ubifs_assert(c, ubifs_search_bud(c, lnum));
@@ -745,6 +919,7 @@ static int add_replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead,
{
struct ubifs_bud *bud;
struct bud_entry *b;
+ int err;
dbg_mnt("add replay bud LEB %d:%d, head %d", lnum, offs, jhead);
@@ -754,13 +929,21 @@ static int add_replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead,
b = kmalloc(sizeof(struct bud_entry), GFP_KERNEL);
if (!b) {
- kfree(bud);
- return -ENOMEM;
+ err = -ENOMEM;
+ goto out;
}
bud->lnum = lnum;
bud->start = offs;
bud->jhead = jhead;
+ bud->log_hash = ubifs_hash_get_desc(c);
+ if (IS_ERR(bud->log_hash)) {
+ err = PTR_ERR(bud->log_hash);
+ goto out;
+ }
+
+ ubifs_shash_copy_state(c, c->log_hash, bud->log_hash);
+
ubifs_add_bud(c, bud);
b->bud = bud;
@@ -768,6 +951,11 @@ static int add_replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead,
list_add_tail(&b->list, &c->replay_buds);
return 0;
+out:
+ kfree(bud);
+ kfree(b);
+
+ return err;
}
/**
@@ -873,6 +1061,14 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
c->cs_sqnum = le64_to_cpu(node->ch.sqnum);
dbg_mnt("commit start sqnum %llu", c->cs_sqnum);
+
+ err = ubifs_shash_init(c, c->log_hash);
+ if (err)
+ goto out;
+
+ err = ubifs_shash_update(c, c->log_hash, node, UBIFS_CS_NODE_SZ);
+ if (err < 0)
+ goto out;
}
if (snod->sqnum < c->cs_sqnum) {
@@ -920,6 +1116,11 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
if (err)
goto out_dump;
+ err = ubifs_shash_update(c, c->log_hash, ref,
+ UBIFS_REF_NODE_SZ);
+ if (err)
+ goto out;
+
err = add_replay_bud(c, le32_to_cpu(ref->lnum),
le32_to_cpu(ref->offs),
le32_to_cpu(ref->jhead),
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index bf17f58908ff..3da90c951c23 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -63,6 +63,17 @@
/* Default time granularity in nanoseconds */
#define DEFAULT_TIME_GRAN 1000000000
+static int get_default_compressor(struct ubifs_info *c)
+{
+ if (ubifs_compr_present(c, UBIFS_COMPR_LZO))
+ return UBIFS_COMPR_LZO;
+
+ if (ubifs_compr_present(c, UBIFS_COMPR_ZLIB))
+ return UBIFS_COMPR_ZLIB;
+
+ return UBIFS_COMPR_NONE;
+}
+
/**
* create_default_filesystem - format empty UBI volume.
* @c: UBIFS file-system description object
@@ -82,10 +93,13 @@ static int create_default_filesystem(struct ubifs_info *c)
int err, tmp, jnl_lebs, log_lebs, max_buds, main_lebs, main_first;
int lpt_lebs, lpt_first, orph_lebs, big_lpt, ino_waste, sup_flags = 0;
int min_leb_cnt = UBIFS_MIN_LEB_CNT;
+ int idx_node_size;
long long tmp64, main_bytes;
__le64 tmp_le64;
__le32 tmp_le32;
struct timespec64 ts;
+ u8 hash[UBIFS_HASH_ARR_SZ];
+ u8 hash_lpt[UBIFS_HASH_ARR_SZ];
/* Some functions called from here depend on the @c->key_len filed */
c->key_len = UBIFS_SK_LEN;
@@ -147,7 +161,7 @@ static int create_default_filesystem(struct ubifs_info *c)
c->lsave_cnt = DEFAULT_LSAVE_CNT;
c->max_leb_cnt = c->leb_cnt;
err = ubifs_create_dflt_lpt(c, &main_lebs, lpt_first, &lpt_lebs,
- &big_lpt);
+ &big_lpt, hash_lpt);
if (err)
return err;
@@ -156,17 +170,35 @@ static int create_default_filesystem(struct ubifs_info *c)
main_first = c->leb_cnt - main_lebs;
+ sup = kzalloc(ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size), GFP_KERNEL);
+ mst = kzalloc(c->mst_node_alsz, GFP_KERNEL);
+ idx_node_size = ubifs_idx_node_sz(c, 1);
+ idx = kzalloc(ALIGN(tmp, c->min_io_size), GFP_KERNEL);
+ ino = kzalloc(ALIGN(UBIFS_INO_NODE_SZ, c->min_io_size), GFP_KERNEL);
+ cs = kzalloc(ALIGN(UBIFS_CS_NODE_SZ, c->min_io_size), GFP_KERNEL);
+
+ if (!sup || !mst || !idx || !ino || !cs) {
+ err = -ENOMEM;
+ goto out;
+ }
+
/* Create default superblock */
- tmp = ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size);
- sup = kzalloc(tmp, GFP_KERNEL);
- if (!sup)
- return -ENOMEM;
tmp64 = (long long)max_buds * c->leb_size;
if (big_lpt)
sup_flags |= UBIFS_FLG_BIGLPT;
sup_flags |= UBIFS_FLG_DOUBLE_HASH;
+ if (ubifs_authenticated(c)) {
+ sup_flags |= UBIFS_FLG_AUTHENTICATION;
+ sup->hash_algo = cpu_to_le16(c->auth_hash_algo);
+ err = ubifs_hmac_wkm(c, sup->hmac_wkm);
+ if (err)
+ goto out;
+ } else {
+ sup->hash_algo = 0xffff;
+ }
+
sup->ch.node_type = UBIFS_SB_NODE;
sup->key_hash = UBIFS_KEY_HASH_R5;
sup->flags = cpu_to_le32(sup_flags);
@@ -186,7 +218,7 @@ static int create_default_filesystem(struct ubifs_info *c)
if (c->mount_opts.override_compr)
sup->default_compr = cpu_to_le16(c->mount_opts.compr_type);
else
- sup->default_compr = cpu_to_le16(UBIFS_COMPR_LZO);
+ sup->default_compr = cpu_to_le16(get_default_compressor(c));
generate_random_uuid(sup->uuid);
@@ -197,17 +229,9 @@ static int create_default_filesystem(struct ubifs_info *c)
sup->rp_size = cpu_to_le64(tmp64);
sup->ro_compat_version = cpu_to_le32(UBIFS_RO_COMPAT_VERSION);
- err = ubifs_write_node(c, sup, UBIFS_SB_NODE_SZ, 0, 0);
- kfree(sup);
- if (err)
- return err;
-
dbg_gen("default superblock created at LEB 0:0");
/* Create default master node */
- mst = kzalloc(c->mst_node_alsz, GFP_KERNEL);
- if (!mst)
- return -ENOMEM;
mst->ch.node_type = UBIFS_MST_NODE;
mst->log_lnum = cpu_to_le32(UBIFS_LOG_LNUM);
@@ -233,6 +257,7 @@ static int create_default_filesystem(struct ubifs_info *c)
mst->empty_lebs = cpu_to_le32(main_lebs - 2);
mst->idx_lebs = cpu_to_le32(1);
mst->leb_cnt = cpu_to_le32(c->leb_cnt);
+ ubifs_copy_hash(c, hash_lpt, mst->hash_lpt);
/* Calculate lprops statistics */
tmp64 = main_bytes;
@@ -253,24 +278,9 @@ static int create_default_filesystem(struct ubifs_info *c)
mst->total_used = cpu_to_le64(UBIFS_INO_NODE_SZ);
- err = ubifs_write_node(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM, 0);
- if (err) {
- kfree(mst);
- return err;
- }
- err = ubifs_write_node(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM + 1,
- 0);
- kfree(mst);
- if (err)
- return err;
-
dbg_gen("default master node created at LEB %d:0", UBIFS_MST_LNUM);
/* Create the root indexing node */
- tmp = ubifs_idx_node_sz(c, 1);
- idx = kzalloc(ALIGN(tmp, c->min_io_size), GFP_KERNEL);
- if (!idx)
- return -ENOMEM;
c->key_fmt = UBIFS_SIMPLE_KEY_FMT;
c->key_hash = key_r5_hash;
@@ -282,19 +292,11 @@ static int create_default_filesystem(struct ubifs_info *c)
key_write_idx(c, &key, &br->key);
br->lnum = cpu_to_le32(main_first + DEFAULT_DATA_LEB);
br->len = cpu_to_le32(UBIFS_INO_NODE_SZ);
- err = ubifs_write_node(c, idx, tmp, main_first + DEFAULT_IDX_LEB, 0);
- kfree(idx);
- if (err)
- return err;
dbg_gen("default root indexing node created LEB %d:0",
main_first + DEFAULT_IDX_LEB);
/* Create default root inode */
- tmp = ALIGN(UBIFS_INO_NODE_SZ, c->min_io_size);
- ino = kzalloc(tmp, GFP_KERNEL);
- if (!ino)
- return -ENOMEM;
ino_key_init_flash(c, &ino->key, UBIFS_ROOT_INO);
ino->ch.node_type = UBIFS_INO_NODE;
@@ -317,12 +319,6 @@ static int create_default_filesystem(struct ubifs_info *c)
/* Set compression enabled by default */
ino->flags = cpu_to_le32(UBIFS_COMPR_FL);
- err = ubifs_write_node(c, ino, UBIFS_INO_NODE_SZ,
- main_first + DEFAULT_DATA_LEB, 0);
- kfree(ino);
- if (err)
- return err;
-
dbg_gen("root inode created at LEB %d:0",
main_first + DEFAULT_DATA_LEB);
@@ -331,19 +327,54 @@ static int create_default_filesystem(struct ubifs_info *c)
* always the case during normal file-system operation. Write a fake
* commit start node to the log.
*/
- tmp = ALIGN(UBIFS_CS_NODE_SZ, c->min_io_size);
- cs = kzalloc(tmp, GFP_KERNEL);
- if (!cs)
- return -ENOMEM;
cs->ch.node_type = UBIFS_CS_NODE;
+
+ err = ubifs_write_node_hmac(c, sup, UBIFS_SB_NODE_SZ, 0, 0,
+ offsetof(struct ubifs_sb_node, hmac));
+ if (err)
+ goto out;
+
+ err = ubifs_write_node(c, ino, UBIFS_INO_NODE_SZ,
+ main_first + DEFAULT_DATA_LEB, 0);
+ if (err)
+ goto out;
+
+ ubifs_node_calc_hash(c, ino, hash);
+ ubifs_copy_hash(c, hash, ubifs_branch_hash(c, br));
+
+ err = ubifs_write_node(c, idx, idx_node_size, main_first + DEFAULT_IDX_LEB, 0);
+ if (err)
+ goto out;
+
+ ubifs_node_calc_hash(c, idx, hash);
+ ubifs_copy_hash(c, hash, mst->hash_root_idx);
+
+ err = ubifs_write_node_hmac(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM, 0,
+ offsetof(struct ubifs_mst_node, hmac));
+ if (err)
+ goto out;
+
+ err = ubifs_write_node_hmac(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM + 1,
+ 0, offsetof(struct ubifs_mst_node, hmac));
+ if (err)
+ goto out;
+
err = ubifs_write_node(c, cs, UBIFS_CS_NODE_SZ, UBIFS_LOG_LNUM, 0);
- kfree(cs);
if (err)
- return err;
+ goto out;
ubifs_msg(c, "default file-system created");
- return 0;
+
+ err = 0;
+out:
+ kfree(sup);
+ kfree(mst);
+ kfree(idx);
+ kfree(ino);
+ kfree(cs);
+
+ return err;
}
/**
@@ -498,7 +529,7 @@ failed:
* code. Note, the user of this function is responsible of kfree()'ing the
* returned superblock buffer.
*/
-struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c)
+static struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c)
{
struct ubifs_sb_node *sup;
int err;
@@ -517,6 +548,65 @@ struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c)
return sup;
}
+static int authenticate_sb_node(struct ubifs_info *c,
+ const struct ubifs_sb_node *sup)
+{
+ unsigned int sup_flags = le32_to_cpu(sup->flags);
+ u8 hmac_wkm[UBIFS_HMAC_ARR_SZ];
+ int authenticated = !!(sup_flags & UBIFS_FLG_AUTHENTICATION);
+ int hash_algo;
+ int err;
+
+ if (c->authenticated && !authenticated) {
+ ubifs_err(c, "authenticated FS forced, but found FS without authentication");
+ return -EINVAL;
+ }
+
+ if (!c->authenticated && authenticated) {
+ ubifs_err(c, "authenticated FS found, but no key given");
+ return -EINVAL;
+ }
+
+ ubifs_msg(c, "Mounting in %sauthenticated mode",
+ c->authenticated ? "" : "un");
+
+ if (!c->authenticated)
+ return 0;
+
+ if (!IS_ENABLED(CONFIG_UBIFS_FS_AUTHENTICATION))
+ return -EOPNOTSUPP;
+
+ hash_algo = le16_to_cpu(sup->hash_algo);
+ if (hash_algo >= HASH_ALGO__LAST) {
+ ubifs_err(c, "superblock uses unknown hash algo %d",
+ hash_algo);
+ return -EINVAL;
+ }
+
+ if (strcmp(hash_algo_name[hash_algo], c->auth_hash_name)) {
+ ubifs_err(c, "This filesystem uses %s for hashing,"
+ " but %s is specified", hash_algo_name[hash_algo],
+ c->auth_hash_name);
+ return -EINVAL;
+ }
+
+ err = ubifs_hmac_wkm(c, hmac_wkm);
+ if (err)
+ return err;
+
+ if (ubifs_check_hmac(c, hmac_wkm, sup->hmac_wkm)) {
+ ubifs_err(c, "provided key does not fit");
+ return -ENOKEY;
+ }
+
+ err = ubifs_node_verify_hmac(c, sup, sizeof(*sup),
+ offsetof(struct ubifs_sb_node, hmac));
+ if (err)
+ ubifs_err(c, "Failed to authenticate superblock: %d", err);
+
+ return err;
+}
+
/**
* ubifs_write_sb_node - write superblock node.
* @c: UBIFS file-system description object
@@ -527,8 +617,13 @@ struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c)
int ubifs_write_sb_node(struct ubifs_info *c, struct ubifs_sb_node *sup)
{
int len = ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size);
+ int err;
+
+ err = ubifs_prepare_node_hmac(c, sup, UBIFS_SB_NODE_SZ,
+ offsetof(struct ubifs_sb_node, hmac), 1);
+ if (err)
+ return err;
- ubifs_prepare_node(c, sup, UBIFS_SB_NODE_SZ, 1);
return ubifs_leb_change(c, UBIFS_SB_LNUM, sup, len);
}
@@ -555,6 +650,8 @@ int ubifs_read_superblock(struct ubifs_info *c)
if (IS_ERR(sup))
return PTR_ERR(sup);
+ c->sup_node = sup;
+
c->fmt_version = le32_to_cpu(sup->fmt_version);
c->ro_compat_version = le32_to_cpu(sup->ro_compat_version);
@@ -603,7 +700,7 @@ int ubifs_read_superblock(struct ubifs_info *c)
c->key_hash = key_test_hash;
c->key_hash_type = UBIFS_KEY_HASH_TEST;
break;
- };
+ }
c->key_fmt = sup->key_fmt;
@@ -640,6 +737,10 @@ int ubifs_read_superblock(struct ubifs_info *c)
c->double_hash = !!(sup_flags & UBIFS_FLG_DOUBLE_HASH);
c->encrypted = !!(sup_flags & UBIFS_FLG_ENCRYPTION);
+ err = authenticate_sb_node(c, sup);
+ if (err)
+ goto out;
+
if ((sup_flags & ~UBIFS_FLG_MASK) != 0) {
ubifs_err(c, "Unknown feature flags found: %#x",
sup_flags & ~UBIFS_FLG_MASK);
@@ -686,7 +787,6 @@ int ubifs_read_superblock(struct ubifs_info *c)
err = validate_sb(c, sup);
out:
- kfree(sup);
return err;
}
@@ -815,7 +915,7 @@ out:
int ubifs_fixup_free_space(struct ubifs_info *c)
{
int err;
- struct ubifs_sb_node *sup;
+ struct ubifs_sb_node *sup = c->sup_node;
ubifs_assert(c, c->space_fixup);
ubifs_assert(c, !c->ro_mount);
@@ -826,16 +926,11 @@ int ubifs_fixup_free_space(struct ubifs_info *c)
if (err)
return err;
- sup = ubifs_read_sb_node(c);
- if (IS_ERR(sup))
- return PTR_ERR(sup);
-
/* Free-space fixup is no longer required */
c->space_fixup = 0;
sup->flags &= cpu_to_le32(~UBIFS_FLG_SPACE_FIXUP);
err = ubifs_write_sb_node(c, sup);
- kfree(sup);
if (err)
return err;
@@ -846,7 +941,7 @@ int ubifs_fixup_free_space(struct ubifs_info *c)
int ubifs_enable_encryption(struct ubifs_info *c)
{
int err;
- struct ubifs_sb_node *sup;
+ struct ubifs_sb_node *sup = c->sup_node;
if (c->encrypted)
return 0;
@@ -859,16 +954,11 @@ int ubifs_enable_encryption(struct ubifs_info *c)
return -EINVAL;
}
- sup = ubifs_read_sb_node(c);
- if (IS_ERR(sup))
- return PTR_ERR(sup);
-
sup->flags |= cpu_to_le32(UBIFS_FLG_ENCRYPTION);
err = ubifs_write_sb_node(c, sup);
if (!err)
c->encrypted = 1;
- kfree(sup);
return err;
}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index fec62e9dfbe6..1fac1133dadd 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -579,6 +579,9 @@ static int init_constants_early(struct ubifs_info *c)
c->ranges[UBIFS_REF_NODE].len = UBIFS_REF_NODE_SZ;
c->ranges[UBIFS_TRUN_NODE].len = UBIFS_TRUN_NODE_SZ;
c->ranges[UBIFS_CS_NODE].len = UBIFS_CS_NODE_SZ;
+ c->ranges[UBIFS_AUTH_NODE].min_len = UBIFS_AUTH_NODE_SZ;
+ c->ranges[UBIFS_AUTH_NODE].max_len = UBIFS_AUTH_NODE_SZ +
+ UBIFS_MAX_HMAC_LEN;
c->ranges[UBIFS_INO_NODE].min_len = UBIFS_INO_NODE_SZ;
c->ranges[UBIFS_INO_NODE].max_len = UBIFS_MAX_INO_NODE_SZ;
@@ -816,6 +819,9 @@ static int alloc_wbufs(struct ubifs_info *c)
c->jheads[i].wbuf.sync_callback = &bud_wbuf_callback;
c->jheads[i].wbuf.jhead = i;
c->jheads[i].grouped = 1;
+ c->jheads[i].log_hash = ubifs_hash_get_desc(c);
+ if (IS_ERR(c->jheads[i].log_hash))
+ goto out;
}
/*
@@ -826,6 +832,12 @@ static int alloc_wbufs(struct ubifs_info *c)
c->jheads[GCHD].grouped = 0;
return 0;
+
+out:
+ while (i--)
+ kfree(c->jheads[i].log_hash);
+
+ return err;
}
/**
@@ -840,6 +852,7 @@ static void free_wbufs(struct ubifs_info *c)
for (i = 0; i < c->jhead_cnt; i++) {
kfree(c->jheads[i].wbuf.buf);
kfree(c->jheads[i].wbuf.inodes);
+ kfree(c->jheads[i].log_hash);
}
kfree(c->jheads);
c->jheads = NULL;
@@ -924,6 +937,8 @@ static int check_volume_empty(struct ubifs_info *c)
* Opt_no_chk_data_crc: do not check CRCs when reading data nodes
* Opt_override_compr: override default compressor
* Opt_assert: set ubifs_assert() action
+ * Opt_auth_key: The key name used for authentication
+ * Opt_auth_hash_name: The hash type used for authentication
* Opt_err: just end of array marker
*/
enum {
@@ -935,6 +950,8 @@ enum {
Opt_no_chk_data_crc,
Opt_override_compr,
Opt_assert,
+ Opt_auth_key,
+ Opt_auth_hash_name,
Opt_ignore,
Opt_err,
};
@@ -947,6 +964,8 @@ static const match_table_t tokens = {
{Opt_chk_data_crc, "chk_data_crc"},
{Opt_no_chk_data_crc, "no_chk_data_crc"},
{Opt_override_compr, "compr=%s"},
+ {Opt_auth_key, "auth_key=%s"},
+ {Opt_auth_hash_name, "auth_hash_name=%s"},
{Opt_ignore, "ubi=%s"},
{Opt_ignore, "vol=%s"},
{Opt_assert, "assert=%s"},
@@ -1070,6 +1089,16 @@ static int ubifs_parse_options(struct ubifs_info *c, char *options,
kfree(act);
break;
}
+ case Opt_auth_key:
+ c->auth_key_name = kstrdup(args[0].from, GFP_KERNEL);
+ if (!c->auth_key_name)
+ return -ENOMEM;
+ break;
+ case Opt_auth_hash_name:
+ c->auth_hash_name = kstrdup(args[0].from, GFP_KERNEL);
+ if (!c->auth_hash_name)
+ return -ENOMEM;
+ break;
case Opt_ignore:
break;
default:
@@ -1249,6 +1278,19 @@ static int mount_ubifs(struct ubifs_info *c)
c->mounting = 1;
+ if (c->auth_key_name) {
+ if (IS_ENABLED(CONFIG_UBIFS_FS_AUTHENTICATION)) {
+ err = ubifs_init_authentication(c);
+ if (err)
+ goto out_free;
+ } else {
+ ubifs_err(c, "auth_key_name, but UBIFS is built without"
+ " authentication support");
+ err = -EINVAL;
+ goto out_free;
+ }
+ }
+
err = ubifs_read_superblock(c);
if (err)
goto out_free;
@@ -1367,12 +1409,21 @@ static int mount_ubifs(struct ubifs_info *c)
}
if (c->need_recovery) {
- err = ubifs_recover_size(c);
- if (err)
- goto out_orphans;
+ if (!ubifs_authenticated(c)) {
+ err = ubifs_recover_size(c, true);
+ if (err)
+ goto out_orphans;
+ }
+
err = ubifs_rcvry_gc_commit(c);
if (err)
goto out_orphans;
+
+ if (ubifs_authenticated(c)) {
+ err = ubifs_recover_size(c, false);
+ if (err)
+ goto out_orphans;
+ }
} else {
err = take_gc_lnum(c);
if (err)
@@ -1391,7 +1442,7 @@ static int mount_ubifs(struct ubifs_info *c)
if (err)
goto out_orphans;
} else if (c->need_recovery) {
- err = ubifs_recover_size(c);
+ err = ubifs_recover_size(c, false);
if (err)
goto out_orphans;
} else {
@@ -1557,7 +1608,10 @@ static void ubifs_umount(struct ubifs_info *c)
free_wbufs(c);
free_orphans(c);
ubifs_lpt_free(c, 0);
+ ubifs_exit_authentication(c);
+ kfree(c->auth_key_name);
+ kfree(c->auth_hash_name);
kfree(c->cbuf);
kfree(c->rcvrd_mst_node);
kfree(c->mst_node);
@@ -1605,16 +1659,10 @@ static int ubifs_remount_rw(struct ubifs_info *c)
goto out;
if (c->old_leb_cnt != c->leb_cnt) {
- struct ubifs_sb_node *sup;
+ struct ubifs_sb_node *sup = c->sup_node;
- sup = ubifs_read_sb_node(c);
- if (IS_ERR(sup)) {
- err = PTR_ERR(sup);
- goto out;
- }
sup->leb_cnt = cpu_to_le32(c->leb_cnt);
err = ubifs_write_sb_node(c, sup);
- kfree(sup);
if (err)
goto out;
}
@@ -1624,9 +1672,11 @@ static int ubifs_remount_rw(struct ubifs_info *c)
err = ubifs_write_rcvrd_mst_node(c);
if (err)
goto out;
- err = ubifs_recover_size(c);
- if (err)
- goto out;
+ if (!ubifs_authenticated(c)) {
+ err = ubifs_recover_size(c, true);
+ if (err)
+ goto out;
+ }
err = ubifs_clean_lebs(c, c->sbuf);
if (err)
goto out;
@@ -1692,10 +1742,19 @@ static int ubifs_remount_rw(struct ubifs_info *c)
goto out;
}
- if (c->need_recovery)
+ if (c->need_recovery) {
err = ubifs_rcvry_gc_commit(c);
- else
+ if (err)
+ goto out;
+
+ if (ubifs_authenticated(c)) {
+ err = ubifs_recover_size(c, false);
+ if (err)
+ goto out;
+ }
+ } else {
err = ubifs_leb_unmap(c, c->gc_lnum);
+ }
if (err)
goto out;
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index bf416e512743..25572ffea163 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -35,7 +35,7 @@
#include "ubifs.h"
static int try_read_node(const struct ubifs_info *c, void *buf, int type,
- int len, int lnum, int offs);
+ struct ubifs_zbranch *zbr);
static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key,
struct ubifs_zbranch *zbr, void *node);
@@ -433,9 +433,7 @@ static int tnc_read_hashed_node(struct ubifs_info *c, struct ubifs_zbranch *zbr,
* @c: UBIFS file-system description object
* @buf: buffer to read to
* @type: node type
- * @len: node length (not aligned)
- * @lnum: LEB number of node to read
- * @offs: offset of node to read
+ * @zbr: the zbranch describing the node to read
*
* This function tries to read a node of known type and length, checks it and
* stores it in @buf. This function returns %1 if a node is present and %0 if
@@ -453,8 +451,11 @@ static int tnc_read_hashed_node(struct ubifs_info *c, struct ubifs_zbranch *zbr,
* journal nodes may potentially be corrupted, so checking is required.
*/
static int try_read_node(const struct ubifs_info *c, void *buf, int type,
- int len, int lnum, int offs)
+ struct ubifs_zbranch *zbr)
{
+ int len = zbr->len;
+ int lnum = zbr->lnum;
+ int offs = zbr->offs;
int err, node_len;
struct ubifs_ch *ch = buf;
uint32_t crc, node_crc;
@@ -487,6 +488,12 @@ static int try_read_node(const struct ubifs_info *c, void *buf, int type,
if (crc != node_crc)
return 0;
+ err = ubifs_node_check_hash(c, buf, zbr->hash);
+ if (err) {
+ ubifs_bad_hash(c, buf, zbr->hash, lnum, offs);
+ return 0;
+ }
+
return 1;
}
@@ -507,8 +514,7 @@ static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key,
dbg_tnck(key, "LEB %d:%d, key ", zbr->lnum, zbr->offs);
- ret = try_read_node(c, node, key_type(c, key), zbr->len, zbr->lnum,
- zbr->offs);
+ ret = try_read_node(c, node, key_type(c, key), zbr);
if (ret == 1) {
union ubifs_key node_key;
struct ubifs_dent_node *dent = node;
@@ -1713,6 +1719,12 @@ static int validate_data_node(struct ubifs_info *c, void *buf,
goto out;
}
+ err = ubifs_node_check_hash(c, buf, zbr->hash);
+ if (err) {
+ ubifs_bad_hash(c, buf, zbr->hash, zbr->lnum, zbr->offs);
+ return err;
+ }
+
len = le32_to_cpu(ch->len);
if (len != zbr->len) {
ubifs_err(c, "bad node length %d, expected %d", len, zbr->len);
@@ -2260,13 +2272,14 @@ do_split:
* @lnum: LEB number of node
* @offs: node offset
* @len: node length
+ * @hash: The hash over the node
*
* This function adds a node with key @key to TNC. The node may be new or it may
* obsolete some existing one. Returns %0 on success or negative error code on
* failure.
*/
int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum,
- int offs, int len)
+ int offs, int len, const u8 *hash)
{
int found, n, err = 0;
struct ubifs_znode *znode;
@@ -2281,6 +2294,7 @@ int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum,
zbr.lnum = lnum;
zbr.offs = offs;
zbr.len = len;
+ ubifs_copy_hash(c, hash, zbr.hash);
key_copy(c, key, &zbr.key);
err = tnc_insert(c, znode, &zbr, n + 1);
} else if (found == 1) {
@@ -2291,6 +2305,7 @@ int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum,
zbr->lnum = lnum;
zbr->offs = offs;
zbr->len = len;
+ ubifs_copy_hash(c, hash, zbr->hash);
} else
err = found;
if (!err)
@@ -2392,13 +2407,14 @@ out_unlock:
* @lnum: LEB number of node
* @offs: node offset
* @len: node length
+ * @hash: The hash over the node
* @nm: node name
*
* This is the same as 'ubifs_tnc_add()' but it should be used with keys which
* may have collisions, like directory entry keys.
*/
int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key,
- int lnum, int offs, int len,
+ int lnum, int offs, int len, const u8 *hash,
const struct fscrypt_name *nm)
{
int found, n, err = 0;
@@ -2441,6 +2457,7 @@ int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key,
zbr->lnum = lnum;
zbr->offs = offs;
zbr->len = len;
+ ubifs_copy_hash(c, hash, zbr->hash);
goto out_unlock;
}
}
@@ -2452,6 +2469,7 @@ int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key,
zbr.lnum = lnum;
zbr.offs = offs;
zbr.len = len;
+ ubifs_copy_hash(c, hash, zbr.hash);
key_copy(c, key, &zbr.key);
err = tnc_insert(c, znode, &zbr, n + 1);
if (err)
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
index dba87d09b989..dbcd2c350b65 100644
--- a/fs/ubifs/tnc_commit.c
+++ b/fs/ubifs/tnc_commit.c
@@ -38,6 +38,7 @@ static int make_idx_node(struct ubifs_info *c, struct ubifs_idx_node *idx,
struct ubifs_znode *znode, int lnum, int offs, int len)
{
struct ubifs_znode *zp;
+ u8 hash[UBIFS_HASH_ARR_SZ];
int i, err;
/* Make index node */
@@ -52,6 +53,7 @@ static int make_idx_node(struct ubifs_info *c, struct ubifs_idx_node *idx,
br->lnum = cpu_to_le32(zbr->lnum);
br->offs = cpu_to_le32(zbr->offs);
br->len = cpu_to_le32(zbr->len);
+ ubifs_copy_hash(c, zbr->hash, ubifs_branch_hash(c, br));
if (!zbr->lnum || !zbr->len) {
ubifs_err(c, "bad ref in znode");
ubifs_dump_znode(c, znode);
@@ -62,6 +64,7 @@ static int make_idx_node(struct ubifs_info *c, struct ubifs_idx_node *idx,
}
}
ubifs_prepare_node(c, idx, len, 0);
+ ubifs_node_calc_hash(c, idx, hash);
znode->lnum = lnum;
znode->offs = offs;
@@ -78,10 +81,12 @@ static int make_idx_node(struct ubifs_info *c, struct ubifs_idx_node *idx,
zbr->lnum = lnum;
zbr->offs = offs;
zbr->len = len;
+ ubifs_copy_hash(c, hash, zbr->hash);
} else {
c->zroot.lnum = lnum;
c->zroot.offs = offs;
c->zroot.len = len;
+ ubifs_copy_hash(c, hash, c->zroot.hash);
}
c->calc_idx_sz += ALIGN(len, 8);
@@ -647,6 +652,8 @@ static int get_znodes_to_commit(struct ubifs_info *c)
znode->cnext = c->cnext;
break;
}
+ znode->cparent = znode->parent;
+ znode->ciip = znode->iip;
znode->cnext = cnext;
znode = cnext;
cnt += 1;
@@ -840,6 +847,8 @@ static int write_index(struct ubifs_info *c)
}
while (1) {
+ u8 hash[UBIFS_HASH_ARR_SZ];
+
cond_resched();
znode = cnext;
@@ -857,6 +866,7 @@ static int write_index(struct ubifs_info *c)
br->lnum = cpu_to_le32(zbr->lnum);
br->offs = cpu_to_le32(zbr->offs);
br->len = cpu_to_le32(zbr->len);
+ ubifs_copy_hash(c, zbr->hash, ubifs_branch_hash(c, br));
if (!zbr->lnum || !zbr->len) {
ubifs_err(c, "bad ref in znode");
ubifs_dump_znode(c, znode);
@@ -868,6 +878,23 @@ static int write_index(struct ubifs_info *c)
}
len = ubifs_idx_node_sz(c, znode->child_cnt);
ubifs_prepare_node(c, idx, len, 0);
+ ubifs_node_calc_hash(c, idx, hash);
+
+ mutex_lock(&c->tnc_mutex);
+
+ if (znode->cparent)
+ ubifs_copy_hash(c, hash,
+ znode->cparent->zbranch[znode->ciip].hash);
+
+ if (znode->parent) {
+ if (!ubifs_zn_obsolete(znode))
+ ubifs_copy_hash(c, hash,
+ znode->parent->zbranch[znode->iip].hash);
+ } else {
+ ubifs_copy_hash(c, hash, c->zroot.hash);
+ }
+
+ mutex_unlock(&c->tnc_mutex);
/* Determine the index node position */
if (lnum == -1) {
diff --git a/fs/ubifs/tnc_misc.c b/fs/ubifs/tnc_misc.c
index d90ee01076a9..d1815e959007 100644
--- a/fs/ubifs/tnc_misc.c
+++ b/fs/ubifs/tnc_misc.c
@@ -265,9 +265,7 @@ long ubifs_destroy_tnc_subtree(const struct ubifs_info *c,
/**
* read_znode - read an indexing node from flash and fill znode.
* @c: UBIFS file-system description object
- * @lnum: LEB of the indexing node to read
- * @offs: node offset
- * @len: node length
+ * @zzbr: the zbranch describing the node to read
* @znode: znode to read to
*
* This function reads an indexing node from the flash media and fills znode
@@ -276,9 +274,12 @@ long ubifs_destroy_tnc_subtree(const struct ubifs_info *c,
* is wrong with it, this function prints complaint messages and returns
* %-EINVAL.
*/
-static int read_znode(struct ubifs_info *c, int lnum, int offs, int len,
+static int read_znode(struct ubifs_info *c, struct ubifs_zbranch *zzbr,
struct ubifs_znode *znode)
{
+ int lnum = zzbr->lnum;
+ int offs = zzbr->offs;
+ int len = zzbr->len;
int i, err, type, cmp;
struct ubifs_idx_node *idx;
@@ -292,6 +293,12 @@ static int read_znode(struct ubifs_info *c, int lnum, int offs, int len,
return err;
}
+ err = ubifs_node_check_hash(c, idx, zzbr->hash);
+ if (err) {
+ ubifs_bad_hash(c, idx, zzbr->hash, lnum, offs);
+ return err;
+ }
+
znode->child_cnt = le16_to_cpu(idx->child_cnt);
znode->level = le16_to_cpu(idx->level);
@@ -308,13 +315,14 @@ static int read_znode(struct ubifs_info *c, int lnum, int offs, int len,
}
for (i = 0; i < znode->child_cnt; i++) {
- const struct ubifs_branch *br = ubifs_idx_branch(c, idx, i);
+ struct ubifs_branch *br = ubifs_idx_branch(c, idx, i);
struct ubifs_zbranch *zbr = &znode->zbranch[i];
key_read(c, &br->key, &zbr->key);
zbr->lnum = le32_to_cpu(br->lnum);
zbr->offs = le32_to_cpu(br->offs);
zbr->len = le32_to_cpu(br->len);
+ ubifs_copy_hash(c, ubifs_branch_hash(c, br), zbr->hash);
zbr->znode = NULL;
/* Validate branch */
@@ -425,7 +433,7 @@ struct ubifs_znode *ubifs_load_znode(struct ubifs_info *c,
if (!znode)
return ERR_PTR(-ENOMEM);
- err = read_znode(c, zbr->lnum, zbr->offs, zbr->len, znode);
+ err = read_znode(c, zbr, znode);
if (err)
goto out;
@@ -496,5 +504,11 @@ int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr,
return -EINVAL;
}
+ err = ubifs_node_check_hash(c, node, zbr->hash);
+ if (err) {
+ ubifs_bad_hash(c, node, zbr->hash, zbr->lnum, zbr->offs);
+ return err;
+ }
+
return 0;
}
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
index e8c23c9d4f4a..8b7c1844014f 100644
--- a/fs/ubifs/ubifs-media.h
+++ b/fs/ubifs/ubifs-media.h
@@ -286,6 +286,7 @@ enum {
#define UBIFS_IDX_NODE_SZ sizeof(struct ubifs_idx_node)
#define UBIFS_CS_NODE_SZ sizeof(struct ubifs_cs_node)
#define UBIFS_ORPH_NODE_SZ sizeof(struct ubifs_orph_node)
+#define UBIFS_AUTH_NODE_SZ sizeof(struct ubifs_auth_node)
/* Extended attribute entry nodes are identical to directory entry nodes */
#define UBIFS_XENT_NODE_SZ UBIFS_DENT_NODE_SZ
/* Only this does not have to be multiple of 8 bytes */
@@ -300,6 +301,12 @@ enum {
/* The largest UBIFS node */
#define UBIFS_MAX_NODE_SZ UBIFS_MAX_INO_NODE_SZ
+/* The maxmimum size of a hash, enough for sha512 */
+#define UBIFS_MAX_HASH_LEN 64
+
+/* The maxmimum size of a hmac, enough for hmac(sha512) */
+#define UBIFS_MAX_HMAC_LEN 64
+
/*
* xattr name of UBIFS encryption context, we don't use a prefix
* nor a long name to not waste space on the flash.
@@ -365,6 +372,7 @@ enum {
* UBIFS_IDX_NODE: index node
* UBIFS_CS_NODE: commit start node
* UBIFS_ORPH_NODE: orphan node
+ * UBIFS_AUTH_NODE: authentication node
* UBIFS_NODE_TYPES_CNT: count of supported node types
*
* Note, we index arrays by these numbers, so keep them low and contiguous.
@@ -384,6 +392,7 @@ enum {
UBIFS_IDX_NODE,
UBIFS_CS_NODE,
UBIFS_ORPH_NODE,
+ UBIFS_AUTH_NODE,
UBIFS_NODE_TYPES_CNT,
};
@@ -421,15 +430,19 @@ enum {
* UBIFS_FLG_DOUBLE_HASH: store a 32bit cookie in directory entry nodes to
* support 64bit cookies for lookups by hash
* UBIFS_FLG_ENCRYPTION: this filesystem contains encrypted files
+ * UBIFS_FLG_AUTHENTICATION: this filesystem contains hashes for authentication
*/
enum {
UBIFS_FLG_BIGLPT = 0x02,
UBIFS_FLG_SPACE_FIXUP = 0x04,
UBIFS_FLG_DOUBLE_HASH = 0x08,
UBIFS_FLG_ENCRYPTION = 0x10,
+ UBIFS_FLG_AUTHENTICATION = 0x20,
};
-#define UBIFS_FLG_MASK (UBIFS_FLG_BIGLPT|UBIFS_FLG_SPACE_FIXUP|UBIFS_FLG_DOUBLE_HASH|UBIFS_FLG_ENCRYPTION)
+#define UBIFS_FLG_MASK (UBIFS_FLG_BIGLPT | UBIFS_FLG_SPACE_FIXUP | \
+ UBIFS_FLG_DOUBLE_HASH | UBIFS_FLG_ENCRYPTION | \
+ UBIFS_FLG_AUTHENTICATION)
/**
* struct ubifs_ch - common header node.
@@ -633,6 +646,10 @@ struct ubifs_pad_node {
* @time_gran: time granularity in nanoseconds
* @uuid: UUID generated when the file system image was created
* @ro_compat_version: UBIFS R/O compatibility version
+ * @hmac: HMAC to authenticate the superblock node
+ * @hmac_wkm: HMAC of a well known message (the string "UBIFS") as a convenience
+ * to the user to check if the correct key is passed.
+ * @hash_algo: The hash algo used for this filesystem (one of enum hash_algo)
*/
struct ubifs_sb_node {
struct ubifs_ch ch;
@@ -660,7 +677,10 @@ struct ubifs_sb_node {
__le32 time_gran;
__u8 uuid[16];
__le32 ro_compat_version;
- __u8 padding2[3968];
+ __u8 hmac[UBIFS_MAX_HMAC_LEN];
+ __u8 hmac_wkm[UBIFS_MAX_HMAC_LEN];
+ __le16 hash_algo;
+ __u8 padding2[3838];
} __packed;
/**
@@ -695,6 +715,9 @@ struct ubifs_sb_node {
* @empty_lebs: number of empty logical eraseblocks
* @idx_lebs: number of indexing logical eraseblocks
* @leb_cnt: count of LEBs used by file-system
+ * @hash_root_idx: the hash of the root index node
+ * @hash_lpt: the hash of the LPT
+ * @hmac: HMAC to authenticate the master node
* @padding: reserved for future, zeroes
*/
struct ubifs_mst_node {
@@ -727,7 +750,10 @@ struct ubifs_mst_node {
__le32 empty_lebs;
__le32 idx_lebs;
__le32 leb_cnt;
- __u8 padding[344];
+ __u8 hash_root_idx[UBIFS_MAX_HASH_LEN];
+ __u8 hash_lpt[UBIFS_MAX_HASH_LEN];
+ __u8 hmac[UBIFS_MAX_HMAC_LEN];
+ __u8 padding[152];
} __packed;
/**
@@ -747,11 +773,25 @@ struct ubifs_ref_node {
} __packed;
/**
+ * struct ubifs_auth_node - node for authenticating other nodes
+ * @ch: common header
+ * @hmac: The HMAC
+ */
+struct ubifs_auth_node {
+ struct ubifs_ch ch;
+ __u8 hmac[];
+} __packed;
+
+/**
* struct ubifs_branch - key/reference/length branch
* @lnum: LEB number of the target node
* @offs: offset within @lnum
* @len: target node length
* @key: key
+ *
+ * In an authenticated UBIFS we have the hash of the referenced node after @key.
+ * This can't be added to the struct type definition because @key is a
+ * dynamically sized element already.
*/
struct ubifs_branch {
__le32 lnum;
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 4368cde476b0..38401adaa00d 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -39,6 +39,9 @@
#include <linux/security.h>
#include <linux/xattr.h>
#include <linux/random.h>
+#include <crypto/hash_info.h>
+#include <crypto/hash.h>
+#include <crypto/algapi.h>
#define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_UBIFS_FS_ENCRYPTION)
#include <linux/fscrypt.h>
@@ -157,6 +160,14 @@
/* Maximum number of data nodes to bulk-read */
#define UBIFS_MAX_BULK_READ 32
+#ifdef CONFIG_UBIFS_FS_AUTHENTICATION
+#define UBIFS_HASH_ARR_SZ UBIFS_MAX_HASH_LEN
+#define UBIFS_HMAC_ARR_SZ UBIFS_MAX_HMAC_LEN
+#else
+#define UBIFS_HASH_ARR_SZ 0
+#define UBIFS_HMAC_ARR_SZ 0
+#endif
+
/*
* Lockdep classes for UBIFS inode @ui_mutex.
*/
@@ -706,6 +717,7 @@ struct ubifs_wbuf {
* @jhead: journal head number this bud belongs to
* @list: link in the list buds belonging to the same journal head
* @rb: link in the tree of all buds
+ * @log_hash: the log hash from the commit start node up to this bud
*/
struct ubifs_bud {
int lnum;
@@ -713,6 +725,7 @@ struct ubifs_bud {
int jhead;
struct list_head list;
struct rb_node rb;
+ struct shash_desc *log_hash;
};
/**
@@ -720,6 +733,7 @@ struct ubifs_bud {
* @wbuf: head's write-buffer
* @buds_list: list of bud LEBs belonging to this journal head
* @grouped: non-zero if UBIFS groups nodes when writing to this journal head
+ * @log_hash: the log hash from the commit start node up to this journal head
*
* Note, the @buds list is protected by the @c->buds_lock.
*/
@@ -727,6 +741,7 @@ struct ubifs_jhead {
struct ubifs_wbuf wbuf;
struct list_head buds_list;
unsigned int grouped:1;
+ struct shash_desc *log_hash;
};
/**
@@ -736,6 +751,7 @@ struct ubifs_jhead {
* @lnum: LEB number of the target node (indexing node or data node)
* @offs: target node offset within @lnum
* @len: target node length
+ * @hash: the hash of the target node
*/
struct ubifs_zbranch {
union ubifs_key key;
@@ -746,12 +762,15 @@ struct ubifs_zbranch {
int lnum;
int offs;
int len;
+ u8 hash[UBIFS_HASH_ARR_SZ];
};
/**
* struct ubifs_znode - in-memory representation of an indexing node.
* @parent: parent znode or NULL if it is the root
* @cnext: next znode to commit
+ * @cparent: parent node for this commit
+ * @ciip: index in cparent's zbranch array
* @flags: znode flags (%DIRTY_ZNODE, %COW_ZNODE or %OBSOLETE_ZNODE)
* @time: last access time (seconds)
* @level: level of the entry in the TNC tree
@@ -769,6 +788,8 @@ struct ubifs_zbranch {
struct ubifs_znode {
struct ubifs_znode *parent;
struct ubifs_znode *cnext;
+ struct ubifs_znode *cparent;
+ int ciip;
unsigned long flags;
time64_t time;
int level;
@@ -983,6 +1004,7 @@ struct ubifs_debug_info;
* struct ubifs_info - UBIFS file-system description data structure
* (per-superblock).
* @vfs_sb: VFS @struct super_block object
+ * @sup_node: The super block node as read from the device
*
* @highest_inum: highest used inode number
* @max_sqnum: current global sequence number
@@ -1028,6 +1050,7 @@ struct ubifs_debug_info;
* @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc)
* @rw_incompat: the media is not R/W compatible
* @assert_action: action to take when a ubifs_assert() fails
+ * @authenticated: flag indigating the FS is mounted in authenticated mode
*
* @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and
* @calc_idx_sz
@@ -1075,6 +1098,7 @@ struct ubifs_debug_info;
* @key_hash: direntry key hash function
* @key_fmt: key format
* @key_len: key length
+ * @hash_len: The length of the index node hashes
* @fanout: fanout of the index tree (number of links per indexing node)
*
* @min_io_size: minimal input/output unit size
@@ -1210,6 +1234,15 @@ struct ubifs_debug_info;
* @rp_uid: reserved pool user ID
* @rp_gid: reserved pool group ID
*
+ * @hash_tfm: the hash transformation used for hashing nodes
+ * @hmac_tfm: the HMAC transformation for this filesystem
+ * @hmac_desc_len: length of the HMAC used for authentication
+ * @auth_key_name: the authentication key name
+ * @auth_hash_name: the name of the hash algorithm used for authentication
+ * @auth_hash_algo: the authentication hash used for this fs
+ * @log_hash: the log hash from the commit start node up to the latest reference
+ * node.
+ *
* @empty: %1 if the UBI device is empty
* @need_recovery: %1 if the file-system needs recovery
* @replaying: %1 during journal replay
@@ -1230,6 +1263,7 @@ struct ubifs_debug_info;
*/
struct ubifs_info {
struct super_block *vfs_sb;
+ struct ubifs_sb_node *sup_node;
ino_t highest_inum;
unsigned long long max_sqnum;
@@ -1270,6 +1304,7 @@ struct ubifs_info {
unsigned int default_compr:2;
unsigned int rw_incompat:1;
unsigned int assert_action:2;
+ unsigned int authenticated:1;
struct mutex tnc_mutex;
struct ubifs_zbranch zroot;
@@ -1314,6 +1349,7 @@ struct ubifs_info {
uint32_t (*key_hash)(const char *str, int len);
int key_fmt;
int key_len;
+ int hash_len;
int fanout;
int min_io_size;
@@ -1441,6 +1477,15 @@ struct ubifs_info {
kuid_t rp_uid;
kgid_t rp_gid;
+ struct crypto_shash *hash_tfm;
+ struct crypto_shash *hmac_tfm;
+ int hmac_desc_len;
+ char *auth_key_name;
+ char *auth_hash_name;
+ enum hash_algo auth_hash_algo;
+
+ struct shash_desc *log_hash;
+
/* The below fields are used only during mounting and re-mounting */
unsigned int empty:1;
unsigned int need_recovery:1;
@@ -1471,6 +1516,195 @@ extern const struct inode_operations ubifs_dir_inode_operations;
extern const struct inode_operations ubifs_symlink_inode_operations;
extern struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT];
+/* auth.c */
+static inline int ubifs_authenticated(const struct ubifs_info *c)
+{
+ return (IS_ENABLED(CONFIG_UBIFS_FS_AUTHENTICATION)) && c->authenticated;
+}
+
+struct shash_desc *__ubifs_hash_get_desc(const struct ubifs_info *c);
+static inline struct shash_desc *ubifs_hash_get_desc(const struct ubifs_info *c)
+{
+ return ubifs_authenticated(c) ? __ubifs_hash_get_desc(c) : NULL;
+}
+
+static inline int ubifs_shash_init(const struct ubifs_info *c,
+ struct shash_desc *desc)
+{
+ if (ubifs_authenticated(c))
+ return crypto_shash_init(desc);
+ else
+ return 0;
+}
+
+static inline int ubifs_shash_update(const struct ubifs_info *c,
+ struct shash_desc *desc, const void *buf,
+ unsigned int len)
+{
+ int err = 0;
+
+ if (ubifs_authenticated(c)) {
+ err = crypto_shash_update(desc, buf, len);
+ if (err < 0)
+ return err;
+ }
+
+ return 0;
+}
+
+static inline int ubifs_shash_final(const struct ubifs_info *c,
+ struct shash_desc *desc, u8 *out)
+{
+ return ubifs_authenticated(c) ? crypto_shash_final(desc, out) : 0;
+}
+
+int __ubifs_node_calc_hash(const struct ubifs_info *c, const void *buf,
+ u8 *hash);
+static inline int ubifs_node_calc_hash(const struct ubifs_info *c,
+ const void *buf, u8 *hash)
+{
+ if (ubifs_authenticated(c))
+ return __ubifs_node_calc_hash(c, buf, hash);
+ else
+ return 0;
+}
+
+int ubifs_prepare_auth_node(struct ubifs_info *c, void *node,
+ struct shash_desc *inhash);
+
+/**
+ * ubifs_check_hash - compare two hashes
+ * @c: UBIFS file-system description object
+ * @expected: first hash
+ * @got: second hash
+ *
+ * Compare two hashes @expected and @got. Returns 0 when they are equal, a
+ * negative error code otherwise.
+ */
+static inline int ubifs_check_hash(const struct ubifs_info *c,
+ const u8 *expected, const u8 *got)
+{
+ return crypto_memneq(expected, got, c->hash_len);
+}
+
+/**
+ * ubifs_check_hmac - compare two HMACs
+ * @c: UBIFS file-system description object
+ * @expected: first HMAC
+ * @got: second HMAC
+ *
+ * Compare two hashes @expected and @got. Returns 0 when they are equal, a
+ * negative error code otherwise.
+ */
+static inline int ubifs_check_hmac(const struct ubifs_info *c,
+ const u8 *expected, const u8 *got)
+{
+ return crypto_memneq(expected, got, c->hmac_desc_len);
+}
+
+void ubifs_bad_hash(const struct ubifs_info *c, const void *node,
+ const u8 *hash, int lnum, int offs);
+
+int __ubifs_node_check_hash(const struct ubifs_info *c, const void *buf,
+ const u8 *expected);
+static inline int ubifs_node_check_hash(const struct ubifs_info *c,
+ const void *buf, const u8 *expected)
+{
+ if (ubifs_authenticated(c))
+ return __ubifs_node_check_hash(c, buf, expected);
+ else
+ return 0;
+}
+
+int ubifs_init_authentication(struct ubifs_info *c);
+void __ubifs_exit_authentication(struct ubifs_info *c);
+static inline void ubifs_exit_authentication(struct ubifs_info *c)
+{
+ if (ubifs_authenticated(c))
+ __ubifs_exit_authentication(c);
+}
+
+/**
+ * ubifs_branch_hash - returns a pointer to the hash of a branch
+ * @c: UBIFS file-system description object
+ * @br: branch to get the hash from
+ *
+ * This returns a pointer to the hash of a branch. Since the key already is a
+ * dynamically sized object we cannot use a struct member here.
+ */
+static inline u8 *ubifs_branch_hash(struct ubifs_info *c,
+ struct ubifs_branch *br)
+{
+ return (void *)br + sizeof(*br) + c->key_len;
+}
+
+/**
+ * ubifs_copy_hash - copy a hash
+ * @c: UBIFS file-system description object
+ * @from: source hash
+ * @to: destination hash
+ *
+ * With authentication this copies a hash, otherwise does nothing.
+ */
+static inline void ubifs_copy_hash(const struct ubifs_info *c, const u8 *from,
+ u8 *to)
+{
+ if (ubifs_authenticated(c))
+ memcpy(to, from, c->hash_len);
+}
+
+int __ubifs_node_insert_hmac(const struct ubifs_info *c, void *buf,
+ int len, int ofs_hmac);
+static inline int ubifs_node_insert_hmac(const struct ubifs_info *c, void *buf,
+ int len, int ofs_hmac)
+{
+ if (ubifs_authenticated(c))
+ return __ubifs_node_insert_hmac(c, buf, len, ofs_hmac);
+ else
+ return 0;
+}
+
+int __ubifs_node_verify_hmac(const struct ubifs_info *c, const void *buf,
+ int len, int ofs_hmac);
+static inline int ubifs_node_verify_hmac(const struct ubifs_info *c,
+ const void *buf, int len, int ofs_hmac)
+{
+ if (ubifs_authenticated(c))
+ return __ubifs_node_verify_hmac(c, buf, len, ofs_hmac);
+ else
+ return 0;
+}
+
+/**
+ * ubifs_auth_node_sz - returns the size of an authentication node
+ * @c: UBIFS file-system description object
+ *
+ * This function returns the size of an authentication node which can
+ * be 0 for unauthenticated filesystems or the real size of an auth node
+ * authentication is enabled.
+ */
+static inline int ubifs_auth_node_sz(const struct ubifs_info *c)
+{
+ if (ubifs_authenticated(c))
+ return sizeof(struct ubifs_auth_node) + c->hmac_desc_len;
+ else
+ return 0;
+}
+
+int ubifs_hmac_wkm(struct ubifs_info *c, u8 *hmac);
+
+int __ubifs_shash_copy_state(const struct ubifs_info *c, struct shash_desc *src,
+ struct shash_desc *target);
+static inline int ubifs_shash_copy_state(const struct ubifs_info *c,
+ struct shash_desc *src,
+ struct shash_desc *target)
+{
+ if (ubifs_authenticated(c))
+ return __ubifs_shash_copy_state(c, src, target);
+ else
+ return 0;
+}
+
/* io.c */
void ubifs_ro_mode(struct ubifs_info *c, int err);
int ubifs_leb_read(const struct ubifs_info *c, int lnum, void *buf, int offs,
@@ -1490,9 +1724,15 @@ int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len,
int lnum, int offs);
int ubifs_write_node(struct ubifs_info *c, void *node, int len, int lnum,
int offs);
+int ubifs_write_node_hmac(struct ubifs_info *c, void *buf, int len, int lnum,
+ int offs, int hmac_offs);
int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
int offs, int quiet, int must_chk_crc);
+void ubifs_init_node(struct ubifs_info *c, void *buf, int len, int pad);
+void ubifs_crc_node(struct ubifs_info *c, void *buf, int len);
void ubifs_prepare_node(struct ubifs_info *c, void *buf, int len, int pad);
+int ubifs_prepare_node_hmac(struct ubifs_info *c, void *node, int len,
+ int hmac_offs, int pad);
void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last);
int ubifs_io_init(struct ubifs_info *c);
void ubifs_pad(const struct ubifs_info *c, void *buf, int pad);
@@ -1592,11 +1832,12 @@ int ubifs_tnc_lookup_dh(struct ubifs_info *c, const union ubifs_key *key,
int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key,
void *node, int *lnum, int *offs);
int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum,
- int offs, int len);
+ int offs, int len, const u8 *hash);
int ubifs_tnc_replace(struct ubifs_info *c, const union ubifs_key *key,
int old_lnum, int old_offs, int lnum, int offs, int len);
int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key,
- int lnum, int offs, int len, const struct fscrypt_name *nm);
+ int lnum, int offs, int len, const u8 *hash,
+ const struct fscrypt_name *nm);
int ubifs_tnc_remove(struct ubifs_info *c, const union ubifs_key *key);
int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key,
const struct fscrypt_name *nm);
@@ -1659,12 +1900,12 @@ int ubifs_gc_should_commit(struct ubifs_info *c);
void ubifs_wait_for_commit(struct ubifs_info *c);
/* master.c */
+int ubifs_compare_master_node(struct ubifs_info *c, void *m1, void *m2);
int ubifs_read_master(struct ubifs_info *c);
int ubifs_write_master(struct ubifs_info *c);
/* sb.c */
int ubifs_read_superblock(struct ubifs_info *c);
-struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c);
int ubifs_write_sb_node(struct ubifs_info *c, struct ubifs_sb_node *sup);
int ubifs_fixup_free_space(struct ubifs_info *c);
int ubifs_enable_encryption(struct ubifs_info *c);
@@ -1693,7 +1934,7 @@ int ubifs_clear_orphans(struct ubifs_info *c);
/* lpt.c */
int ubifs_calc_lpt_geom(struct ubifs_info *c);
int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first,
- int *lpt_lebs, int *big_lpt);
+ int *lpt_lebs, int *big_lpt, u8 *hash);
int ubifs_lpt_init(struct ubifs_info *c, int rd, int wr);
struct ubifs_lprops *ubifs_lpt_lookup(struct ubifs_info *c, int lnum);
struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum);
@@ -1712,6 +1953,7 @@ struct ubifs_pnode *ubifs_get_pnode(struct ubifs_info *c,
struct ubifs_nnode *parent, int iip);
struct ubifs_nnode *ubifs_get_nnode(struct ubifs_info *c,
struct ubifs_nnode *parent, int iip);
+struct ubifs_pnode *ubifs_pnode_lookup(struct ubifs_info *c, int i);
int ubifs_read_nnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip);
void ubifs_add_lpt_dirt(struct ubifs_info *c, int lnum, int dirty);
void ubifs_add_nnode_dirt(struct ubifs_info *c, struct ubifs_nnode *nnode);
@@ -1720,6 +1962,7 @@ struct ubifs_nnode *ubifs_first_nnode(struct ubifs_info *c, int *hght);
/* Needed only in debugging code in lpt_commit.c */
int ubifs_unpack_nnode(const struct ubifs_info *c, void *buf,
struct ubifs_nnode *nnode);
+int ubifs_lpt_calc_hash(struct ubifs_info *c, u8 *hash);
/* lpt_commit.c */
int ubifs_lpt_start_commit(struct ubifs_info *c);
@@ -1807,7 +2050,7 @@ int ubifs_clean_lebs(struct ubifs_info *c, void *sbuf);
int ubifs_rcvry_gc_commit(struct ubifs_info *c);
int ubifs_recover_size_accum(struct ubifs_info *c, union ubifs_key *key,
int deletion, loff_t new_size);
-int ubifs_recover_size(struct ubifs_info *c);
+int ubifs_recover_size(struct ubifs_info *c, bool in_place);
void ubifs_destroy_size_tree(struct ubifs_info *c);
/* ioctl.c */
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index fcda0fc97b90..ec85aeaed54a 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -175,8 +175,8 @@ static int udf_bitmap_prealloc_blocks(struct super_block *sb,
{
struct udf_sb_info *sbi = UDF_SB(sb);
int alloc_count = 0;
- int bit, block, block_group, group_start;
- int nr_groups, bitmap_nr;
+ int bit, block, block_group;
+ int bitmap_nr;
struct buffer_head *bh;
__u32 part_len;
@@ -189,10 +189,8 @@ static int udf_bitmap_prealloc_blocks(struct super_block *sb,
block_count = part_len - first_block;
do {
- nr_groups = udf_compute_nr_groups(sb, partition);
block = first_block + (sizeof(struct spaceBitmapDesc) << 3);
block_group = block >> (sb->s_blocksize_bits + 3);
- group_start = block_group ? 0 : sizeof(struct spaceBitmapDesc);
bitmap_nr = load_block_bitmap(sb, bitmap, block_group);
if (bitmap_nr < 0)
@@ -652,12 +650,6 @@ void udf_free_blocks(struct super_block *sb, struct inode *inode,
} else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) {
udf_table_free_blocks(sb, map->s_uspace.s_table,
bloc, offset, count);
- } else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) {
- udf_bitmap_free_blocks(sb, map->s_fspace.s_bitmap,
- bloc, offset, count);
- } else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) {
- udf_table_free_blocks(sb, map->s_fspace.s_table,
- bloc, offset, count);
}
if (inode) {
@@ -684,16 +676,6 @@ inline int udf_prealloc_blocks(struct super_block *sb,
map->s_uspace.s_table,
partition, first_block,
block_count);
- else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP)
- allocated = udf_bitmap_prealloc_blocks(sb,
- map->s_fspace.s_bitmap,
- partition, first_block,
- block_count);
- else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE)
- allocated = udf_table_prealloc_blocks(sb,
- map->s_fspace.s_table,
- partition, first_block,
- block_count);
else
return 0;
@@ -717,14 +699,6 @@ inline udf_pblk_t udf_new_block(struct super_block *sb,
block = udf_table_new_block(sb,
map->s_uspace.s_table,
partition, goal, err);
- else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP)
- block = udf_bitmap_new_block(sb,
- map->s_fspace.s_bitmap,
- partition, goal, err);
- else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE)
- block = udf_table_new_block(sb,
- map->s_fspace.s_table,
- partition, goal, err);
else {
*err = -EIO;
return 0;
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 5df554a9f9c9..ae796e10f68b 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1357,6 +1357,12 @@ reread:
iinfo->i_alloc_type = le16_to_cpu(fe->icbTag.flags) &
ICBTAG_FLAG_AD_MASK;
+ if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_SHORT &&
+ iinfo->i_alloc_type != ICBTAG_FLAG_AD_LONG &&
+ iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
+ ret = -EIO;
+ goto out;
+ }
iinfo->i_unique = 0;
iinfo->i_lenEAttr = 0;
iinfo->i_lenExtents = 0;
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 6f515651a2c2..ffd8038ff728 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -290,12 +290,8 @@ static void udf_free_partition(struct udf_part_map *map)
if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE)
iput(map->s_uspace.s_table);
- if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE)
- iput(map->s_fspace.s_table);
if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP)
udf_sb_free_bitmap(map->s_uspace.s_bitmap);
- if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP)
- udf_sb_free_bitmap(map->s_fspace.s_bitmap);
if (map->s_partition_type == UDF_SPARABLE_MAP15)
for (i = 0; i < 4; i++)
brelse(map->s_type_specific.s_sparing.s_spar_map[i]);
@@ -613,14 +609,11 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
struct udf_options uopt;
struct udf_sb_info *sbi = UDF_SB(sb);
int error = 0;
- struct logicalVolIntegrityDescImpUse *lvidiu = udf_sb_lvidiu(sb);
+
+ if (!(*flags & SB_RDONLY) && UDF_QUERY_FLAG(sb, UDF_FLAG_RW_INCOMPAT))
+ return -EACCES;
sync_filesystem(sb);
- if (lvidiu) {
- int write_rev = le16_to_cpu(lvidiu->minUDFWriteRev);
- if (write_rev > UDF_MAX_WRITE_VERSION && !(*flags & SB_RDONLY))
- return -EACCES;
- }
uopt.flags = sbi->s_flags;
uopt.uid = sbi->s_uid;
@@ -834,16 +827,20 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
ret = udf_dstrCS0toChar(sb, outstr, 31, pvoldesc->volIdent, 32);
- if (ret < 0)
- goto out_bh;
-
- strncpy(UDF_SB(sb)->s_volume_ident, outstr, ret);
+ if (ret < 0) {
+ strcpy(UDF_SB(sb)->s_volume_ident, "InvalidName");
+ pr_warn("incorrect volume identification, setting to "
+ "'InvalidName'\n");
+ } else {
+ strncpy(UDF_SB(sb)->s_volume_ident, outstr, ret);
+ }
udf_debug("volIdent[] = '%s'\n", UDF_SB(sb)->s_volume_ident);
ret = udf_dstrCS0toChar(sb, outstr, 127, pvoldesc->volSetIdent, 128);
- if (ret < 0)
+ if (ret < 0) {
+ ret = 0;
goto out_bh;
-
+ }
outstr[ret] = 0;
udf_debug("volSetIdent[] = '%s'\n", outstr);
@@ -988,12 +985,62 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index)
return bitmap;
}
+static int check_partition_desc(struct super_block *sb,
+ struct partitionDesc *p,
+ struct udf_part_map *map)
+{
+ bool umap, utable, fmap, ftable;
+ struct partitionHeaderDesc *phd;
+
+ switch (le32_to_cpu(p->accessType)) {
+ case PD_ACCESS_TYPE_READ_ONLY:
+ case PD_ACCESS_TYPE_WRITE_ONCE:
+ case PD_ACCESS_TYPE_REWRITABLE:
+ case PD_ACCESS_TYPE_NONE:
+ goto force_ro;
+ }
+
+ /* No Partition Header Descriptor? */
+ if (strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR02) &&
+ strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR03))
+ goto force_ro;
+
+ phd = (struct partitionHeaderDesc *)p->partitionContentsUse;
+ utable = phd->unallocSpaceTable.extLength;
+ umap = phd->unallocSpaceBitmap.extLength;
+ ftable = phd->freedSpaceTable.extLength;
+ fmap = phd->freedSpaceBitmap.extLength;
+
+ /* No allocation info? */
+ if (!utable && !umap && !ftable && !fmap)
+ goto force_ro;
+
+ /* We don't support blocks that require erasing before overwrite */
+ if (ftable || fmap)
+ goto force_ro;
+ /* UDF 2.60: 2.3.3 - no mixing of tables & bitmaps, no VAT. */
+ if (utable && umap)
+ goto force_ro;
+
+ if (map->s_partition_type == UDF_VIRTUAL_MAP15 ||
+ map->s_partition_type == UDF_VIRTUAL_MAP20)
+ goto force_ro;
+
+ return 0;
+force_ro:
+ if (!sb_rdonly(sb))
+ return -EACCES;
+ UDF_SET_FLAG(sb, UDF_FLAG_RW_INCOMPAT);
+ return 0;
+}
+
static int udf_fill_partdesc_info(struct super_block *sb,
struct partitionDesc *p, int p_index)
{
struct udf_part_map *map;
struct udf_sb_info *sbi = UDF_SB(sb);
struct partitionHeaderDesc *phd;
+ int err;
map = &sbi->s_partmaps[p_index];
@@ -1013,8 +1060,16 @@ static int udf_fill_partdesc_info(struct super_block *sb,
p_index, map->s_partition_type,
map->s_partition_root, map->s_partition_len);
- if (strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR02) &&
- strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR03))
+ err = check_partition_desc(sb, p, map);
+ if (err)
+ return err;
+
+ /*
+ * Skip loading allocation info it we cannot ever write to the fs.
+ * This is a correctness thing as we may have decided to force ro mount
+ * to avoid allocation info we don't support.
+ */
+ if (UDF_QUERY_FLAG(sb, UDF_FLAG_RW_INCOMPAT))
return 0;
phd = (struct partitionHeaderDesc *)p->partitionContentsUse;
@@ -1050,40 +1105,6 @@ static int udf_fill_partdesc_info(struct super_block *sb,
p_index, bitmap->s_extPosition);
}
- if (phd->partitionIntegrityTable.extLength)
- udf_debug("partitionIntegrityTable (part %d)\n", p_index);
-
- if (phd->freedSpaceTable.extLength) {
- struct kernel_lb_addr loc = {
- .logicalBlockNum = le32_to_cpu(
- phd->freedSpaceTable.extPosition),
- .partitionReferenceNum = p_index,
- };
- struct inode *inode;
-
- inode = udf_iget_special(sb, &loc);
- if (IS_ERR(inode)) {
- udf_debug("cannot load freedSpaceTable (part %d)\n",
- p_index);
- return PTR_ERR(inode);
- }
- map->s_fspace.s_table = inode;
- map->s_partition_flags |= UDF_PART_FLAG_FREED_TABLE;
- udf_debug("freedSpaceTable (part %d) @ %lu\n",
- p_index, map->s_fspace.s_table->i_ino);
- }
-
- if (phd->freedSpaceBitmap.extLength) {
- struct udf_bitmap *bitmap = udf_sb_alloc_bitmap(sb, p_index);
- if (!bitmap)
- return -ENOMEM;
- map->s_fspace.s_bitmap = bitmap;
- bitmap->s_extPosition = le32_to_cpu(
- phd->freedSpaceBitmap.extPosition);
- map->s_partition_flags |= UDF_PART_FLAG_FREED_BITMAP;
- udf_debug("freedSpaceBitmap (part %d) @ %u\n",
- p_index, bitmap->s_extPosition);
- }
return 0;
}
@@ -1257,6 +1278,7 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block)
ret = -EACCES;
goto out_bh;
}
+ UDF_SET_FLAG(sb, UDF_FLAG_RW_INCOMPAT);
ret = udf_load_vat(sb, i, type1_idx);
if (ret < 0)
goto out_bh;
@@ -1452,6 +1474,17 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
if (lvd->integritySeqExt.extLength)
udf_load_logicalvolint(sb, leea_to_cpu(lvd->integritySeqExt));
ret = 0;
+
+ if (!sbi->s_lvid_bh) {
+ /* We can't generate unique IDs without a valid LVID */
+ if (sb_rdonly(sb)) {
+ UDF_SET_FLAG(sb, UDF_FLAG_RW_INCOMPAT);
+ } else {
+ udf_warn(sb, "Damaged or missing LVID, forcing "
+ "readonly mount\n");
+ ret = -EACCES;
+ }
+ }
out_bh:
brelse(bh);
return ret;
@@ -1921,13 +1954,24 @@ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt,
return 0;
}
+static void udf_finalize_lvid(struct logicalVolIntegrityDesc *lvid)
+{
+ struct timespec64 ts;
+
+ ktime_get_real_ts64(&ts);
+ udf_time_to_disk_stamp(&lvid->recordingDateAndTime, ts);
+ lvid->descTag.descCRC = cpu_to_le16(
+ crc_itu_t(0, (char *)lvid + sizeof(struct tag),
+ le16_to_cpu(lvid->descTag.descCRCLength)));
+ lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
+}
+
static void udf_open_lvid(struct super_block *sb)
{
struct udf_sb_info *sbi = UDF_SB(sb);
struct buffer_head *bh = sbi->s_lvid_bh;
struct logicalVolIntegrityDesc *lvid;
struct logicalVolIntegrityDescImpUse *lvidiu;
- struct timespec64 ts;
if (!bh)
return;
@@ -1939,18 +1983,12 @@ static void udf_open_lvid(struct super_block *sb)
mutex_lock(&sbi->s_alloc_mutex);
lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
- ktime_get_real_ts64(&ts);
- udf_time_to_disk_stamp(&lvid->recordingDateAndTime, ts);
if (le32_to_cpu(lvid->integrityType) == LVID_INTEGRITY_TYPE_CLOSE)
lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_OPEN);
else
UDF_SET_FLAG(sb, UDF_FLAG_INCONSISTENT);
- lvid->descTag.descCRC = cpu_to_le16(
- crc_itu_t(0, (char *)lvid + sizeof(struct tag),
- le16_to_cpu(lvid->descTag.descCRCLength)));
-
- lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
+ udf_finalize_lvid(lvid);
mark_buffer_dirty(bh);
sbi->s_lvid_dirty = 0;
mutex_unlock(&sbi->s_alloc_mutex);
@@ -1964,7 +2002,6 @@ static void udf_close_lvid(struct super_block *sb)
struct buffer_head *bh = sbi->s_lvid_bh;
struct logicalVolIntegrityDesc *lvid;
struct logicalVolIntegrityDescImpUse *lvidiu;
- struct timespec64 ts;
if (!bh)
return;
@@ -1976,8 +2013,6 @@ static void udf_close_lvid(struct super_block *sb)
mutex_lock(&sbi->s_alloc_mutex);
lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
- ktime_get_real_ts64(&ts);
- udf_time_to_disk_stamp(&lvid->recordingDateAndTime, ts);
if (UDF_MAX_WRITE_VERSION > le16_to_cpu(lvidiu->maxUDFWriteRev))
lvidiu->maxUDFWriteRev = cpu_to_le16(UDF_MAX_WRITE_VERSION);
if (sbi->s_udfrev > le16_to_cpu(lvidiu->minUDFReadRev))
@@ -1987,17 +2022,13 @@ static void udf_close_lvid(struct super_block *sb)
if (!UDF_QUERY_FLAG(sb, UDF_FLAG_INCONSISTENT))
lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_CLOSE);
- lvid->descTag.descCRC = cpu_to_le16(
- crc_itu_t(0, (char *)lvid + sizeof(struct tag),
- le16_to_cpu(lvid->descTag.descCRCLength)));
-
- lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
/*
* We set buffer uptodate unconditionally here to avoid spurious
* warnings from mark_buffer_dirty() when previous EIO has marked
* the buffer as !uptodate
*/
set_buffer_uptodate(bh);
+ udf_finalize_lvid(lvid);
mark_buffer_dirty(bh);
sbi->s_lvid_dirty = 0;
mutex_unlock(&sbi->s_alloc_mutex);
@@ -2026,8 +2057,8 @@ u64 lvid_get_unique_id(struct super_block *sb)
if (!(++uniqueID & 0xFFFFFFFF))
uniqueID += 16;
lvhd->uniqueID = cpu_to_le64(uniqueID);
+ udf_updated_lvid(sb);
mutex_unlock(&sbi->s_alloc_mutex);
- mark_buffer_dirty(bh);
return ret;
}
@@ -2155,10 +2186,12 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
UDF_MAX_READ_VERSION);
ret = -EINVAL;
goto error_out;
- } else if (minUDFWriteRev > UDF_MAX_WRITE_VERSION &&
- !sb_rdonly(sb)) {
- ret = -EACCES;
- goto error_out;
+ } else if (minUDFWriteRev > UDF_MAX_WRITE_VERSION) {
+ if (!sb_rdonly(sb)) {
+ ret = -EACCES;
+ goto error_out;
+ }
+ UDF_SET_FLAG(sb, UDF_FLAG_RW_INCOMPAT);
}
sbi->s_udfrev = minUDFWriteRev;
@@ -2176,10 +2209,12 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
}
if (sbi->s_partmaps[sbi->s_partition].s_partition_flags &
- UDF_PART_FLAG_READ_ONLY &&
- !sb_rdonly(sb)) {
- ret = -EACCES;
- goto error_out;
+ UDF_PART_FLAG_READ_ONLY) {
+ if (!sb_rdonly(sb)) {
+ ret = -EACCES;
+ goto error_out;
+ }
+ UDF_SET_FLAG(sb, UDF_FLAG_RW_INCOMPAT);
}
if (udf_find_fileset(sb, &fileset, &rootdir)) {
@@ -2294,11 +2329,17 @@ static int udf_sync_fs(struct super_block *sb, int wait)
mutex_lock(&sbi->s_alloc_mutex);
if (sbi->s_lvid_dirty) {
+ struct buffer_head *bh = sbi->s_lvid_bh;
+ struct logicalVolIntegrityDesc *lvid;
+
+ lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
+ udf_finalize_lvid(lvid);
+
/*
* Blockdevice will be synced later so we don't have to submit
* the buffer for IO
*/
- mark_buffer_dirty(sbi->s_lvid_bh);
+ mark_buffer_dirty(bh);
sbi->s_lvid_dirty = 0;
}
mutex_unlock(&sbi->s_alloc_mutex);
@@ -2433,10 +2474,6 @@ static unsigned int udf_count_free(struct super_block *sb)
accum += udf_count_free_bitmap(sb,
map->s_uspace.s_bitmap);
}
- if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) {
- accum += udf_count_free_bitmap(sb,
- map->s_fspace.s_bitmap);
- }
if (accum)
return accum;
@@ -2444,11 +2481,6 @@ static unsigned int udf_count_free(struct super_block *sb)
accum += udf_count_free_table(sb,
map->s_uspace.s_table);
}
- if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) {
- accum += udf_count_free_table(sb,
- map->s_fspace.s_table);
- }
-
return accum;
}
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index 9424d7cab790..3d83be54c474 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -30,11 +30,11 @@
#define UDF_FLAG_LASTBLOCK_SET 16
#define UDF_FLAG_BLOCKSIZE_SET 17
#define UDF_FLAG_INCONSISTENT 18
+#define UDF_FLAG_RW_INCOMPAT 19 /* Set when we find RW incompatible
+ * feature */
#define UDF_PART_FLAG_UNALLOC_BITMAP 0x0001
#define UDF_PART_FLAG_UNALLOC_TABLE 0x0002
-#define UDF_PART_FLAG_FREED_BITMAP 0x0004
-#define UDF_PART_FLAG_FREED_TABLE 0x0008
#define UDF_PART_FLAG_READ_ONLY 0x0010
#define UDF_PART_FLAG_WRITE_ONCE 0x0020
#define UDF_PART_FLAG_REWRITABLE 0x0040
@@ -50,8 +50,6 @@
#define UDF_INVALID_MODE ((umode_t)-1)
-#pragma pack(1) /* XXX(hch): Why? This file just defines in-core structures */
-
#define MF_DUPLICATE_MD 0x01
#define MF_MIRROR_FE_LOADED 0x02
@@ -93,10 +91,6 @@ struct udf_part_map {
struct udf_bitmap *s_bitmap;
struct inode *s_table;
} s_uspace;
- union {
- struct udf_bitmap *s_bitmap;
- struct inode *s_table;
- } s_fspace;
__u32 s_partition_root;
__u32 s_partition_len;
__u16 s_partition_type;
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index 45234791fec2..5fcfa96463eb 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -351,6 +351,11 @@ try_again:
return u_len;
}
+/*
+ * Convert CS0 dstring to output charset. Warning: This function may truncate
+ * input string if it is too long as it is used for informational strings only
+ * and it is better to truncate the string than to refuse mounting a media.
+ */
int udf_dstrCS0toChar(struct super_block *sb, uint8_t *utf_o, int o_len,
const uint8_t *ocu_i, int i_len)
{
@@ -359,9 +364,12 @@ int udf_dstrCS0toChar(struct super_block *sb, uint8_t *utf_o, int o_len,
if (i_len > 0) {
s_len = ocu_i[i_len - 1];
if (s_len >= i_len) {
- pr_err("incorrect dstring lengths (%d/%d)\n",
- s_len, i_len);
- return -EINVAL;
+ pr_warn("incorrect dstring lengths (%d/%d),"
+ " truncating\n", s_len, i_len);
+ s_len = i_len - 1;
+ /* 2-byte encoding? Need to round properly... */
+ if (ocu_i[0] == 16)
+ s_len -= (s_len - 1) & 2;
}
}
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index bfa0ec69f924..89800fc7dc9d 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -53,7 +53,7 @@ struct userfaultfd_ctx {
/* a refile sequence protected by fault_pending_wqh lock */
struct seqcount refile_seq;
/* pseudo fd refcounting */
- atomic_t refcount;
+ refcount_t refcount;
/* userfaultfd syscall flags */
unsigned int flags;
/* features requested from the userspace */
@@ -140,8 +140,7 @@ out:
*/
static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx)
{
- if (!atomic_inc_not_zero(&ctx->refcount))
- BUG();
+ refcount_inc(&ctx->refcount);
}
/**
@@ -154,7 +153,7 @@ static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx)
*/
static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
{
- if (atomic_dec_and_test(&ctx->refcount)) {
+ if (refcount_dec_and_test(&ctx->refcount)) {
VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock));
VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh));
VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock));
@@ -686,7 +685,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
return -ENOMEM;
}
- atomic_set(&ctx->refcount, 1);
+ refcount_set(&ctx->refcount, 1);
ctx->flags = octx->flags;
ctx->state = UFFD_STATE_RUNNING;
ctx->features = octx->features;
@@ -736,10 +735,18 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
struct userfaultfd_ctx *ctx;
ctx = vma->vm_userfaultfd_ctx.ctx;
- if (ctx && (ctx->features & UFFD_FEATURE_EVENT_REMAP)) {
+
+ if (!ctx)
+ return;
+
+ if (ctx->features & UFFD_FEATURE_EVENT_REMAP) {
vm_ctx->ctx = ctx;
userfaultfd_ctx_get(ctx);
WRITE_ONCE(ctx->mmap_changing, true);
+ } else {
+ /* Drop uffd context if remap feature not enabled */
+ vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+ vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING);
}
}
@@ -926,7 +933,7 @@ static inline struct userfaultfd_wait_queue *find_userfault_in(
wait_queue_entry_t *wq;
struct userfaultfd_wait_queue *uwq;
- VM_BUG_ON(!spin_is_locked(&wqh->lock));
+ lockdep_assert_held(&wqh->lock);
uwq = NULL;
if (!waitqueue_active(wqh))
@@ -1026,7 +1033,7 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
struct userfaultfd_ctx *fork_nctx = NULL;
/* always take the fd_wqh lock before the fault_pending_wqh lock */
- spin_lock(&ctx->fd_wqh.lock);
+ spin_lock_irq(&ctx->fd_wqh.lock);
__add_wait_queue(&ctx->fd_wqh, &wait);
for (;;) {
set_current_state(TASK_INTERRUPTIBLE);
@@ -1112,13 +1119,13 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
ret = -EAGAIN;
break;
}
- spin_unlock(&ctx->fd_wqh.lock);
+ spin_unlock_irq(&ctx->fd_wqh.lock);
schedule();
- spin_lock(&ctx->fd_wqh.lock);
+ spin_lock_irq(&ctx->fd_wqh.lock);
}
__remove_wait_queue(&ctx->fd_wqh, &wait);
__set_current_state(TASK_RUNNING);
- spin_unlock(&ctx->fd_wqh.lock);
+ spin_unlock_irq(&ctx->fd_wqh.lock);
if (!ret && msg->event == UFFD_EVENT_FORK) {
ret = resolve_userfault_fork(ctx, fork_nctx, msg);
@@ -1361,6 +1368,19 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
ret = -EINVAL;
if (!vma_can_userfault(cur))
goto out_unlock;
+
+ /*
+ * UFFDIO_COPY will fill file holes even without
+ * PROT_WRITE. This check enforces that if this is a
+ * MAP_SHARED, the process has write permission to the backing
+ * file. If VM_MAYWRITE is set it also enforces that on a
+ * MAP_SHARED vma: there is no F_WRITE_SEAL and no further
+ * F_WRITE_SEAL can be taken until the vma is destroyed.
+ */
+ ret = -EPERM;
+ if (unlikely(!(cur->vm_flags & VM_MAYWRITE)))
+ goto out_unlock;
+
/*
* If this vma contains ending address, and huge pages
* check alignment.
@@ -1406,6 +1426,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
BUG_ON(!vma_can_userfault(vma));
BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
vma->vm_userfaultfd_ctx.ctx != ctx);
+ WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
/*
* Nothing to do: this vma is already registered into this
@@ -1560,6 +1581,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
if (!vma->vm_userfaultfd_ctx.ctx)
goto skip;
+ WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
+
if (vma->vm_start > start)
start = vma->vm_start;
vma_end = min(end, vma->vm_end);
@@ -1911,7 +1934,7 @@ SYSCALL_DEFINE1(userfaultfd, int, flags)
if (!ctx)
return -ENOMEM;
- atomic_set(&ctx->refcount, 1);
+ refcount_set(&ctx->refcount, 1);
ctx->flags = flags;
ctx->features = 0;
ctx->state = UFFD_STATE_WAIT_API;
diff --git a/fs/utimes.c b/fs/utimes.c
index bdcf2daf39c1..350c9c16ace1 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -224,8 +224,8 @@ SYSCALL_DEFINE2(utime, char __user *, filename, struct utimbuf __user *, times)
* of sys_utimes.
*/
#ifdef __ARCH_WANT_SYS_UTIME32
-COMPAT_SYSCALL_DEFINE2(utime, const char __user *, filename,
- struct old_utimbuf32 __user *, t)
+SYSCALL_DEFINE2(utime32, const char __user *, filename,
+ struct old_utimbuf32 __user *, t)
{
struct timespec64 tv[2];
@@ -240,7 +240,7 @@ COMPAT_SYSCALL_DEFINE2(utime, const char __user *, filename,
}
#endif
-COMPAT_SYSCALL_DEFINE4(utimensat, unsigned int, dfd, const char __user *, filename, struct old_timespec32 __user *, t, int, flags)
+SYSCALL_DEFINE4(utimensat_time32, unsigned int, dfd, const char __user *, filename, struct old_timespec32 __user *, t, int, flags)
{
struct timespec64 tv[2];
@@ -276,14 +276,14 @@ static long do_compat_futimesat(unsigned int dfd, const char __user *filename,
return do_utimes(dfd, filename, t ? tv : NULL, 0);
}
-COMPAT_SYSCALL_DEFINE3(futimesat, unsigned int, dfd,
+SYSCALL_DEFINE3(futimesat_time32, unsigned int, dfd,
const char __user *, filename,
struct old_timeval32 __user *, t)
{
return do_compat_futimesat(dfd, filename, t);
}
-COMPAT_SYSCALL_DEFINE2(utimes, const char __user *, filename, struct old_timeval32 __user *, t)
+SYSCALL_DEFINE2(utimes_time32, const char __user *, filename, struct old_timeval32 __user *, t)
{
return do_compat_futimesat(AT_FDCWD, filename, t);
}
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index 9345802c99f7..1ef8acf35e7d 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -339,14 +339,14 @@ xfs_ag_init_headers(
{ /* BNO root block */
.daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_BNO_BLOCK(mp)),
.numblks = BTOBB(mp->m_sb.sb_blocksize),
- .ops = &xfs_allocbt_buf_ops,
+ .ops = &xfs_bnobt_buf_ops,
.work = &xfs_bnoroot_init,
.need_init = true
},
{ /* CNT root block */
.daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_CNT_BLOCK(mp)),
.numblks = BTOBB(mp->m_sb.sb_blocksize),
- .ops = &xfs_allocbt_buf_ops,
+ .ops = &xfs_cntbt_buf_ops,
.work = &xfs_cntroot_init,
.need_init = true
},
@@ -361,7 +361,7 @@ xfs_ag_init_headers(
{ /* FINO root block */
.daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_FIBT_BLOCK(mp)),
.numblks = BTOBB(mp->m_sb.sb_blocksize),
- .ops = &xfs_inobt_buf_ops,
+ .ops = &xfs_finobt_buf_ops,
.work = &xfs_btroot_init,
.type = XFS_BTNUM_FINO,
.need_init = xfs_sb_version_hasfinobt(&mp->m_sb)
@@ -414,7 +414,6 @@ xfs_ag_extend_space(
struct aghdr_init_data *id,
xfs_extlen_t len)
{
- struct xfs_owner_info oinfo;
struct xfs_buf *bp;
struct xfs_agi *agi;
struct xfs_agf *agf;
@@ -448,17 +447,17 @@ xfs_ag_extend_space(
/*
* Free the new space.
*
- * XFS_RMAP_OWN_NULL is used here to tell the rmap btree that
+ * XFS_RMAP_OINFO_SKIP_UPDATE is used here to tell the rmap btree that
* this doesn't actually exist in the rmap btree.
*/
- xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_NULL);
error = xfs_rmap_free(tp, bp, id->agno,
be32_to_cpu(agf->agf_length) - len,
- len, &oinfo);
+ len, &XFS_RMAP_OINFO_SKIP_UPDATE);
if (error)
return error;
return xfs_free_extent(tp, XFS_AGB_TO_FSB(mp, id->agno,
be32_to_cpu(agf->agf_length) - len),
- len, &oinfo, XFS_AG_RESV_NONE);
+ len, &XFS_RMAP_OINFO_SKIP_UPDATE,
+ XFS_AG_RESV_NONE);
}
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
index e701ebc36c06..e2ba2a3b63b2 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.c
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -281,7 +281,7 @@ xfs_ag_resv_init(
*/
ask = used = 0;
- mp->m_inotbt_nores = true;
+ mp->m_finobt_nores = true;
error = xfs_refcountbt_calc_reserves(mp, tp, agno, &ask,
&used);
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index e1c0c0d2f1b0..bc3367b8b7bb 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -568,9 +568,9 @@ xfs_agfl_verify(
if (!xfs_sb_version_hascrc(&mp->m_sb))
return NULL;
- if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid))
+ if (!xfs_verify_magic(bp, agfl->agfl_magicnum))
return __this_address;
- if (be32_to_cpu(agfl->agfl_magicnum) != XFS_AGFL_MAGIC)
+ if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
/*
* during growfs operations, the perag is not fully initialised,
@@ -643,6 +643,7 @@ xfs_agfl_write_verify(
const struct xfs_buf_ops xfs_agfl_buf_ops = {
.name = "xfs_agfl",
+ .magic = { cpu_to_be32(XFS_AGFL_MAGIC), cpu_to_be32(XFS_AGFL_MAGIC) },
.verify_read = xfs_agfl_read_verify,
.verify_write = xfs_agfl_write_verify,
.verify_struct = xfs_agfl_verify,
@@ -1594,7 +1595,6 @@ xfs_alloc_ag_vextent_small(
xfs_extlen_t *flenp, /* result length */
int *stat) /* status: 0-freelist, 1-normal/none */
{
- struct xfs_owner_info oinfo;
int error;
xfs_agblock_t fbno;
xfs_extlen_t flen;
@@ -1648,9 +1648,8 @@ xfs_alloc_ag_vextent_small(
* doesn't live in the free space, we need to clear
* out the OWN_AG rmap.
*/
- xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
error = xfs_rmap_free(args->tp, args->agbp, args->agno,
- fbno, 1, &oinfo);
+ fbno, 1, &XFS_RMAP_OINFO_AG);
if (error)
goto error0;
@@ -1694,28 +1693,28 @@ error0:
*/
STATIC int
xfs_free_ag_extent(
- xfs_trans_t *tp,
- xfs_buf_t *agbp,
- xfs_agnumber_t agno,
- xfs_agblock_t bno,
- xfs_extlen_t len,
- struct xfs_owner_info *oinfo,
- enum xfs_ag_resv_type type)
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ xfs_agnumber_t agno,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ const struct xfs_owner_info *oinfo,
+ enum xfs_ag_resv_type type)
{
- xfs_btree_cur_t *bno_cur; /* cursor for by-block btree */
- xfs_btree_cur_t *cnt_cur; /* cursor for by-size btree */
- int error; /* error return value */
- xfs_agblock_t gtbno; /* start of right neighbor block */
- xfs_extlen_t gtlen; /* length of right neighbor block */
- int haveleft; /* have a left neighbor block */
- int haveright; /* have a right neighbor block */
- int i; /* temp, result code */
- xfs_agblock_t ltbno; /* start of left neighbor block */
- xfs_extlen_t ltlen; /* length of left neighbor block */
- xfs_mount_t *mp; /* mount point struct for filesystem */
- xfs_agblock_t nbno; /* new starting block of freespace */
- xfs_extlen_t nlen; /* new length of freespace */
- xfs_perag_t *pag; /* per allocation group data */
+ struct xfs_mount *mp;
+ struct xfs_perag *pag;
+ struct xfs_btree_cur *bno_cur;
+ struct xfs_btree_cur *cnt_cur;
+ xfs_agblock_t gtbno; /* start of right neighbor */
+ xfs_extlen_t gtlen; /* length of right neighbor */
+ xfs_agblock_t ltbno; /* start of left neighbor */
+ xfs_extlen_t ltlen; /* length of left neighbor */
+ xfs_agblock_t nbno; /* new starting block of freesp */
+ xfs_extlen_t nlen; /* new length of freespace */
+ int haveleft; /* have a left neighbor */
+ int haveright; /* have a right neighbor */
+ int i;
+ int error;
bno_cur = cnt_cur = NULL;
mp = tp->t_mountp;
@@ -2314,10 +2313,11 @@ xfs_alloc_fix_freelist(
* repair/rmap.c in xfsprogs for details.
*/
memset(&targs, 0, sizeof(targs));
+ /* struct copy below */
if (flags & XFS_ALLOC_FLAG_NORMAP)
- xfs_rmap_skip_owner_update(&targs.oinfo);
+ targs.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE;
else
- xfs_rmap_ag_owner(&targs.oinfo, XFS_RMAP_OWN_AG);
+ targs.oinfo = XFS_RMAP_OINFO_AG;
while (!(flags & XFS_ALLOC_FLAG_NOSHRINK) && pag->pagf_flcount > need) {
error = xfs_alloc_get_freelist(tp, agbp, &bno, 0);
if (error)
@@ -2435,7 +2435,6 @@ xfs_alloc_get_freelist(
be32_add_cpu(&agf->agf_flcount, -1);
xfs_trans_agflist_delta(tp, -1);
pag->pagf_flcount--;
- xfs_perag_put(pag);
logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT;
if (btreeblk) {
@@ -2443,6 +2442,7 @@ xfs_alloc_get_freelist(
pag->pagf_btreeblks++;
logflags |= XFS_AGF_BTREEBLKS;
}
+ xfs_perag_put(pag);
xfs_alloc_log_agf(tp, agbp, logflags);
*bnop = bno;
@@ -2588,8 +2588,10 @@ xfs_agf_verify(
return __this_address;
}
- if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
- XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
+ if (!xfs_verify_magic(bp, agf->agf_magicnum))
+ return __this_address;
+
+ if (!(XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
be32_to_cpu(agf->agf_flfirst) < xfs_agfl_size(mp) &&
be32_to_cpu(agf->agf_fllast) < xfs_agfl_size(mp) &&
@@ -2671,6 +2673,7 @@ xfs_agf_write_verify(
const struct xfs_buf_ops xfs_agf_buf_ops = {
.name = "xfs_agf",
+ .magic = { cpu_to_be32(XFS_AGF_MAGIC), cpu_to_be32(XFS_AGF_MAGIC) },
.verify_read = xfs_agf_read_verify,
.verify_write = xfs_agf_write_verify,
.verify_struct = xfs_agf_verify,
@@ -3008,21 +3011,21 @@ out:
* Just break up the extent address and hand off to xfs_free_ag_extent
* after fixing up the freelist.
*/
-int /* error */
+int
__xfs_free_extent(
- struct xfs_trans *tp, /* transaction pointer */
- xfs_fsblock_t bno, /* starting block number of extent */
- xfs_extlen_t len, /* length of extent */
- struct xfs_owner_info *oinfo, /* extent owner */
- enum xfs_ag_resv_type type, /* block reservation type */
- bool skip_discard)
+ struct xfs_trans *tp,
+ xfs_fsblock_t bno,
+ xfs_extlen_t len,
+ const struct xfs_owner_info *oinfo,
+ enum xfs_ag_resv_type type,
+ bool skip_discard)
{
- struct xfs_mount *mp = tp->t_mountp;
- struct xfs_buf *agbp;
- xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, bno);
- xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, bno);
- int error;
- unsigned int busy_flags = 0;
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_buf *agbp;
+ xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, bno);
+ xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, bno);
+ int error;
+ unsigned int busy_flags = 0;
ASSERT(len != 0);
ASSERT(type != XFS_AG_RESV_AGFL);
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 00cd5ec4cb6b..d6ed5d2c07c2 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -182,7 +182,7 @@ __xfs_free_extent(
struct xfs_trans *tp, /* transaction pointer */
xfs_fsblock_t bno, /* starting block number of extent */
xfs_extlen_t len, /* length of extent */
- struct xfs_owner_info *oinfo, /* extent owner */
+ const struct xfs_owner_info *oinfo, /* extent owner */
enum xfs_ag_resv_type type, /* block reservation type */
bool skip_discard);
@@ -191,7 +191,7 @@ xfs_free_extent(
struct xfs_trans *tp,
xfs_fsblock_t bno,
xfs_extlen_t len,
- struct xfs_owner_info *oinfo,
+ const struct xfs_owner_info *oinfo,
enum xfs_ag_resv_type type)
{
return __xfs_free_extent(tp, bno, len, oinfo, type, false);
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index 4e59cc8a2802..9fe949f6055e 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -297,48 +297,34 @@ xfs_allocbt_verify(
struct xfs_perag *pag = bp->b_pag;
xfs_failaddr_t fa;
unsigned int level;
+ xfs_btnum_t btnum = XFS_BTNUM_BNOi;
+
+ if (!xfs_verify_magic(bp, block->bb_magic))
+ return __this_address;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ fa = xfs_btree_sblock_v5hdr_verify(bp);
+ if (fa)
+ return fa;
+ }
/*
- * magic number and level verification
- *
- * During growfs operations, we can't verify the exact level or owner as
- * the perag is not fully initialised and hence not attached to the
- * buffer. In this case, check against the maximum tree depth.
+ * The perag may not be attached during grow operations or fully
+ * initialized from the AGF during log recovery. Therefore we can only
+ * check against maximum tree depth from those contexts.
*
- * Similarly, during log recovery we will have a perag structure
- * attached, but the agf information will not yet have been initialised
- * from the on disk AGF. Again, we can only check against maximum limits
- * in this case.
+ * Otherwise check against the per-tree limit. Peek at one of the
+ * verifier magic values to determine the type of tree we're verifying
+ * against.
*/
level = be16_to_cpu(block->bb_level);
- switch (block->bb_magic) {
- case cpu_to_be32(XFS_ABTB_CRC_MAGIC):
- fa = xfs_btree_sblock_v5hdr_verify(bp);
- if (fa)
- return fa;
- /* fall through */
- case cpu_to_be32(XFS_ABTB_MAGIC):
- if (pag && pag->pagf_init) {
- if (level >= pag->pagf_levels[XFS_BTNUM_BNOi])
- return __this_address;
- } else if (level >= mp->m_ag_maxlevels)
+ if (bp->b_ops->magic[0] == cpu_to_be32(XFS_ABTC_MAGIC))
+ btnum = XFS_BTNUM_CNTi;
+ if (pag && pag->pagf_init) {
+ if (level >= pag->pagf_levels[btnum])
return __this_address;
- break;
- case cpu_to_be32(XFS_ABTC_CRC_MAGIC):
- fa = xfs_btree_sblock_v5hdr_verify(bp);
- if (fa)
- return fa;
- /* fall through */
- case cpu_to_be32(XFS_ABTC_MAGIC):
- if (pag && pag->pagf_init) {
- if (level >= pag->pagf_levels[XFS_BTNUM_CNTi])
- return __this_address;
- } else if (level >= mp->m_ag_maxlevels)
- return __this_address;
- break;
- default:
+ } else if (level >= mp->m_ag_maxlevels)
return __this_address;
- }
return xfs_btree_sblock_verify(bp, mp->m_alloc_mxr[level != 0]);
}
@@ -377,13 +363,23 @@ xfs_allocbt_write_verify(
}
-const struct xfs_buf_ops xfs_allocbt_buf_ops = {
- .name = "xfs_allocbt",
+const struct xfs_buf_ops xfs_bnobt_buf_ops = {
+ .name = "xfs_bnobt",
+ .magic = { cpu_to_be32(XFS_ABTB_MAGIC),
+ cpu_to_be32(XFS_ABTB_CRC_MAGIC) },
.verify_read = xfs_allocbt_read_verify,
.verify_write = xfs_allocbt_write_verify,
.verify_struct = xfs_allocbt_verify,
};
+const struct xfs_buf_ops xfs_cntbt_buf_ops = {
+ .name = "xfs_cntbt",
+ .magic = { cpu_to_be32(XFS_ABTC_MAGIC),
+ cpu_to_be32(XFS_ABTC_CRC_MAGIC) },
+ .verify_read = xfs_allocbt_read_verify,
+ .verify_write = xfs_allocbt_write_verify,
+ .verify_struct = xfs_allocbt_verify,
+};
STATIC int
xfs_bnobt_keys_inorder(
@@ -448,7 +444,7 @@ static const struct xfs_btree_ops xfs_bnobt_ops = {
.init_rec_from_cur = xfs_allocbt_init_rec_from_cur,
.init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur,
.key_diff = xfs_bnobt_key_diff,
- .buf_ops = &xfs_allocbt_buf_ops,
+ .buf_ops = &xfs_bnobt_buf_ops,
.diff_two_keys = xfs_bnobt_diff_two_keys,
.keys_inorder = xfs_bnobt_keys_inorder,
.recs_inorder = xfs_bnobt_recs_inorder,
@@ -470,7 +466,7 @@ static const struct xfs_btree_ops xfs_cntbt_ops = {
.init_rec_from_cur = xfs_allocbt_init_rec_from_cur,
.init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur,
.key_diff = xfs_cntbt_key_diff,
- .buf_ops = &xfs_allocbt_buf_ops,
+ .buf_ops = &xfs_cntbt_buf_ops,
.diff_two_keys = xfs_cntbt_diff_two_keys,
.keys_inorder = xfs_cntbt_keys_inorder,
.recs_inorder = xfs_cntbt_recs_inorder,
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 844ed87b1900..2dd9ee2a2e08 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -1336,3 +1336,20 @@ xfs_attr_node_get(xfs_da_args_t *args)
xfs_da_state_free(state);
return retval;
}
+
+/* Returns true if the attribute entry name is valid. */
+bool
+xfs_attr_namecheck(
+ const void *name,
+ size_t length)
+{
+ /*
+ * MAXNAMELEN includes the trailing null, but (name/length) leave it
+ * out, so use >= for the length check.
+ */
+ if (length >= MAXNAMELEN)
+ return false;
+
+ /* There shouldn't be any nulls here */
+ return !memchr(name, 0, length);
+}
diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h
index bdf52a333f3f..2297d8467666 100644
--- a/fs/xfs/libxfs/xfs_attr.h
+++ b/fs/xfs/libxfs/xfs_attr.h
@@ -145,6 +145,6 @@ int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
int xfs_attr_remove_args(struct xfs_da_args *args);
int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
int flags, struct attrlist_cursor_kern *cursor);
-
+bool xfs_attr_namecheck(const void *name, size_t length);
#endif /* __XFS_ATTR_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 6fc5425b1474..1f6e3965ff74 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -243,27 +243,16 @@ xfs_attr3_leaf_verify(
struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_attr_leafblock *leaf = bp->b_addr;
struct xfs_attr_leaf_entry *entries;
- uint16_t end;
+ uint32_t end; /* must be 32bit - see below */
int i;
+ xfs_failaddr_t fa;
xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
- struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
-
- if (ichdr.magic != XFS_ATTR3_LEAF_MAGIC)
- return __this_address;
+ fa = xfs_da3_blkinfo_verify(bp, bp->b_addr);
+ if (fa)
+ return fa;
- if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid))
- return __this_address;
- if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
- return __this_address;
- if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn)))
- return __this_address;
- } else {
- if (ichdr.magic != XFS_ATTR_LEAF_MAGIC)
- return __this_address;
- }
/*
* In recovery there is a transient state where count == 0 is valid
* because we may have transitioned an empty shortform attr to a leaf
@@ -293,6 +282,11 @@ xfs_attr3_leaf_verify(
/*
* Quickly check the freemap information. Attribute data has to be
* aligned to 4-byte boundaries, and likewise for the free space.
+ *
+ * Note that for 64k block size filesystems, the freemap entries cannot
+ * overflow as they are only be16 fields. However, when checking end
+ * pointer of the freemap, we have to be careful to detect overflows and
+ * so use uint32_t for those checks.
*/
for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
if (ichdr.freemap[i].base > mp->m_attr_geo->blksize)
@@ -303,7 +297,9 @@ xfs_attr3_leaf_verify(
return __this_address;
if (ichdr.freemap[i].size & 0x3)
return __this_address;
- end = ichdr.freemap[i].base + ichdr.freemap[i].size;
+
+ /* be care of 16 bit overflows here */
+ end = (uint32_t)ichdr.freemap[i].base + ichdr.freemap[i].size;
if (end < ichdr.freemap[i].base)
return __this_address;
if (end > mp->m_attr_geo->blksize)
@@ -362,6 +358,8 @@ xfs_attr3_leaf_read_verify(
const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = {
.name = "xfs_attr3_leaf",
+ .magic16 = { cpu_to_be16(XFS_ATTR_LEAF_MAGIC),
+ cpu_to_be16(XFS_ATTR3_LEAF_MAGIC) },
.verify_read = xfs_attr3_leaf_read_verify,
.verify_write = xfs_attr3_leaf_write_verify,
.verify_struct = xfs_attr3_leaf_verify,
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index d89363c6b523..65ff600a8067 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -79,6 +79,7 @@ xfs_attr3_rmt_hdr_ok(
static xfs_failaddr_t
xfs_attr3_rmt_verify(
struct xfs_mount *mp,
+ struct xfs_buf *bp,
void *ptr,
int fsbsize,
xfs_daddr_t bno)
@@ -87,7 +88,7 @@ xfs_attr3_rmt_verify(
if (!xfs_sb_version_hascrc(&mp->m_sb))
return __this_address;
- if (rmt->rm_magic != cpu_to_be32(XFS_ATTR3_RMT_MAGIC))
+ if (!xfs_verify_magic(bp, rmt->rm_magic))
return __this_address;
if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
@@ -131,7 +132,7 @@ __xfs_attr3_rmt_read_verify(
*failaddr = __this_address;
return -EFSBADCRC;
}
- *failaddr = xfs_attr3_rmt_verify(mp, ptr, blksize, bno);
+ *failaddr = xfs_attr3_rmt_verify(mp, bp, ptr, blksize, bno);
if (*failaddr)
return -EFSCORRUPTED;
len -= blksize;
@@ -193,7 +194,7 @@ xfs_attr3_rmt_write_verify(
while (len > 0) {
struct xfs_attr3_rmt_hdr *rmt = (struct xfs_attr3_rmt_hdr *)ptr;
- fa = xfs_attr3_rmt_verify(mp, ptr, blksize, bno);
+ fa = xfs_attr3_rmt_verify(mp, bp, ptr, blksize, bno);
if (fa) {
xfs_verifier_error(bp, -EFSCORRUPTED, fa);
return;
@@ -220,6 +221,7 @@ xfs_attr3_rmt_write_verify(
const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = {
.name = "xfs_attr3_rmt",
+ .magic = { 0, cpu_to_be32(XFS_ATTR3_RMT_MAGIC) },
.verify_read = xfs_attr3_rmt_read_verify,
.verify_write = xfs_attr3_rmt_write_verify,
.verify_struct = xfs_attr3_rmt_verify_struct,
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 74d7228e755b..48502cb9990f 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -536,7 +536,7 @@ __xfs_bmap_add_free(
struct xfs_trans *tp,
xfs_fsblock_t bno,
xfs_filblks_t len,
- struct xfs_owner_info *oinfo,
+ const struct xfs_owner_info *oinfo,
bool skip_discard)
{
struct xfs_extent_free_item *new; /* new element */
@@ -564,7 +564,7 @@ __xfs_bmap_add_free(
if (oinfo)
new->xefi_oinfo = *oinfo;
else
- xfs_rmap_skip_owner_update(&new->xefi_oinfo);
+ new->xefi_oinfo = XFS_RMAP_OINFO_SKIP_UPDATE;
new->xefi_skip_discard = skip_discard;
trace_xfs_bmap_free_defer(tp->t_mountp,
XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0,
@@ -577,42 +577,44 @@ __xfs_bmap_add_free(
*/
/*
- * Transform a btree format file with only one leaf node, where the
- * extents list will fit in the inode, into an extents format file.
- * Since the file extents are already in-core, all we have to do is
- * give up the space for the btree root and pitch the leaf block.
+ * Convert the inode format to extent format if it currently is in btree format,
+ * but the extent list is small enough that it fits into the extent format.
+ *
+ * Since the extents are already in-core, all we have to do is give up the space
+ * for the btree root and pitch the leaf block.
*/
STATIC int /* error */
xfs_bmap_btree_to_extents(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_btree_cur_t *cur, /* btree cursor */
+ struct xfs_trans *tp, /* transaction pointer */
+ struct xfs_inode *ip, /* incore inode pointer */
+ struct xfs_btree_cur *cur, /* btree cursor */
int *logflagsp, /* inode logging flags */
int whichfork) /* data or attr fork */
{
- /* REFERENCED */
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_btree_block *rblock = ifp->if_broot;
struct xfs_btree_block *cblock;/* child btree block */
xfs_fsblock_t cbno; /* child block number */
xfs_buf_t *cbp; /* child block's buffer */
int error; /* error return value */
- struct xfs_ifork *ifp; /* inode fork data */
- xfs_mount_t *mp; /* mount point structure */
__be64 *pp; /* ptr to block address */
- struct xfs_btree_block *rblock;/* root btree block */
struct xfs_owner_info oinfo;
- mp = ip->i_mount;
- ifp = XFS_IFORK_PTR(ip, whichfork);
+ /* check if we actually need the extent format first: */
+ if (!xfs_bmap_wants_extents(ip, whichfork))
+ return 0;
+
+ ASSERT(cur);
ASSERT(whichfork != XFS_COW_FORK);
ASSERT(ifp->if_flags & XFS_IFEXTENTS);
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
- rblock = ifp->if_broot;
ASSERT(be16_to_cpu(rblock->bb_level) == 1);
ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1);
ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1);
+
pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes);
cbno = be64_to_cpu(*pp);
- *logflagsp = 0;
#ifdef DEBUG
XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
xfs_btree_check_lptr(cur, cbno, 1));
@@ -635,7 +637,7 @@ xfs_bmap_btree_to_extents(
ASSERT(ifp->if_broot == NULL);
ASSERT((ifp->if_flags & XFS_IFBROOT) == 0);
XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
- *logflagsp = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
+ *logflagsp |= XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
return 0;
}
@@ -1694,10 +1696,13 @@ xfs_bmap_add_extent_delay_real(
case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
/*
* Filling in all of a previously delayed allocation extent.
- * The right neighbor is contiguous, the left is not.
+ * The right neighbor is contiguous, the left is not. Take care
+ * with delay -> unwritten extent allocation here because the
+ * delalloc record we are overwriting is always written.
*/
PREV.br_startblock = new->br_startblock;
PREV.br_blockcount += RIGHT.br_blockcount;
+ PREV.br_state = new->br_state;
xfs_iext_next(ifp, &bma->icur);
xfs_iext_remove(bma->ip, &bma->icur, state);
@@ -2026,7 +2031,7 @@ done:
/*
* Convert an unwritten allocation to a real allocation or vice versa.
*/
-STATIC int /* error */
+int /* error */
xfs_bmap_add_extent_unwritten_real(
struct xfs_trans *tp,
xfs_inode_t *ip, /* incore inode pointer */
@@ -3450,7 +3455,7 @@ xfs_bmap_btalloc(
args.tp = ap->tp;
args.mp = mp;
args.fsbno = ap->blkno;
- xfs_rmap_skip_owner_update(&args.oinfo);
+ args.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE;
/* Trim the allocation back to the maximum an AG can fit. */
args.maxlen = min(ap->length, mp->m_ag_max_usable);
@@ -3682,17 +3687,6 @@ xfs_trim_extent(
}
}
-/* trim extent to within eof */
-void
-xfs_trim_extent_eof(
- struct xfs_bmbt_irec *irec,
- struct xfs_inode *ip)
-
-{
- xfs_trim_extent(irec, 0, XFS_B_TO_FSB(ip->i_mount,
- i_size_read(VFS_I(ip))));
-}
-
/*
* Trim the returned map to the required bounds
*/
@@ -4200,6 +4194,44 @@ xfs_bmapi_convert_unwritten(
return 0;
}
+static inline xfs_extlen_t
+xfs_bmapi_minleft(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ int fork)
+{
+ if (tp && tp->t_firstblock != NULLFSBLOCK)
+ return 0;
+ if (XFS_IFORK_FORMAT(ip, fork) != XFS_DINODE_FMT_BTREE)
+ return 1;
+ return be16_to_cpu(XFS_IFORK_PTR(ip, fork)->if_broot->bb_level) + 1;
+}
+
+/*
+ * Log whatever the flags say, even if error. Otherwise we might miss detecting
+ * a case where the data is changed, there's an error, and it's not logged so we
+ * don't shutdown when we should. Don't bother logging extents/btree changes if
+ * we converted to the other format.
+ */
+static void
+xfs_bmapi_finish(
+ struct xfs_bmalloca *bma,
+ int whichfork,
+ int error)
+{
+ if ((bma->logflags & xfs_ilog_fext(whichfork)) &&
+ XFS_IFORK_FORMAT(bma->ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+ bma->logflags &= ~xfs_ilog_fext(whichfork);
+ else if ((bma->logflags & xfs_ilog_fbroot(whichfork)) &&
+ XFS_IFORK_FORMAT(bma->ip, whichfork) != XFS_DINODE_FMT_BTREE)
+ bma->logflags &= ~xfs_ilog_fbroot(whichfork);
+
+ if (bma->logflags)
+ xfs_trans_log_inode(bma->tp, bma->ip, bma->logflags);
+ if (bma->cur)
+ xfs_btree_del_cursor(bma->cur, error);
+}
+
/*
* Map file blocks to filesystem blocks, and allocate blocks or convert the
* extent state if necessary. Details behaviour is controlled by the flags
@@ -4244,9 +4276,7 @@ xfs_bmapi_write(
ASSERT(*nmap >= 1);
ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
- ASSERT(tp != NULL ||
- (flags & (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK)) ==
- (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK));
+ ASSERT(tp != NULL);
ASSERT(len > 0);
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -4279,25 +4309,12 @@ xfs_bmapi_write(
XFS_STATS_INC(mp, xs_blk_mapw);
- if (!tp || tp->t_firstblock == NULLFSBLOCK) {
- if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE)
- bma.minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1;
- else
- bma.minleft = 1;
- } else {
- bma.minleft = 0;
- }
-
if (!(ifp->if_flags & XFS_IFEXTENTS)) {
error = xfs_iread_extents(tp, ip, whichfork);
if (error)
goto error0;
}
- n = 0;
- end = bno + len;
- obno = bno;
-
if (!xfs_iext_lookup_extent(ip, ifp, bno, &bma.icur, &bma.got))
eof = true;
if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev))
@@ -4306,7 +4323,11 @@ xfs_bmapi_write(
bma.ip = ip;
bma.total = total;
bma.datatype = 0;
+ bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork);
+ n = 0;
+ end = bno + len;
+ obno = bno;
while (bno < end && n < *nmap) {
bool need_alloc = false, wasdelay = false;
@@ -4320,26 +4341,7 @@ xfs_bmapi_write(
ASSERT(!((flags & XFS_BMAPI_CONVERT) &&
(flags & XFS_BMAPI_COWFORK)));
- if (flags & XFS_BMAPI_DELALLOC) {
- /*
- * For the COW fork we can reasonably get a
- * request for converting an extent that races
- * with other threads already having converted
- * part of it, as there converting COW to
- * regular blocks is not protected using the
- * IOLOCK.
- */
- ASSERT(flags & XFS_BMAPI_COWFORK);
- if (!(flags & XFS_BMAPI_COWFORK)) {
- error = -EIO;
- goto error0;
- }
-
- if (eof || bno >= end)
- break;
- } else {
- need_alloc = true;
- }
+ need_alloc = true;
} else if (isnullstartblock(bma.got.br_startblock)) {
wasdelay = true;
}
@@ -4348,8 +4350,7 @@ xfs_bmapi_write(
* First, deal with the hole before the allocated space
* that we found, if any.
*/
- if ((need_alloc || wasdelay) &&
- !(flags & XFS_BMAPI_CONVERT_ONLY)) {
+ if (need_alloc || wasdelay) {
bma.eof = eof;
bma.conv = !!(flags & XFS_BMAPI_CONVERT);
bma.wasdel = wasdelay;
@@ -4417,49 +4418,130 @@ xfs_bmapi_write(
}
*nmap = n;
- /*
- * Transform from btree to extents, give it cur.
- */
- if (xfs_bmap_wants_extents(ip, whichfork)) {
- int tmp_logflags = 0;
-
- ASSERT(bma.cur);
- error = xfs_bmap_btree_to_extents(tp, ip, bma.cur,
- &tmp_logflags, whichfork);
- bma.logflags |= tmp_logflags;
- if (error)
- goto error0;
- }
+ error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags,
+ whichfork);
+ if (error)
+ goto error0;
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE ||
XFS_IFORK_NEXTENTS(ip, whichfork) >
XFS_IFORK_MAXEXT(ip, whichfork));
- error = 0;
+ xfs_bmapi_finish(&bma, whichfork, 0);
+ xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval,
+ orig_nmap, *nmap);
+ return 0;
error0:
+ xfs_bmapi_finish(&bma, whichfork, error);
+ return error;
+}
+
+/*
+ * Convert an existing delalloc extent to real blocks based on file offset. This
+ * attempts to allocate the entire delalloc extent and may require multiple
+ * invocations to allocate the target offset if a large enough physical extent
+ * is not available.
+ */
+int
+xfs_bmapi_convert_delalloc(
+ struct xfs_inode *ip,
+ int whichfork,
+ xfs_fileoff_t offset_fsb,
+ struct xfs_bmbt_irec *imap,
+ unsigned int *seq)
+{
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_bmalloca bma = { NULL };
+ struct xfs_trans *tp;
+ int error;
+
/*
- * Log everything. Do this after conversion, there's no point in
- * logging the extent records if we've converted to btree format.
+ * Space for the extent and indirect blocks was reserved when the
+ * delalloc extent was created so there's no need to do so here.
*/
- if ((bma.logflags & xfs_ilog_fext(whichfork)) &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
- bma.logflags &= ~xfs_ilog_fext(whichfork);
- else if ((bma.logflags & xfs_ilog_fbroot(whichfork)) &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
- bma.logflags &= ~xfs_ilog_fbroot(whichfork);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0,
+ XFS_TRANS_RESERVE, &tp);
+ if (error)
+ return error;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
+
+ if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &bma.icur, &bma.got) ||
+ bma.got.br_startoff > offset_fsb) {
+ /*
+ * No extent found in the range we are trying to convert. This
+ * should only happen for the COW fork, where another thread
+ * might have moved the extent to the data fork in the meantime.
+ */
+ WARN_ON_ONCE(whichfork != XFS_COW_FORK);
+ error = -EAGAIN;
+ goto out_trans_cancel;
+ }
+
/*
- * Log whatever the flags say, even if error. Otherwise we might miss
- * detecting a case where the data is changed, there's an error,
- * and it's not logged so we don't shutdown when we should.
+ * If we find a real extent here we raced with another thread converting
+ * the extent. Just return the real extent at this offset.
*/
- if (bma.logflags)
- xfs_trans_log_inode(tp, ip, bma.logflags);
+ if (!isnullstartblock(bma.got.br_startblock)) {
+ *imap = bma.got;
+ *seq = READ_ONCE(ifp->if_seq);
+ goto out_trans_cancel;
+ }
+
+ bma.tp = tp;
+ bma.ip = ip;
+ bma.wasdel = true;
+ bma.offset = bma.got.br_startoff;
+ bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount, MAXEXTLEN);
+ bma.total = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK);
+ bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork);
+ if (whichfork == XFS_COW_FORK)
+ bma.flags = XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC;
+
+ if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev))
+ bma.prev.br_startoff = NULLFILEOFF;
- if (bma.cur) {
- xfs_btree_del_cursor(bma.cur, error);
+ error = xfs_bmapi_allocate(&bma);
+ if (error)
+ goto out_finish;
+
+ error = -ENOSPC;
+ if (WARN_ON_ONCE(bma.blkno == NULLFSBLOCK))
+ goto out_finish;
+ error = -EFSCORRUPTED;
+ if (WARN_ON_ONCE(!bma.got.br_startblock && !XFS_IS_REALTIME_INODE(ip)))
+ goto out_finish;
+
+ XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, bma.length));
+ XFS_STATS_INC(mp, xs_xstrat_quick);
+
+ ASSERT(!isnullstartblock(bma.got.br_startblock));
+ *imap = bma.got;
+ *seq = READ_ONCE(ifp->if_seq);
+
+ if (whichfork == XFS_COW_FORK) {
+ error = xfs_refcount_alloc_cow_extent(tp, bma.blkno,
+ bma.length);
+ if (error)
+ goto out_finish;
}
- if (!error)
- xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval,
- orig_nmap, *nmap);
+
+ error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags,
+ whichfork);
+ if (error)
+ goto out_finish;
+
+ xfs_bmapi_finish(&bma, whichfork, 0);
+ error = xfs_trans_commit(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
+
+out_finish:
+ xfs_bmapi_finish(&bma, whichfork, error);
+out_trans_cancel:
+ xfs_trans_cancel(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
@@ -4533,13 +4615,7 @@ xfs_bmapi_remap(
if (error)
goto error0;
- if (xfs_bmap_wants_extents(ip, whichfork)) {
- int tmp_logflags = 0;
-
- error = xfs_bmap_btree_to_extents(tp, ip, cur,
- &tmp_logflags, whichfork);
- logflags |= tmp_logflags;
- }
+ error = xfs_bmap_btree_to_extents(tp, ip, cur, &logflags, whichfork);
error0:
if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS)
@@ -5403,24 +5479,11 @@ nodelete:
error = xfs_bmap_extents_to_btree(tp, ip, &cur, 0,
&tmp_logflags, whichfork);
logflags |= tmp_logflags;
- if (error)
- goto error0;
- }
- /*
- * transform from btree to extents, give it cur
- */
- else if (xfs_bmap_wants_extents(ip, whichfork)) {
- ASSERT(cur != NULL);
- error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags,
+ } else {
+ error = xfs_bmap_btree_to_extents(tp, ip, cur, &logflags,
whichfork);
- logflags |= tmp_logflags;
- if (error)
- goto error0;
}
- /*
- * transform from extents to local?
- */
- error = 0;
+
error0:
/*
* Log everything. Do this after conversion, there's no point in
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 488dc8860fd7..8f597f9abdbe 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -95,12 +95,6 @@ struct xfs_extent_free_item
/* Map something in the CoW fork. */
#define XFS_BMAPI_COWFORK 0x200
-/* Only convert delalloc space, don't allocate entirely new extents */
-#define XFS_BMAPI_DELALLOC 0x400
-
-/* Only convert unwritten extents, don't allocate new blocks */
-#define XFS_BMAPI_CONVERT_ONLY 0x800
-
/* Skip online discard of freed extents */
#define XFS_BMAPI_NODISCARD 0x1000
@@ -117,8 +111,6 @@ struct xfs_extent_free_item
{ XFS_BMAPI_ZERO, "ZERO" }, \
{ XFS_BMAPI_REMAP, "REMAP" }, \
{ XFS_BMAPI_COWFORK, "COWFORK" }, \
- { XFS_BMAPI_DELALLOC, "DELALLOC" }, \
- { XFS_BMAPI_CONVERT_ONLY, "CONVERT_ONLY" }, \
{ XFS_BMAPI_NODISCARD, "NODISCARD" }, \
{ XFS_BMAPI_NORMAP, "NORMAP" }
@@ -181,12 +173,11 @@ static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec)
void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno,
xfs_filblks_t len);
-void xfs_trim_extent_eof(struct xfs_bmbt_irec *, struct xfs_inode *);
int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
int xfs_bmap_set_attrforkoff(struct xfs_inode *ip, int size, int *version);
void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
void __xfs_bmap_add_free(struct xfs_trans *tp, xfs_fsblock_t bno,
- xfs_filblks_t len, struct xfs_owner_info *oinfo,
+ xfs_filblks_t len, const struct xfs_owner_info *oinfo,
bool skip_discard);
void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork);
int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip,
@@ -228,13 +219,20 @@ int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork,
xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc,
struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur,
int eof);
+int xfs_bmapi_convert_delalloc(struct xfs_inode *ip, int whichfork,
+ xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap,
+ unsigned int *seq);
+int xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp,
+ struct xfs_inode *ip, int whichfork,
+ struct xfs_iext_cursor *icur, struct xfs_btree_cur **curp,
+ struct xfs_bmbt_irec *new, int *logflagsp);
static inline void
xfs_bmap_add_free(
struct xfs_trans *tp,
xfs_fsblock_t bno,
xfs_filblks_t len,
- struct xfs_owner_info *oinfo)
+ const struct xfs_owner_info *oinfo)
{
__xfs_bmap_add_free(tp, bno, len, oinfo, false);
}
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index cdb74d2e2a43..aff82ed112c9 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -416,8 +416,10 @@ xfs_bmbt_verify(
xfs_failaddr_t fa;
unsigned int level;
- switch (block->bb_magic) {
- case cpu_to_be32(XFS_BMAP_CRC_MAGIC):
+ if (!xfs_verify_magic(bp, block->bb_magic))
+ return __this_address;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
/*
* XXX: need a better way of verifying the owner here. Right now
* just make sure there has been one set.
@@ -425,11 +427,6 @@ xfs_bmbt_verify(
fa = xfs_btree_lblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
if (fa)
return fa;
- /* fall through */
- case cpu_to_be32(XFS_BMAP_MAGIC):
- break;
- default:
- return __this_address;
}
/*
@@ -481,6 +478,8 @@ xfs_bmbt_write_verify(
const struct xfs_buf_ops xfs_bmbt_buf_ops = {
.name = "xfs_bmbt",
+ .magic = { cpu_to_be32(XFS_BMAP_MAGIC),
+ cpu_to_be32(XFS_BMAP_CRC_MAGIC) },
.verify_read = xfs_bmbt_read_verify,
.verify_write = xfs_bmbt_write_verify,
.verify_struct = xfs_bmbt_verify,
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 34c6d7bd4d18..bbdae2b4559f 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -330,7 +330,7 @@ xfs_btree_sblock_verify_crc(
if (xfs_sb_version_hascrc(&mp->m_sb)) {
if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn)))
- return __this_address;
+ return false;
return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
}
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 376bee94b5dd..e2737e2ac2ae 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -116,6 +116,34 @@ xfs_da_state_free(xfs_da_state_t *state)
kmem_zone_free(xfs_da_state_zone, state);
}
+/*
+ * Verify an xfs_da3_blkinfo structure. Note that the da3 fields are only
+ * accessible on v5 filesystems. This header format is common across da node,
+ * attr leaf and dir leaf blocks.
+ */
+xfs_failaddr_t
+xfs_da3_blkinfo_verify(
+ struct xfs_buf *bp,
+ struct xfs_da3_blkinfo *hdr3)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_da_blkinfo *hdr = &hdr3->hdr;
+
+ if (!xfs_verify_magic16(bp, hdr->magic))
+ return __this_address;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
+ return __this_address;
+ if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
+ return __this_address;
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
+ return __this_address;
+ }
+
+ return NULL;
+}
+
static xfs_failaddr_t
xfs_da3_node_verify(
struct xfs_buf *bp)
@@ -124,27 +152,16 @@ xfs_da3_node_verify(
struct xfs_da_intnode *hdr = bp->b_addr;
struct xfs_da3_icnode_hdr ichdr;
const struct xfs_dir_ops *ops;
+ xfs_failaddr_t fa;
ops = xfs_dir_get_ops(mp, NULL);
ops->node_hdr_from_disk(&ichdr, hdr);
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
- struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
-
- if (ichdr.magic != XFS_DA3_NODE_MAGIC)
- return __this_address;
+ fa = xfs_da3_blkinfo_verify(bp, bp->b_addr);
+ if (fa)
+ return fa;
- if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid))
- return __this_address;
- if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
- return __this_address;
- if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn)))
- return __this_address;
- } else {
- if (ichdr.magic != XFS_DA_NODE_MAGIC)
- return __this_address;
- }
if (ichdr.level == 0)
return __this_address;
if (ichdr.level > XFS_DA_NODE_MAXDEPTH)
@@ -257,6 +274,8 @@ xfs_da3_node_verify_struct(
const struct xfs_buf_ops xfs_da3_node_buf_ops = {
.name = "xfs_da3_node",
+ .magic16 = { cpu_to_be16(XFS_DA_NODE_MAGIC),
+ cpu_to_be16(XFS_DA3_NODE_MAGIC) },
.verify_read = xfs_da3_node_read_verify,
.verify_write = xfs_da3_node_write_verify,
.verify_struct = xfs_da3_node_verify_struct,
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index 5d5bf3bffc78..ae654e06b2fb 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -869,4 +869,7 @@ static inline unsigned int xfs_dir2_dirblock_bytes(struct xfs_sb *sbp)
return 1 << (sbp->sb_blocklog + sbp->sb_dirblklog);
}
+xfs_failaddr_t xfs_da3_blkinfo_verify(struct xfs_buf *bp,
+ struct xfs_da3_blkinfo *hdr3);
+
#endif /* __XFS_DA_FORMAT_H__ */
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index e792b167150a..94f00427de98 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -172,7 +172,13 @@
* reoccur.
*/
-static const struct xfs_defer_op_type *defer_op_types[XFS_DEFER_OPS_TYPE_MAX];
+static const struct xfs_defer_op_type *defer_op_types[] = {
+ [XFS_DEFER_OPS_TYPE_BMAP] = &xfs_bmap_update_defer_type,
+ [XFS_DEFER_OPS_TYPE_REFCOUNT] = &xfs_refcount_update_defer_type,
+ [XFS_DEFER_OPS_TYPE_RMAP] = &xfs_rmap_update_defer_type,
+ [XFS_DEFER_OPS_TYPE_FREE] = &xfs_extent_free_defer_type,
+ [XFS_DEFER_OPS_TYPE_AGFL_FREE] = &xfs_agfl_free_defer_type,
+};
/*
* For each pending item in the intake list, log its intent item and the
@@ -185,15 +191,15 @@ xfs_defer_create_intents(
{
struct list_head *li;
struct xfs_defer_pending *dfp;
+ const struct xfs_defer_op_type *ops;
list_for_each_entry(dfp, &tp->t_dfops, dfp_list) {
- dfp->dfp_intent = dfp->dfp_type->create_intent(tp,
- dfp->dfp_count);
+ ops = defer_op_types[dfp->dfp_type];
+ dfp->dfp_intent = ops->create_intent(tp, dfp->dfp_count);
trace_xfs_defer_create_intent(tp->t_mountp, dfp);
- list_sort(tp->t_mountp, &dfp->dfp_work,
- dfp->dfp_type->diff_items);
+ list_sort(tp->t_mountp, &dfp->dfp_work, ops->diff_items);
list_for_each(li, &dfp->dfp_work)
- dfp->dfp_type->log_item(tp, dfp->dfp_intent, li);
+ ops->log_item(tp, dfp->dfp_intent, li);
}
}
@@ -204,14 +210,16 @@ xfs_defer_trans_abort(
struct list_head *dop_pending)
{
struct xfs_defer_pending *dfp;
+ const struct xfs_defer_op_type *ops;
trace_xfs_defer_trans_abort(tp, _RET_IP_);
/* Abort intent items that don't have a done item. */
list_for_each_entry(dfp, dop_pending, dfp_list) {
+ ops = defer_op_types[dfp->dfp_type];
trace_xfs_defer_pending_abort(tp->t_mountp, dfp);
if (dfp->dfp_intent && !dfp->dfp_done) {
- dfp->dfp_type->abort_intent(dfp->dfp_intent);
+ ops->abort_intent(dfp->dfp_intent);
dfp->dfp_intent = NULL;
}
}
@@ -315,18 +323,20 @@ xfs_defer_cancel_list(
struct xfs_defer_pending *pli;
struct list_head *pwi;
struct list_head *n;
+ const struct xfs_defer_op_type *ops;
/*
* Free the pending items. Caller should already have arranged
* for the intent items to be released.
*/
list_for_each_entry_safe(dfp, pli, dop_list, dfp_list) {
+ ops = defer_op_types[dfp->dfp_type];
trace_xfs_defer_cancel_list(mp, dfp);
list_del(&dfp->dfp_list);
list_for_each_safe(pwi, n, &dfp->dfp_work) {
list_del(pwi);
dfp->dfp_count--;
- dfp->dfp_type->cancel_item(pwi);
+ ops->cancel_item(pwi);
}
ASSERT(dfp->dfp_count == 0);
kmem_free(dfp);
@@ -350,7 +360,7 @@ xfs_defer_finish_noroll(
struct list_head *n;
void *state;
int error = 0;
- void (*cleanup_fn)(struct xfs_trans *, void *, int);
+ const struct xfs_defer_op_type *ops;
LIST_HEAD(dop_pending);
ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
@@ -373,18 +383,18 @@ xfs_defer_finish_noroll(
/* Log an intent-done item for the first pending item. */
dfp = list_first_entry(&dop_pending, struct xfs_defer_pending,
dfp_list);
+ ops = defer_op_types[dfp->dfp_type];
trace_xfs_defer_pending_finish((*tp)->t_mountp, dfp);
- dfp->dfp_done = dfp->dfp_type->create_done(*tp, dfp->dfp_intent,
+ dfp->dfp_done = ops->create_done(*tp, dfp->dfp_intent,
dfp->dfp_count);
- cleanup_fn = dfp->dfp_type->finish_cleanup;
/* Finish the work items. */
state = NULL;
list_for_each_safe(li, n, &dfp->dfp_work) {
list_del(li);
dfp->dfp_count--;
- error = dfp->dfp_type->finish_item(*tp, li,
- dfp->dfp_done, &state);
+ error = ops->finish_item(*tp, li, dfp->dfp_done,
+ &state);
if (error == -EAGAIN) {
/*
* Caller wants a fresh transaction;
@@ -400,8 +410,8 @@ xfs_defer_finish_noroll(
* xfs_defer_cancel will take care of freeing
* all these lists and stuff.
*/
- if (cleanup_fn)
- cleanup_fn(*tp, state, error);
+ if (ops->finish_cleanup)
+ ops->finish_cleanup(*tp, state, error);
goto out;
}
}
@@ -413,20 +423,19 @@ xfs_defer_finish_noroll(
* a Fresh Transaction while Finishing
* Deferred Work" above.
*/
- dfp->dfp_intent = dfp->dfp_type->create_intent(*tp,
+ dfp->dfp_intent = ops->create_intent(*tp,
dfp->dfp_count);
dfp->dfp_done = NULL;
list_for_each(li, &dfp->dfp_work)
- dfp->dfp_type->log_item(*tp, dfp->dfp_intent,
- li);
+ ops->log_item(*tp, dfp->dfp_intent, li);
} else {
/* Done with the dfp, free it. */
list_del(&dfp->dfp_list);
kmem_free(dfp);
}
- if (cleanup_fn)
- cleanup_fn(*tp, state, error);
+ if (ops->finish_cleanup)
+ ops->finish_cleanup(*tp, state, error);
}
out:
@@ -486,8 +495,10 @@ xfs_defer_add(
struct list_head *li)
{
struct xfs_defer_pending *dfp = NULL;
+ const struct xfs_defer_op_type *ops;
ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+ BUILD_BUG_ON(ARRAY_SIZE(defer_op_types) != XFS_DEFER_OPS_TYPE_MAX);
/*
* Add the item to a pending item at the end of the intake list.
@@ -497,15 +508,15 @@ xfs_defer_add(
if (!list_empty(&tp->t_dfops)) {
dfp = list_last_entry(&tp->t_dfops,
struct xfs_defer_pending, dfp_list);
- if (dfp->dfp_type->type != type ||
- (dfp->dfp_type->max_items &&
- dfp->dfp_count >= dfp->dfp_type->max_items))
+ ops = defer_op_types[dfp->dfp_type];
+ if (dfp->dfp_type != type ||
+ (ops->max_items && dfp->dfp_count >= ops->max_items))
dfp = NULL;
}
if (!dfp) {
dfp = kmem_alloc(sizeof(struct xfs_defer_pending),
KM_SLEEP | KM_NOFS);
- dfp->dfp_type = defer_op_types[type];
+ dfp->dfp_type = type;
dfp->dfp_intent = NULL;
dfp->dfp_done = NULL;
dfp->dfp_count = 0;
@@ -517,14 +528,6 @@ xfs_defer_add(
dfp->dfp_count++;
}
-/* Initialize a deferred operation list. */
-void
-xfs_defer_init_op_type(
- const struct xfs_defer_op_type *type)
-{
- defer_op_types[type->type] = type;
-}
-
/*
* Move deferred ops from one transaction to another and reset the source to
* initial state. This is primarily used to carry state forward across
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index 2584a5b95b0d..7c28d7608ac6 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -9,20 +9,6 @@
struct xfs_defer_op_type;
/*
- * Save a log intent item and a list of extents, so that we can replay
- * whatever action had to happen to the extent list and file the log done
- * item.
- */
-struct xfs_defer_pending {
- const struct xfs_defer_op_type *dfp_type; /* function pointers */
- struct list_head dfp_list; /* pending items */
- void *dfp_intent; /* log intent item */
- void *dfp_done; /* log done item */
- struct list_head dfp_work; /* work items */
- unsigned int dfp_count; /* # extent items */
-};
-
-/*
* Header for deferred operation list.
*/
enum xfs_defer_ops_type {
@@ -34,6 +20,20 @@ enum xfs_defer_ops_type {
XFS_DEFER_OPS_TYPE_MAX,
};
+/*
+ * Save a log intent item and a list of extents, so that we can replay
+ * whatever action had to happen to the extent list and file the log done
+ * item.
+ */
+struct xfs_defer_pending {
+ struct list_head dfp_list; /* pending items */
+ struct list_head dfp_work; /* work items */
+ void *dfp_intent; /* log intent item */
+ void *dfp_done; /* log done item */
+ unsigned int dfp_count; /* # extent items */
+ enum xfs_defer_ops_type dfp_type;
+};
+
void xfs_defer_add(struct xfs_trans *tp, enum xfs_defer_ops_type type,
struct list_head *h);
int xfs_defer_finish_noroll(struct xfs_trans **tp);
@@ -43,8 +43,6 @@ void xfs_defer_move(struct xfs_trans *dtp, struct xfs_trans *stp);
/* Description of a deferred type. */
struct xfs_defer_op_type {
- enum xfs_defer_ops_type type;
- unsigned int max_items;
void (*abort_intent)(void *);
void *(*create_done)(struct xfs_trans *, void *, unsigned int);
int (*finish_item)(struct xfs_trans *, struct list_head *, void *,
@@ -54,8 +52,13 @@ struct xfs_defer_op_type {
int (*diff_items)(void *, struct list_head *, struct list_head *);
void *(*create_intent)(struct xfs_trans *, uint);
void (*log_item)(struct xfs_trans *, void *, struct list_head *);
+ unsigned int max_items;
};
-void xfs_defer_init_op_type(const struct xfs_defer_op_type *type);
+extern const struct xfs_defer_op_type xfs_bmap_update_defer_type;
+extern const struct xfs_defer_op_type xfs_refcount_update_defer_type;
+extern const struct xfs_defer_op_type xfs_rmap_update_defer_type;
+extern const struct xfs_defer_op_type xfs_extent_free_defer_type;
+extern const struct xfs_defer_op_type xfs_agfl_free_defer_type;
#endif /* __XFS_DEFER_H__ */
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 229152cd1a24..156ce95c9c45 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -703,3 +703,20 @@ xfs_dir2_shrink_inode(
xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
return 0;
}
+
+/* Returns true if the directory entry name is valid. */
+bool
+xfs_dir2_namecheck(
+ const void *name,
+ size_t length)
+{
+ /*
+ * MAXNAMELEN includes the trailing null, but (name/length) leave it
+ * out, so use >= for the length check.
+ */
+ if (length >= MAXNAMELEN)
+ return false;
+
+ /* There shouldn't be any slashes or nulls here */
+ return !memchr(name, '/', length) && !memchr(name, 0, length);
+}
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index c3e3f6b813d8..f54244779492 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -326,5 +326,6 @@ xfs_dir2_leaf_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_leaf *lp)
unsigned char xfs_dir3_get_dtype(struct xfs_mount *mp, uint8_t filetype);
void *xfs_dir3_data_endp(struct xfs_da_geometry *geo,
struct xfs_dir2_data_hdr *hdr);
+bool xfs_dir2_namecheck(const void *name, size_t length);
#endif /* __XFS_DIR2_H__ */
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index 30ed5919da72..b7d6d78f4ce2 100644
--- a/fs/xfs/libxfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -53,18 +53,16 @@ xfs_dir3_block_verify(
struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+ if (!xfs_verify_magic(bp, hdr3->magic))
+ return __this_address;
+
if (xfs_sb_version_hascrc(&mp->m_sb)) {
- if (hdr3->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC))
- return __this_address;
if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
return __this_address;
if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
return __this_address;
- } else {
- if (hdr3->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))
- return __this_address;
}
return __xfs_dir3_data_check(NULL, bp);
}
@@ -112,6 +110,8 @@ xfs_dir3_block_write_verify(
const struct xfs_buf_ops xfs_dir3_block_buf_ops = {
.name = "xfs_dir3_block",
+ .magic = { cpu_to_be32(XFS_DIR2_BLOCK_MAGIC),
+ cpu_to_be32(XFS_DIR3_BLOCK_MAGIC) },
.verify_read = xfs_dir3_block_read_verify,
.verify_write = xfs_dir3_block_write_verify,
.verify_struct = xfs_dir3_block_verify,
diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c
index 01162c62ec8f..b7b9ce002cb9 100644
--- a/fs/xfs/libxfs/xfs_dir2_data.c
+++ b/fs/xfs/libxfs/xfs_dir2_data.c
@@ -252,18 +252,16 @@ xfs_dir3_data_verify(
struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+ if (!xfs_verify_magic(bp, hdr3->magic))
+ return __this_address;
+
if (xfs_sb_version_hascrc(&mp->m_sb)) {
- if (hdr3->magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC))
- return __this_address;
if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
return __this_address;
if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
return __this_address;
- } else {
- if (hdr3->magic != cpu_to_be32(XFS_DIR2_DATA_MAGIC))
- return __this_address;
}
return __xfs_dir3_data_check(NULL, bp);
}
@@ -339,6 +337,8 @@ xfs_dir3_data_write_verify(
const struct xfs_buf_ops xfs_dir3_data_buf_ops = {
.name = "xfs_dir3_data",
+ .magic = { cpu_to_be32(XFS_DIR2_DATA_MAGIC),
+ cpu_to_be32(XFS_DIR3_DATA_MAGIC) },
.verify_read = xfs_dir3_data_read_verify,
.verify_write = xfs_dir3_data_write_verify,
.verify_struct = xfs_dir3_data_verify,
@@ -346,6 +346,8 @@ const struct xfs_buf_ops xfs_dir3_data_buf_ops = {
static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = {
.name = "xfs_dir3_data_reada",
+ .magic = { cpu_to_be32(XFS_DIR2_DATA_MAGIC),
+ cpu_to_be32(XFS_DIR3_DATA_MAGIC) },
.verify_read = xfs_dir3_data_reada_verify,
.verify_write = xfs_dir3_data_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c
index 1728a3e6f5cf..9a3767818c50 100644
--- a/fs/xfs/libxfs/xfs_dir2_leaf.c
+++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
@@ -142,41 +142,22 @@ xfs_dir3_leaf_check_int(
*/
static xfs_failaddr_t
xfs_dir3_leaf_verify(
- struct xfs_buf *bp,
- uint16_t magic)
+ struct xfs_buf *bp)
{
struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_dir2_leaf *leaf = bp->b_addr;
+ xfs_failaddr_t fa;
- ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC);
-
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
- struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
- uint16_t magic3;
-
- magic3 = (magic == XFS_DIR2_LEAF1_MAGIC) ? XFS_DIR3_LEAF1_MAGIC
- : XFS_DIR3_LEAFN_MAGIC;
-
- if (leaf3->info.hdr.magic != cpu_to_be16(magic3))
- return __this_address;
- if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_meta_uuid))
- return __this_address;
- if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
- return __this_address;
- if (!xfs_log_check_lsn(mp, be64_to_cpu(leaf3->info.lsn)))
- return __this_address;
- } else {
- if (leaf->hdr.info.magic != cpu_to_be16(magic))
- return __this_address;
- }
+ fa = xfs_da3_blkinfo_verify(bp, bp->b_addr);
+ if (fa)
+ return fa;
return xfs_dir3_leaf_check_int(mp, NULL, NULL, leaf);
}
static void
-__read_verify(
- struct xfs_buf *bp,
- uint16_t magic)
+xfs_dir3_leaf_read_verify(
+ struct xfs_buf *bp)
{
struct xfs_mount *mp = bp->b_target->bt_mount;
xfs_failaddr_t fa;
@@ -185,23 +166,22 @@ __read_verify(
!xfs_buf_verify_cksum(bp, XFS_DIR3_LEAF_CRC_OFF))
xfs_verifier_error(bp, -EFSBADCRC, __this_address);
else {
- fa = xfs_dir3_leaf_verify(bp, magic);
+ fa = xfs_dir3_leaf_verify(bp);
if (fa)
xfs_verifier_error(bp, -EFSCORRUPTED, fa);
}
}
static void
-__write_verify(
- struct xfs_buf *bp,
- uint16_t magic)
+xfs_dir3_leaf_write_verify(
+ struct xfs_buf *bp)
{
struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_buf_log_item *bip = bp->b_log_item;
struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr;
xfs_failaddr_t fa;
- fa = xfs_dir3_leaf_verify(bp, magic);
+ fa = xfs_dir3_leaf_verify(bp);
if (fa) {
xfs_verifier_error(bp, -EFSCORRUPTED, fa);
return;
@@ -216,60 +196,22 @@ __write_verify(
xfs_buf_update_cksum(bp, XFS_DIR3_LEAF_CRC_OFF);
}
-static xfs_failaddr_t
-xfs_dir3_leaf1_verify(
- struct xfs_buf *bp)
-{
- return xfs_dir3_leaf_verify(bp, XFS_DIR2_LEAF1_MAGIC);
-}
-
-static void
-xfs_dir3_leaf1_read_verify(
- struct xfs_buf *bp)
-{
- __read_verify(bp, XFS_DIR2_LEAF1_MAGIC);
-}
-
-static void
-xfs_dir3_leaf1_write_verify(
- struct xfs_buf *bp)
-{
- __write_verify(bp, XFS_DIR2_LEAF1_MAGIC);
-}
-
-static xfs_failaddr_t
-xfs_dir3_leafn_verify(
- struct xfs_buf *bp)
-{
- return xfs_dir3_leaf_verify(bp, XFS_DIR2_LEAFN_MAGIC);
-}
-
-static void
-xfs_dir3_leafn_read_verify(
- struct xfs_buf *bp)
-{
- __read_verify(bp, XFS_DIR2_LEAFN_MAGIC);
-}
-
-static void
-xfs_dir3_leafn_write_verify(
- struct xfs_buf *bp)
-{
- __write_verify(bp, XFS_DIR2_LEAFN_MAGIC);
-}
-
const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops = {
.name = "xfs_dir3_leaf1",
- .verify_read = xfs_dir3_leaf1_read_verify,
- .verify_write = xfs_dir3_leaf1_write_verify,
- .verify_struct = xfs_dir3_leaf1_verify,
+ .magic16 = { cpu_to_be16(XFS_DIR2_LEAF1_MAGIC),
+ cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) },
+ .verify_read = xfs_dir3_leaf_read_verify,
+ .verify_write = xfs_dir3_leaf_write_verify,
+ .verify_struct = xfs_dir3_leaf_verify,
};
const struct xfs_buf_ops xfs_dir3_leafn_buf_ops = {
.name = "xfs_dir3_leafn",
- .verify_read = xfs_dir3_leafn_read_verify,
- .verify_write = xfs_dir3_leafn_write_verify,
- .verify_struct = xfs_dir3_leafn_verify,
+ .magic16 = { cpu_to_be16(XFS_DIR2_LEAFN_MAGIC),
+ cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) },
+ .verify_read = xfs_dir3_leaf_read_verify,
+ .verify_write = xfs_dir3_leaf_write_verify,
+ .verify_struct = xfs_dir3_leaf_verify,
};
int
diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index f1bb3434f51c..3b03703c5c3d 100644
--- a/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -87,20 +87,18 @@ xfs_dir3_free_verify(
struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_dir2_free_hdr *hdr = bp->b_addr;
+ if (!xfs_verify_magic(bp, hdr->magic))
+ return __this_address;
+
if (xfs_sb_version_hascrc(&mp->m_sb)) {
struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
- if (hdr3->magic != cpu_to_be32(XFS_DIR3_FREE_MAGIC))
- return __this_address;
if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
return __this_address;
if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
return __this_address;
- } else {
- if (hdr->magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC))
- return __this_address;
}
/* XXX: should bounds check the xfs_dir3_icfree_hdr here */
@@ -151,6 +149,8 @@ xfs_dir3_free_write_verify(
const struct xfs_buf_ops xfs_dir3_free_buf_ops = {
.name = "xfs_dir3_free",
+ .magic = { cpu_to_be32(XFS_DIR2_FREE_MAGIC),
+ cpu_to_be32(XFS_DIR3_FREE_MAGIC) },
.verify_read = xfs_dir3_free_read_verify,
.verify_write = xfs_dir3_free_write_verify,
.verify_struct = xfs_dir3_free_verify,
diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
index d293f371dd54..fb5bd9a804f6 100644
--- a/fs/xfs/libxfs/xfs_dquot_buf.c
+++ b/fs/xfs/libxfs/xfs_dquot_buf.c
@@ -277,6 +277,8 @@ xfs_dquot_buf_write_verify(
const struct xfs_buf_ops xfs_dquot_buf_ops = {
.name = "xfs_dquot",
+ .magic16 = { cpu_to_be16(XFS_DQUOT_MAGIC),
+ cpu_to_be16(XFS_DQUOT_MAGIC) },
.verify_read = xfs_dquot_buf_read_verify,
.verify_write = xfs_dquot_buf_write_verify,
.verify_struct = xfs_dquot_buf_verify_struct,
@@ -284,6 +286,8 @@ const struct xfs_buf_ops xfs_dquot_buf_ops = {
const struct xfs_buf_ops xfs_dquot_buf_ra_ops = {
.name = "xfs_dquot_ra",
+ .magic16 = { cpu_to_be16(XFS_DQUOT_MAGIC),
+ cpu_to_be16(XFS_DQUOT_MAGIC) },
.verify_read = xfs_dquot_buf_readahead_verify,
.verify_write = xfs_dquot_buf_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h
index 66077a105cbb..79e6c4fb1d8a 100644
--- a/fs/xfs/libxfs/xfs_errortag.h
+++ b/fs/xfs/libxfs/xfs_errortag.h
@@ -54,7 +54,8 @@
#define XFS_ERRTAG_BUF_LRU_REF 31
#define XFS_ERRTAG_FORCE_SCRUB_REPAIR 32
#define XFS_ERRTAG_FORCE_SUMMARY_RECALC 33
-#define XFS_ERRTAG_MAX 34
+#define XFS_ERRTAG_IUNLINK_FALLBACK 34
+#define XFS_ERRTAG_MAX 35
/*
* Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
@@ -93,5 +94,6 @@
#define XFS_RANDOM_BUF_LRU_REF 2
#define XFS_RANDOM_FORCE_SCRUB_REPAIR 1
#define XFS_RANDOM_FORCE_SUMMARY_RECALC 1
+#define XFS_RANDOM_IUNLINK_FALLBACK (XFS_RANDOM_DEFAULT/10)
#endif /* __XFS_ERRORTAG_H_ */
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 9995d5ae380b..9bb3c48843ec 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -916,6 +916,9 @@ static inline uint xfs_dinode_size(int version)
/*
* Values for di_format
+ *
+ * This enum is used in string mapping in xfs_trace.h; please keep the
+ * TRACE_DEFINE_ENUMs for it up to date.
*/
typedef enum xfs_dinode_fmt {
XFS_DINODE_FMT_DEV, /* xfs_dev_t */
@@ -925,6 +928,13 @@ typedef enum xfs_dinode_fmt {
XFS_DINODE_FMT_UUID /* added long ago, but never used */
} xfs_dinode_fmt_t;
+#define XFS_INODE_FORMAT_STR \
+ { XFS_DINODE_FMT_DEV, "dev" }, \
+ { XFS_DINODE_FMT_LOCAL, "local" }, \
+ { XFS_DINODE_FMT_EXTENTS, "extent" }, \
+ { XFS_DINODE_FMT_BTREE, "btree" }, \
+ { XFS_DINODE_FMT_UUID, "uuid" }
+
/*
* Inode minimum and maximum sizes.
*/
@@ -1083,6 +1093,8 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
((i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp)))
#define XFS_OFFBNO_TO_AGINO(mp,b,o) \
((xfs_agino_t)(((b) << XFS_INO_OFFSET_BITS(mp)) | (o)))
+#define XFS_FSB_TO_INO(mp, b) ((xfs_ino_t)((b) << XFS_INO_OFFSET_BITS(mp)))
+#define XFS_AGB_TO_AGINO(mp, b) ((xfs_agino_t)((b) << XFS_INO_OFFSET_BITS(mp)))
#define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 56) - 1ULL))
#define XFS_MAXINUMBER_32 ((xfs_ino_t)((1ULL << 32) - 1ULL))
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index a8f6db735d5d..fe9898875097 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -288,7 +288,7 @@ xfs_ialloc_inode_init(
{
struct xfs_buf *fbuf;
struct xfs_dinode *free;
- int nbufs, blks_per_cluster, inodes_per_cluster;
+ int nbufs;
int version;
int i, j;
xfs_daddr_t d;
@@ -299,9 +299,7 @@ xfs_ialloc_inode_init(
* sizes, manipulate the inodes in buffers which are multiples of the
* blocks size.
*/
- blks_per_cluster = xfs_icluster_size_fsb(mp);
- inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
- nbufs = length / blks_per_cluster;
+ nbufs = length / mp->m_blocks_per_cluster;
/*
* Figure out what version number to use in the inodes we create. If
@@ -312,7 +310,7 @@ xfs_ialloc_inode_init(
*
* For v3 inodes, we also need to write the inode number into the inode,
* so calculate the first inode number of the chunk here as
- * XFS_OFFBNO_TO_AGINO() only works within a filesystem block, not
+ * XFS_AGB_TO_AGINO() only works within a filesystem block, not
* across multiple filesystem blocks (such as a cluster) and so cannot
* be used in the cluster buffer loop below.
*
@@ -324,8 +322,7 @@ xfs_ialloc_inode_init(
*/
if (xfs_sb_version_hascrc(&mp->m_sb)) {
version = 3;
- ino = XFS_AGINO_TO_INO(mp, agno,
- XFS_OFFBNO_TO_AGINO(mp, agbno, 0));
+ ino = XFS_AGINO_TO_INO(mp, agno, XFS_AGB_TO_AGINO(mp, agbno));
/*
* log the initialisation that is about to take place as an
@@ -345,9 +342,10 @@ xfs_ialloc_inode_init(
/*
* Get the block.
*/
- d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster));
+ d = XFS_AGB_TO_DADDR(mp, agno, agbno +
+ (j * mp->m_blocks_per_cluster));
fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
- mp->m_bsize * blks_per_cluster,
+ mp->m_bsize * mp->m_blocks_per_cluster,
XBF_UNMAPPED);
if (!fbuf)
return -ENOMEM;
@@ -355,7 +353,7 @@ xfs_ialloc_inode_init(
/* Initialize the inode buffers and log them appropriately. */
fbuf->b_ops = &xfs_inode_buf_ops;
xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length));
- for (i = 0; i < inodes_per_cluster; i++) {
+ for (i = 0; i < mp->m_inodes_per_cluster; i++) {
int ioffset = i << mp->m_sb.sb_inodelog;
uint isize = xfs_dinode_size(version);
@@ -445,7 +443,7 @@ xfs_align_sparse_ino(
return;
/* calculate the inode offset and align startino */
- offset = mod << mp->m_sb.sb_inopblog;
+ offset = XFS_AGB_TO_AGINO(mp, mod);
*startino -= offset;
/*
@@ -641,7 +639,7 @@ xfs_ialloc_ag_alloc(
args.tp = tp;
args.mp = tp->t_mountp;
args.fsbno = NULLFSBLOCK;
- xfs_rmap_ag_owner(&args.oinfo, XFS_RMAP_OWN_INODES);
+ args.oinfo = XFS_RMAP_OINFO_INODES;
#ifdef DEBUG
/* randomly do sparse inode allocations */
@@ -692,7 +690,7 @@ xfs_ialloc_ag_alloc(
* but not to use them in the actual exact allocation.
*/
args.alignment = 1;
- args.minalignslop = xfs_ialloc_cluster_alignment(args.mp) - 1;
+ args.minalignslop = args.mp->m_cluster_align - 1;
/* Allow space for the inode btree to split. */
args.minleft = args.mp->m_in_maxlevels - 1;
@@ -727,7 +725,7 @@ xfs_ialloc_ag_alloc(
args.alignment = args.mp->m_dalign;
isaligned = 1;
} else
- args.alignment = xfs_ialloc_cluster_alignment(args.mp);
+ args.alignment = args.mp->m_cluster_align;
/*
* Need to figure out where to allocate the inode blocks.
* Ideally they should be spaced out through the a.g.
@@ -756,7 +754,7 @@ xfs_ialloc_ag_alloc(
args.type = XFS_ALLOCTYPE_NEAR_BNO;
args.agbno = be32_to_cpu(agi->agi_root);
args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
- args.alignment = xfs_ialloc_cluster_alignment(args.mp);
+ args.alignment = args.mp->m_cluster_align;
if ((error = xfs_alloc_vextent(&args)))
return error;
}
@@ -797,7 +795,7 @@ sparse_alloc:
if (error)
return error;
- newlen = args.len << args.mp->m_sb.sb_inopblog;
+ newlen = XFS_AGB_TO_AGINO(args.mp, args.len);
ASSERT(newlen <= XFS_INODES_PER_CHUNK);
allocmask = (1 << (newlen / XFS_INODES_PER_HOLEMASK_BIT)) - 1;
}
@@ -825,7 +823,7 @@ sparse_alloc:
/*
* Convert the results.
*/
- newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
+ newino = XFS_AGB_TO_AGINO(args.mp, args.agbno);
if (xfs_inobt_issparse(~allocmask)) {
/*
@@ -1019,7 +1017,7 @@ xfs_ialloc_ag_select(
*/
ineed = mp->m_ialloc_min_blks;
if (flags && ineed > 1)
- ineed += xfs_ialloc_cluster_alignment(mp);
+ ineed += mp->m_cluster_align;
longest = pag->pagf_longest;
if (!longest)
longest = pag->pagf_flcount > 0;
@@ -1849,14 +1847,12 @@ xfs_difree_inode_chunk(
int nextbit;
xfs_agblock_t agbno;
int contigblk;
- struct xfs_owner_info oinfo;
DECLARE_BITMAP(holemask, XFS_INOBT_HOLEMASK_BITS);
- xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INODES);
if (!xfs_inobt_issparse(rec->ir_holemask)) {
/* not sparse, calculate extent info directly */
xfs_bmap_add_free(tp, XFS_AGB_TO_FSB(mp, agno, sagbno),
- mp->m_ialloc_blks, &oinfo);
+ mp->m_ialloc_blks, &XFS_RMAP_OINFO_INODES);
return;
}
@@ -1900,7 +1896,7 @@ xfs_difree_inode_chunk(
ASSERT(agbno % mp->m_sb.sb_spino_align == 0);
ASSERT(contigblk % mp->m_sb.sb_spino_align == 0);
xfs_bmap_add_free(tp, XFS_AGB_TO_FSB(mp, agno, agbno),
- contigblk, &oinfo);
+ contigblk, &XFS_RMAP_OINFO_INODES);
/* reset range to current bit and carry on... */
startidx = endidx = nextbit;
@@ -2292,7 +2288,6 @@ xfs_imap(
xfs_agblock_t agbno; /* block number of inode in the alloc group */
xfs_agino_t agino; /* inode number within alloc group */
xfs_agnumber_t agno; /* allocation group number */
- int blks_per_cluster; /* num blocks per inode cluster */
xfs_agblock_t chunk_agbno; /* first block in inode chunk */
xfs_agblock_t cluster_agbno; /* first block in inode cluster */
int error; /* error code */
@@ -2338,8 +2333,6 @@ xfs_imap(
return -EINVAL;
}
- blks_per_cluster = xfs_icluster_size_fsb(mp);
-
/*
* For bulkstat and handle lookups, we have an untrusted inode number
* that we have to verify is valid. We cannot do this just by reading
@@ -2359,7 +2352,7 @@ xfs_imap(
* If the inode cluster size is the same as the blocksize or
* smaller we get to the buffer by simple arithmetics.
*/
- if (blks_per_cluster == 1) {
+ if (mp->m_blocks_per_cluster == 1) {
offset = XFS_INO_TO_OFFSET(mp, ino);
ASSERT(offset < mp->m_sb.sb_inopblock);
@@ -2388,12 +2381,13 @@ xfs_imap(
out_map:
ASSERT(agbno >= chunk_agbno);
cluster_agbno = chunk_agbno +
- ((offset_agbno / blks_per_cluster) * blks_per_cluster);
+ ((offset_agbno / mp->m_blocks_per_cluster) *
+ mp->m_blocks_per_cluster);
offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) +
XFS_INO_TO_OFFSET(mp, ino);
imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, cluster_agbno);
- imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
+ imap->im_len = XFS_FSB_TO_BB(mp, mp->m_blocks_per_cluster);
imap->im_boffset = (unsigned short)(offset << mp->m_sb.sb_inodelog);
/*
@@ -2514,7 +2508,7 @@ xfs_agi_verify(
/*
* Validate the magic number of the agi block.
*/
- if (agi->agi_magicnum != cpu_to_be32(XFS_AGI_MAGIC))
+ if (!xfs_verify_magic(bp, agi->agi_magicnum))
return __this_address;
if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)))
return __this_address;
@@ -2588,6 +2582,7 @@ xfs_agi_write_verify(
const struct xfs_buf_ops xfs_agi_buf_ops = {
.name = "xfs_agi",
+ .magic = { cpu_to_be32(XFS_AGI_MAGIC), cpu_to_be32(XFS_AGI_MAGIC) },
.verify_read = xfs_agi_read_verify,
.verify_write = xfs_agi_write_verify,
.verify_struct = xfs_agi_verify,
@@ -2726,8 +2721,8 @@ xfs_ialloc_has_inodes_at_extent(
xfs_agino_t low;
xfs_agino_t high;
- low = XFS_OFFBNO_TO_AGINO(cur->bc_mp, bno, 0);
- high = XFS_OFFBNO_TO_AGINO(cur->bc_mp, bno + len, 0) - 1;
+ low = XFS_AGB_TO_AGINO(cur->bc_mp, bno);
+ high = XFS_AGB_TO_AGINO(cur->bc_mp, bno + len) - 1;
return xfs_ialloc_has_inode_record(cur, low, high, exists);
}
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 86c50208a143..1080381ff243 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -84,7 +84,7 @@ __xfs_inobt_alloc_block(
memset(&args, 0, sizeof(args));
args.tp = cur->bc_tp;
args.mp = cur->bc_mp;
- xfs_rmap_ag_owner(&args.oinfo, XFS_RMAP_OWN_INOBT);
+ args.oinfo = XFS_RMAP_OINFO_INOBT;
args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, sbno);
args.minlen = 1;
args.maxlen = 1;
@@ -124,7 +124,7 @@ xfs_finobt_alloc_block(
union xfs_btree_ptr *new,
int *stat)
{
- if (cur->bc_mp->m_inotbt_nores)
+ if (cur->bc_mp->m_finobt_nores)
return xfs_inobt_alloc_block(cur, start, new, stat);
return __xfs_inobt_alloc_block(cur, start, new, stat,
XFS_AG_RESV_METADATA);
@@ -136,12 +136,9 @@ __xfs_inobt_free_block(
struct xfs_buf *bp,
enum xfs_ag_resv_type resv)
{
- struct xfs_owner_info oinfo;
-
- xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INOBT);
return xfs_free_extent(cur->bc_tp,
XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)), 1,
- &oinfo, resv);
+ &XFS_RMAP_OINFO_INOBT, resv);
}
STATIC int
@@ -157,7 +154,7 @@ xfs_finobt_free_block(
struct xfs_btree_cur *cur,
struct xfs_buf *bp)
{
- if (cur->bc_mp->m_inotbt_nores)
+ if (cur->bc_mp->m_finobt_nores)
return xfs_inobt_free_block(cur, bp);
return __xfs_inobt_free_block(cur, bp, XFS_AG_RESV_METADATA);
}
@@ -263,6 +260,9 @@ xfs_inobt_verify(
xfs_failaddr_t fa;
unsigned int level;
+ if (!xfs_verify_magic(bp, block->bb_magic))
+ return __this_address;
+
/*
* During growfs operations, we can't verify the exact owner as the
* perag is not fully initialised and hence not attached to the buffer.
@@ -273,18 +273,10 @@ xfs_inobt_verify(
* but beware of the landmine (i.e. need to check pag->pagi_init) if we
* ever do.
*/
- switch (block->bb_magic) {
- case cpu_to_be32(XFS_IBT_CRC_MAGIC):
- case cpu_to_be32(XFS_FIBT_CRC_MAGIC):
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
fa = xfs_btree_sblock_v5hdr_verify(bp);
if (fa)
return fa;
- /* fall through */
- case cpu_to_be32(XFS_IBT_MAGIC):
- case cpu_to_be32(XFS_FIBT_MAGIC):
- break;
- default:
- return __this_address;
}
/* level verification */
@@ -331,6 +323,16 @@ xfs_inobt_write_verify(
const struct xfs_buf_ops xfs_inobt_buf_ops = {
.name = "xfs_inobt",
+ .magic = { cpu_to_be32(XFS_IBT_MAGIC), cpu_to_be32(XFS_IBT_CRC_MAGIC) },
+ .verify_read = xfs_inobt_read_verify,
+ .verify_write = xfs_inobt_write_verify,
+ .verify_struct = xfs_inobt_verify,
+};
+
+const struct xfs_buf_ops xfs_finobt_buf_ops = {
+ .name = "xfs_finobt",
+ .magic = { cpu_to_be32(XFS_FIBT_MAGIC),
+ cpu_to_be32(XFS_FIBT_CRC_MAGIC) },
.verify_read = xfs_inobt_read_verify,
.verify_write = xfs_inobt_write_verify,
.verify_struct = xfs_inobt_verify,
@@ -392,7 +394,7 @@ static const struct xfs_btree_ops xfs_finobt_ops = {
.init_rec_from_cur = xfs_inobt_init_rec_from_cur,
.init_ptr_from_cur = xfs_finobt_init_ptr_from_cur,
.key_diff = xfs_inobt_key_diff,
- .buf_ops = &xfs_inobt_buf_ops,
+ .buf_ops = &xfs_finobt_buf_ops,
.diff_two_keys = xfs_inobt_diff_two_keys,
.keys_inorder = xfs_inobt_keys_inorder,
.recs_inorder = xfs_inobt_recs_inorder,
@@ -538,15 +540,18 @@ xfs_inobt_rec_check_count(
static xfs_extlen_t
xfs_inobt_max_size(
- struct xfs_mount *mp)
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno)
{
+ xfs_agblock_t agblocks = xfs_ag_block_count(mp, agno);
+
/* Bail out if we're uninitialized, which can happen in mkfs. */
if (mp->m_inobt_mxr[0] == 0)
return 0;
return xfs_btree_calc_size(mp->m_inobt_mnr,
- (uint64_t)mp->m_sb.sb_agblocks * mp->m_sb.sb_inopblock /
- XFS_INODES_PER_CHUNK);
+ (uint64_t)agblocks * mp->m_sb.sb_inopblock /
+ XFS_INODES_PER_CHUNK);
}
static int
@@ -594,7 +599,7 @@ xfs_finobt_calc_reserves(
if (error)
return error;
- *ask += xfs_inobt_max_size(mp);
+ *ask += xfs_inobt_max_size(mp, agno);
*used += tree_len;
return 0;
}
diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c
index 771dd072015d..bc690f2409fa 100644
--- a/fs/xfs/libxfs/xfs_iext_tree.c
+++ b/fs/xfs/libxfs/xfs_iext_tree.c
@@ -614,16 +614,15 @@ xfs_iext_realloc_root(
}
/*
- * Increment the sequence counter if we are on a COW fork. This allows
- * the writeback code to skip looking for a COW extent if the COW fork
- * hasn't changed. We use WRITE_ONCE here to ensure the update to the
- * sequence counter is seen before the modifications to the extent
- * tree itself take effect.
+ * Increment the sequence counter on extent tree changes. If we are on a COW
+ * fork, this allows the writeback code to skip looking for a COW extent if the
+ * COW fork hasn't changed. We use WRITE_ONCE here to ensure the update to the
+ * sequence counter is seen before the modifications to the extent tree itself
+ * take effect.
*/
static inline void xfs_iext_inc_seq(struct xfs_ifork *ifp, int state)
{
- if (state & BMAP_COWFORK)
- WRITE_ONCE(ifp->if_seq, READ_ONCE(ifp->if_seq) + 1);
+ WRITE_ONCE(ifp->if_seq, READ_ONCE(ifp->if_seq) + 1);
}
void
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 09d9c8cfa4a0..e021d5133ccb 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -97,10 +97,9 @@ xfs_inode_buf_verify(
dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog));
unlinked_ino = be32_to_cpu(dip->di_next_unlinked);
- di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
+ di_ok = xfs_verify_magic16(bp, dip->di_magic) &&
xfs_dinode_good_version(mp, dip->di_version) &&
- (unlinked_ino == NULLAGINO ||
- xfs_verify_agino(mp, agno, unlinked_ino));
+ xfs_verify_agino_or_null(mp, agno, unlinked_ino);
if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
XFS_ERRTAG_ITOBP_INOTOBP))) {
if (readahead) {
@@ -147,12 +146,16 @@ xfs_inode_buf_write_verify(
const struct xfs_buf_ops xfs_inode_buf_ops = {
.name = "xfs_inode",
+ .magic16 = { cpu_to_be16(XFS_DINODE_MAGIC),
+ cpu_to_be16(XFS_DINODE_MAGIC) },
.verify_read = xfs_inode_buf_read_verify,
.verify_write = xfs_inode_buf_write_verify,
};
const struct xfs_buf_ops xfs_inode_buf_ra_ops = {
- .name = "xxfs_inode_ra",
+ .name = "xfs_inode_ra",
+ .magic16 = { cpu_to_be16(XFS_DINODE_MAGIC),
+ cpu_to_be16(XFS_DINODE_MAGIC) },
.verify_read = xfs_inode_buf_readahead_verify,
.verify_write = xfs_inode_buf_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index 60361d2d74a1..00c62ce170d0 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -14,7 +14,7 @@ struct xfs_dinode;
*/
struct xfs_ifork {
int if_bytes; /* bytes in if_u1 */
- unsigned int if_seq; /* cow fork mod counter */
+ unsigned int if_seq; /* fork mod counter */
struct xfs_btree_block *if_broot; /* file's incore btree root */
short if_broot_bytes; /* bytes allocated for root */
unsigned char if_flags; /* per-fork flags */
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
index 1aaa01c97517..6f47ab876d90 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.c
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -70,7 +70,7 @@ xfs_refcountbt_alloc_block(
args.type = XFS_ALLOCTYPE_NEAR_BNO;
args.fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno,
xfs_refc_block(args.mp));
- xfs_rmap_ag_owner(&args.oinfo, XFS_RMAP_OWN_REFC);
+ args.oinfo = XFS_RMAP_OINFO_REFC;
args.minlen = args.maxlen = args.prod = 1;
args.resv = XFS_AG_RESV_METADATA;
@@ -106,15 +106,13 @@ xfs_refcountbt_free_block(
struct xfs_buf *agbp = cur->bc_private.a.agbp;
struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp));
- struct xfs_owner_info oinfo;
int error;
trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_private.a.agno,
XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1);
- xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_REFC);
be32_add_cpu(&agf->agf_refcount_blocks, -1);
xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS);
- error = xfs_free_extent(cur->bc_tp, fsbno, 1, &oinfo,
+ error = xfs_free_extent(cur->bc_tp, fsbno, 1, &XFS_RMAP_OINFO_REFC,
XFS_AG_RESV_METADATA);
if (error)
return error;
@@ -211,7 +209,7 @@ xfs_refcountbt_verify(
xfs_failaddr_t fa;
unsigned int level;
- if (block->bb_magic != cpu_to_be32(XFS_REFC_CRC_MAGIC))
+ if (!xfs_verify_magic(bp, block->bb_magic))
return __this_address;
if (!xfs_sb_version_hasreflink(&mp->m_sb))
@@ -266,6 +264,7 @@ xfs_refcountbt_write_verify(
const struct xfs_buf_ops xfs_refcountbt_buf_ops = {
.name = "xfs_refcountbt",
+ .magic = { 0, cpu_to_be32(XFS_REFC_CRC_MAGIC) },
.verify_read = xfs_refcountbt_read_verify,
.verify_write = xfs_refcountbt_write_verify,
.verify_struct = xfs_refcountbt_verify,
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index 245af452840e..8ed885507dd8 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -458,21 +458,21 @@ out:
*/
STATIC int
xfs_rmap_unmap(
- struct xfs_btree_cur *cur,
- xfs_agblock_t bno,
- xfs_extlen_t len,
- bool unwritten,
- struct xfs_owner_info *oinfo)
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ bool unwritten,
+ const struct xfs_owner_info *oinfo)
{
- struct xfs_mount *mp = cur->bc_mp;
- struct xfs_rmap_irec ltrec;
- uint64_t ltoff;
- int error = 0;
- int i;
- uint64_t owner;
- uint64_t offset;
- unsigned int flags;
- bool ignore_off;
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_rmap_irec ltrec;
+ uint64_t ltoff;
+ int error = 0;
+ int i;
+ uint64_t owner;
+ uint64_t offset;
+ unsigned int flags;
+ bool ignore_off;
xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
ignore_off = XFS_RMAP_NON_INODE_OWNER(owner) ||
@@ -653,16 +653,16 @@ out_error:
*/
int
xfs_rmap_free(
- struct xfs_trans *tp,
- struct xfs_buf *agbp,
- xfs_agnumber_t agno,
- xfs_agblock_t bno,
- xfs_extlen_t len,
- struct xfs_owner_info *oinfo)
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ xfs_agnumber_t agno,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ const struct xfs_owner_info *oinfo)
{
- struct xfs_mount *mp = tp->t_mountp;
- struct xfs_btree_cur *cur;
- int error;
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_btree_cur *cur;
+ int error;
if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
return 0;
@@ -710,23 +710,23 @@ xfs_rmap_is_mergeable(
*/
STATIC int
xfs_rmap_map(
- struct xfs_btree_cur *cur,
- xfs_agblock_t bno,
- xfs_extlen_t len,
- bool unwritten,
- struct xfs_owner_info *oinfo)
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ bool unwritten,
+ const struct xfs_owner_info *oinfo)
{
- struct xfs_mount *mp = cur->bc_mp;
- struct xfs_rmap_irec ltrec;
- struct xfs_rmap_irec gtrec;
- int have_gt;
- int have_lt;
- int error = 0;
- int i;
- uint64_t owner;
- uint64_t offset;
- unsigned int flags = 0;
- bool ignore_off;
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_rmap_irec ltrec;
+ struct xfs_rmap_irec gtrec;
+ int have_gt;
+ int have_lt;
+ int error = 0;
+ int i;
+ uint64_t owner;
+ uint64_t offset;
+ unsigned int flags = 0;
+ bool ignore_off;
xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
ASSERT(owner != 0);
@@ -890,16 +890,16 @@ out_error:
*/
int
xfs_rmap_alloc(
- struct xfs_trans *tp,
- struct xfs_buf *agbp,
- xfs_agnumber_t agno,
- xfs_agblock_t bno,
- xfs_extlen_t len,
- struct xfs_owner_info *oinfo)
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ xfs_agnumber_t agno,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ const struct xfs_owner_info *oinfo)
{
- struct xfs_mount *mp = tp->t_mountp;
- struct xfs_btree_cur *cur;
- int error;
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_btree_cur *cur;
+ int error;
if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
return 0;
@@ -929,16 +929,16 @@ xfs_rmap_alloc(
*/
STATIC int
xfs_rmap_convert(
- struct xfs_btree_cur *cur,
- xfs_agblock_t bno,
- xfs_extlen_t len,
- bool unwritten,
- struct xfs_owner_info *oinfo)
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ bool unwritten,
+ const struct xfs_owner_info *oinfo)
{
- struct xfs_mount *mp = cur->bc_mp;
- struct xfs_rmap_irec r[4]; /* neighbor extent entries */
- /* left is 0, right is 1, prev is 2 */
- /* new is 3 */
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_rmap_irec r[4]; /* neighbor extent entries */
+ /* left is 0, right is 1, */
+ /* prev is 2, new is 3 */
uint64_t owner;
uint64_t offset;
uint64_t new_endoff;
@@ -1354,16 +1354,16 @@ done:
*/
STATIC int
xfs_rmap_convert_shared(
- struct xfs_btree_cur *cur,
- xfs_agblock_t bno,
- xfs_extlen_t len,
- bool unwritten,
- struct xfs_owner_info *oinfo)
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ bool unwritten,
+ const struct xfs_owner_info *oinfo)
{
- struct xfs_mount *mp = cur->bc_mp;
- struct xfs_rmap_irec r[4]; /* neighbor extent entries */
- /* left is 0, right is 1, prev is 2 */
- /* new is 3 */
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_rmap_irec r[4]; /* neighbor extent entries */
+ /* left is 0, right is 1, */
+ /* prev is 2, new is 3 */
uint64_t owner;
uint64_t offset;
uint64_t new_endoff;
@@ -1743,20 +1743,20 @@ done:
*/
STATIC int
xfs_rmap_unmap_shared(
- struct xfs_btree_cur *cur,
- xfs_agblock_t bno,
- xfs_extlen_t len,
- bool unwritten,
- struct xfs_owner_info *oinfo)
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ bool unwritten,
+ const struct xfs_owner_info *oinfo)
{
- struct xfs_mount *mp = cur->bc_mp;
- struct xfs_rmap_irec ltrec;
- uint64_t ltoff;
- int error = 0;
- int i;
- uint64_t owner;
- uint64_t offset;
- unsigned int flags;
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_rmap_irec ltrec;
+ uint64_t ltoff;
+ int error = 0;
+ int i;
+ uint64_t owner;
+ uint64_t offset;
+ unsigned int flags;
xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
if (unwritten)
@@ -1905,22 +1905,22 @@ out_error:
*/
STATIC int
xfs_rmap_map_shared(
- struct xfs_btree_cur *cur,
- xfs_agblock_t bno,
- xfs_extlen_t len,
- bool unwritten,
- struct xfs_owner_info *oinfo)
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ bool unwritten,
+ const struct xfs_owner_info *oinfo)
{
- struct xfs_mount *mp = cur->bc_mp;
- struct xfs_rmap_irec ltrec;
- struct xfs_rmap_irec gtrec;
- int have_gt;
- int have_lt;
- int error = 0;
- int i;
- uint64_t owner;
- uint64_t offset;
- unsigned int flags = 0;
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_rmap_irec ltrec;
+ struct xfs_rmap_irec gtrec;
+ int have_gt;
+ int have_lt;
+ int error = 0;
+ int i;
+ uint64_t owner;
+ uint64_t offset;
+ unsigned int flags = 0;
xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
if (unwritten)
@@ -2459,18 +2459,18 @@ xfs_rmap_has_record(
*/
int
xfs_rmap_record_exists(
- struct xfs_btree_cur *cur,
- xfs_agblock_t bno,
- xfs_extlen_t len,
- struct xfs_owner_info *oinfo,
- bool *has_rmap)
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ const struct xfs_owner_info *oinfo,
+ bool *has_rmap)
{
- uint64_t owner;
- uint64_t offset;
- unsigned int flags;
- int has_record;
- struct xfs_rmap_irec irec;
- int error;
+ uint64_t owner;
+ uint64_t offset;
+ unsigned int flags;
+ int has_record;
+ struct xfs_rmap_irec irec;
+ int error;
xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
ASSERT(XFS_RMAP_NON_INODE_OWNER(owner) ||
@@ -2530,7 +2530,7 @@ xfs_rmap_has_other_keys(
struct xfs_btree_cur *cur,
xfs_agblock_t bno,
xfs_extlen_t len,
- struct xfs_owner_info *oinfo,
+ const struct xfs_owner_info *oinfo,
bool *has_rmap)
{
struct xfs_rmap_irec low = {0};
@@ -2550,3 +2550,31 @@ xfs_rmap_has_other_keys(
*has_rmap = rks.has_rmap;
return error;
}
+
+const struct xfs_owner_info XFS_RMAP_OINFO_SKIP_UPDATE = {
+ .oi_owner = XFS_RMAP_OWN_NULL,
+};
+const struct xfs_owner_info XFS_RMAP_OINFO_ANY_OWNER = {
+ .oi_owner = XFS_RMAP_OWN_UNKNOWN,
+};
+const struct xfs_owner_info XFS_RMAP_OINFO_FS = {
+ .oi_owner = XFS_RMAP_OWN_FS,
+};
+const struct xfs_owner_info XFS_RMAP_OINFO_LOG = {
+ .oi_owner = XFS_RMAP_OWN_LOG,
+};
+const struct xfs_owner_info XFS_RMAP_OINFO_AG = {
+ .oi_owner = XFS_RMAP_OWN_AG,
+};
+const struct xfs_owner_info XFS_RMAP_OINFO_INOBT = {
+ .oi_owner = XFS_RMAP_OWN_INOBT,
+};
+const struct xfs_owner_info XFS_RMAP_OINFO_INODES = {
+ .oi_owner = XFS_RMAP_OWN_INODES,
+};
+const struct xfs_owner_info XFS_RMAP_OINFO_REFC = {
+ .oi_owner = XFS_RMAP_OWN_REFC,
+};
+const struct xfs_owner_info XFS_RMAP_OINFO_COW = {
+ .oi_owner = XFS_RMAP_OWN_COW,
+};
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index 157dc722ad35..e21ed0294e5c 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -7,16 +7,6 @@
#define __XFS_RMAP_H__
static inline void
-xfs_rmap_ag_owner(
- struct xfs_owner_info *oi,
- uint64_t owner)
-{
- oi->oi_owner = owner;
- oi->oi_offset = 0;
- oi->oi_flags = 0;
-}
-
-static inline void
xfs_rmap_ino_bmbt_owner(
struct xfs_owner_info *oi,
xfs_ino_t ino,
@@ -43,27 +33,13 @@ xfs_rmap_ino_owner(
oi->oi_flags |= XFS_OWNER_INFO_ATTR_FORK;
}
-static inline void
-xfs_rmap_skip_owner_update(
- struct xfs_owner_info *oi)
-{
- xfs_rmap_ag_owner(oi, XFS_RMAP_OWN_NULL);
-}
-
static inline bool
xfs_rmap_should_skip_owner_update(
- struct xfs_owner_info *oi)
+ const struct xfs_owner_info *oi)
{
return oi->oi_owner == XFS_RMAP_OWN_NULL;
}
-static inline void
-xfs_rmap_any_owner_update(
- struct xfs_owner_info *oi)
-{
- xfs_rmap_ag_owner(oi, XFS_RMAP_OWN_UNKNOWN);
-}
-
/* Reverse mapping functions. */
struct xfs_buf;
@@ -103,12 +79,12 @@ xfs_rmap_irec_offset_unpack(
static inline void
xfs_owner_info_unpack(
- struct xfs_owner_info *oinfo,
- uint64_t *owner,
- uint64_t *offset,
- unsigned int *flags)
+ const struct xfs_owner_info *oinfo,
+ uint64_t *owner,
+ uint64_t *offset,
+ unsigned int *flags)
{
- unsigned int r = 0;
+ unsigned int r = 0;
*owner = oinfo->oi_owner;
*offset = oinfo->oi_offset;
@@ -137,10 +113,10 @@ xfs_owner_info_pack(
int xfs_rmap_alloc(struct xfs_trans *tp, struct xfs_buf *agbp,
xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len,
- struct xfs_owner_info *oinfo);
+ const struct xfs_owner_info *oinfo);
int xfs_rmap_free(struct xfs_trans *tp, struct xfs_buf *agbp,
xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len,
- struct xfs_owner_info *oinfo);
+ const struct xfs_owner_info *oinfo);
int xfs_rmap_lookup_le(struct xfs_btree_cur *cur, xfs_agblock_t bno,
xfs_extlen_t len, uint64_t owner, uint64_t offset,
@@ -218,11 +194,21 @@ int xfs_rmap_btrec_to_irec(union xfs_btree_rec *rec,
int xfs_rmap_has_record(struct xfs_btree_cur *cur, xfs_agblock_t bno,
xfs_extlen_t len, bool *exists);
int xfs_rmap_record_exists(struct xfs_btree_cur *cur, xfs_agblock_t bno,
- xfs_extlen_t len, struct xfs_owner_info *oinfo,
+ xfs_extlen_t len, const struct xfs_owner_info *oinfo,
bool *has_rmap);
int xfs_rmap_has_other_keys(struct xfs_btree_cur *cur, xfs_agblock_t bno,
- xfs_extlen_t len, struct xfs_owner_info *oinfo,
+ xfs_extlen_t len, const struct xfs_owner_info *oinfo,
bool *has_rmap);
int xfs_rmap_map_raw(struct xfs_btree_cur *cur, struct xfs_rmap_irec *rmap);
+extern const struct xfs_owner_info XFS_RMAP_OINFO_SKIP_UPDATE;
+extern const struct xfs_owner_info XFS_RMAP_OINFO_ANY_OWNER;
+extern const struct xfs_owner_info XFS_RMAP_OINFO_FS;
+extern const struct xfs_owner_info XFS_RMAP_OINFO_LOG;
+extern const struct xfs_owner_info XFS_RMAP_OINFO_AG;
+extern const struct xfs_owner_info XFS_RMAP_OINFO_INOBT;
+extern const struct xfs_owner_info XFS_RMAP_OINFO_INODES;
+extern const struct xfs_owner_info XFS_RMAP_OINFO_REFC;
+extern const struct xfs_owner_info XFS_RMAP_OINFO_COW;
+
#endif /* __XFS_RMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index f79cf040d745..5738e11055e6 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -310,7 +310,7 @@ xfs_rmapbt_verify(
* from the on disk AGF. Again, we can only check against maximum limits
* in this case.
*/
- if (block->bb_magic != cpu_to_be32(XFS_RMAP_CRC_MAGIC))
+ if (!xfs_verify_magic(bp, block->bb_magic))
return __this_address;
if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
@@ -365,6 +365,7 @@ xfs_rmapbt_write_verify(
const struct xfs_buf_ops xfs_rmapbt_buf_ops = {
.name = "xfs_rmapbt",
+ .magic = { 0, cpu_to_be32(XFS_RMAP_CRC_MAGIC) },
.verify_read = xfs_rmapbt_read_verify,
.verify_write = xfs_rmapbt_write_verify,
.verify_struct = xfs_rmapbt_verify,
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index b228c821bae6..eaaff67e9626 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -505,6 +505,12 @@ xfs_rtmodify_summary_int(
uint first = (uint)((char *)sp - (char *)bp->b_addr);
*sp += delta;
+ if (mp->m_rsum_cache) {
+ if (*sp == 0 && log == mp->m_rsum_cache[bbno])
+ mp->m_rsum_cache[bbno]++;
+ if (*sp != 0 && log < mp->m_rsum_cache[bbno])
+ mp->m_rsum_cache[bbno] = log;
+ }
xfs_trans_log_buf(tp, bp, first, first + sizeof(*sp) - 1);
}
if (sum)
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index b5a82acd7dfe..77a3a4085de3 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -225,10 +225,11 @@ xfs_validate_sb_common(
struct xfs_buf *bp,
struct xfs_sb *sbp)
{
+ struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp);
uint32_t agcount = 0;
uint32_t rem;
- if (sbp->sb_magicnum != XFS_SB_MAGIC) {
+ if (!xfs_verify_magic(bp, dsb->sb_magicnum)) {
xfs_warn(mp, "bad magic number");
return -EWRONGFS;
}
@@ -781,12 +782,14 @@ out_error:
const struct xfs_buf_ops xfs_sb_buf_ops = {
.name = "xfs_sb",
+ .magic = { cpu_to_be32(XFS_SB_MAGIC), cpu_to_be32(XFS_SB_MAGIC) },
.verify_read = xfs_sb_read_verify,
.verify_write = xfs_sb_write_verify,
};
const struct xfs_buf_ops xfs_sb_quiet_buf_ops = {
.name = "xfs_sb_quiet",
+ .magic = { cpu_to_be32(XFS_SB_MAGIC), cpu_to_be32(XFS_SB_MAGIC) },
.verify_read = xfs_sb_quiet_read_verify,
.verify_write = xfs_sb_write_verify,
};
@@ -874,7 +877,7 @@ xfs_initialize_perag_data(
uint64_t bfreelst = 0;
uint64_t btree = 0;
uint64_t fdblocks;
- int error;
+ int error = 0;
for (index = 0; index < agcount; index++) {
/*
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 1c5debe748f0..4e909791aeac 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -25,7 +25,8 @@ extern const struct xfs_buf_ops xfs_agf_buf_ops;
extern const struct xfs_buf_ops xfs_agi_buf_ops;
extern const struct xfs_buf_ops xfs_agf_buf_ops;
extern const struct xfs_buf_ops xfs_agfl_buf_ops;
-extern const struct xfs_buf_ops xfs_allocbt_buf_ops;
+extern const struct xfs_buf_ops xfs_bnobt_buf_ops;
+extern const struct xfs_buf_ops xfs_cntbt_buf_ops;
extern const struct xfs_buf_ops xfs_rmapbt_buf_ops;
extern const struct xfs_buf_ops xfs_refcountbt_buf_ops;
extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops;
@@ -36,6 +37,7 @@ extern const struct xfs_buf_ops xfs_dquot_buf_ops;
extern const struct xfs_buf_ops xfs_symlink_buf_ops;
extern const struct xfs_buf_ops xfs_agi_buf_ops;
extern const struct xfs_buf_ops xfs_inobt_buf_ops;
+extern const struct xfs_buf_ops xfs_finobt_buf_ops;
extern const struct xfs_buf_ops xfs_inode_buf_ops;
extern const struct xfs_buf_ops xfs_inode_buf_ra_ops;
extern const struct xfs_buf_ops xfs_dquot_buf_ops;
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
index 95374ab2dee7..a0ccc253c43d 100644
--- a/fs/xfs/libxfs/xfs_symlink_remote.c
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -95,7 +95,7 @@ xfs_symlink_verify(
if (!xfs_sb_version_hascrc(&mp->m_sb))
return __this_address;
- if (dsl->sl_magic != cpu_to_be32(XFS_SYMLINK_MAGIC))
+ if (!xfs_verify_magic(bp, dsl->sl_magic))
return __this_address;
if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
@@ -159,6 +159,7 @@ xfs_symlink_write_verify(
const struct xfs_buf_ops xfs_symlink_buf_ops = {
.name = "xfs_symlink",
+ .magic = { 0, cpu_to_be32(XFS_SYMLINK_MAGIC) },
.verify_read = xfs_symlink_read_verify,
.verify_write = xfs_symlink_write_verify,
.verify_struct = xfs_symlink_verify,
@@ -199,7 +200,10 @@ xfs_symlink_local_to_remote(
ifp->if_bytes - 1);
}
-/* Verify the consistency of an inline symlink. */
+/*
+ * Verify the in-memory consistency of an inline symlink data fork. This
+ * does not do on-disk format checks.
+ */
xfs_failaddr_t
xfs_symlink_shortform_verify(
struct xfs_inode *ip)
@@ -215,9 +219,12 @@ xfs_symlink_shortform_verify(
size = ifp->if_bytes;
endp = sfp + size;
- /* Zero length symlinks can exist while we're deleting a remote one. */
- if (size == 0)
- return NULL;
+ /*
+ * Zero length symlinks should never occur in memory as they are
+ * never alllowed to exist on disk.
+ */
+ if (!size)
+ return __this_address;
/* No negative sizes or overly long symlink targets. */
if (size < 0 || size > XFS_SYMLINK_MAXLEN)
diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c
index 33a5ca346baf..de310712dd6d 100644
--- a/fs/xfs/libxfs/xfs_types.c
+++ b/fs/xfs/libxfs/xfs_types.c
@@ -87,16 +87,15 @@ xfs_agino_range(
* Calculate the first inode, which will be in the first
* cluster-aligned block after the AGFL.
*/
- bno = round_up(XFS_AGFL_BLOCK(mp) + 1,
- xfs_ialloc_cluster_alignment(mp));
- *first = XFS_OFFBNO_TO_AGINO(mp, bno, 0);
+ bno = round_up(XFS_AGFL_BLOCK(mp) + 1, mp->m_cluster_align);
+ *first = XFS_AGB_TO_AGINO(mp, bno);
/*
* Calculate the last inode, which will be at the end of the
* last (aligned) cluster that can be allocated in the AG.
*/
- bno = round_down(eoag, xfs_ialloc_cluster_alignment(mp));
- *last = XFS_OFFBNO_TO_AGINO(mp, bno, 0) - 1;
+ bno = round_down(eoag, mp->m_cluster_align);
+ *last = XFS_AGB_TO_AGINO(mp, bno) - 1;
}
/*
@@ -117,6 +116,19 @@ xfs_verify_agino(
}
/*
+ * Verify that an AG inode number pointer neither points outside the AG
+ * nor points at static metadata, or is NULLAGINO.
+ */
+bool
+xfs_verify_agino_or_null(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ xfs_agino_t agino)
+{
+ return agino == NULLAGINO || xfs_verify_agino(mp, agno, agino);
+}
+
+/*
* Verify that an FS inode number pointer neither points outside the
* filesystem nor points at static AG metadata.
*/
@@ -205,3 +217,14 @@ xfs_verify_icount(
xfs_icount_range(mp, &min, &max);
return icount >= min && icount <= max;
}
+
+/* Sanity-checking of dir/attr block offsets. */
+bool
+xfs_verify_dablk(
+ struct xfs_mount *mp,
+ xfs_fileoff_t dabno)
+{
+ xfs_dablk_t max_dablk = -1U;
+
+ return dabno <= max_dablk;
+}
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index b9e6c89284c3..c5a25403b4db 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -100,15 +100,37 @@ typedef void * xfs_failaddr_t;
*/
#define MAXNAMELEN 256
+/*
+ * This enum is used in string mapping in xfs_trace.h; please keep the
+ * TRACE_DEFINE_ENUMs for it up to date.
+ */
typedef enum {
XFS_LOOKUP_EQi, XFS_LOOKUP_LEi, XFS_LOOKUP_GEi
} xfs_lookup_t;
+#define XFS_AG_BTREE_CMP_FORMAT_STR \
+ { XFS_LOOKUP_EQi, "eq" }, \
+ { XFS_LOOKUP_LEi, "le" }, \
+ { XFS_LOOKUP_GEi, "ge" }
+
+/*
+ * This enum is used in string mapping in xfs_trace.h and scrub/trace.h;
+ * please keep the TRACE_DEFINE_ENUMs for it up to date.
+ */
typedef enum {
XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_RMAPi, XFS_BTNUM_BMAPi,
XFS_BTNUM_INOi, XFS_BTNUM_FINOi, XFS_BTNUM_REFCi, XFS_BTNUM_MAX
} xfs_btnum_t;
+#define XFS_BTNUM_STRINGS \
+ { XFS_BTNUM_BNOi, "bnobt" }, \
+ { XFS_BTNUM_CNTi, "cntbt" }, \
+ { XFS_BTNUM_RMAPi, "rmapbt" }, \
+ { XFS_BTNUM_BMAPi, "bmbt" }, \
+ { XFS_BTNUM_INOi, "inobt" }, \
+ { XFS_BTNUM_FINOi, "finobt" }, \
+ { XFS_BTNUM_REFCi, "refcbt" }
+
struct xfs_name {
const unsigned char *name;
int len;
@@ -161,10 +183,13 @@ void xfs_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno,
xfs_agino_t *first, xfs_agino_t *last);
bool xfs_verify_agino(struct xfs_mount *mp, xfs_agnumber_t agno,
xfs_agino_t agino);
+bool xfs_verify_agino_or_null(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agino_t agino);
bool xfs_verify_ino(struct xfs_mount *mp, xfs_ino_t ino);
bool xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino);
bool xfs_verify_dir_ino(struct xfs_mount *mp, xfs_ino_t ino);
bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno);
bool xfs_verify_icount(struct xfs_mount *mp, unsigned long long icount);
+bool xfs_verify_dablk(struct xfs_mount *mp, xfs_fileoff_t off);
#endif /* __XFS_TYPES_H__ */
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index 3068a9382feb..ddf06bfaa29d 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -32,7 +32,6 @@ xchk_superblock_xref(
struct xfs_scrub *sc,
struct xfs_buf *bp)
{
- struct xfs_owner_info oinfo;
struct xfs_mount *mp = sc->mp;
xfs_agnumber_t agno = sc->sm->sm_agno;
xfs_agblock_t agbno;
@@ -49,8 +48,7 @@ xchk_superblock_xref(
xchk_xref_is_used_space(sc, agbno, 1);
xchk_xref_is_not_inode_chunk(sc, agbno, 1);
- xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_FS);
- xchk_xref_is_owned_by(sc, agbno, 1, &oinfo);
+ xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS);
xchk_xref_is_not_shared(sc, agbno, 1);
/* scrub teardown will take care of sc->sa for us */
@@ -401,7 +399,7 @@ xchk_agf_xref_cntbt(
if (!xchk_should_check_xref(sc, &error, &sc->sa.cnt_cur))
return;
if (!have) {
- if (agf->agf_freeblks != be32_to_cpu(0))
+ if (agf->agf_freeblks != cpu_to_be32(0))
xchk_block_xref_set_corrupt(sc, sc->sa.agf_bp);
return;
}
@@ -484,7 +482,6 @@ STATIC void
xchk_agf_xref(
struct xfs_scrub *sc)
{
- struct xfs_owner_info oinfo;
struct xfs_mount *mp = sc->mp;
xfs_agblock_t agbno;
int error;
@@ -502,8 +499,7 @@ xchk_agf_xref(
xchk_agf_xref_freeblks(sc);
xchk_agf_xref_cntbt(sc);
xchk_xref_is_not_inode_chunk(sc, agbno, 1);
- xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_FS);
- xchk_xref_is_owned_by(sc, agbno, 1, &oinfo);
+ xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS);
xchk_agf_xref_btreeblks(sc);
xchk_xref_is_not_shared(sc, agbno, 1);
xchk_agf_xref_refcblks(sc);
@@ -598,7 +594,6 @@ out:
/* AGFL */
struct xchk_agfl_info {
- struct xfs_owner_info oinfo;
unsigned int sz_entries;
unsigned int nr_entries;
xfs_agblock_t *entries;
@@ -609,15 +604,14 @@ struct xchk_agfl_info {
STATIC void
xchk_agfl_block_xref(
struct xfs_scrub *sc,
- xfs_agblock_t agbno,
- struct xfs_owner_info *oinfo)
+ xfs_agblock_t agbno)
{
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
return;
xchk_xref_is_used_space(sc, agbno, 1);
xchk_xref_is_not_inode_chunk(sc, agbno, 1);
- xchk_xref_is_owned_by(sc, agbno, 1, oinfo);
+ xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_AG);
xchk_xref_is_not_shared(sc, agbno, 1);
}
@@ -638,7 +632,7 @@ xchk_agfl_block(
else
xchk_block_set_corrupt(sc, sc->sa.agfl_bp);
- xchk_agfl_block_xref(sc, agbno, priv);
+ xchk_agfl_block_xref(sc, agbno);
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
return XFS_BTREE_QUERY_RANGE_ABORT;
@@ -662,7 +656,6 @@ STATIC void
xchk_agfl_xref(
struct xfs_scrub *sc)
{
- struct xfs_owner_info oinfo;
struct xfs_mount *mp = sc->mp;
xfs_agblock_t agbno;
int error;
@@ -678,8 +671,7 @@ xchk_agfl_xref(
xchk_xref_is_used_space(sc, agbno, 1);
xchk_xref_is_not_inode_chunk(sc, agbno, 1);
- xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_FS);
- xchk_xref_is_owned_by(sc, agbno, 1, &oinfo);
+ xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS);
xchk_xref_is_not_shared(sc, agbno, 1);
/*
@@ -732,7 +724,6 @@ xchk_agfl(
}
/* Check the blocks in the AGFL. */
- xfs_rmap_ag_owner(&sai.oinfo, XFS_RMAP_OWN_AG);
error = xfs_agfl_walk(sc->mp, XFS_BUF_TO_AGF(sc->sa.agf_bp),
sc->sa.agfl_bp, xchk_agfl_block, &sai);
if (error == XFS_BTREE_QUERY_RANGE_ABORT) {
@@ -791,7 +782,6 @@ STATIC void
xchk_agi_xref(
struct xfs_scrub *sc)
{
- struct xfs_owner_info oinfo;
struct xfs_mount *mp = sc->mp;
xfs_agblock_t agbno;
int error;
@@ -808,8 +798,7 @@ xchk_agi_xref(
xchk_xref_is_used_space(sc, agbno, 1);
xchk_xref_is_not_inode_chunk(sc, agbno, 1);
xchk_agi_xref_icounts(sc);
- xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_FS);
- xchk_xref_is_owned_by(sc, agbno, 1, &oinfo);
+ xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS);
xchk_xref_is_not_shared(sc, agbno, 1);
/* scrub teardown will take care of sc->sa for us */
@@ -875,19 +864,17 @@ xchk_agi(
/* Check inode pointers */
agino = be32_to_cpu(agi->agi_newino);
- if (agino != NULLAGINO && !xfs_verify_agino(mp, agno, agino))
+ if (!xfs_verify_agino_or_null(mp, agno, agino))
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
agino = be32_to_cpu(agi->agi_dirino);
- if (agino != NULLAGINO && !xfs_verify_agino(mp, agno, agino))
+ if (!xfs_verify_agino_or_null(mp, agno, agino))
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
/* Check unlinked inode buckets */
for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
agino = be32_to_cpu(agi->agi_unlinked[i]);
- if (agino == NULLAGINO)
- continue;
- if (!xfs_verify_agino(mp, agno, agino))
+ if (!xfs_verify_agino_or_null(mp, agno, agino))
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
}
diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index f7568a4b5fe5..64e31f87d490 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -341,23 +341,19 @@ xrep_agf(
struct xrep_find_ag_btree fab[XREP_AGF_MAX] = {
[XREP_AGF_BNOBT] = {
.rmap_owner = XFS_RMAP_OWN_AG,
- .buf_ops = &xfs_allocbt_buf_ops,
- .magic = XFS_ABTB_CRC_MAGIC,
+ .buf_ops = &xfs_bnobt_buf_ops,
},
[XREP_AGF_CNTBT] = {
.rmap_owner = XFS_RMAP_OWN_AG,
- .buf_ops = &xfs_allocbt_buf_ops,
- .magic = XFS_ABTC_CRC_MAGIC,
+ .buf_ops = &xfs_cntbt_buf_ops,
},
[XREP_AGF_RMAPBT] = {
.rmap_owner = XFS_RMAP_OWN_AG,
.buf_ops = &xfs_rmapbt_buf_ops,
- .magic = XFS_RMAP_CRC_MAGIC,
},
[XREP_AGF_REFCOUNTBT] = {
.rmap_owner = XFS_RMAP_OWN_REFC,
.buf_ops = &xfs_refcountbt_buf_ops,
- .magic = XFS_REFC_CRC_MAGIC,
},
[XREP_AGF_END] = {
.buf_ops = NULL,
@@ -646,7 +642,6 @@ int
xrep_agfl(
struct xfs_scrub *sc)
{
- struct xfs_owner_info oinfo;
struct xfs_bitmap agfl_extents;
struct xfs_mount *mp = sc->mp;
struct xfs_buf *agf_bp;
@@ -708,8 +703,8 @@ xrep_agfl(
goto err;
/* Dump any AGFL overflow. */
- xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
- return xrep_reap_extents(sc, &agfl_extents, &oinfo, XFS_AG_RESV_AGFL);
+ return xrep_reap_extents(sc, &agfl_extents, &XFS_RMAP_OINFO_AG,
+ XFS_AG_RESV_AGFL);
err:
xfs_bitmap_destroy(&agfl_extents);
return error;
@@ -876,12 +871,10 @@ xrep_agi(
[XREP_AGI_INOBT] = {
.rmap_owner = XFS_RMAP_OWN_INOBT,
.buf_ops = &xfs_inobt_buf_ops,
- .magic = XFS_IBT_CRC_MAGIC,
},
[XREP_AGI_FINOBT] = {
.rmap_owner = XFS_RMAP_OWN_INOBT,
- .buf_ops = &xfs_inobt_buf_ops,
- .magic = XFS_FIBT_CRC_MAGIC,
+ .buf_ops = &xfs_finobt_buf_ops,
},
[XREP_AGI_END] = {
.buf_ops = NULL
diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c
index 376bcb585ae6..44883e9112ad 100644
--- a/fs/xfs/scrub/alloc.c
+++ b/fs/xfs/scrub/alloc.c
@@ -125,12 +125,10 @@ xchk_allocbt(
struct xfs_scrub *sc,
xfs_btnum_t which)
{
- struct xfs_owner_info oinfo;
struct xfs_btree_cur *cur;
- xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
cur = which == XFS_BTNUM_BNO ? sc->sa.bno_cur : sc->sa.cnt_cur;
- return xchk_btree(sc, cur, xchk_allocbt_rec, &oinfo, NULL);
+ return xchk_btree(sc, cur, xchk_allocbt_rec, &XFS_RMAP_OINFO_AG, NULL);
}
int
diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c
index 81d5e90547a1..dce74ec57038 100644
--- a/fs/xfs/scrub/attr.c
+++ b/fs/xfs/scrub/attr.c
@@ -82,12 +82,23 @@ xchk_xattr_listent(
sx = container_of(context, struct xchk_xattr, context);
+ if (xchk_should_terminate(sx->sc, &error)) {
+ context->seen_enough = 1;
+ return;
+ }
+
if (flags & XFS_ATTR_INCOMPLETE) {
/* Incomplete attr key, just mark the inode for preening. */
xchk_ino_set_preen(sx->sc, context->dp->i_ino);
return;
}
+ /* Does this name make sense? */
+ if (!xfs_attr_namecheck(name, namelen)) {
+ xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno);
+ return;
+ }
+
args.flags = ATTR_KERNOTIME;
if (flags & XFS_ATTR_ROOT)
args.flags |= ATTR_ROOT;
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index e1d11f3223e3..a703cd58a90e 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -281,6 +281,31 @@ xchk_bmap_extent_xref(
xchk_ag_free(info->sc, &info->sc->sa);
}
+/*
+ * Directories and attr forks should never have blocks that can't be addressed
+ * by a xfs_dablk_t.
+ */
+STATIC void
+xchk_bmap_dirattr_extent(
+ struct xfs_inode *ip,
+ struct xchk_bmap_info *info,
+ struct xfs_bmbt_irec *irec)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t off;
+
+ if (!S_ISDIR(VFS_I(ip)->i_mode) && info->whichfork != XFS_ATTR_FORK)
+ return;
+
+ if (!xfs_verify_dablk(mp, irec->br_startoff))
+ xchk_fblock_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+
+ off = irec->br_startoff + irec->br_blockcount - 1;
+ if (!xfs_verify_dablk(mp, off))
+ xchk_fblock_set_corrupt(info->sc, info->whichfork, off);
+}
+
/* Scrub a single extent record. */
STATIC int
xchk_bmap_extent(
@@ -305,6 +330,8 @@ xchk_bmap_extent(
xchk_fblock_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
+ xchk_bmap_dirattr_extent(ip, info, irec);
+
/* There should never be a "hole" extent in either extent list. */
if (irec->br_startblock == HOLESTARTBLOCK)
xchk_fblock_set_corrupt(info->sc, info->whichfork,
diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c
index 4ae959f7ad2c..6f94d1f7322d 100644
--- a/fs/xfs/scrub/btree.c
+++ b/fs/xfs/scrub/btree.c
@@ -583,31 +583,32 @@ xchk_btree_block_keys(
*/
int
xchk_btree(
- struct xfs_scrub *sc,
- struct xfs_btree_cur *cur,
- xchk_btree_rec_fn scrub_fn,
- struct xfs_owner_info *oinfo,
- void *private)
+ struct xfs_scrub *sc,
+ struct xfs_btree_cur *cur,
+ xchk_btree_rec_fn scrub_fn,
+ const struct xfs_owner_info *oinfo,
+ void *private)
{
- struct xchk_btree bs = { NULL };
- union xfs_btree_ptr ptr;
- union xfs_btree_ptr *pp;
- union xfs_btree_rec *recp;
- struct xfs_btree_block *block;
- int level;
- struct xfs_buf *bp;
- struct check_owner *co;
- struct check_owner *n;
- int i;
- int error = 0;
+ struct xchk_btree bs = {
+ .cur = cur,
+ .scrub_rec = scrub_fn,
+ .oinfo = oinfo,
+ .firstrec = true,
+ .private = private,
+ .sc = sc,
+ };
+ union xfs_btree_ptr ptr;
+ union xfs_btree_ptr *pp;
+ union xfs_btree_rec *recp;
+ struct xfs_btree_block *block;
+ int level;
+ struct xfs_buf *bp;
+ struct check_owner *co;
+ struct check_owner *n;
+ int i;
+ int error = 0;
/* Initialize scrub state */
- bs.cur = cur;
- bs.scrub_rec = scrub_fn;
- bs.oinfo = oinfo;
- bs.firstrec = true;
- bs.private = private;
- bs.sc = sc;
for (i = 0; i < XFS_BTREE_MAXLEVELS; i++)
bs.firstkey[i] = true;
INIT_LIST_HEAD(&bs.to_check);
diff --git a/fs/xfs/scrub/btree.h b/fs/xfs/scrub/btree.h
index aada763cd006..5572e475f8ed 100644
--- a/fs/xfs/scrub/btree.h
+++ b/fs/xfs/scrub/btree.h
@@ -31,21 +31,21 @@ typedef int (*xchk_btree_rec_fn)(
struct xchk_btree {
/* caller-provided scrub state */
- struct xfs_scrub *sc;
- struct xfs_btree_cur *cur;
- xchk_btree_rec_fn scrub_rec;
- struct xfs_owner_info *oinfo;
- void *private;
+ struct xfs_scrub *sc;
+ struct xfs_btree_cur *cur;
+ xchk_btree_rec_fn scrub_rec;
+ const struct xfs_owner_info *oinfo;
+ void *private;
/* internal scrub state */
- union xfs_btree_rec lastrec;
- bool firstrec;
- union xfs_btree_key lastkey[XFS_BTREE_MAXLEVELS];
- bool firstkey[XFS_BTREE_MAXLEVELS];
- struct list_head to_check;
+ union xfs_btree_rec lastrec;
+ bool firstrec;
+ union xfs_btree_key lastkey[XFS_BTREE_MAXLEVELS];
+ bool firstkey[XFS_BTREE_MAXLEVELS];
+ struct list_head to_check;
};
int xchk_btree(struct xfs_scrub *sc, struct xfs_btree_cur *cur,
- xchk_btree_rec_fn scrub_fn, struct xfs_owner_info *oinfo,
+ xchk_btree_rec_fn scrub_fn, const struct xfs_owner_info *oinfo,
void *private);
#endif /* __XFS_SCRUB_BTREE_H__ */
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 346b02abccf7..0c54ff55b901 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -313,8 +313,8 @@ xchk_set_incomplete(
*/
struct xchk_rmap_ownedby_info {
- struct xfs_owner_info *oinfo;
- xfs_filblks_t *blocks;
+ const struct xfs_owner_info *oinfo;
+ xfs_filblks_t *blocks;
};
STATIC int
@@ -347,15 +347,15 @@ int
xchk_count_rmap_ownedby_ag(
struct xfs_scrub *sc,
struct xfs_btree_cur *cur,
- struct xfs_owner_info *oinfo,
+ const struct xfs_owner_info *oinfo,
xfs_filblks_t *blocks)
{
- struct xchk_rmap_ownedby_info sroi;
+ struct xchk_rmap_ownedby_info sroi = {
+ .oinfo = oinfo,
+ .blocks = blocks,
+ };
- sroi.oinfo = oinfo;
*blocks = 0;
- sroi.blocks = blocks;
-
return xfs_rmap_query_all(cur, xchk_count_rmap_ownedby_irec,
&sroi);
}
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index 2d4324d12f9a..e26a430bd466 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -116,7 +116,7 @@ int xchk_ag_read_headers(struct xfs_scrub *sc, xfs_agnumber_t agno,
void xchk_ag_btcur_free(struct xchk_ag *sa);
int xchk_ag_btcur_init(struct xfs_scrub *sc, struct xchk_ag *sa);
int xchk_count_rmap_ownedby_ag(struct xfs_scrub *sc, struct xfs_btree_cur *cur,
- struct xfs_owner_info *oinfo, xfs_filblks_t *blocks);
+ const struct xfs_owner_info *oinfo, xfs_filblks_t *blocks);
int xchk_setup_ag_btree(struct xfs_scrub *sc, struct xfs_inode *ip,
bool force_log);
diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c
index cd3e4d768a18..a38a22785a1a 100644
--- a/fs/xfs/scrub/dir.c
+++ b/fs/xfs/scrub/dir.c
@@ -129,6 +129,12 @@ xchk_dir_actor(
goto out;
}
+ /* Does this name make sense? */
+ if (!xfs_dir2_namecheck(name, namelen)) {
+ xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset);
+ goto out;
+ }
+
if (!strncmp(".", name, namelen)) {
/* If this is "." then check that the inum matches the dir. */
if (xfs_sb_version_hasftype(&mp->m_sb) && type != DT_DIR)
diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c
index 224dba937492..700114f79a7d 100644
--- a/fs/xfs/scrub/ialloc.c
+++ b/fs/xfs/scrub/ialloc.c
@@ -44,6 +44,17 @@ xchk_setup_ag_iallocbt(
/* Inode btree scrubber. */
+struct xchk_iallocbt {
+ /* Number of inodes we see while scanning inobt. */
+ unsigned long long inodes;
+
+ /* Expected next startino, for big block filesystems. */
+ xfs_agino_t next_startino;
+
+ /* Expected end of the current inode cluster. */
+ xfs_agino_t next_cluster_ino;
+};
+
/*
* If we're checking the finobt, cross-reference with the inobt.
* Otherwise we're checking the inobt; if there is an finobt, make sure
@@ -82,15 +93,12 @@ xchk_iallocbt_chunk_xref(
xfs_agblock_t agbno,
xfs_extlen_t len)
{
- struct xfs_owner_info oinfo;
-
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
return;
xchk_xref_is_used_space(sc, agbno, len);
xchk_iallocbt_chunk_xref_other(sc, irec, agino);
- xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INODES);
- xchk_xref_is_owned_by(sc, agbno, len, &oinfo);
+ xchk_xref_is_owned_by(sc, agbno, len, &XFS_RMAP_OINFO_INODES);
xchk_xref_is_not_shared(sc, agbno, len);
}
@@ -126,41 +134,57 @@ xchk_iallocbt_freecount(
return hweight64(freemask);
}
-/* Check a particular inode with ir_free. */
+/*
+ * Check that an inode's allocation status matches ir_free in the inobt
+ * record. First we try querying the in-core inode state, and if the inode
+ * isn't loaded we examine the on-disk inode directly.
+ *
+ * Since there can be 1:M and M:1 mappings between inobt records and inode
+ * clusters, we pass in the inode location information as an inobt record;
+ * the index of an inode cluster within the inobt record (as well as the
+ * cluster buffer itself); and the index of the inode within the cluster.
+ *
+ * @irec is the inobt record.
+ * @irec_ino is the inode offset from the start of the record.
+ * @dip is the on-disk inode.
+ */
STATIC int
-xchk_iallocbt_check_cluster_freemask(
+xchk_iallocbt_check_cluster_ifree(
struct xchk_btree *bs,
- xfs_ino_t fsino,
- xfs_agino_t chunkino,
- xfs_agino_t clusterino,
struct xfs_inobt_rec_incore *irec,
- struct xfs_buf *bp)
+ unsigned int irec_ino,
+ struct xfs_dinode *dip)
{
- struct xfs_dinode *dip;
struct xfs_mount *mp = bs->cur->bc_mp;
- bool inode_is_free = false;
+ xfs_ino_t fsino;
+ xfs_agino_t agino;
+ bool irec_free;
+ bool ino_inuse;
bool freemask_ok;
- bool inuse;
int error = 0;
if (xchk_should_terminate(bs->sc, &error))
return error;
- dip = xfs_buf_offset(bp, clusterino * mp->m_sb.sb_inodesize);
+ /*
+ * Given an inobt record and the offset of an inode from the start of
+ * the record, compute which fs inode we're talking about.
+ */
+ agino = irec->ir_startino + irec_ino;
+ fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_private.a.agno, agino);
+ irec_free = (irec->ir_free & XFS_INOBT_MASK(irec_ino));
+
if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC ||
- (dip->di_version >= 3 &&
- be64_to_cpu(dip->di_ino) != fsino + clusterino)) {
+ (dip->di_version >= 3 && be64_to_cpu(dip->di_ino) != fsino)) {
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
goto out;
}
- if (irec->ir_free & XFS_INOBT_MASK(chunkino + clusterino))
- inode_is_free = true;
- error = xfs_icache_inode_is_allocated(mp, bs->cur->bc_tp,
- fsino + clusterino, &inuse);
+ error = xfs_icache_inode_is_allocated(mp, bs->cur->bc_tp, fsino,
+ &ino_inuse);
if (error == -ENODATA) {
/* Not cached, just read the disk buffer */
- freemask_ok = inode_is_free ^ !!(dip->di_mode);
+ freemask_ok = irec_free ^ !!(dip->di_mode);
if (!bs->sc->try_harder && !freemask_ok)
return -EDEADLOCK;
} else if (error < 0) {
@@ -172,7 +196,7 @@ xchk_iallocbt_check_cluster_freemask(
goto out;
} else {
/* Inode is all there. */
- freemask_ok = inode_is_free ^ inuse;
+ freemask_ok = irec_free ^ ino_inuse;
}
if (!freemask_ok)
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
@@ -180,89 +204,221 @@ out:
return 0;
}
-/* Make sure the free mask is consistent with what the inodes think. */
+/*
+ * Check that the holemask and freemask of a hypothetical inode cluster match
+ * what's actually on disk. If sparse inodes are enabled, the cluster does
+ * not actually have to map to inodes if the corresponding holemask bit is set.
+ *
+ * @cluster_base is the first inode in the cluster within the @irec.
+ */
STATIC int
-xchk_iallocbt_check_freemask(
+xchk_iallocbt_check_cluster(
struct xchk_btree *bs,
- struct xfs_inobt_rec_incore *irec)
+ struct xfs_inobt_rec_incore *irec,
+ unsigned int cluster_base)
{
- struct xfs_owner_info oinfo;
struct xfs_imap imap;
struct xfs_mount *mp = bs->cur->bc_mp;
struct xfs_dinode *dip;
- struct xfs_buf *bp;
- xfs_ino_t fsino;
- xfs_agino_t nr_inodes;
- xfs_agino_t agino;
- xfs_agino_t chunkino;
- xfs_agino_t clusterino;
+ struct xfs_buf *cluster_bp;
+ unsigned int nr_inodes;
+ xfs_agnumber_t agno = bs->cur->bc_private.a.agno;
xfs_agblock_t agbno;
- int blks_per_cluster;
- uint16_t holemask;
+ unsigned int cluster_index;
+ uint16_t cluster_mask = 0;
uint16_t ir_holemask;
int error = 0;
- /* Make sure the freemask matches the inode records. */
- blks_per_cluster = xfs_icluster_size_fsb(mp);
- nr_inodes = XFS_OFFBNO_TO_AGINO(mp, blks_per_cluster, 0);
- xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INODES);
-
- for (agino = irec->ir_startino;
- agino < irec->ir_startino + XFS_INODES_PER_CHUNK;
- agino += blks_per_cluster * mp->m_sb.sb_inopblock) {
- fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_private.a.agno, agino);
- chunkino = agino - irec->ir_startino;
- agbno = XFS_AGINO_TO_AGBNO(mp, agino);
-
- /* Compute the holemask mask for this cluster. */
- for (clusterino = 0, holemask = 0; clusterino < nr_inodes;
- clusterino += XFS_INODES_PER_HOLEMASK_BIT)
- holemask |= XFS_INOBT_MASK((chunkino + clusterino) /
- XFS_INODES_PER_HOLEMASK_BIT);
-
- /* The whole cluster must be a hole or not a hole. */
- ir_holemask = (irec->ir_holemask & holemask);
- if (ir_holemask != holemask && ir_holemask != 0) {
+ nr_inodes = min_t(unsigned int, XFS_INODES_PER_CHUNK,
+ mp->m_inodes_per_cluster);
+
+ /* Map this inode cluster */
+ agbno = XFS_AGINO_TO_AGBNO(mp, irec->ir_startino + cluster_base);
+
+ /* Compute a bitmask for this cluster that can be used for holemask. */
+ for (cluster_index = 0;
+ cluster_index < nr_inodes;
+ cluster_index += XFS_INODES_PER_HOLEMASK_BIT)
+ cluster_mask |= XFS_INOBT_MASK((cluster_base + cluster_index) /
+ XFS_INODES_PER_HOLEMASK_BIT);
+
+ /*
+ * Map the first inode of this cluster to a buffer and offset.
+ * Be careful about inobt records that don't align with the start of
+ * the inode buffer when block sizes are large enough to hold multiple
+ * inode chunks. When this happens, cluster_base will be zero but
+ * ir_startino can be large enough to make im_boffset nonzero.
+ */
+ ir_holemask = (irec->ir_holemask & cluster_mask);
+ imap.im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno);
+ imap.im_len = XFS_FSB_TO_BB(mp, mp->m_blocks_per_cluster);
+ imap.im_boffset = XFS_INO_TO_OFFSET(mp, irec->ir_startino);
+
+ if (imap.im_boffset != 0 && cluster_base != 0) {
+ ASSERT(imap.im_boffset == 0 || cluster_base == 0);
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ return 0;
+ }
+
+ trace_xchk_iallocbt_check_cluster(mp, agno, irec->ir_startino,
+ imap.im_blkno, imap.im_len, cluster_base, nr_inodes,
+ cluster_mask, ir_holemask,
+ XFS_INO_TO_OFFSET(mp, irec->ir_startino +
+ cluster_base));
+
+ /* The whole cluster must be a hole or not a hole. */
+ if (ir_holemask != cluster_mask && ir_holemask != 0) {
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ return 0;
+ }
+
+ /* If any part of this is a hole, skip it. */
+ if (ir_holemask) {
+ xchk_xref_is_not_owned_by(bs->sc, agbno,
+ mp->m_blocks_per_cluster,
+ &XFS_RMAP_OINFO_INODES);
+ return 0;
+ }
+
+ xchk_xref_is_owned_by(bs->sc, agbno, mp->m_blocks_per_cluster,
+ &XFS_RMAP_OINFO_INODES);
+
+ /* Grab the inode cluster buffer. */
+ error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap, &dip, &cluster_bp,
+ 0, 0);
+ if (!xchk_btree_xref_process_error(bs->sc, bs->cur, 0, &error))
+ return error;
+
+ /* Check free status of each inode within this cluster. */
+ for (cluster_index = 0; cluster_index < nr_inodes; cluster_index++) {
+ struct xfs_dinode *dip;
+
+ if (imap.im_boffset >= BBTOB(cluster_bp->b_length)) {
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
- continue;
+ break;
}
- /* If any part of this is a hole, skip it. */
- if (ir_holemask) {
- xchk_xref_is_not_owned_by(bs->sc, agbno,
- blks_per_cluster, &oinfo);
- continue;
+ dip = xfs_buf_offset(cluster_bp, imap.im_boffset);
+ error = xchk_iallocbt_check_cluster_ifree(bs, irec,
+ cluster_base + cluster_index, dip);
+ if (error)
+ break;
+ imap.im_boffset += mp->m_sb.sb_inodesize;
+ }
+
+ xfs_trans_brelse(bs->cur->bc_tp, cluster_bp);
+ return error;
+}
+
+/*
+ * For all the inode clusters that could map to this inobt record, make sure
+ * that the holemask makes sense and that the allocation status of each inode
+ * matches the freemask.
+ */
+STATIC int
+xchk_iallocbt_check_clusters(
+ struct xchk_btree *bs,
+ struct xfs_inobt_rec_incore *irec)
+{
+ unsigned int cluster_base;
+ int error = 0;
+
+ /*
+ * For the common case where this inobt record maps to multiple inode
+ * clusters this will call _check_cluster for each cluster.
+ *
+ * For the case that multiple inobt records map to a single cluster,
+ * this will call _check_cluster once.
+ */
+ for (cluster_base = 0;
+ cluster_base < XFS_INODES_PER_CHUNK;
+ cluster_base += bs->sc->mp->m_inodes_per_cluster) {
+ error = xchk_iallocbt_check_cluster(bs, irec, cluster_base);
+ if (error)
+ break;
+ }
+
+ return error;
+}
+
+/*
+ * Make sure this inode btree record is aligned properly. Because a fs block
+ * contains multiple inodes, we check that the inobt record is aligned to the
+ * correct inode, not just the correct block on disk. This results in a finer
+ * grained corruption check.
+ */
+STATIC void
+xchk_iallocbt_rec_alignment(
+ struct xchk_btree *bs,
+ struct xfs_inobt_rec_incore *irec)
+{
+ struct xfs_mount *mp = bs->sc->mp;
+ struct xchk_iallocbt *iabt = bs->private;
+
+ /*
+ * finobt records have different positioning requirements than inobt
+ * records: each finobt record must have a corresponding inobt record.
+ * That is checked in the xref function, so for now we only catch the
+ * obvious case where the record isn't at all aligned properly.
+ *
+ * Note that if a fs block contains more than a single chunk of inodes,
+ * we will have finobt records only for those chunks containing free
+ * inodes, and therefore expect chunk alignment of finobt records.
+ * Otherwise, we expect that the finobt record is aligned to the
+ * cluster alignment as told by the superblock.
+ */
+ if (bs->cur->bc_btnum == XFS_BTNUM_FINO) {
+ unsigned int imask;
+
+ imask = min_t(unsigned int, XFS_INODES_PER_CHUNK,
+ mp->m_cluster_align_inodes) - 1;
+ if (irec->ir_startino & imask)
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ return;
+ }
+
+ if (iabt->next_startino != NULLAGINO) {
+ /*
+ * We're midway through a cluster of inodes that is mapped by
+ * multiple inobt records. Did we get the record for the next
+ * irec in the sequence?
+ */
+ if (irec->ir_startino != iabt->next_startino) {
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ return;
}
- xchk_xref_is_owned_by(bs->sc, agbno, blks_per_cluster,
- &oinfo);
-
- /* Grab the inode cluster buffer. */
- imap.im_blkno = XFS_AGB_TO_DADDR(mp, bs->cur->bc_private.a.agno,
- agbno);
- imap.im_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
- imap.im_boffset = 0;
-
- error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap,
- &dip, &bp, 0, 0);
- if (!xchk_btree_xref_process_error(bs->sc, bs->cur, 0,
- &error))
- continue;
-
- /* Which inodes are free? */
- for (clusterino = 0; clusterino < nr_inodes; clusterino++) {
- error = xchk_iallocbt_check_cluster_freemask(bs,
- fsino, chunkino, clusterino, irec, bp);
- if (error) {
- xfs_trans_brelse(bs->cur->bc_tp, bp);
- return error;
- }
+ iabt->next_startino += XFS_INODES_PER_CHUNK;
+
+ /* Are we done with the cluster? */
+ if (iabt->next_startino >= iabt->next_cluster_ino) {
+ iabt->next_startino = NULLAGINO;
+ iabt->next_cluster_ino = NULLAGINO;
}
+ return;
+ }
+
+ /* inobt records must be aligned to cluster and inoalignmnt size. */
+ if (irec->ir_startino & (mp->m_cluster_align_inodes - 1)) {
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ return;
+ }
- xfs_trans_brelse(bs->cur->bc_tp, bp);
+ if (irec->ir_startino & (mp->m_inodes_per_cluster - 1)) {
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ return;
}
- return error;
+ if (mp->m_inodes_per_cluster <= XFS_INODES_PER_CHUNK)
+ return;
+
+ /*
+ * If this is the start of an inode cluster that can be mapped by
+ * multiple inobt records, the next inobt record must follow exactly
+ * after this one.
+ */
+ iabt->next_startino = irec->ir_startino + XFS_INODES_PER_CHUNK;
+ iabt->next_cluster_ino = irec->ir_startino + mp->m_inodes_per_cluster;
}
/* Scrub an inobt/finobt record. */
@@ -272,12 +428,11 @@ xchk_iallocbt_rec(
union xfs_btree_rec *rec)
{
struct xfs_mount *mp = bs->cur->bc_mp;
- xfs_filblks_t *inode_blocks = bs->private;
+ struct xchk_iallocbt *iabt = bs->private;
struct xfs_inobt_rec_incore irec;
uint64_t holes;
xfs_agnumber_t agno = bs->cur->bc_private.a.agno;
xfs_agino_t agino;
- xfs_agblock_t agbno;
xfs_extlen_t len;
int holecount;
int i;
@@ -304,14 +459,11 @@ xchk_iallocbt_rec(
goto out;
}
- /* Make sure this record is aligned to cluster and inoalignmnt size. */
- agbno = XFS_AGINO_TO_AGBNO(mp, irec.ir_startino);
- if ((agbno & (xfs_ialloc_cluster_alignment(mp) - 1)) ||
- (agbno & (xfs_icluster_size_fsb(mp) - 1)))
- xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ xchk_iallocbt_rec_alignment(bs, &irec);
+ if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out;
- *inode_blocks += XFS_B_TO_FSB(mp,
- irec.ir_count * mp->m_sb.sb_inodesize);
+ iabt->inodes += irec.ir_count;
/* Handle non-sparse inodes */
if (!xfs_inobt_issparse(irec.ir_holemask)) {
@@ -322,7 +474,7 @@ xchk_iallocbt_rec(
if (!xchk_iallocbt_chunk(bs, &irec, agino, len))
goto out;
- goto check_freemask;
+ goto check_clusters;
}
/* Check each chunk of a sparse inode cluster. */
@@ -348,8 +500,8 @@ xchk_iallocbt_rec(
holecount + irec.ir_count != XFS_INODES_PER_CHUNK)
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
-check_freemask:
- error = xchk_iallocbt_check_freemask(bs, &irec);
+check_clusters:
+ error = xchk_iallocbt_check_clusters(bs, &irec);
if (error)
goto out;
@@ -366,7 +518,6 @@ xchk_iallocbt_xref_rmap_btreeblks(
struct xfs_scrub *sc,
int which)
{
- struct xfs_owner_info oinfo;
xfs_filblks_t blocks;
xfs_extlen_t inobt_blocks = 0;
xfs_extlen_t finobt_blocks = 0;
@@ -388,9 +539,8 @@ xchk_iallocbt_xref_rmap_btreeblks(
return;
}
- xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INOBT);
- error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur, &oinfo,
- &blocks);
+ error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur,
+ &XFS_RMAP_OINFO_INOBT, &blocks);
if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur))
return;
if (blocks != inobt_blocks + finobt_blocks)
@@ -405,21 +555,21 @@ STATIC void
xchk_iallocbt_xref_rmap_inodes(
struct xfs_scrub *sc,
int which,
- xfs_filblks_t inode_blocks)
+ unsigned long long inodes)
{
- struct xfs_owner_info oinfo;
xfs_filblks_t blocks;
+ xfs_filblks_t inode_blocks;
int error;
if (!sc->sa.rmap_cur || xchk_skip_xref(sc->sm))
return;
/* Check that we saw as many inode blocks as the rmap knows about. */
- xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INODES);
- error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur, &oinfo,
- &blocks);
+ error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur,
+ &XFS_RMAP_OINFO_INODES, &blocks);
if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur))
return;
+ inode_blocks = XFS_B_TO_FSB(sc->mp, inodes * sc->mp->m_sb.sb_inodesize);
if (blocks != inode_blocks)
xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0);
}
@@ -431,14 +581,16 @@ xchk_iallocbt(
xfs_btnum_t which)
{
struct xfs_btree_cur *cur;
- struct xfs_owner_info oinfo;
- xfs_filblks_t inode_blocks = 0;
+ struct xchk_iallocbt iabt = {
+ .inodes = 0,
+ .next_startino = NULLAGINO,
+ .next_cluster_ino = NULLAGINO,
+ };
int error;
- xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INOBT);
cur = which == XFS_BTNUM_INO ? sc->sa.ino_cur : sc->sa.fino_cur;
- error = xchk_btree(sc, cur, xchk_iallocbt_rec, &oinfo,
- &inode_blocks);
+ error = xchk_btree(sc, cur, xchk_iallocbt_rec, &XFS_RMAP_OINFO_INOBT,
+ &iabt);
if (error)
return error;
@@ -452,7 +604,7 @@ xchk_iallocbt(
* to inode chunks with free inodes.
*/
if (which == XFS_BTNUM_INO)
- xchk_iallocbt_xref_rmap_inodes(sc, which, inode_blocks);
+ xchk_iallocbt_xref_rmap_inodes(sc, which, iabt.inodes);
return error;
}
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index e386c9b0b4ab..e213efc194a1 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -509,7 +509,6 @@ xchk_inode_xref(
xfs_ino_t ino,
struct xfs_dinode *dip)
{
- struct xfs_owner_info oinfo;
xfs_agnumber_t agno;
xfs_agblock_t agbno;
int error;
@@ -526,8 +525,7 @@ xchk_inode_xref(
xchk_xref_is_used_space(sc, agbno, 1);
xchk_inode_xref_finobt(sc, ino);
- xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INODES);
- xchk_xref_is_owned_by(sc, agbno, 1, &oinfo);
+ xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_INODES);
xchk_xref_is_not_shared(sc, agbno, 1);
xchk_inode_xref_bmap(sc, dip);
diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c
index e8c82b026083..708b4158eb90 100644
--- a/fs/xfs/scrub/refcount.c
+++ b/fs/xfs/scrub/refcount.c
@@ -383,7 +383,6 @@ xchk_refcountbt_rec(
STATIC void
xchk_refcount_xref_rmap(
struct xfs_scrub *sc,
- struct xfs_owner_info *oinfo,
xfs_filblks_t cow_blocks)
{
xfs_extlen_t refcbt_blocks = 0;
@@ -397,17 +396,16 @@ xchk_refcount_xref_rmap(
error = xfs_btree_count_blocks(sc->sa.refc_cur, &refcbt_blocks);
if (!xchk_btree_process_error(sc, sc->sa.refc_cur, 0, &error))
return;
- error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur, oinfo,
- &blocks);
+ error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur,
+ &XFS_RMAP_OINFO_REFC, &blocks);
if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur))
return;
if (blocks != refcbt_blocks)
xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0);
/* Check that we saw as many cow blocks as the rmap knows about. */
- xfs_rmap_ag_owner(oinfo, XFS_RMAP_OWN_COW);
- error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur, oinfo,
- &blocks);
+ error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur,
+ &XFS_RMAP_OINFO_COW, &blocks);
if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur))
return;
if (blocks != cow_blocks)
@@ -419,17 +417,15 @@ int
xchk_refcountbt(
struct xfs_scrub *sc)
{
- struct xfs_owner_info oinfo;
xfs_agblock_t cow_blocks = 0;
int error;
- xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_REFC);
error = xchk_btree(sc, sc->sa.refc_cur, xchk_refcountbt_rec,
- &oinfo, &cow_blocks);
+ &XFS_RMAP_OINFO_REFC, &cow_blocks);
if (error)
return error;
- xchk_refcount_xref_rmap(sc, &oinfo, cow_blocks);
+ xchk_refcount_xref_rmap(sc, cow_blocks);
return 0;
}
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 4fc0a5ea7673..f28f4bad317b 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -299,14 +299,14 @@ xrep_calc_ag_resblks(
/* Allocate a block in an AG. */
int
xrep_alloc_ag_block(
- struct xfs_scrub *sc,
- struct xfs_owner_info *oinfo,
- xfs_fsblock_t *fsbno,
- enum xfs_ag_resv_type resv)
+ struct xfs_scrub *sc,
+ const struct xfs_owner_info *oinfo,
+ xfs_fsblock_t *fsbno,
+ enum xfs_ag_resv_type resv)
{
- struct xfs_alloc_arg args = {0};
- xfs_agblock_t bno;
- int error;
+ struct xfs_alloc_arg args = {0};
+ xfs_agblock_t bno;
+ int error;
switch (resv) {
case XFS_AG_RESV_AGFL:
@@ -505,7 +505,6 @@ xrep_put_freelist(
struct xfs_scrub *sc,
xfs_agblock_t agbno)
{
- struct xfs_owner_info oinfo;
int error;
/* Make sure there's space on the freelist. */
@@ -518,9 +517,8 @@ xrep_put_freelist(
* create an rmap for the block prior to merging it or else other
* parts will break.
*/
- xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.agno, agbno, 1,
- &oinfo);
+ &XFS_RMAP_OINFO_AG);
if (error)
return error;
@@ -538,17 +536,17 @@ xrep_put_freelist(
/* Dispose of a single block. */
STATIC int
xrep_reap_block(
- struct xfs_scrub *sc,
- xfs_fsblock_t fsbno,
- struct xfs_owner_info *oinfo,
- enum xfs_ag_resv_type resv)
+ struct xfs_scrub *sc,
+ xfs_fsblock_t fsbno,
+ const struct xfs_owner_info *oinfo,
+ enum xfs_ag_resv_type resv)
{
- struct xfs_btree_cur *cur;
- struct xfs_buf *agf_bp = NULL;
- xfs_agnumber_t agno;
- xfs_agblock_t agbno;
- bool has_other_rmap;
- int error;
+ struct xfs_btree_cur *cur;
+ struct xfs_buf *agf_bp = NULL;
+ xfs_agnumber_t agno;
+ xfs_agblock_t agbno;
+ bool has_other_rmap;
+ int error;
agno = XFS_FSB_TO_AGNO(sc->mp, fsbno);
agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno);
@@ -612,15 +610,15 @@ out_free:
/* Dispose of every block of every extent in the bitmap. */
int
xrep_reap_extents(
- struct xfs_scrub *sc,
- struct xfs_bitmap *bitmap,
- struct xfs_owner_info *oinfo,
- enum xfs_ag_resv_type type)
+ struct xfs_scrub *sc,
+ struct xfs_bitmap *bitmap,
+ const struct xfs_owner_info *oinfo,
+ enum xfs_ag_resv_type type)
{
- struct xfs_bitmap_range *bmr;
- struct xfs_bitmap_range *n;
- xfs_fsblock_t fsbno;
- int error = 0;
+ struct xfs_bitmap_range *bmr;
+ struct xfs_bitmap_range *n;
+ xfs_fsblock_t fsbno;
+ int error = 0;
ASSERT(xfs_sb_version_hasrmapbt(&sc->mp->m_sb));
@@ -745,7 +743,8 @@ xrep_findroot_block(
/* Ensure the block magic matches the btree type we're looking for. */
btblock = XFS_BUF_TO_BLOCK(bp);
- if (be32_to_cpu(btblock->bb_magic) != fab->magic)
+ ASSERT(fab->buf_ops->magic[1] != 0);
+ if (btblock->bb_magic != fab->buf_ops->magic[1])
goto out;
/*
@@ -770,18 +769,23 @@ xrep_findroot_block(
if (!uuid_equal(&btblock->bb_u.s.bb_uuid,
&mp->m_sb.sb_meta_uuid))
goto out;
+ /*
+ * Read verifiers can reference b_ops, so we set the pointer
+ * here. If the verifier fails we'll reset the buffer state
+ * to what it was before we touched the buffer.
+ */
+ bp->b_ops = fab->buf_ops;
fab->buf_ops->verify_read(bp);
if (bp->b_error) {
+ bp->b_ops = NULL;
bp->b_error = 0;
goto out;
}
/*
* Some read verifiers will (re)set b_ops, so we must be
- * careful not to blow away any such assignment.
+ * careful not to change b_ops after running the verifier.
*/
- if (!bp->b_ops)
- bp->b_ops = fab->buf_ops;
}
/*
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 9de321eee4ab..d990314eb08b 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -21,8 +21,9 @@ int xrep_roll_ag_trans(struct xfs_scrub *sc);
bool xrep_ag_has_space(struct xfs_perag *pag, xfs_extlen_t nr_blocks,
enum xfs_ag_resv_type type);
xfs_extlen_t xrep_calc_ag_resblks(struct xfs_scrub *sc);
-int xrep_alloc_ag_block(struct xfs_scrub *sc, struct xfs_owner_info *oinfo,
- xfs_fsblock_t *fsbno, enum xfs_ag_resv_type resv);
+int xrep_alloc_ag_block(struct xfs_scrub *sc,
+ const struct xfs_owner_info *oinfo, xfs_fsblock_t *fsbno,
+ enum xfs_ag_resv_type resv);
int xrep_init_btblock(struct xfs_scrub *sc, xfs_fsblock_t fsb,
struct xfs_buf **bpp, xfs_btnum_t btnum,
const struct xfs_buf_ops *ops);
@@ -32,7 +33,7 @@ struct xfs_bitmap;
int xrep_fix_freelist(struct xfs_scrub *sc, bool can_shrink);
int xrep_invalidate_blocks(struct xfs_scrub *sc, struct xfs_bitmap *btlist);
int xrep_reap_extents(struct xfs_scrub *sc, struct xfs_bitmap *exlist,
- struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type);
+ const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type);
struct xrep_find_ag_btree {
/* in: rmap owner of the btree we're looking for */
@@ -41,9 +42,6 @@ struct xrep_find_ag_btree {
/* in: buffer ops */
const struct xfs_buf_ops *buf_ops;
- /* in: magic number of the btree */
- uint32_t magic;
-
/* out: the highest btree block found and the tree height */
xfs_agblock_t root;
unsigned int height;
diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c
index 5e293c129813..92a140c5b55e 100644
--- a/fs/xfs/scrub/rmap.c
+++ b/fs/xfs/scrub/rmap.c
@@ -174,24 +174,21 @@ int
xchk_rmapbt(
struct xfs_scrub *sc)
{
- struct xfs_owner_info oinfo;
-
- xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
return xchk_btree(sc, sc->sa.rmap_cur, xchk_rmapbt_rec,
- &oinfo, NULL);
+ &XFS_RMAP_OINFO_AG, NULL);
}
/* xref check that the extent is owned by a given owner */
static inline void
xchk_xref_check_owner(
- struct xfs_scrub *sc,
- xfs_agblock_t bno,
- xfs_extlen_t len,
- struct xfs_owner_info *oinfo,
- bool should_have_rmap)
+ struct xfs_scrub *sc,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ const struct xfs_owner_info *oinfo,
+ bool should_have_rmap)
{
- bool has_rmap;
- int error;
+ bool has_rmap;
+ int error;
if (!sc->sa.rmap_cur || xchk_skip_xref(sc->sm))
return;
@@ -207,10 +204,10 @@ xchk_xref_check_owner(
/* xref check that the extent is owned by a given owner */
void
xchk_xref_is_owned_by(
- struct xfs_scrub *sc,
- xfs_agblock_t bno,
- xfs_extlen_t len,
- struct xfs_owner_info *oinfo)
+ struct xfs_scrub *sc,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ const struct xfs_owner_info *oinfo)
{
xchk_xref_check_owner(sc, bno, len, oinfo, true);
}
@@ -218,10 +215,10 @@ xchk_xref_is_owned_by(
/* xref check that the extent is not owned by a given owner */
void
xchk_xref_is_not_owned_by(
- struct xfs_scrub *sc,
- xfs_agblock_t bno,
- xfs_extlen_t len,
- struct xfs_owner_info *oinfo)
+ struct xfs_scrub *sc,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ const struct xfs_owner_info *oinfo)
{
xchk_xref_check_owner(sc, bno, len, oinfo, false);
}
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index 665d4bbb17cc..dbe115b075f7 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -141,9 +141,8 @@ xchk_xref_is_used_rt_space(
startext = fsbno;
endext = fsbno + len - 1;
do_div(startext, sc->mp->m_sb.sb_rextsize);
- if (do_div(endext, sc->mp->m_sb.sb_rextsize))
- endext++;
- extcount = endext - startext;
+ do_div(endext, sc->mp->m_sb.sb_rextsize);
+ extcount = endext - startext + 1;
xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, startext, extcount,
&is_free);
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index af323b229c4b..22f754fba8e5 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -122,9 +122,9 @@ void xchk_xref_is_not_inode_chunk(struct xfs_scrub *sc, xfs_agblock_t agbno,
void xchk_xref_is_inode_chunk(struct xfs_scrub *sc, xfs_agblock_t agbno,
xfs_extlen_t len);
void xchk_xref_is_owned_by(struct xfs_scrub *sc, xfs_agblock_t agbno,
- xfs_extlen_t len, struct xfs_owner_info *oinfo);
+ xfs_extlen_t len, const struct xfs_owner_info *oinfo);
void xchk_xref_is_not_owned_by(struct xfs_scrub *sc, xfs_agblock_t agbno,
- xfs_extlen_t len, struct xfs_owner_info *oinfo);
+ xfs_extlen_t len, const struct xfs_owner_info *oinfo);
void xchk_xref_has_no_owner(struct xfs_scrub *sc, xfs_agblock_t agbno,
xfs_extlen_t len);
void xchk_xref_is_cow_staging(struct xfs_scrub *sc, xfs_agblock_t bno,
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 4e20f0e48232..3c83e8b3b39c 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -12,6 +12,71 @@
#include <linux/tracepoint.h>
#include "xfs_bit.h"
+/*
+ * ftrace's __print_symbolic requires that all enum values be wrapped in the
+ * TRACE_DEFINE_ENUM macro so that the enum value can be encoded in the ftrace
+ * ring buffer. Somehow this was only worth mentioning in the ftrace sample
+ * code.
+ */
+TRACE_DEFINE_ENUM(XFS_BTNUM_BNOi);
+TRACE_DEFINE_ENUM(XFS_BTNUM_CNTi);
+TRACE_DEFINE_ENUM(XFS_BTNUM_BMAPi);
+TRACE_DEFINE_ENUM(XFS_BTNUM_INOi);
+TRACE_DEFINE_ENUM(XFS_BTNUM_FINOi);
+TRACE_DEFINE_ENUM(XFS_BTNUM_RMAPi);
+TRACE_DEFINE_ENUM(XFS_BTNUM_REFCi);
+
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PROBE);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_SB);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_AGF);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_AGFL);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_AGI);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BNOBT);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_CNTBT);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_INOBT);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FINOBT);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RMAPBT);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_REFCNTBT);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_INODE);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BMBTD);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BMBTA);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BMBTC);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_DIR);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_XATTR);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_SYMLINK);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PARENT);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RTBITMAP);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RTSUM);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_UQUOTA);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_GQUOTA);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PQUOTA);
+
+#define XFS_SCRUB_TYPE_STRINGS \
+ { XFS_SCRUB_TYPE_PROBE, "probe" }, \
+ { XFS_SCRUB_TYPE_SB, "sb" }, \
+ { XFS_SCRUB_TYPE_AGF, "agf" }, \
+ { XFS_SCRUB_TYPE_AGFL, "agfl" }, \
+ { XFS_SCRUB_TYPE_AGI, "agi" }, \
+ { XFS_SCRUB_TYPE_BNOBT, "bnobt" }, \
+ { XFS_SCRUB_TYPE_CNTBT, "cntbt" }, \
+ { XFS_SCRUB_TYPE_INOBT, "inobt" }, \
+ { XFS_SCRUB_TYPE_FINOBT, "finobt" }, \
+ { XFS_SCRUB_TYPE_RMAPBT, "rmapbt" }, \
+ { XFS_SCRUB_TYPE_REFCNTBT, "refcountbt" }, \
+ { XFS_SCRUB_TYPE_INODE, "inode" }, \
+ { XFS_SCRUB_TYPE_BMBTD, "bmapbtd" }, \
+ { XFS_SCRUB_TYPE_BMBTA, "bmapbta" }, \
+ { XFS_SCRUB_TYPE_BMBTC, "bmapbtc" }, \
+ { XFS_SCRUB_TYPE_DIR, "directory" }, \
+ { XFS_SCRUB_TYPE_XATTR, "xattr" }, \
+ { XFS_SCRUB_TYPE_SYMLINK, "symlink" }, \
+ { XFS_SCRUB_TYPE_PARENT, "parent" }, \
+ { XFS_SCRUB_TYPE_RTBITMAP, "rtbitmap" }, \
+ { XFS_SCRUB_TYPE_RTSUM, "rtsummary" }, \
+ { XFS_SCRUB_TYPE_UQUOTA, "usrquota" }, \
+ { XFS_SCRUB_TYPE_GQUOTA, "grpquota" }, \
+ { XFS_SCRUB_TYPE_PQUOTA, "prjquota" }
+
DECLARE_EVENT_CLASS(xchk_class,
TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_metadata *sm,
int error),
@@ -36,10 +101,10 @@ DECLARE_EVENT_CLASS(xchk_class,
__entry->flags = sm->sm_flags;
__entry->error = error;
),
- TP_printk("dev %d:%d ino 0x%llx type %u agno %u inum %llu gen %u flags 0x%x error %d",
+ TP_printk("dev %d:%d ino 0x%llx type %s agno %u inum %llu gen %u flags 0x%x error %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
- __entry->type,
+ __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
__entry->agno,
__entry->inum,
__entry->gen,
@@ -78,9 +143,9 @@ TRACE_EVENT(xchk_op_error,
__entry->error = error;
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d type %u agno %u agbno %u error %d ret_ip %pS",
+ TP_printk("dev %d:%d type %s agno %u agbno %u error %d ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->type,
+ __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
__entry->agno,
__entry->bno,
__entry->error,
@@ -109,11 +174,11 @@ TRACE_EVENT(xchk_file_op_error,
__entry->error = error;
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d ino 0x%llx fork %d type %u offset %llu error %d ret_ip %pS",
+ TP_printk("dev %d:%d ino 0x%llx fork %d type %s offset %llu error %d ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->whichfork,
- __entry->type,
+ __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
__entry->offset,
__entry->error,
__entry->ret_ip)
@@ -144,9 +209,9 @@ DECLARE_EVENT_CLASS(xchk_block_error_class,
__entry->bno = bno;
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d type %u agno %u agbno %u ret_ip %pS",
+ TP_printk("dev %d:%d type %s agno %u agbno %u ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->type,
+ __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
__entry->agno,
__entry->bno,
__entry->ret_ip)
@@ -176,10 +241,10 @@ DECLARE_EVENT_CLASS(xchk_ino_error_class,
__entry->type = sc->sm->sm_type;
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d ino 0x%llx type %u ret_ip %pS",
+ TP_printk("dev %d:%d ino 0x%llx type %s ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
- __entry->type,
+ __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
__entry->ret_ip)
)
@@ -213,11 +278,11 @@ DECLARE_EVENT_CLASS(xchk_fblock_error_class,
__entry->offset = offset;
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d ino 0x%llx fork %d type %u offset %llu ret_ip %pS",
+ TP_printk("dev %d:%d ino 0x%llx fork %d type %s offset %llu ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->whichfork,
- __entry->type,
+ __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
__entry->offset,
__entry->ret_ip)
);
@@ -244,9 +309,9 @@ TRACE_EVENT(xchk_incomplete,
__entry->type = sc->sm->sm_type;
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d type %u ret_ip %pS",
+ TP_printk("dev %d:%d type %s ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->type,
+ __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
__entry->ret_ip)
);
@@ -278,10 +343,10 @@ TRACE_EVENT(xchk_btree_op_error,
__entry->error = error;
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d type %u btnum %d level %d ptr %d agno %u agbno %u error %d ret_ip %pS",
+ TP_printk("dev %d:%d type %s btree %s level %d ptr %d agno %u agbno %u error %d ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->type,
- __entry->btnum,
+ __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
+ __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
__entry->level,
__entry->ptr,
__entry->agno,
@@ -321,12 +386,12 @@ TRACE_EVENT(xchk_ifork_btree_op_error,
__entry->error = error;
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d ino 0x%llx fork %d type %u btnum %d level %d ptr %d agno %u agbno %u error %d ret_ip %pS",
+ TP_printk("dev %d:%d ino 0x%llx fork %d type %s btree %s level %d ptr %d agno %u agbno %u error %d ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->whichfork,
- __entry->type,
- __entry->btnum,
+ __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
+ __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
__entry->level,
__entry->ptr,
__entry->agno,
@@ -360,10 +425,10 @@ TRACE_EVENT(xchk_btree_error,
__entry->ptr = cur->bc_ptrs[level];
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d type %u btnum %d level %d ptr %d agno %u agbno %u ret_ip %pS",
+ TP_printk("dev %d:%d type %s btree %s level %d ptr %d agno %u agbno %u ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->type,
- __entry->btnum,
+ __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
+ __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
__entry->level,
__entry->ptr,
__entry->agno,
@@ -400,12 +465,12 @@ TRACE_EVENT(xchk_ifork_btree_error,
__entry->ptr = cur->bc_ptrs[level];
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d ino 0x%llx fork %d type %u btnum %d level %d ptr %d agno %u agbno %u ret_ip %pS",
+ TP_printk("dev %d:%d ino 0x%llx fork %d type %s btree %s level %d ptr %d agno %u agbno %u ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->whichfork,
- __entry->type,
- __entry->btnum,
+ __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
+ __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
__entry->level,
__entry->ptr,
__entry->agno,
@@ -439,10 +504,10 @@ DECLARE_EVENT_CLASS(xchk_sbtree_class,
__entry->nlevels = cur->bc_nlevels;
__entry->ptr = cur->bc_ptrs[level];
),
- TP_printk("dev %d:%d type %u btnum %d agno %u agbno %u level %d nlevels %d ptr %d",
+ TP_printk("dev %d:%d type %s btree %s agno %u agbno %u level %d nlevels %d ptr %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->type,
- __entry->btnum,
+ __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
+ __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
__entry->agno,
__entry->bno,
__entry->level,
@@ -473,13 +538,58 @@ TRACE_EVENT(xchk_xref_error,
__entry->error = error;
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d type %u xref error %d ret_ip %pF",
+ TP_printk("dev %d:%d type %s xref error %d ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->type,
+ __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
__entry->error,
__entry->ret_ip)
);
+TRACE_EVENT(xchk_iallocbt_check_cluster,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agino_t startino, xfs_daddr_t map_daddr,
+ unsigned short map_len, unsigned int chunk_ino,
+ unsigned int nr_inodes, uint16_t cluster_mask,
+ uint16_t holemask, unsigned int cluster_ino),
+ TP_ARGS(mp, agno, startino, map_daddr, map_len, chunk_ino, nr_inodes,
+ cluster_mask, holemask, cluster_ino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, startino)
+ __field(xfs_daddr_t, map_daddr)
+ __field(unsigned short, map_len)
+ __field(unsigned int, chunk_ino)
+ __field(unsigned int, nr_inodes)
+ __field(unsigned int, cluster_ino)
+ __field(uint16_t, cluster_mask)
+ __field(uint16_t, holemask)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->startino = startino;
+ __entry->map_daddr = map_daddr;
+ __entry->map_len = map_len;
+ __entry->chunk_ino = chunk_ino;
+ __entry->nr_inodes = nr_inodes;
+ __entry->cluster_mask = cluster_mask;
+ __entry->holemask = holemask;
+ __entry->cluster_ino = cluster_ino;
+ ),
+ TP_printk("dev %d:%d agno %d startino %u daddr 0x%llx len %d chunkino %u nr_inodes %u cluster_mask 0x%x holemask 0x%x cluster_ino %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->startino,
+ __entry->map_daddr,
+ __entry->map_len,
+ __entry->chunk_ino,
+ __entry->nr_inodes,
+ __entry->cluster_mask,
+ __entry->holemask,
+ __entry->cluster_ino)
+)
+
/* repair tracepoints */
#if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR)
@@ -598,11 +708,11 @@ TRACE_EVENT(xrep_init_btblock,
__entry->agbno = agbno;
__entry->btnum = btnum;
),
- TP_printk("dev %d:%d agno %u agbno %u btnum %d",
+ TP_printk("dev %d:%d agno %u agbno %u btree %s",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->agbno,
- __entry->btnum)
+ __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS))
)
TRACE_EVENT(xrep_findroot_block,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 338b9d9984e0..3619e9e8d359 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -28,7 +28,8 @@
*/
struct xfs_writepage_ctx {
struct xfs_bmbt_irec imap;
- unsigned int io_type;
+ int fork;
+ unsigned int data_seq;
unsigned int cow_seq;
struct xfs_ioend *ioend;
};
@@ -62,7 +63,7 @@ xfs_find_daxdev_for_inode(
static void
xfs_finish_page_writeback(
struct inode *inode,
- struct bio_vec *bvec,
+ struct bio_vec *bvec,
int error)
{
struct iomap_page *iop = to_iomap_page(bvec->bv_page);
@@ -98,6 +99,7 @@ xfs_destroy_ioend(
for (bio = &ioend->io_inline_bio; bio; bio = next) {
struct bio_vec *bvec;
int i;
+ struct bvec_iter_all iter_all;
/*
* For the last bio, bi_private points to the ioend, so we
@@ -109,7 +111,7 @@ xfs_destroy_ioend(
next = bio->bi_private;
/* walk each page on bio, ending page IO on them */
- bio_for_each_segment_all(bvec, bio, i)
+ bio_for_each_segment_all(bvec, bio, i, iter_all)
xfs_finish_page_writeback(inode, bvec, error);
bio_put(bio);
}
@@ -255,30 +257,20 @@ xfs_end_io(
*/
error = blk_status_to_errno(ioend->io_bio->bi_status);
if (unlikely(error)) {
- switch (ioend->io_type) {
- case XFS_IO_COW:
+ if (ioend->io_fork == XFS_COW_FORK)
xfs_reflink_cancel_cow_range(ip, offset, size, true);
- break;
- }
-
goto done;
}
/*
- * Success: commit the COW or unwritten blocks if needed.
+ * Success: commit the COW or unwritten blocks if needed.
*/
- switch (ioend->io_type) {
- case XFS_IO_COW:
+ if (ioend->io_fork == XFS_COW_FORK)
error = xfs_reflink_end_cow(ip, offset, size);
- break;
- case XFS_IO_UNWRITTEN:
- /* writeback should never update isize */
+ else if (ioend->io_state == XFS_EXT_UNWRITTEN)
error = xfs_iomap_write_unwritten(ip, offset, size, false);
- break;
- default:
+ else
ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans);
- break;
- }
done:
if (ioend->io_append_trans)
@@ -293,7 +285,8 @@ xfs_end_bio(
struct xfs_ioend *ioend = bio->bi_private;
struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;
- if (ioend->io_type == XFS_IO_UNWRITTEN || ioend->io_type == XFS_IO_COW)
+ if (ioend->io_fork == XFS_COW_FORK ||
+ ioend->io_state == XFS_EXT_UNWRITTEN)
queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
else if (ioend->io_append_trans)
queue_work(mp->m_data_workqueue, &ioend->io_work);
@@ -301,6 +294,75 @@ xfs_end_bio(
xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status));
}
+/*
+ * Fast revalidation of the cached writeback mapping. Return true if the current
+ * mapping is valid, false otherwise.
+ */
+static bool
+xfs_imap_valid(
+ struct xfs_writepage_ctx *wpc,
+ struct xfs_inode *ip,
+ xfs_fileoff_t offset_fsb)
+{
+ if (offset_fsb < wpc->imap.br_startoff ||
+ offset_fsb >= wpc->imap.br_startoff + wpc->imap.br_blockcount)
+ return false;
+ /*
+ * If this is a COW mapping, it is sufficient to check that the mapping
+ * covers the offset. Be careful to check this first because the caller
+ * can revalidate a COW mapping without updating the data seqno.
+ */
+ if (wpc->fork == XFS_COW_FORK)
+ return true;
+
+ /*
+ * This is not a COW mapping. Check the sequence number of the data fork
+ * because concurrent changes could have invalidated the extent. Check
+ * the COW fork because concurrent changes since the last time we
+ * checked (and found nothing at this offset) could have added
+ * overlapping blocks.
+ */
+ if (wpc->data_seq != READ_ONCE(ip->i_df.if_seq))
+ return false;
+ if (xfs_inode_has_cow_data(ip) &&
+ wpc->cow_seq != READ_ONCE(ip->i_cowfp->if_seq))
+ return false;
+ return true;
+}
+
+/*
+ * Pass in a dellalloc extent and convert it to real extents, return the real
+ * extent that maps offset_fsb in wpc->imap.
+ *
+ * The current page is held locked so nothing could have removed the block
+ * backing offset_fsb, although it could have moved from the COW to the data
+ * fork by another thread.
+ */
+static int
+xfs_convert_blocks(
+ struct xfs_writepage_ctx *wpc,
+ struct xfs_inode *ip,
+ xfs_fileoff_t offset_fsb)
+{
+ int error;
+
+ /*
+ * Attempt to allocate whatever delalloc extent currently backs
+ * offset_fsb and put the result into wpc->imap. Allocate in a loop
+ * because it may take several attempts to allocate real blocks for a
+ * contiguous delalloc extent if free space is sufficiently fragmented.
+ */
+ do {
+ error = xfs_bmapi_convert_delalloc(ip, wpc->fork, offset_fsb,
+ &wpc->imap, wpc->fork == XFS_COW_FORK ?
+ &wpc->cow_seq : &wpc->data_seq);
+ if (error)
+ return error;
+ } while (wpc->imap.br_startoff + wpc->imap.br_blockcount <= offset_fsb);
+
+ return 0;
+}
+
STATIC int
xfs_map_blocks(
struct xfs_writepage_ctx *wpc,
@@ -310,26 +372,16 @@ xfs_map_blocks(
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
ssize_t count = i_blocksize(inode);
- xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset), end_fsb;
+ xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
xfs_fileoff_t cow_fsb = NULLFILEOFF;
struct xfs_bmbt_irec imap;
- int whichfork = XFS_DATA_FORK;
struct xfs_iext_cursor icur;
- bool imap_valid;
+ int retries = 0;
int error = 0;
- /*
- * We have to make sure the cached mapping is within EOF to protect
- * against eofblocks trimming on file release leaving us with a stale
- * mapping. Otherwise, a page for a subsequent file extending buffered
- * write could get picked up by this writeback cycle and written to the
- * wrong blocks.
- *
- * Note that what we really want here is a generic mapping invalidation
- * mechanism to protect us from arbitrary extent modifying contexts, not
- * just eofblocks.
- */
- xfs_trim_extent_eof(&wpc->imap, ip);
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
/*
* COW fork blocks can overlap data fork blocks even if the blocks
@@ -346,31 +398,19 @@ xfs_map_blocks(
* against concurrent updates and provides a memory barrier on the way
* out that ensures that we always see the current value.
*/
- imap_valid = offset_fsb >= wpc->imap.br_startoff &&
- offset_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount;
- if (imap_valid &&
- (!xfs_inode_has_cow_data(ip) ||
- wpc->io_type == XFS_IO_COW ||
- wpc->cow_seq == READ_ONCE(ip->i_cowfp->if_seq)))
+ if (xfs_imap_valid(wpc, ip, offset_fsb))
return 0;
- if (XFS_FORCED_SHUTDOWN(mp))
- return -EIO;
-
/*
* If we don't have a valid map, now it's time to get a new one for this
* offset. This will convert delayed allocations (including COW ones)
* into real extents. If we return without a valid map, it means we
* landed in a hole and we skip the block.
*/
+retry:
xfs_ilock(ip, XFS_ILOCK_SHARED);
ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
(ip->i_df.if_flags & XFS_IFEXTENTS));
- ASSERT(offset <= mp->m_super->s_maxbytes);
-
- if (offset > mp->m_super->s_maxbytes - count)
- count = mp->m_super->s_maxbytes - offset;
- end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
/*
* Check if this is offset is covered by a COW extents, and if yes use
@@ -382,30 +422,16 @@ xfs_map_blocks(
if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
wpc->cow_seq = READ_ONCE(ip->i_cowfp->if_seq);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
- /*
- * Truncate can race with writeback since writeback doesn't
- * take the iolock and truncate decreases the file size before
- * it starts truncating the pages between new_size and old_size.
- * Therefore, we can end up in the situation where writeback
- * gets a CoW fork mapping but the truncate makes the mapping
- * invalid and we end up in here trying to get a new mapping.
- * bail out here so that we simply never get a valid mapping
- * and so we drop the write altogether. The page truncation
- * will kill the contents anyway.
- */
- if (offset > i_size_read(inode)) {
- wpc->io_type = XFS_IO_HOLE;
- return 0;
- }
- whichfork = XFS_COW_FORK;
- wpc->io_type = XFS_IO_COW;
+
+ wpc->fork = XFS_COW_FORK;
goto allocate_blocks;
}
/*
- * Map valid and no COW extent in the way? We're done.
+ * No COW extent overlap. Revalidate now that we may have updated
+ * ->cow_seq. If the data mapping is still valid, we're done.
*/
- if (imap_valid) {
+ if (xfs_imap_valid(wpc, ip, offset_fsb)) {
xfs_iunlock(ip, XFS_ILOCK_SHARED);
return 0;
}
@@ -417,49 +443,65 @@ xfs_map_blocks(
*/
if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap))
imap.br_startoff = end_fsb; /* fake a hole past EOF */
+ wpc->data_seq = READ_ONCE(ip->i_df.if_seq);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ wpc->fork = XFS_DATA_FORK;
+
+ /* landed in a hole or beyond EOF? */
if (imap.br_startoff > offset_fsb) {
- /* landed in a hole or beyond EOF */
imap.br_blockcount = imap.br_startoff - offset_fsb;
imap.br_startoff = offset_fsb;
imap.br_startblock = HOLESTARTBLOCK;
- wpc->io_type = XFS_IO_HOLE;
- } else {
- /*
- * Truncate to the next COW extent if there is one. This is the
- * only opportunity to do this because we can skip COW fork
- * lookups for the subsequent blocks in the mapping; however,
- * the requirement to treat the COW range separately remains.
- */
- if (cow_fsb != NULLFILEOFF &&
- cow_fsb < imap.br_startoff + imap.br_blockcount)
- imap.br_blockcount = cow_fsb - imap.br_startoff;
-
- if (isnullstartblock(imap.br_startblock)) {
- /* got a delalloc extent */
- wpc->io_type = XFS_IO_DELALLOC;
- goto allocate_blocks;
- }
-
- if (imap.br_state == XFS_EXT_UNWRITTEN)
- wpc->io_type = XFS_IO_UNWRITTEN;
- else
- wpc->io_type = XFS_IO_OVERWRITE;
+ imap.br_state = XFS_EXT_NORM;
}
+ /*
+ * Truncate to the next COW extent if there is one. This is the only
+ * opportunity to do this because we can skip COW fork lookups for the
+ * subsequent blocks in the mapping; however, the requirement to treat
+ * the COW range separately remains.
+ */
+ if (cow_fsb != NULLFILEOFF &&
+ cow_fsb < imap.br_startoff + imap.br_blockcount)
+ imap.br_blockcount = cow_fsb - imap.br_startoff;
+
+ /* got a delalloc extent? */
+ if (imap.br_startblock != HOLESTARTBLOCK &&
+ isnullstartblock(imap.br_startblock))
+ goto allocate_blocks;
+
wpc->imap = imap;
- trace_xfs_map_blocks_found(ip, offset, count, wpc->io_type, &imap);
+ trace_xfs_map_blocks_found(ip, offset, count, wpc->fork, &imap);
return 0;
allocate_blocks:
- error = xfs_iomap_write_allocate(ip, whichfork, offset, &imap,
- &wpc->cow_seq);
- if (error)
+ error = xfs_convert_blocks(wpc, ip, offset_fsb);
+ if (error) {
+ /*
+ * If we failed to find the extent in the COW fork we might have
+ * raced with a COW to data fork conversion or truncate.
+ * Restart the lookup to catch the extent in the data fork for
+ * the former case, but prevent additional retries to avoid
+ * looping forever for the latter case.
+ */
+ if (error == -EAGAIN && wpc->fork == XFS_COW_FORK && !retries++)
+ goto retry;
+ ASSERT(error != -EAGAIN);
return error;
- ASSERT(whichfork == XFS_COW_FORK || cow_fsb == NULLFILEOFF ||
- imap.br_startoff + imap.br_blockcount <= cow_fsb);
- wpc->imap = imap;
- trace_xfs_map_blocks_alloc(ip, offset, count, wpc->io_type, &imap);
+ }
+
+ /*
+ * Due to merging the return real extent might be larger than the
+ * original delalloc one. Trim the return extent to the next COW
+ * boundary again to force a re-lookup.
+ */
+ if (wpc->fork != XFS_COW_FORK && cow_fsb != NULLFILEOFF &&
+ cow_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount)
+ wpc->imap.br_blockcount = cow_fsb - wpc->imap.br_startoff;
+
+ ASSERT(wpc->imap.br_startoff <= offset_fsb);
+ ASSERT(wpc->imap.br_startoff + wpc->imap.br_blockcount > offset_fsb);
+ trace_xfs_map_blocks_alloc(ip, offset, count, wpc->fork, &imap);
return 0;
}
@@ -484,7 +526,7 @@ xfs_submit_ioend(
int status)
{
/* Convert CoW extents to regular */
- if (!status && ioend->io_type == XFS_IO_COW) {
+ if (!status && ioend->io_fork == XFS_COW_FORK) {
/*
* Yuk. This can do memory allocation, but is not a
* transactional operation so everything is done in GFP_KERNEL
@@ -502,7 +544,8 @@ xfs_submit_ioend(
/* Reserve log space if we might write beyond the on-disk inode size. */
if (!status &&
- ioend->io_type != XFS_IO_UNWRITTEN &&
+ (ioend->io_fork == XFS_COW_FORK ||
+ ioend->io_state != XFS_EXT_UNWRITTEN) &&
xfs_ioend_is_append(ioend) &&
!ioend->io_append_trans)
status = xfs_setfilesize_trans_alloc(ioend);
@@ -531,7 +574,8 @@ xfs_submit_ioend(
static struct xfs_ioend *
xfs_alloc_ioend(
struct inode *inode,
- unsigned int type,
+ int fork,
+ xfs_exntst_t state,
xfs_off_t offset,
struct block_device *bdev,
sector_t sector)
@@ -545,7 +589,8 @@ xfs_alloc_ioend(
ioend = container_of(bio, struct xfs_ioend, io_inline_bio);
INIT_LIST_HEAD(&ioend->io_list);
- ioend->io_type = type;
+ ioend->io_fork = fork;
+ ioend->io_state = state;
ioend->io_inode = inode;
ioend->io_size = 0;
ioend->io_offset = offset;
@@ -606,21 +651,23 @@ xfs_add_to_ioend(
sector = xfs_fsb_to_db(ip, wpc->imap.br_startblock) +
((offset - XFS_FSB_TO_B(mp, wpc->imap.br_startoff)) >> 9);
- if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type ||
+ if (!wpc->ioend ||
+ wpc->fork != wpc->ioend->io_fork ||
+ wpc->imap.br_state != wpc->ioend->io_state ||
sector != bio_end_sector(wpc->ioend->io_bio) ||
offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
if (wpc->ioend)
list_add(&wpc->ioend->io_list, iolist);
- wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset,
- bdev, sector);
+ wpc->ioend = xfs_alloc_ioend(inode, wpc->fork,
+ wpc->imap.br_state, offset, bdev, sector);
}
- if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff)) {
+ if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff, true)) {
if (iop)
atomic_inc(&iop->write_count);
if (bio_full(wpc->ioend->io_bio))
xfs_chain_bio(wpc->ioend, wbc, bdev, sector);
- __bio_add_page(wpc->ioend->io_bio, page, len, poff);
+ bio_add_page(wpc->ioend->io_bio, page, len, poff);
}
wpc->ioend->io_size += len;
@@ -721,7 +768,7 @@ xfs_writepage_map(
error = xfs_map_blocks(wpc, inode, file_offset);
if (error)
break;
- if (wpc->io_type == XFS_IO_HOLE)
+ if (wpc->imap.br_startblock == HOLESTARTBLOCK)
continue;
xfs_add_to_ioend(inode, file_offset, page, iop, wpc, wbc,
&submit_list);
@@ -916,9 +963,7 @@ xfs_vm_writepage(
struct page *page,
struct writeback_control *wbc)
{
- struct xfs_writepage_ctx wpc = {
- .io_type = XFS_IO_HOLE,
- };
+ struct xfs_writepage_ctx wpc = { };
int ret;
ret = xfs_do_writepage(page, wbc, &wpc);
@@ -932,9 +977,7 @@ xfs_vm_writepages(
struct address_space *mapping,
struct writeback_control *wbc)
{
- struct xfs_writepage_ctx wpc = {
- .io_type = XFS_IO_HOLE,
- };
+ struct xfs_writepage_ctx wpc = { };
int ret;
xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
@@ -981,7 +1024,7 @@ xfs_vm_bmap(
* Since we don't pass back blockdev info, we can't return bmap
* information for rt files either.
*/
- if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip))
+ if (xfs_is_cow_inode(ip) || XFS_IS_REALTIME_INODE(ip))
return 0;
return iomap_bmap(mapping, block, &xfs_iomap_ops);
}
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index 494b4338446e..6c2615b83c5d 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -9,29 +9,12 @@
extern struct bio_set xfs_ioend_bioset;
/*
- * Types of I/O for bmap clustering and I/O completion tracking.
- */
-enum {
- XFS_IO_HOLE, /* covers region without any block allocation */
- XFS_IO_DELALLOC, /* covers delalloc region */
- XFS_IO_UNWRITTEN, /* covers allocated but uninitialized data */
- XFS_IO_OVERWRITE, /* covers already allocated extent */
- XFS_IO_COW, /* covers copy-on-write extent */
-};
-
-#define XFS_IO_TYPES \
- { XFS_IO_HOLE, "hole" }, \
- { XFS_IO_DELALLOC, "delalloc" }, \
- { XFS_IO_UNWRITTEN, "unwritten" }, \
- { XFS_IO_OVERWRITE, "overwrite" }, \
- { XFS_IO_COW, "CoW" }
-
-/*
* Structure for buffered I/O completions.
*/
struct xfs_ioend {
struct list_head io_list; /* next ioend in chain */
- unsigned int io_type; /* delalloc / unwritten */
+ int io_fork; /* inode fork written back */
+ xfs_exntst_t io_state; /* extent state */
struct inode *io_inode; /* file being written to */
size_t io_size; /* size of the extent */
xfs_off_t io_offset; /* offset in the file */
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index a58034049995..3d213a7394c5 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -555,6 +555,7 @@ xfs_attr_put_listent(
attrlist_ent_t *aep;
int arraytop;
+ ASSERT(!context->seen_enough);
ASSERT(!(context->flags & ATTR_KERNOVAL));
ASSERT(context->count >= 0);
ASSERT(context->count < (ATTR_MAX_VALUELEN/8));
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 5d263dfdb3bc..2db43ff4f8b5 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1042,7 +1042,7 @@ out_trans_cancel:
goto out_unlock;
}
-static int
+int
xfs_flush_unmap_range(
struct xfs_inode *ip,
xfs_off_t offset,
@@ -1126,9 +1126,9 @@ xfs_free_file_space(
* page could be mmap'd and iomap_zero_range doesn't do that for us.
* Writeback of the eof page will do this, albeit clumsily.
*/
- if (offset + len >= XFS_ISIZE(ip) && ((offset + len) & PAGE_MASK)) {
+ if (offset + len >= XFS_ISIZE(ip) && offset_in_page(offset + len) > 0) {
error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
- (offset + len) & ~PAGE_MASK, LLONG_MAX);
+ round_down(offset + len, PAGE_SIZE), LLONG_MAX);
}
return error;
@@ -1162,16 +1162,13 @@ xfs_zero_file_space(
* by virtue of the hole punch.
*/
error = xfs_free_file_space(ip, offset, len);
- if (error)
- goto out;
+ if (error || xfs_is_always_cow_inode(ip))
+ return error;
- error = xfs_alloc_file_space(ip, round_down(offset, blksize),
+ return xfs_alloc_file_space(ip, round_down(offset, blksize),
round_up(offset + len, blksize) -
round_down(offset, blksize),
XFS_BMAPI_PREALLOC);
-out:
- return error;
-
}
static int
@@ -1195,13 +1192,7 @@ xfs_prepare_shift(
* Writeback and invalidate cache for the remainder of the file as we're
* about to shift down every extent from offset to EOF.
*/
- error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, offset, -1);
- if (error)
- return error;
- error = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
- offset >> PAGE_SHIFT, -1);
- if (error)
- return error;
+ error = xfs_flush_unmap_range(ip, offset, XFS_ISIZE(ip));
/*
* Clean out anything hanging around in the cow fork now that
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 87363d136bb6..7a78229cf1a7 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -80,4 +80,7 @@ int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip,
int whichfork, xfs_extnum_t *nextents,
xfs_filblks_t *count);
+int xfs_flush_unmap_range(struct xfs_inode *ip, xfs_off_t offset,
+ xfs_off_t len);
+
#endif /* __XFS_BMAP_UTIL_H__ */
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index b21ea2ba768d..548344e25128 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -776,13 +776,24 @@ _xfs_buf_read(
}
/*
- * If the caller passed in an ops structure and the buffer doesn't have ops
- * assigned, set the ops and use them to verify the contents. If the contents
- * cannot be verified, we'll clear XBF_DONE. We assume the buffer has no
- * recorded errors and is already in XBF_DONE state.
+ * Reverify a buffer found in cache without an attached ->b_ops.
+ *
+ * If the caller passed an ops structure and the buffer doesn't have ops
+ * assigned, set the ops and use it to verify the contents. If verification
+ * fails, clear XBF_DONE. We assume the buffer has no recorded errors and is
+ * already in XBF_DONE state on entry.
+ *
+ * Under normal operations, every in-core buffer is verified on read I/O
+ * completion. There are two scenarios that can lead to in-core buffers without
+ * an assigned ->b_ops. The first is during log recovery of buffers on a V4
+ * filesystem, though these buffers are purged at the end of recovery. The
+ * other is online repair, which intentionally reads with a NULL buffer ops to
+ * run several verifiers across an in-core buffer in order to establish buffer
+ * type. If repair can't establish that, the buffer will be left in memory
+ * with NULL buffer ops.
*/
int
-xfs_buf_ensure_ops(
+xfs_buf_reverify(
struct xfs_buf *bp,
const struct xfs_buf_ops *ops)
{
@@ -824,7 +835,7 @@ xfs_buf_read_map(
return bp;
}
- xfs_buf_ensure_ops(bp, ops);
+ xfs_buf_reverify(bp, ops);
if (flags & XBF_ASYNC) {
/*
@@ -1536,8 +1547,7 @@ __xfs_buf_submit(
xfs_buf_ioerror(bp, -EIO);
bp->b_flags &= ~XBF_DONE;
xfs_buf_stale(bp);
- if (bp->b_flags & XBF_ASYNC)
- xfs_buf_ioend(bp);
+ xfs_buf_ioend(bp);
return -EIO;
}
@@ -1992,7 +2002,6 @@ xfs_buf_delwri_submit_buffers(
struct list_head *wait_list)
{
struct xfs_buf *bp, *n;
- LIST_HEAD (submit_list);
int pinned = 0;
struct blk_plug plug;
@@ -2195,3 +2204,40 @@ void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
atomic_set(&bp->b_lru_ref, lru_ref);
}
+
+/*
+ * Verify an on-disk magic value against the magic value specified in the
+ * verifier structure. The verifier magic is in disk byte order so the caller is
+ * expected to pass the value directly from disk.
+ */
+bool
+xfs_verify_magic(
+ struct xfs_buf *bp,
+ __be32 dmagic)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ int idx;
+
+ idx = xfs_sb_version_hascrc(&mp->m_sb);
+ if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx])))
+ return false;
+ return dmagic == bp->b_ops->magic[idx];
+}
+/*
+ * Verify an on-disk magic value against the magic value specified in the
+ * verifier structure. The verifier magic is in disk byte order so the caller is
+ * expected to pass the value directly from disk.
+ */
+bool
+xfs_verify_magic16(
+ struct xfs_buf *bp,
+ __be16 dmagic)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ int idx;
+
+ idx = xfs_sb_version_hascrc(&mp->m_sb);
+ if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx])))
+ return false;
+ return dmagic == bp->b_ops->magic16[idx];
+}
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index b9f5511ea998..d0b96e071cec 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -125,6 +125,10 @@ struct xfs_buf_map {
struct xfs_buf_ops {
char *name;
+ union {
+ __be32 magic[2]; /* v4 and v5 on disk magic values */
+ __be16 magic16[2]; /* v4 and v5 on disk magic values */
+ };
void (*verify_read)(struct xfs_buf *);
void (*verify_write)(struct xfs_buf *);
xfs_failaddr_t (*verify_struct)(struct xfs_buf *bp);
@@ -385,6 +389,8 @@ extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int);
#define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev)
#define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev)
-int xfs_buf_ensure_ops(struct xfs_buf *bp, const struct xfs_buf_ops *ops);
+int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops);
+bool xfs_verify_magic(struct xfs_buf *bp, __be32 dmagic);
+bool xfs_verify_magic16(struct xfs_buf *bp, __be16 dmagic);
#endif /* __XFS_BUF_H__ */
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 12d8455bfbb2..010db5f8fb00 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -1233,9 +1233,23 @@ xfs_buf_iodone(
}
/*
- * Requeue a failed buffer for writeback
+ * Requeue a failed buffer for writeback.
*
- * Return true if the buffer has been re-queued properly, false otherwise
+ * We clear the log item failed state here as well, but we have to be careful
+ * about reference counts because the only active reference counts on the buffer
+ * may be the failed log items. Hence if we clear the log item failed state
+ * before queuing the buffer for IO we can release all active references to
+ * the buffer and free it, leading to use after free problems in
+ * xfs_buf_delwri_queue. It makes no difference to the buffer or log items which
+ * order we process them in - the buffer is locked, and we own the buffer list
+ * so nothing on them is going to change while we are performing this action.
+ *
+ * Hence we can safely queue the buffer for IO before we clear the failed log
+ * item state, therefore always having an active reference to the buffer and
+ * avoiding the transient zero-reference state that leads to use-after-free.
+ *
+ * Return true if the buffer was added to the buffer list, false if it was
+ * already on the buffer list.
*/
bool
xfs_buf_resubmit_failed_buffers(
@@ -1243,16 +1257,16 @@ xfs_buf_resubmit_failed_buffers(
struct list_head *buffer_list)
{
struct xfs_log_item *lip;
+ bool ret;
+
+ ret = xfs_buf_delwri_queue(bp, buffer_list);
/*
- * Clear XFS_LI_FAILED flag from all items before resubmit
- *
- * XFS_LI_FAILED set/clear is protected by ail_lock, caller this
+ * XFS_LI_FAILED set/clear is protected by ail_lock, caller of this
* function already have it acquired
*/
list_for_each_entry(lip, &bp->b_li_list, li_bio_list)
xfs_clear_li_failed(lip);
- /* Add this buffer back to the delayed write list */
- return xfs_buf_delwri_queue(bp, buffer_list);
+ return ret;
}
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 9866f542e77b..a1e177f66404 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -51,6 +51,7 @@ static unsigned int xfs_errortag_random_default[] = {
XFS_RANDOM_BUF_LRU_REF,
XFS_RANDOM_FORCE_SCRUB_REPAIR,
XFS_RANDOM_FORCE_SUMMARY_RECALC,
+ XFS_RANDOM_IUNLINK_FALLBACK,
};
struct xfs_errortag_attr {
@@ -159,6 +160,7 @@ XFS_ERRORTAG_ATTR_RW(log_item_pin, XFS_ERRTAG_LOG_ITEM_PIN);
XFS_ERRORTAG_ATTR_RW(buf_lru_ref, XFS_ERRTAG_BUF_LRU_REF);
XFS_ERRORTAG_ATTR_RW(force_repair, XFS_ERRTAG_FORCE_SCRUB_REPAIR);
XFS_ERRORTAG_ATTR_RW(bad_summary, XFS_ERRTAG_FORCE_SUMMARY_RECALC);
+XFS_ERRORTAG_ATTR_RW(iunlink_fallback, XFS_ERRTAG_IUNLINK_FALLBACK);
static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(noerror),
@@ -195,6 +197,7 @@ static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(buf_lru_ref),
XFS_ERRORTAG_ATTR_LIST(force_repair),
XFS_ERRORTAG_ATTR_LIST(bad_summary),
+ XFS_ERRORTAG_ATTR_LIST(iunlink_fallback),
NULL,
};
@@ -357,7 +360,8 @@ xfs_buf_verifier_error(
fa = failaddr ? failaddr : __return_address;
__xfs_buf_ioerror(bp, error, fa);
- xfs_alert(mp, "Metadata %s detected at %pS, %s block 0x%llx %s",
+ xfs_alert_tag(mp, XFS_PTAG_VERIFIER_ERROR,
+ "Metadata %s detected at %pS, %s block 0x%llx %s",
bp->b_error == -EFSBADCRC ? "CRC error" : "corruption",
fa, bp->b_ops->name, bp->b_bn, name);
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 246d3e989c6c..602aa7d62b66 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -98,5 +98,6 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp);
#define XFS_PTAG_SHUTDOWN_IOERROR 0x00000020
#define XFS_PTAG_SHUTDOWN_LOGERROR 0x00000040
#define XFS_PTAG_FSBLOCK_ZERO 0x00000080
+#define XFS_PTAG_VERIFIER_ERROR 0x00000100
#endif /* __XFS_ERROR_H__ */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index d9da66c718bb..74ddf66f4cfe 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -494,7 +494,6 @@ xfs_efi_recover(
int error = 0;
xfs_extent_t *extp;
xfs_fsblock_t startblock_fsb;
- struct xfs_owner_info oinfo;
ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags));
@@ -526,11 +525,11 @@ xfs_efi_recover(
return error;
efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
- xfs_rmap_any_owner_update(&oinfo);
for (i = 0; i < efip->efi_format.efi_nextents; i++) {
extp = &efip->efi_format.efi_extents[i];
error = xfs_trans_free_extent(tp, efdp, extp->ext_start,
- extp->ext_len, &oinfo, false);
+ extp->ext_len,
+ &XFS_RMAP_OINFO_ANY_OWNER, false);
if (error)
goto abort_error;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 61a5ad2600e8..1f2e2845eb76 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -507,7 +507,7 @@ xfs_file_dio_aio_write(
* We can't properly handle unaligned direct I/O to reflink
* files yet, as we can't unshare a partial block.
*/
- if (xfs_is_reflink_inode(ip)) {
+ if (xfs_is_cow_inode(ip)) {
trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
return -EREMCHG;
}
@@ -872,14 +872,27 @@ xfs_file_fallocate(
goto out_unlock;
}
- if (mode & FALLOC_FL_ZERO_RANGE)
+ if (mode & FALLOC_FL_ZERO_RANGE) {
error = xfs_zero_file_space(ip, offset, len);
- else {
- if (mode & FALLOC_FL_UNSHARE_RANGE) {
- error = xfs_reflink_unshare(ip, offset, len);
- if (error)
- goto out_unlock;
+ } else if (mode & FALLOC_FL_UNSHARE_RANGE) {
+ error = xfs_reflink_unshare(ip, offset, len);
+ if (error)
+ goto out_unlock;
+
+ if (!xfs_is_always_cow_inode(ip)) {
+ error = xfs_alloc_file_space(ip, offset, len,
+ XFS_BMAPI_PREALLOC);
+ }
+ } else {
+ /*
+ * If always_cow mode we can't use preallocations and
+ * thus should not create them.
+ */
+ if (xfs_is_always_cow_inode(ip)) {
+ error = -EOPNOTSUPP;
+ goto out_unlock;
}
+
error = xfs_alloc_file_space(ip, offset, len,
XFS_BMAPI_PREALLOC);
}
@@ -919,28 +932,67 @@ out_unlock:
return error;
}
-STATIC int
-xfs_file_clone_range(
- struct file *file_in,
- loff_t pos_in,
- struct file *file_out,
- loff_t pos_out,
- u64 len)
-{
- return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out,
- len, false);
-}
-STATIC int
-xfs_file_dedupe_range(
- struct file *file_in,
- loff_t pos_in,
- struct file *file_out,
- loff_t pos_out,
- u64 len)
+STATIC loff_t
+xfs_file_remap_range(
+ struct file *file_in,
+ loff_t pos_in,
+ struct file *file_out,
+ loff_t pos_out,
+ loff_t len,
+ unsigned int remap_flags)
{
- return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out,
- len, true);
+ struct inode *inode_in = file_inode(file_in);
+ struct xfs_inode *src = XFS_I(inode_in);
+ struct inode *inode_out = file_inode(file_out);
+ struct xfs_inode *dest = XFS_I(inode_out);
+ struct xfs_mount *mp = src->i_mount;
+ loff_t remapped = 0;
+ xfs_extlen_t cowextsize;
+ int ret;
+
+ if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
+ return -EINVAL;
+
+ if (!xfs_sb_version_hasreflink(&mp->m_sb))
+ return -EOPNOTSUPP;
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
+
+ /* Prepare and then clone file data. */
+ ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
+ &len, remap_flags);
+ if (ret < 0 || len == 0)
+ return ret;
+
+ trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
+
+ ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len,
+ &remapped);
+ if (ret)
+ goto out_unlock;
+
+ /*
+ * Carry the cowextsize hint from src to dest if we're sharing the
+ * entire source file to the entire destination file, the source file
+ * has a cowextsize hint, and the destination file does not.
+ */
+ cowextsize = 0;
+ if (pos_in == 0 && len == i_size_read(inode_in) &&
+ (src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) &&
+ pos_out == 0 && len >= i_size_read(inode_out) &&
+ !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE))
+ cowextsize = src->i_d.di_cowextsize;
+
+ ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
+ remap_flags);
+
+out_unlock:
+ xfs_reflink_remap_unlock(file_in, file_out);
+ if (ret)
+ trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
+ return remapped > 0 ? remapped : ret;
}
STATIC int
@@ -1029,10 +1081,10 @@ xfs_file_llseek(
default:
return generic_file_llseek(file, offset, whence);
case SEEK_HOLE:
- offset = iomap_seek_hole(inode, offset, &xfs_iomap_ops);
+ offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops);
break;
case SEEK_DATA:
- offset = iomap_seek_data(inode, offset, &xfs_iomap_ops);
+ offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops);
break;
}
@@ -1164,6 +1216,7 @@ const struct file_operations xfs_file_operations = {
.write_iter = xfs_file_write_iter,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
+ .iopoll = iomap_dio_iopoll,
.unlocked_ioctl = xfs_file_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = xfs_file_compat_ioctl,
@@ -1175,8 +1228,7 @@ const struct file_operations xfs_file_operations = {
.fsync = xfs_file_fsync,
.get_unmapped_area = thp_get_unmapped_area,
.fallocate = xfs_file_fallocate,
- .clone_file_range = xfs_file_clone_range,
- .dedupe_file_range = xfs_file_dedupe_range,
+ .remap_file_range = xfs_file_remap_range,
};
const struct file_operations xfs_dir_file_operations = {
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 093c2b8d7e20..584648582ba7 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -40,7 +40,6 @@ xfs_growfs_data_private(
xfs_rfsblock_t new;
xfs_agnumber_t oagcount;
xfs_trans_t *tp;
- LIST_HEAD (buffer_list);
struct aghdr_init_data id = {};
nb = in->newblocks;
@@ -252,7 +251,7 @@ xfs_growfs_data(
if (mp->m_sb.sb_imax_pct) {
uint64_t icount = mp->m_sb.sb_dblocks * mp->m_sb.sb_imax_pct;
do_div(icount, 100);
- mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
+ mp->m_maxicount = XFS_FSB_TO_INO(mp, icount);
} else
mp->m_maxicount = 0;
@@ -534,6 +533,7 @@ xfs_fs_reserve_ag_blocks(
int error = 0;
int err2;
+ mp->m_finobt_nores = false;
for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
pag = xfs_perag_get(mp, agno);
err2 = xfs_ag_resv_init(pag, NULL);
diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
index 5169e84ae382..d0d377384120 100644
--- a/fs/xfs/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
@@ -16,7 +16,7 @@ xfs_param_t xfs_params = {
/* MIN DFLT MAX */
.sgid_inherit = { 0, 0, 1 },
.symlink_mode = { 0, 0, 1 },
- .panic_mask = { 0, 0, 255 },
+ .panic_mask = { 0, 0, 256 },
.error_level = { 0, 3, 11 },
.syncd_timer = { 1*100, 30*100, 7200*100},
.stats_clear = { 0, 0, 1 },
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 05db9540e459..f643a9295179 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1332,7 +1332,7 @@ xfs_create_tmpfile(
if (error)
goto out_trans_cancel;
- error = xfs_dir_ialloc(&tp, dp, mode, 1, 0, prid, &ip);
+ error = xfs_dir_ialloc(&tp, dp, mode, 0, 0, prid, &ip);
if (error)
goto out_trans_cancel;
@@ -1754,7 +1754,7 @@ xfs_inactive_ifree(
* now remains allocated and sits on the unlinked list until the fs is
* repaired.
*/
- if (unlikely(mp->m_inotbt_nores)) {
+ if (unlikely(mp->m_finobt_nores)) {
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE,
&tp);
@@ -1907,86 +1907,510 @@ xfs_inactive(
}
/*
- * This is called when the inode's link count goes to 0 or we are creating a
- * tmpfile via O_TMPFILE. In the case of a tmpfile, @ignore_linkcount will be
- * set to true as the link count is dropped to zero by the VFS after we've
- * created the file successfully, so we have to add it to the unlinked list
- * while the link count is non-zero.
+ * In-Core Unlinked List Lookups
+ * =============================
+ *
+ * Every inode is supposed to be reachable from some other piece of metadata
+ * with the exception of the root directory. Inodes with a connection to a
+ * file descriptor but not linked from anywhere in the on-disk directory tree
+ * are collectively known as unlinked inodes, though the filesystem itself
+ * maintains links to these inodes so that on-disk metadata are consistent.
+ *
+ * XFS implements a per-AG on-disk hash table of unlinked inodes. The AGI
+ * header contains a number of buckets that point to an inode, and each inode
+ * record has a pointer to the next inode in the hash chain. This
+ * singly-linked list causes scaling problems in the iunlink remove function
+ * because we must walk that list to find the inode that points to the inode
+ * being removed from the unlinked hash bucket list.
+ *
+ * What if we modelled the unlinked list as a collection of records capturing
+ * "X.next_unlinked = Y" relations? If we indexed those records on Y, we'd
+ * have a fast way to look up unlinked list predecessors, which avoids the
+ * slow list walk. That's exactly what we do here (in-core) with a per-AG
+ * rhashtable.
+ *
+ * Because this is a backref cache, we ignore operational failures since the
+ * iunlink code can fall back to the slow bucket walk. The only errors that
+ * should bubble out are for obviously incorrect situations.
+ *
+ * All users of the backref cache MUST hold the AGI buffer lock to serialize
+ * access or have otherwise provided for concurrency control.
+ */
+
+/* Capture a "X.next_unlinked = Y" relationship. */
+struct xfs_iunlink {
+ struct rhash_head iu_rhash_head;
+ xfs_agino_t iu_agino; /* X */
+ xfs_agino_t iu_next_unlinked; /* Y */
+};
+
+/* Unlinked list predecessor lookup hashtable construction */
+static int
+xfs_iunlink_obj_cmpfn(
+ struct rhashtable_compare_arg *arg,
+ const void *obj)
+{
+ const xfs_agino_t *key = arg->key;
+ const struct xfs_iunlink *iu = obj;
+
+ if (iu->iu_next_unlinked != *key)
+ return 1;
+ return 0;
+}
+
+static const struct rhashtable_params xfs_iunlink_hash_params = {
+ .min_size = XFS_AGI_UNLINKED_BUCKETS,
+ .key_len = sizeof(xfs_agino_t),
+ .key_offset = offsetof(struct xfs_iunlink,
+ iu_next_unlinked),
+ .head_offset = offsetof(struct xfs_iunlink, iu_rhash_head),
+ .automatic_shrinking = true,
+ .obj_cmpfn = xfs_iunlink_obj_cmpfn,
+};
+
+/*
+ * Return X, where X.next_unlinked == @agino. Returns NULLAGINO if no such
+ * relation is found.
+ */
+static xfs_agino_t
+xfs_iunlink_lookup_backref(
+ struct xfs_perag *pag,
+ xfs_agino_t agino)
+{
+ struct xfs_iunlink *iu;
+
+ iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
+ xfs_iunlink_hash_params);
+ return iu ? iu->iu_agino : NULLAGINO;
+}
+
+/*
+ * Take ownership of an iunlink cache entry and insert it into the hash table.
+ * If successful, the entry will be owned by the cache; if not, it is freed.
+ * Either way, the caller does not own @iu after this call.
+ */
+static int
+xfs_iunlink_insert_backref(
+ struct xfs_perag *pag,
+ struct xfs_iunlink *iu)
+{
+ int error;
+
+ error = rhashtable_insert_fast(&pag->pagi_unlinked_hash,
+ &iu->iu_rhash_head, xfs_iunlink_hash_params);
+ /*
+ * Fail loudly if there already was an entry because that's a sign of
+ * corruption of in-memory data. Also fail loudly if we see an error
+ * code we didn't anticipate from the rhashtable code. Currently we
+ * only anticipate ENOMEM.
+ */
+ if (error) {
+ WARN(error != -ENOMEM, "iunlink cache insert error %d", error);
+ kmem_free(iu);
+ }
+ /*
+ * Absorb any runtime errors that aren't a result of corruption because
+ * this is a cache and we can always fall back to bucket list scanning.
+ */
+ if (error != 0 && error != -EEXIST)
+ error = 0;
+ return error;
+}
+
+/* Remember that @prev_agino.next_unlinked = @this_agino. */
+static int
+xfs_iunlink_add_backref(
+ struct xfs_perag *pag,
+ xfs_agino_t prev_agino,
+ xfs_agino_t this_agino)
+{
+ struct xfs_iunlink *iu;
+
+ if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK))
+ return 0;
+
+ iu = kmem_zalloc(sizeof(*iu), KM_SLEEP | KM_NOFS);
+ iu->iu_agino = prev_agino;
+ iu->iu_next_unlinked = this_agino;
+
+ return xfs_iunlink_insert_backref(pag, iu);
+}
+
+/*
+ * Replace X.next_unlinked = @agino with X.next_unlinked = @next_unlinked.
+ * If @next_unlinked is NULLAGINO, we drop the backref and exit. If there
+ * wasn't any such entry then we don't bother.
+ */
+static int
+xfs_iunlink_change_backref(
+ struct xfs_perag *pag,
+ xfs_agino_t agino,
+ xfs_agino_t next_unlinked)
+{
+ struct xfs_iunlink *iu;
+ int error;
+
+ /* Look up the old entry; if there wasn't one then exit. */
+ iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
+ xfs_iunlink_hash_params);
+ if (!iu)
+ return 0;
+
+ /*
+ * Remove the entry. This shouldn't ever return an error, but if we
+ * couldn't remove the old entry we don't want to add it again to the
+ * hash table, and if the entry disappeared on us then someone's
+ * violated the locking rules and we need to fail loudly. Either way
+ * we cannot remove the inode because internal state is or would have
+ * been corrupt.
+ */
+ error = rhashtable_remove_fast(&pag->pagi_unlinked_hash,
+ &iu->iu_rhash_head, xfs_iunlink_hash_params);
+ if (error)
+ return error;
+
+ /* If there is no new next entry just free our item and return. */
+ if (next_unlinked == NULLAGINO) {
+ kmem_free(iu);
+ return 0;
+ }
+
+ /* Update the entry and re-add it to the hash table. */
+ iu->iu_next_unlinked = next_unlinked;
+ return xfs_iunlink_insert_backref(pag, iu);
+}
+
+/* Set up the in-core predecessor structures. */
+int
+xfs_iunlink_init(
+ struct xfs_perag *pag)
+{
+ return rhashtable_init(&pag->pagi_unlinked_hash,
+ &xfs_iunlink_hash_params);
+}
+
+/* Free the in-core predecessor structures. */
+static void
+xfs_iunlink_free_item(
+ void *ptr,
+ void *arg)
+{
+ struct xfs_iunlink *iu = ptr;
+ bool *freed_anything = arg;
+
+ *freed_anything = true;
+ kmem_free(iu);
+}
+
+void
+xfs_iunlink_destroy(
+ struct xfs_perag *pag)
+{
+ bool freed_anything = false;
+
+ rhashtable_free_and_destroy(&pag->pagi_unlinked_hash,
+ xfs_iunlink_free_item, &freed_anything);
+
+ ASSERT(freed_anything == false || XFS_FORCED_SHUTDOWN(pag->pag_mount));
+}
+
+/*
+ * Point the AGI unlinked bucket at an inode and log the results. The caller
+ * is responsible for validating the old value.
+ */
+STATIC int
+xfs_iunlink_update_bucket(
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ struct xfs_buf *agibp,
+ unsigned int bucket_index,
+ xfs_agino_t new_agino)
+{
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp);
+ xfs_agino_t old_value;
+ int offset;
+
+ ASSERT(xfs_verify_agino_or_null(tp->t_mountp, agno, new_agino));
+
+ old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+ trace_xfs_iunlink_update_bucket(tp->t_mountp, agno, bucket_index,
+ old_value, new_agino);
+
+ /*
+ * We should never find the head of the list already set to the value
+ * passed in because either we're adding or removing ourselves from the
+ * head of the list.
+ */
+ if (old_value == new_agino)
+ return -EFSCORRUPTED;
+
+ agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino);
+ offset = offsetof(struct xfs_agi, agi_unlinked) +
+ (sizeof(xfs_agino_t) * bucket_index);
+ xfs_trans_log_buf(tp, agibp, offset, offset + sizeof(xfs_agino_t) - 1);
+ return 0;
+}
+
+/* Set an on-disk inode's next_unlinked pointer. */
+STATIC void
+xfs_iunlink_update_dinode(
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ xfs_agino_t agino,
+ struct xfs_buf *ibp,
+ struct xfs_dinode *dip,
+ struct xfs_imap *imap,
+ xfs_agino_t next_agino)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ int offset;
+
+ ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino));
+
+ trace_xfs_iunlink_update_dinode(mp, agno, agino,
+ be32_to_cpu(dip->di_next_unlinked), next_agino);
+
+ dip->di_next_unlinked = cpu_to_be32(next_agino);
+ offset = imap->im_boffset +
+ offsetof(struct xfs_dinode, di_next_unlinked);
+
+ /* need to recalc the inode CRC if appropriate */
+ xfs_dinode_calc_crc(mp, dip);
+ xfs_trans_inode_buf(tp, ibp);
+ xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1);
+ xfs_inobp_check(mp, ibp);
+}
+
+/* Set an in-core inode's unlinked pointer and return the old value. */
+STATIC int
+xfs_iunlink_update_inode(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ xfs_agnumber_t agno,
+ xfs_agino_t next_agino,
+ xfs_agino_t *old_next_agino)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_dinode *dip;
+ struct xfs_buf *ibp;
+ xfs_agino_t old_value;
+ int error;
+
+ ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino));
+
+ error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 0, 0);
+ if (error)
+ return error;
+
+ /* Make sure the old pointer isn't garbage. */
+ old_value = be32_to_cpu(dip->di_next_unlinked);
+ if (!xfs_verify_agino_or_null(mp, agno, old_value)) {
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+
+ /*
+ * Since we're updating a linked list, we should never find that the
+ * current pointer is the same as the new value, unless we're
+ * terminating the list.
+ */
+ *old_next_agino = old_value;
+ if (old_value == next_agino) {
+ if (next_agino != NULLAGINO)
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+
+ /* Ok, update the new pointer. */
+ xfs_iunlink_update_dinode(tp, agno, XFS_INO_TO_AGINO(mp, ip->i_ino),
+ ibp, dip, &ip->i_imap, next_agino);
+ return 0;
+out:
+ xfs_trans_brelse(tp, ibp);
+ return error;
+}
+
+/*
+ * This is called when the inode's link count has gone to 0 or we are creating
+ * a tmpfile via O_TMPFILE. The inode @ip must have nlink == 0.
*
* We place the on-disk inode on a list in the AGI. It will be pulled from this
* list when the inode is freed.
*/
STATIC int
xfs_iunlink(
- struct xfs_trans *tp,
- struct xfs_inode *ip)
+ struct xfs_trans *tp,
+ struct xfs_inode *ip)
{
- xfs_mount_t *mp = tp->t_mountp;
- xfs_agi_t *agi;
- xfs_dinode_t *dip;
- xfs_buf_t *agibp;
- xfs_buf_t *ibp;
- xfs_agino_t agino;
- short bucket_index;
- int offset;
- int error;
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_agi *agi;
+ struct xfs_buf *agibp;
+ xfs_agino_t next_agino;
+ xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+ xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+ short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
+ int error;
+ ASSERT(VFS_I(ip)->i_nlink == 0);
ASSERT(VFS_I(ip)->i_mode != 0);
+ trace_xfs_iunlink(ip);
- /*
- * Get the agi buffer first. It ensures lock ordering
- * on the list.
- */
- error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp);
+ /* Get the agi buffer first. It ensures lock ordering on the list. */
+ error = xfs_read_agi(mp, tp, agno, &agibp);
if (error)
return error;
agi = XFS_BUF_TO_AGI(agibp);
/*
- * Get the index into the agi hash table for the
- * list this inode will go on.
+ * Get the index into the agi hash table for the list this inode will
+ * go on. Make sure the pointer isn't garbage and that this inode
+ * isn't already on the list.
*/
- agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
- ASSERT(agino != 0);
- bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
- ASSERT(agi->agi_unlinked[bucket_index]);
- ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino);
+ next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+ if (next_agino == agino ||
+ !xfs_verify_agino_or_null(mp, agno, next_agino))
+ return -EFSCORRUPTED;
+
+ if (next_agino != NULLAGINO) {
+ struct xfs_perag *pag;
+ xfs_agino_t old_agino;
+
+ /*
+ * There is already another inode in the bucket, so point this
+ * inode to the current head of the list.
+ */
+ error = xfs_iunlink_update_inode(tp, ip, agno, next_agino,
+ &old_agino);
+ if (error)
+ return error;
+ ASSERT(old_agino == NULLAGINO);
- if (agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)) {
/*
- * There is already another inode in the bucket we need
- * to add ourselves to. Add us at the front of the list.
- * Here we put the head pointer into our next pointer,
- * and then we fall through to point the head at us.
+ * agino has been unlinked, add a backref from the next inode
+ * back to agino.
*/
- error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
- 0, 0);
+ pag = xfs_perag_get(mp, agno);
+ error = xfs_iunlink_add_backref(pag, agino, next_agino);
+ xfs_perag_put(pag);
if (error)
return error;
+ }
+
+ /* Point the head of the list to point to this inode. */
+ return xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, agino);
+}
- ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO));
- dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
- offset = ip->i_imap.im_boffset +
- offsetof(xfs_dinode_t, di_next_unlinked);
+/* Return the imap, dinode pointer, and buffer for an inode. */
+STATIC int
+xfs_iunlink_map_ino(
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ xfs_agino_t agino,
+ struct xfs_imap *imap,
+ struct xfs_dinode **dipp,
+ struct xfs_buf **bpp)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ int error;
- /* need to recalc the inode CRC if appropriate */
- xfs_dinode_calc_crc(mp, dip);
+ imap->im_blkno = 0;
+ error = xfs_imap(mp, tp, XFS_AGINO_TO_INO(mp, agno, agino), imap, 0);
+ if (error) {
+ xfs_warn(mp, "%s: xfs_imap returned error %d.",
+ __func__, error);
+ return error;
+ }
- xfs_trans_inode_buf(tp, ibp);
- xfs_trans_log_buf(tp, ibp, offset,
- (offset + sizeof(xfs_agino_t) - 1));
- xfs_inobp_check(mp, ibp);
+ error = xfs_imap_to_bp(mp, tp, imap, dipp, bpp, 0, 0);
+ if (error) {
+ xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
+ __func__, error);
+ return error;
+ }
+
+ return 0;
+}
+
+/*
+ * Walk the unlinked chain from @head_agino until we find the inode that
+ * points to @target_agino. Return the inode number, map, dinode pointer,
+ * and inode cluster buffer of that inode as @agino, @imap, @dipp, and @bpp.
+ *
+ * @tp, @pag, @head_agino, and @target_agino are input parameters.
+ * @agino, @imap, @dipp, and @bpp are all output parameters.
+ *
+ * Do not call this function if @target_agino is the head of the list.
+ */
+STATIC int
+xfs_iunlink_map_prev(
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ xfs_agino_t head_agino,
+ xfs_agino_t target_agino,
+ xfs_agino_t *agino,
+ struct xfs_imap *imap,
+ struct xfs_dinode **dipp,
+ struct xfs_buf **bpp,
+ struct xfs_perag *pag)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ xfs_agino_t next_agino;
+ int error;
+
+ ASSERT(head_agino != target_agino);
+ *bpp = NULL;
+
+ /* See if our backref cache can find it faster. */
+ *agino = xfs_iunlink_lookup_backref(pag, target_agino);
+ if (*agino != NULLAGINO) {
+ error = xfs_iunlink_map_ino(tp, agno, *agino, imap, dipp, bpp);
+ if (error)
+ return error;
+
+ if (be32_to_cpu((*dipp)->di_next_unlinked) == target_agino)
+ return 0;
+
+ /*
+ * If we get here the cache contents were corrupt, so drop the
+ * buffer and fall back to walking the bucket list.
+ */
+ xfs_trans_brelse(tp, *bpp);
+ *bpp = NULL;
+ WARN_ON_ONCE(1);
+ }
+
+ trace_xfs_iunlink_map_prev_fallback(mp, agno);
+
+ /* Otherwise, walk the entire bucket until we find it. */
+ next_agino = head_agino;
+ while (next_agino != target_agino) {
+ xfs_agino_t unlinked_agino;
+
+ if (*bpp)
+ xfs_trans_brelse(tp, *bpp);
+
+ *agino = next_agino;
+ error = xfs_iunlink_map_ino(tp, agno, next_agino, imap, dipp,
+ bpp);
+ if (error)
+ return error;
+
+ unlinked_agino = be32_to_cpu((*dipp)->di_next_unlinked);
+ /*
+ * Make sure this pointer is valid and isn't an obvious
+ * infinite loop.
+ */
+ if (!xfs_verify_agino(mp, agno, unlinked_agino) ||
+ next_agino == unlinked_agino) {
+ XFS_CORRUPTION_ERROR(__func__,
+ XFS_ERRLEVEL_LOW, mp,
+ *dipp, sizeof(**dipp));
+ error = -EFSCORRUPTED;
+ return error;
+ }
+ next_agino = unlinked_agino;
}
- /*
- * Point the bucket head pointer at the inode being inserted.
- */
- ASSERT(agino != 0);
- agi->agi_unlinked[bucket_index] = cpu_to_be32(agino);
- offset = offsetof(xfs_agi_t, agi_unlinked) +
- (sizeof(xfs_agino_t) * bucket_index);
- xfs_trans_log_buf(tp, agibp, offset,
- (offset + sizeof(xfs_agino_t) - 1));
return 0;
}
@@ -1995,181 +2419,106 @@ xfs_iunlink(
*/
STATIC int
xfs_iunlink_remove(
- xfs_trans_t *tp,
- xfs_inode_t *ip)
+ struct xfs_trans *tp,
+ struct xfs_inode *ip)
{
- xfs_ino_t next_ino;
- xfs_mount_t *mp;
- xfs_agi_t *agi;
- xfs_dinode_t *dip;
- xfs_buf_t *agibp;
- xfs_buf_t *ibp;
- xfs_agnumber_t agno;
- xfs_agino_t agino;
- xfs_agino_t next_agino;
- xfs_buf_t *last_ibp;
- xfs_dinode_t *last_dip = NULL;
- short bucket_index;
- int offset, last_offset = 0;
- int error;
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_agi *agi;
+ struct xfs_buf *agibp;
+ struct xfs_buf *last_ibp;
+ struct xfs_dinode *last_dip = NULL;
+ struct xfs_perag *pag = NULL;
+ xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+ xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+ xfs_agino_t next_agino;
+ xfs_agino_t head_agino;
+ short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
+ int error;
- mp = tp->t_mountp;
- agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+ trace_xfs_iunlink_remove(ip);
- /*
- * Get the agi buffer first. It ensures lock ordering
- * on the list.
- */
+ /* Get the agi buffer first. It ensures lock ordering on the list. */
error = xfs_read_agi(mp, tp, agno, &agibp);
if (error)
return error;
-
agi = XFS_BUF_TO_AGI(agibp);
/*
- * Get the index into the agi hash table for the
- * list this inode will go on.
+ * Get the index into the agi hash table for the list this inode will
+ * go on. Make sure the head pointer isn't garbage.
*/
- agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
- if (!xfs_verify_agino(mp, agno, agino))
- return -EFSCORRUPTED;
- bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
- if (!xfs_verify_agino(mp, agno,
- be32_to_cpu(agi->agi_unlinked[bucket_index]))) {
+ head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+ if (!xfs_verify_agino(mp, agno, head_agino)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
agi, sizeof(*agi));
return -EFSCORRUPTED;
}
- if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) {
- /*
- * We're at the head of the list. Get the inode's on-disk
- * buffer to see if there is anyone after us on the list.
- * Only modify our next pointer if it is not already NULLAGINO.
- * This saves us the overhead of dealing with the buffer when
- * there is no need to change it.
- */
- error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
- 0, 0);
- if (error) {
- xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
- __func__, error);
- return error;
- }
- next_agino = be32_to_cpu(dip->di_next_unlinked);
- ASSERT(next_agino != 0);
- if (next_agino != NULLAGINO) {
- dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
- offset = ip->i_imap.im_boffset +
- offsetof(xfs_dinode_t, di_next_unlinked);
-
- /* need to recalc the inode CRC if appropriate */
- xfs_dinode_calc_crc(mp, dip);
-
- xfs_trans_inode_buf(tp, ibp);
- xfs_trans_log_buf(tp, ibp, offset,
- (offset + sizeof(xfs_agino_t) - 1));
- xfs_inobp_check(mp, ibp);
- } else {
- xfs_trans_brelse(tp, ibp);
- }
- /*
- * Point the bucket head pointer at the next inode.
- */
- ASSERT(next_agino != 0);
- ASSERT(next_agino != agino);
- agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino);
- offset = offsetof(xfs_agi_t, agi_unlinked) +
- (sizeof(xfs_agino_t) * bucket_index);
- xfs_trans_log_buf(tp, agibp, offset,
- (offset + sizeof(xfs_agino_t) - 1));
- } else {
- /*
- * We need to search the list for the inode being freed.
- */
- next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
- last_ibp = NULL;
- while (next_agino != agino) {
- struct xfs_imap imap;
+ /*
+ * Set our inode's next_unlinked pointer to NULL and then return
+ * the old pointer value so that we can update whatever was previous
+ * to us in the list to point to whatever was next in the list.
+ */
+ error = xfs_iunlink_update_inode(tp, ip, agno, NULLAGINO, &next_agino);
+ if (error)
+ return error;
- if (last_ibp)
- xfs_trans_brelse(tp, last_ibp);
+ /*
+ * If there was a backref pointing from the next inode back to this
+ * one, remove it because we've removed this inode from the list.
+ *
+ * Later, if this inode was in the middle of the list we'll update
+ * this inode's backref to point from the next inode.
+ */
+ if (next_agino != NULLAGINO) {
+ pag = xfs_perag_get(mp, agno);
+ error = xfs_iunlink_change_backref(pag, next_agino,
+ NULLAGINO);
+ if (error)
+ goto out;
+ }
- imap.im_blkno = 0;
- next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino);
+ if (head_agino == agino) {
+ /* Point the head of the list to the next unlinked inode. */
+ error = xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index,
+ next_agino);
+ if (error)
+ goto out;
+ } else {
+ struct xfs_imap imap;
+ xfs_agino_t prev_agino;
- error = xfs_imap(mp, tp, next_ino, &imap, 0);
- if (error) {
- xfs_warn(mp,
- "%s: xfs_imap returned error %d.",
- __func__, error);
- return error;
- }
+ if (!pag)
+ pag = xfs_perag_get(mp, agno);
- error = xfs_imap_to_bp(mp, tp, &imap, &last_dip,
- &last_ibp, 0, 0);
- if (error) {
- xfs_warn(mp,
- "%s: xfs_imap_to_bp returned error %d.",
- __func__, error);
- return error;
- }
+ /* We need to search the list for the inode being freed. */
+ error = xfs_iunlink_map_prev(tp, agno, head_agino, agino,
+ &prev_agino, &imap, &last_dip, &last_ibp,
+ pag);
+ if (error)
+ goto out;
- last_offset = imap.im_boffset;
- next_agino = be32_to_cpu(last_dip->di_next_unlinked);
- if (!xfs_verify_agino(mp, agno, next_agino)) {
- XFS_CORRUPTION_ERROR(__func__,
- XFS_ERRLEVEL_LOW, mp,
- last_dip, sizeof(*last_dip));
- return -EFSCORRUPTED;
- }
- }
+ /* Point the previous inode on the list to the next inode. */
+ xfs_iunlink_update_dinode(tp, agno, prev_agino, last_ibp,
+ last_dip, &imap, next_agino);
/*
- * Now last_ibp points to the buffer previous to us on the
- * unlinked list. Pull us from the list.
+ * Now we deal with the backref for this inode. If this inode
+ * pointed at a real inode, change the backref that pointed to
+ * us to point to our old next. If this inode was the end of
+ * the list, delete the backref that pointed to us. Note that
+ * change_backref takes care of deleting the backref if
+ * next_agino is NULLAGINO.
*/
- error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
- 0, 0);
- if (error) {
- xfs_warn(mp, "%s: xfs_imap_to_bp(2) returned error %d.",
- __func__, error);
- return error;
- }
- next_agino = be32_to_cpu(dip->di_next_unlinked);
- ASSERT(next_agino != 0);
- ASSERT(next_agino != agino);
- if (next_agino != NULLAGINO) {
- dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
- offset = ip->i_imap.im_boffset +
- offsetof(xfs_dinode_t, di_next_unlinked);
-
- /* need to recalc the inode CRC if appropriate */
- xfs_dinode_calc_crc(mp, dip);
-
- xfs_trans_inode_buf(tp, ibp);
- xfs_trans_log_buf(tp, ibp, offset,
- (offset + sizeof(xfs_agino_t) - 1));
- xfs_inobp_check(mp, ibp);
- } else {
- xfs_trans_brelse(tp, ibp);
- }
- /*
- * Point the previous inode on the list to the next inode.
- */
- last_dip->di_next_unlinked = cpu_to_be32(next_agino);
- ASSERT(next_agino != 0);
- offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked);
-
- /* need to recalc the inode CRC if appropriate */
- xfs_dinode_calc_crc(mp, last_dip);
-
- xfs_trans_inode_buf(tp, last_ibp);
- xfs_trans_log_buf(tp, last_ibp, offset,
- (offset + sizeof(xfs_agino_t) - 1));
- xfs_inobp_check(mp, last_ibp);
+ error = xfs_iunlink_change_backref(pag, agino, next_agino);
+ if (error)
+ goto out;
}
- return 0;
+
+out:
+ if (pag)
+ xfs_perag_put(pag);
+ return error;
}
/*
@@ -2184,8 +2533,6 @@ xfs_ifree_cluster(
struct xfs_icluster *xic)
{
xfs_mount_t *mp = free_ip->i_mount;
- int blks_per_cluster;
- int inodes_per_cluster;
int nbufs;
int i, j;
int ioffset;
@@ -2199,11 +2546,9 @@ xfs_ifree_cluster(
inum = xic->first_ino;
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
- blks_per_cluster = xfs_icluster_size_fsb(mp);
- inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
- nbufs = mp->m_ialloc_blks / blks_per_cluster;
+ nbufs = mp->m_ialloc_blks / mp->m_blocks_per_cluster;
- for (j = 0; j < nbufs; j++, inum += inodes_per_cluster) {
+ for (j = 0; j < nbufs; j++, inum += mp->m_inodes_per_cluster) {
/*
* The allocation bitmap tells us which inodes of the chunk were
* physically allocated. Skip the cluster if an inode falls into
@@ -2211,7 +2556,7 @@ xfs_ifree_cluster(
*/
ioffset = inum - xic->first_ino;
if ((xic->alloc & XFS_INOBT_MASK(ioffset)) == 0) {
- ASSERT(ioffset % inodes_per_cluster == 0);
+ ASSERT(ioffset % mp->m_inodes_per_cluster == 0);
continue;
}
@@ -2227,7 +2572,7 @@ xfs_ifree_cluster(
* to mark all the active inodes on the buffer stale.
*/
bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
- mp->m_bsize * blks_per_cluster,
+ mp->m_bsize * mp->m_blocks_per_cluster,
XBF_UNMAPPED);
if (!bp)
@@ -2242,7 +2587,7 @@ xfs_ifree_cluster(
* want it to fail. We can acheive this by adding a write
* verifier to the buffer.
*/
- bp->b_ops = &xfs_inode_buf_ops;
+ bp->b_ops = &xfs_inode_buf_ops;
/*
* Walk the inodes already attached to the buffer and mark them
@@ -2274,7 +2619,7 @@ xfs_ifree_cluster(
* transaction stale above, which means there is no point in
* even trying to lock them.
*/
- for (i = 0; i < inodes_per_cluster; i++) {
+ for (i = 0; i < mp->m_inodes_per_cluster; i++) {
retry:
rcu_read_lock();
ip = radix_tree_lookup(&pag->pag_ici_root,
@@ -2837,11 +3182,9 @@ xfs_rename_alloc_whiteout(
/*
* Prepare the tmpfile inode as if it were created through the VFS.
- * Otherwise, the link increment paths will complain about nlink 0->1.
- * Drop the link count as done by d_tmpfile(), complete the inode setup
- * and flag it as linkable.
+ * Complete the inode setup and flag it as linkable. nlink is already
+ * zero, so we can skip the drop_nlink.
*/
- drop_nlink(VFS_I(tmpfile));
xfs_setup_iops(tmpfile);
xfs_finish_inode_setup(tmpfile);
VFS_I(tmpfile)->i_state |= I_LINKABLE;
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index be2014520155..e62074a5257c 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -500,4 +500,7 @@ extern struct kmem_zone *xfs_inode_zone;
bool xfs_inode_verify_forks(struct xfs_inode *ip);
+int xfs_iunlink_init(struct xfs_perag *pag);
+void xfs_iunlink_destroy(struct xfs_perag *pag);
+
#endif /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 6e2c08f30f60..6ecdbb3af7de 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1608,7 +1608,7 @@ xfs_ioc_getbmap(
error = 0;
out_free_buf:
kmem_free(buf);
- return 0;
+ return error;
}
struct getfsmap_info {
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index fba115f4103a..5001dca361e9 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -241,6 +241,32 @@ xfs_compat_ioc_bulkstat(
int done;
int error;
+ /*
+ * Output structure handling functions. Depending on the command,
+ * either the xfs_bstat and xfs_inogrp structures are written out
+ * to userpace memory via bulkreq.ubuffer. Normally the compat
+ * functions and structure size are the correct ones to use ...
+ */
+ inumbers_fmt_pf inumbers_func = xfs_inumbers_fmt_compat;
+ bulkstat_one_pf bs_one_func = xfs_bulkstat_one_compat;
+ size_t bs_one_size = sizeof(struct compat_xfs_bstat);
+
+#ifdef CONFIG_X86_X32
+ if (in_x32_syscall()) {
+ /*
+ * ... but on x32 the input xfs_fsop_bulkreq has pointers
+ * which must be handled in the "compat" (32-bit) way, while
+ * the xfs_bstat and xfs_inogrp structures follow native 64-
+ * bit layout convention. So adjust accordingly, otherwise
+ * the data written out in compat layout will not match what
+ * x32 userspace expects.
+ */
+ inumbers_func = xfs_inumbers_fmt;
+ bs_one_func = xfs_bulkstat_one;
+ bs_one_size = sizeof(struct xfs_bstat);
+ }
+#endif
+
/* done = 1 if there are more stats to get and if bulkstat */
/* should be called again (unused here, but used in dmapi) */
@@ -272,15 +298,15 @@ xfs_compat_ioc_bulkstat(
if (cmd == XFS_IOC_FSINUMBERS_32) {
error = xfs_inumbers(mp, &inlast, &count,
- bulkreq.ubuffer, xfs_inumbers_fmt_compat);
+ bulkreq.ubuffer, inumbers_func);
} else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE_32) {
int res;
- error = xfs_bulkstat_one_compat(mp, inlast, bulkreq.ubuffer,
- sizeof(compat_xfs_bstat_t), NULL, &res);
+ error = bs_one_func(mp, inlast, bulkreq.ubuffer,
+ bs_one_size, NULL, &res);
} else if (cmd == XFS_IOC_FSBULKSTAT_32) {
error = xfs_bulkstat(mp, &inlast, &count,
- xfs_bulkstat_one_compat, sizeof(compat_xfs_bstat_t),
+ bs_one_func, bs_one_size,
bulkreq.ubuffer, &done);
} else
error = -EINVAL;
@@ -336,6 +362,7 @@ xfs_compat_attrlist_by_handle(
{
int error;
attrlist_cursor_kern_t *cursor;
+ compat_xfs_fsop_attrlist_handlereq_t __user *p = arg;
compat_xfs_fsop_attrlist_handlereq_t al_hreq;
struct dentry *dentry;
char *kbuf;
@@ -370,6 +397,11 @@ xfs_compat_attrlist_by_handle(
if (error)
goto out_kfree;
+ if (copy_to_user(&p->pos, cursor, sizeof(attrlist_cursor_kern_t))) {
+ error = -EFAULT;
+ goto out_kfree;
+ }
+
if (copy_to_user(compat_ptr(al_hreq.buffer), kbuf, al_hreq.buflen))
error = -EFAULT;
@@ -547,8 +579,12 @@ xfs_file_compat_ioctl(
case FS_IOC_GETFSMAP:
case XFS_IOC_SCRUB_METADATA:
return xfs_file_ioctl(filp, cmd, p);
-#ifndef BROKEN_X86_ALIGNMENT
- /* These are handled fine if no alignment issues */
+#if !defined(BROKEN_X86_ALIGNMENT) || defined(CONFIG_X86_X32)
+ /*
+ * These are handled fine if no alignment issues. To support x32
+ * which uses native 64-bit alignment we must emit these cases in
+ * addition to the ia-32 compat set below.
+ */
case XFS_IOC_ALLOCSP:
case XFS_IOC_FREESP:
case XFS_IOC_RESVSP:
@@ -561,8 +597,16 @@ xfs_file_compat_ioctl(
case XFS_IOC_FSGROWFSDATA:
case XFS_IOC_FSGROWFSRT:
case XFS_IOC_ZERO_RANGE:
+#ifdef CONFIG_X86_X32
+ /*
+ * x32 special: this gets a different cmd number from the ia-32 compat
+ * case below; the associated data will match native 64-bit alignment.
+ */
+ case XFS_IOC_SWAPEXT:
+#endif
return xfs_file_ioctl(filp, cmd, p);
-#else
+#endif
+#if defined(BROKEN_X86_ALIGNMENT)
case XFS_IOC_ALLOCSP_32:
case XFS_IOC_FREESP_32:
case XFS_IOC_ALLOCSP64_32:
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 27c93b5f029d..63d323916bba 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -35,18 +35,40 @@
#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \
<< mp->m_writeio_log)
-void
+static int
+xfs_alert_fsblock_zero(
+ xfs_inode_t *ip,
+ xfs_bmbt_irec_t *imap)
+{
+ xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
+ "Access to block zero in inode %llu "
+ "start_block: %llx start_off: %llx "
+ "blkcnt: %llx extent-state: %x",
+ (unsigned long long)ip->i_ino,
+ (unsigned long long)imap->br_startblock,
+ (unsigned long long)imap->br_startoff,
+ (unsigned long long)imap->br_blockcount,
+ imap->br_state);
+ return -EFSCORRUPTED;
+}
+
+int
xfs_bmbt_to_iomap(
struct xfs_inode *ip,
struct iomap *iomap,
- struct xfs_bmbt_irec *imap)
+ struct xfs_bmbt_irec *imap,
+ bool shared)
{
struct xfs_mount *mp = ip->i_mount;
+ if (unlikely(!imap->br_startblock && !XFS_IS_REALTIME_INODE(ip)))
+ return xfs_alert_fsblock_zero(ip, imap);
+
if (imap->br_startblock == HOLESTARTBLOCK) {
iomap->addr = IOMAP_NULL_ADDR;
iomap->type = IOMAP_HOLE;
- } else if (imap->br_startblock == DELAYSTARTBLOCK) {
+ } else if (imap->br_startblock == DELAYSTARTBLOCK ||
+ isnullstartblock(imap->br_startblock)) {
iomap->addr = IOMAP_NULL_ADDR;
iomap->type = IOMAP_DELALLOC;
} else {
@@ -60,6 +82,13 @@ xfs_bmbt_to_iomap(
iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip));
+
+ if (xfs_ipincount(ip) &&
+ (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
+ iomap->flags |= IOMAP_F_DIRTY;
+ if (shared)
+ iomap->flags |= IOMAP_F_SHARED;
+ return 0;
}
static void
@@ -138,23 +167,6 @@ xfs_iomap_eof_align_last_fsb(
return 0;
}
-STATIC int
-xfs_alert_fsblock_zero(
- xfs_inode_t *ip,
- xfs_bmbt_irec_t *imap)
-{
- xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
- "Access to block zero in inode %llu "
- "start_block: %llx start_off: %llx "
- "blkcnt: %llx extent-state: %x",
- (unsigned long long)ip->i_ino,
- (unsigned long long)imap->br_startblock,
- (unsigned long long)imap->br_startoff,
- (unsigned long long)imap->br_blockcount,
- imap->br_state);
- return -EFSCORRUPTED;
-}
-
int
xfs_iomap_write_direct(
xfs_inode_t *ip,
@@ -383,12 +395,13 @@ xfs_quota_calc_throttle(
STATIC xfs_fsblock_t
xfs_iomap_prealloc_size(
struct xfs_inode *ip,
+ int whichfork,
loff_t offset,
loff_t count,
struct xfs_iext_cursor *icur)
{
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
struct xfs_bmbt_irec prev;
int shift = 0;
@@ -522,15 +535,16 @@ xfs_file_iomap_begin_delay(
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
xfs_fileoff_t maxbytes_fsb =
XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
xfs_fileoff_t end_fsb;
- int error = 0, eof = 0;
- struct xfs_bmbt_irec got;
- struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec imap, cmap;
+ struct xfs_iext_cursor icur, ccur;
xfs_fsblock_t prealloc_blocks = 0;
+ bool eof = false, cow_eof = false, shared = false;
+ int whichfork = XFS_DATA_FORK;
+ int error = 0;
ASSERT(!XFS_IS_REALTIME_INODE(ip));
ASSERT(!xfs_get_extsz_hint(ip));
@@ -548,7 +562,7 @@ xfs_file_iomap_begin_delay(
XFS_STATS_INC(mp, xs_blk_mapw);
- if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+ if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) {
error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
if (error)
goto out_unlock;
@@ -556,53 +570,101 @@ xfs_file_iomap_begin_delay(
end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
- eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got);
+ /*
+ * Search the data fork fork first to look up our source mapping. We
+ * always need the data fork map, as we have to return it to the
+ * iomap code so that the higher level write code can read data in to
+ * perform read-modify-write cycles for unaligned writes.
+ */
+ eof = !xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap);
if (eof)
- got.br_startoff = end_fsb; /* fake hole until the end */
+ imap.br_startoff = end_fsb; /* fake hole until the end */
+
+ /* We never need to allocate blocks for zeroing a hole. */
+ if ((flags & IOMAP_ZERO) && imap.br_startoff > offset_fsb) {
+ xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff);
+ goto out_unlock;
+ }
- if (got.br_startoff <= offset_fsb) {
+ /*
+ * Search the COW fork extent list even if we did not find a data fork
+ * extent. This serves two purposes: first this implements the
+ * speculative preallocation using cowextsize, so that we also unshare
+ * block adjacent to shared blocks instead of just the shared blocks
+ * themselves. Second the lookup in the extent list is generally faster
+ * than going out to the shared extent tree.
+ */
+ if (xfs_is_cow_inode(ip)) {
+ if (!ip->i_cowfp) {
+ ASSERT(!xfs_is_reflink_inode(ip));
+ xfs_ifork_init_cow(ip);
+ }
+ cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb,
+ &ccur, &cmap);
+ if (!cow_eof && cmap.br_startoff <= offset_fsb) {
+ trace_xfs_reflink_cow_found(ip, &cmap);
+ whichfork = XFS_COW_FORK;
+ goto done;
+ }
+ }
+
+ if (imap.br_startoff <= offset_fsb) {
/*
* For reflink files we may need a delalloc reservation when
* overwriting shared extents. This includes zeroing of
* existing extents that contain data.
*/
- if (xfs_is_reflink_inode(ip) &&
- ((flags & IOMAP_WRITE) ||
- got.br_state != XFS_EXT_UNWRITTEN)) {
- xfs_trim_extent(&got, offset_fsb, end_fsb - offset_fsb);
- error = xfs_reflink_reserve_cow(ip, &got);
- if (error)
- goto out_unlock;
+ if (!xfs_is_cow_inode(ip) ||
+ ((flags & IOMAP_ZERO) && imap.br_state != XFS_EXT_NORM)) {
+ trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
+ &imap);
+ goto done;
}
- trace_xfs_iomap_found(ip, offset, count, 0, &got);
- goto done;
- }
+ xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb);
- if (flags & IOMAP_ZERO) {
- xfs_hole_to_iomap(ip, iomap, offset_fsb, got.br_startoff);
- goto out_unlock;
+ /* Trim the mapping to the nearest shared extent boundary. */
+ error = xfs_inode_need_cow(ip, &imap, &shared);
+ if (error)
+ goto out_unlock;
+
+ /* Not shared? Just report the (potentially capped) extent. */
+ if (!shared) {
+ trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
+ &imap);
+ goto done;
+ }
+
+ /*
+ * Fork all the shared blocks from our write offset until the
+ * end of the extent.
+ */
+ whichfork = XFS_COW_FORK;
+ end_fsb = imap.br_startoff + imap.br_blockcount;
+ } else {
+ /*
+ * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
+ * pages to keep the chunks of work done where somewhat
+ * symmetric with the work writeback does. This is a completely
+ * arbitrary number pulled out of thin air.
+ *
+ * Note that the values needs to be less than 32-bits wide until
+ * the lower level functions are updated.
+ */
+ count = min_t(loff_t, count, 1024 * PAGE_SIZE);
+ end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
+
+ if (xfs_is_always_cow_inode(ip))
+ whichfork = XFS_COW_FORK;
}
error = xfs_qm_dqattach_locked(ip, false);
if (error)
goto out_unlock;
- /*
- * We cap the maximum length we map here to MAX_WRITEBACK_PAGES pages
- * to keep the chunks of work done where somewhat symmetric with the
- * work writeback does. This is a completely arbitrary number pulled
- * out of thin air as a best guess for initial testing.
- *
- * Note that the values needs to be less than 32-bits wide until
- * the lower level functions are updated.
- */
- count = min_t(loff_t, count, 1024 * PAGE_SIZE);
- end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
-
if (eof) {
- prealloc_blocks = xfs_iomap_prealloc_size(ip, offset, count,
- &icur);
+ prealloc_blocks = xfs_iomap_prealloc_size(ip, whichfork, offset,
+ count, &icur);
if (prealloc_blocks) {
xfs_extlen_t align;
xfs_off_t end_offset;
@@ -623,9 +685,11 @@ xfs_file_iomap_begin_delay(
}
retry:
- error = xfs_bmapi_reserve_delalloc(ip, XFS_DATA_FORK, offset_fsb,
- end_fsb - offset_fsb, prealloc_blocks, &got, &icur,
- eof);
+ error = xfs_bmapi_reserve_delalloc(ip, whichfork, offset_fsb,
+ end_fsb - offset_fsb, prealloc_blocks,
+ whichfork == XFS_DATA_FORK ? &imap : &cmap,
+ whichfork == XFS_DATA_FORK ? &icur : &ccur,
+ whichfork == XFS_DATA_FORK ? eof : cow_eof);
switch (error) {
case 0:
break;
@@ -647,186 +711,22 @@ retry:
* them out if the write happens to fail.
*/
iomap->flags |= IOMAP_F_NEW;
- trace_xfs_iomap_alloc(ip, offset, count, 0, &got);
+ trace_xfs_iomap_alloc(ip, offset, count, whichfork,
+ whichfork == XFS_DATA_FORK ? &imap : &cmap);
done:
- if (isnullstartblock(got.br_startblock))
- got.br_startblock = DELAYSTARTBLOCK;
-
- if (!got.br_startblock) {
- error = xfs_alert_fsblock_zero(ip, &got);
- if (error)
+ if (whichfork == XFS_COW_FORK) {
+ if (imap.br_startoff > offset_fsb) {
+ xfs_trim_extent(&cmap, offset_fsb,
+ imap.br_startoff - offset_fsb);
+ error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true);
goto out_unlock;
- }
-
- xfs_bmbt_to_iomap(ip, iomap, &got);
-
-out_unlock:
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- return error;
-}
-
-/*
- * Pass in a delayed allocate extent, convert it to real extents;
- * return to the caller the extent we create which maps on top of
- * the originating callers request.
- *
- * Called without a lock on the inode.
- *
- * We no longer bother to look at the incoming map - all we have to
- * guarantee is that whatever we allocate fills the required range.
- */
-int
-xfs_iomap_write_allocate(
- xfs_inode_t *ip,
- int whichfork,
- xfs_off_t offset,
- xfs_bmbt_irec_t *imap,
- unsigned int *cow_seq)
-{
- xfs_mount_t *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
- xfs_fileoff_t offset_fsb, last_block;
- xfs_fileoff_t end_fsb, map_start_fsb;
- xfs_filblks_t count_fsb;
- xfs_trans_t *tp;
- int nimaps;
- int error = 0;
- int flags = XFS_BMAPI_DELALLOC;
- int nres;
-
- if (whichfork == XFS_COW_FORK)
- flags |= XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC;
-
- /*
- * Make sure that the dquots are there.
- */
- error = xfs_qm_dqattach(ip);
- if (error)
- return error;
-
- offset_fsb = XFS_B_TO_FSBT(mp, offset);
- count_fsb = imap->br_blockcount;
- map_start_fsb = imap->br_startoff;
-
- XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb));
-
- while (count_fsb != 0) {
- /*
- * Set up a transaction with which to allocate the
- * backing store for the file. Do allocations in a
- * loop until we get some space in the range we are
- * interested in. The other space that might be allocated
- * is in the delayed allocation extent on which we sit
- * but before our buffer starts.
- */
- nimaps = 0;
- while (nimaps == 0) {
- nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
- /*
- * We have already reserved space for the extent and any
- * indirect blocks when creating the delalloc extent,
- * there is no need to reserve space in this transaction
- * again.
- */
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0,
- 0, XFS_TRANS_RESERVE, &tp);
- if (error)
- return error;
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, 0);
-
- /*
- * it is possible that the extents have changed since
- * we did the read call as we dropped the ilock for a
- * while. We have to be careful about truncates or hole
- * punchs here - we are not allowed to allocate
- * non-delalloc blocks here.
- *
- * The only protection against truncation is the pages
- * for the range we are being asked to convert are
- * locked and hence a truncate will block on them
- * first.
- *
- * As a result, if we go beyond the range we really
- * need and hit an delalloc extent boundary followed by
- * a hole while we have excess blocks in the map, we
- * will fill the hole incorrectly and overrun the
- * transaction reservation.
- *
- * Using a single map prevents this as we are forced to
- * check each map we look for overlap with the desired
- * range and abort as soon as we find it. Also, given
- * that we only return a single map, having one beyond
- * what we can return is probably a bit silly.
- *
- * We also need to check that we don't go beyond EOF;
- * this is a truncate optimisation as a truncate sets
- * the new file size before block on the pages we
- * currently have locked under writeback. Because they
- * are about to be tossed, we don't need to write them
- * back....
- */
- nimaps = 1;
- end_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
- error = xfs_bmap_last_offset(ip, &last_block,
- XFS_DATA_FORK);
- if (error)
- goto trans_cancel;
-
- last_block = XFS_FILEOFF_MAX(last_block, end_fsb);
- if ((map_start_fsb + count_fsb) > last_block) {
- count_fsb = last_block - map_start_fsb;
- if (count_fsb == 0) {
- error = -EAGAIN;
- goto trans_cancel;
- }
- }
-
- /*
- * From this point onwards we overwrite the imap
- * pointer that the caller gave to us.
- */
- error = xfs_bmapi_write(tp, ip, map_start_fsb,
- count_fsb, flags, nres, imap,
- &nimaps);
- if (error)
- goto trans_cancel;
-
- error = xfs_trans_commit(tp);
- if (error)
- goto error0;
-
- if (whichfork == XFS_COW_FORK)
- *cow_seq = READ_ONCE(ifp->if_seq);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- }
-
- /*
- * See if we were able to allocate an extent that
- * covers at least part of the callers request
- */
- if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip)))
- return xfs_alert_fsblock_zero(ip, imap);
-
- if ((offset_fsb >= imap->br_startoff) &&
- (offset_fsb < (imap->br_startoff +
- imap->br_blockcount))) {
- XFS_STATS_INC(mp, xs_xstrat_quick);
- return 0;
}
-
- /*
- * So far we have not mapped the requested part of the
- * file, just surrounding data, try again.
- */
- count_fsb -= imap->br_blockcount;
- map_start_fsb = imap->br_startoff + imap->br_blockcount;
+ /* ensure we only report blocks we have a reservation for */
+ xfs_trim_extent(&imap, cmap.br_startoff, cmap.br_blockcount);
+ shared = true;
}
-
-trans_cancel:
- xfs_trans_cancel(tp);
-error0:
+ error = xfs_bmbt_to_iomap(ip, iomap, &imap, shared);
+out_unlock:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
@@ -975,7 +875,7 @@ xfs_ilock_for_iomap(
* COW writes may allocate delalloc space or convert unwritten COW
* extents, so we need to make sure to take the lock exclusively here.
*/
- if (xfs_is_reflink_inode(ip) && is_write) {
+ if (xfs_is_cow_inode(ip) && is_write) {
/*
* FIXME: It could still overwrite on unshared extents and not
* need allocation.
@@ -1009,7 +909,7 @@ relock:
* check, so if we got ILOCK_SHARED for a write and but we're now a
* reflink inode we have to switch to ILOCK_EXCL and relock.
*/
- if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_reflink_inode(ip)) {
+ if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_cow_inode(ip)) {
xfs_iunlock(ip, mode);
mode = XFS_ILOCK_EXCL;
goto relock;
@@ -1081,23 +981,33 @@ xfs_file_iomap_begin(
* Break shared extents if necessary. Checks for non-blocking IO have
* been done up front, so we don't need to do them here.
*/
- if (xfs_is_reflink_inode(ip)) {
+ if (xfs_is_cow_inode(ip)) {
+ struct xfs_bmbt_irec cmap;
+ bool directio = (flags & IOMAP_DIRECT);
+
/* if zeroing doesn't need COW allocation, then we are done. */
if ((flags & IOMAP_ZERO) &&
!needs_cow_for_zeroing(&imap, nimaps))
goto out_found;
- if (flags & IOMAP_DIRECT) {
- /* may drop and re-acquire the ilock */
- error = xfs_reflink_allocate_cow(ip, &imap, &shared,
- &lockmode);
- if (error)
- goto out_unlock;
- } else {
- error = xfs_reflink_reserve_cow(ip, &imap);
- if (error)
- goto out_unlock;
- }
+ /* may drop and re-acquire the ilock */
+ cmap = imap;
+ error = xfs_reflink_allocate_cow(ip, &cmap, &shared, &lockmode,
+ directio);
+ if (error)
+ goto out_unlock;
+
+ /*
+ * For buffered writes we need to report the address of the
+ * previous block (if there was any) so that the higher level
+ * write code can perform read-modify-write operations; we
+ * won't need the CoW fork mapping until writeback. For direct
+ * I/O, which must be block aligned, we need to report the
+ * newly allocated address. If the data fork has a hole, copy
+ * the COW fork mapping to avoid allocating to the data fork.
+ */
+ if (directio || imap.br_startblock == HOLESTARTBLOCK)
+ imap = cmap;
end_fsb = imap.br_startoff + imap.br_blockcount;
length = XFS_FSB_TO_B(mp, end_fsb) - offset;
@@ -1139,23 +1049,15 @@ xfs_file_iomap_begin(
return error;
iomap->flags |= IOMAP_F_NEW;
- trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
+ trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap);
out_finish:
- if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields
- & ~XFS_ILOG_TIMESTAMP))
- iomap->flags |= IOMAP_F_DIRTY;
-
- xfs_bmbt_to_iomap(ip, iomap, &imap);
-
- if (shared)
- iomap->flags |= IOMAP_F_SHARED;
- return 0;
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, shared);
out_found:
ASSERT(nimaps);
xfs_iunlock(ip, lockmode);
- trace_xfs_iomap_found(ip, offset, length, 0, &imap);
+ trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
goto out_finish;
out_unlock:
@@ -1241,6 +1143,92 @@ const struct iomap_ops xfs_iomap_ops = {
};
static int
+xfs_seek_iomap_begin(
+ struct inode *inode,
+ loff_t offset,
+ loff_t length,
+ unsigned flags,
+ struct iomap *iomap)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + length);
+ xfs_fileoff_t cow_fsb = NULLFILEOFF, data_fsb = NULLFILEOFF;
+ struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec imap, cmap;
+ int error = 0;
+ unsigned lockmode;
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
+
+ lockmode = xfs_ilock_data_map_shared(ip);
+ if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) {
+ error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+ if (error)
+ goto out_unlock;
+ }
+
+ if (xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap)) {
+ /*
+ * If we found a data extent we are done.
+ */
+ if (imap.br_startoff <= offset_fsb)
+ goto done;
+ data_fsb = imap.br_startoff;
+ } else {
+ /*
+ * Fake a hole until the end of the file.
+ */
+ data_fsb = min(XFS_B_TO_FSB(mp, offset + length),
+ XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes));
+ }
+
+ /*
+ * If a COW fork extent covers the hole, report it - capped to the next
+ * data fork extent:
+ */
+ if (xfs_inode_has_cow_data(ip) &&
+ xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap))
+ cow_fsb = cmap.br_startoff;
+ if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
+ if (data_fsb < cow_fsb + cmap.br_blockcount)
+ end_fsb = min(end_fsb, data_fsb);
+ xfs_trim_extent(&cmap, offset_fsb, end_fsb);
+ error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true);
+ /*
+ * This is a COW extent, so we must probe the page cache
+ * because there could be dirty page cache being backed
+ * by this extent.
+ */
+ iomap->type = IOMAP_UNWRITTEN;
+ goto out_unlock;
+ }
+
+ /*
+ * Else report a hole, capped to the next found data or COW extent.
+ */
+ if (cow_fsb != NULLFILEOFF && cow_fsb < data_fsb)
+ imap.br_blockcount = cow_fsb - offset_fsb;
+ else
+ imap.br_blockcount = data_fsb - offset_fsb;
+ imap.br_startoff = offset_fsb;
+ imap.br_startblock = HOLESTARTBLOCK;
+ imap.br_state = XFS_EXT_NORM;
+done:
+ xfs_trim_extent(&imap, offset_fsb, end_fsb);
+ error = xfs_bmbt_to_iomap(ip, iomap, &imap, false);
+out_unlock:
+ xfs_iunlock(ip, lockmode);
+ return error;
+}
+
+const struct iomap_ops xfs_seek_iomap_ops = {
+ .iomap_begin = xfs_seek_iomap_begin,
+};
+
+static int
xfs_xattr_iomap_begin(
struct inode *inode,
loff_t offset,
@@ -1273,12 +1261,10 @@ xfs_xattr_iomap_begin(
out_unlock:
xfs_iunlock(ip, lockmode);
- if (!error) {
- ASSERT(nimaps);
- xfs_bmbt_to_iomap(ip, iomap, &imap);
- }
-
- return error;
+ if (error)
+ return error;
+ ASSERT(nimaps);
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, false);
}
const struct iomap_ops xfs_xattr_iomap_ops = {
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index c6170548831b..5c2f6aa6d78f 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -13,12 +13,10 @@ struct xfs_bmbt_irec;
int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
struct xfs_bmbt_irec *, int);
-int xfs_iomap_write_allocate(struct xfs_inode *, int, xfs_off_t,
- struct xfs_bmbt_irec *, unsigned int *);
int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool);
-void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
- struct xfs_bmbt_irec *);
+int xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
+ struct xfs_bmbt_irec *, bool shared);
xfs_extlen_t xfs_eof_alignment(struct xfs_inode *ip, xfs_extlen_t extsize);
static inline xfs_filblks_t
@@ -42,6 +40,7 @@ xfs_aligned_fsb_count(
}
extern const struct iomap_ops xfs_iomap_ops;
+extern const struct iomap_ops xfs_seek_iomap_ops;
extern const struct iomap_ops xfs_xattr_iomap_ops;
#endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index f48ffd7a8d3e..74047bd0c1ae 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -191,9 +191,18 @@ xfs_generic_create(
xfs_setup_iops(ip);
- if (tmpfile)
+ if (tmpfile) {
+ /*
+ * The VFS requires that any inode fed to d_tmpfile must have
+ * nlink == 1 so that it can decrement the nlink in d_tmpfile.
+ * However, we created the temp file with nlink == 0 because
+ * we're not allowed to put an inode with nlink > 0 on the
+ * unlinked list. Therefore we have to set nlink to 1 so that
+ * d_tmpfile can immediately set it back to zero.
+ */
+ set_nlink(inode, 1);
d_tmpfile(dentry, inode);
- else
+ } else
d_instantiate(dentry, inode);
xfs_finish_inode_setup(ip);
@@ -522,6 +531,10 @@ xfs_vn_getattr(
}
}
+ /*
+ * Note: If you add another clause to set an attribute flag, please
+ * update attributes_mask below.
+ */
if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE)
stat->attributes |= STATX_ATTR_IMMUTABLE;
if (ip->i_d.di_flags & XFS_DIFLAG_APPEND)
@@ -529,6 +542,10 @@ xfs_vn_getattr(
if (ip->i_d.di_flags & XFS_DIFLAG_NODUMP)
stat->attributes |= STATX_ATTR_NODUMP;
+ stat->attributes_mask |= (STATX_ATTR_IMMUTABLE |
+ STATX_ATTR_APPEND |
+ STATX_ATTR_NODUMP);
+
switch (inode->i_mode & S_IFMT) {
case S_IFBLK:
case S_IFCHR:
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index e9508ba01ed1..942e4aa5e729 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -167,20 +167,18 @@ xfs_bulkstat_ichunk_ra(
{
xfs_agblock_t agbno;
struct blk_plug plug;
- int blks_per_cluster;
- int inodes_per_cluster;
int i; /* inode chunk index */
agbno = XFS_AGINO_TO_AGBNO(mp, irec->ir_startino);
- blks_per_cluster = xfs_icluster_size_fsb(mp);
- inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
blk_start_plug(&plug);
for (i = 0; i < XFS_INODES_PER_CHUNK;
- i += inodes_per_cluster, agbno += blks_per_cluster) {
- if (xfs_inobt_maskn(i, inodes_per_cluster) & ~irec->ir_free) {
- xfs_btree_reada_bufs(mp, agno, agbno, blks_per_cluster,
- &xfs_inode_buf_ops);
+ i += mp->m_inodes_per_cluster, agbno += mp->m_blocks_per_cluster) {
+ if (xfs_inobt_maskn(i, mp->m_inodes_per_cluster) &
+ ~irec->ir_free) {
+ xfs_btree_reada_bufs(mp, agno, agbno,
+ mp->m_blocks_per_cluster,
+ &xfs_inode_buf_ops);
}
}
blk_finish_plug(&plug);
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 1fc9e9042e0e..3371d1ff27c4 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2439,17 +2439,21 @@ xlog_recover_validate_buf_type(
case XFS_BLFT_BTREE_BUF:
switch (magic32) {
case XFS_ABTB_CRC_MAGIC:
- case XFS_ABTC_CRC_MAGIC:
case XFS_ABTB_MAGIC:
+ bp->b_ops = &xfs_bnobt_buf_ops;
+ break;
+ case XFS_ABTC_CRC_MAGIC:
case XFS_ABTC_MAGIC:
- bp->b_ops = &xfs_allocbt_buf_ops;
+ bp->b_ops = &xfs_cntbt_buf_ops;
break;
case XFS_IBT_CRC_MAGIC:
- case XFS_FIBT_CRC_MAGIC:
case XFS_IBT_MAGIC:
- case XFS_FIBT_MAGIC:
bp->b_ops = &xfs_inobt_buf_ops;
break;
+ case XFS_FIBT_CRC_MAGIC:
+ case XFS_FIBT_MAGIC:
+ bp->b_ops = &xfs_finobt_buf_ops;
+ break;
case XFS_BMAP_CRC_MAGIC:
case XFS_BMAP_MAGIC:
bp->b_ops = &xfs_bmbt_buf_ops;
@@ -3045,7 +3049,7 @@ xlog_recover_inode_pass2(
* Make sure the place we're flushing out to really looks
* like an inode!
*/
- if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) {
+ if (unlikely(!xfs_verify_magic16(bp, dip->di_magic))) {
xfs_alert(mp,
"%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %Ld",
__func__, dip, bp, in_f->ilf_ino);
@@ -3850,7 +3854,6 @@ xlog_recover_do_icreate_pass2(
unsigned int count;
unsigned int isize;
xfs_agblock_t length;
- int blks_per_cluster;
int bb_per_cluster;
int cancel_count;
int nbufs;
@@ -3918,14 +3921,13 @@ xlog_recover_do_icreate_pass2(
* buffers for cancellation so we don't overwrite anything written after
* a cancellation.
*/
- blks_per_cluster = xfs_icluster_size_fsb(mp);
- bb_per_cluster = XFS_FSB_TO_BB(mp, blks_per_cluster);
- nbufs = length / blks_per_cluster;
+ bb_per_cluster = XFS_FSB_TO_BB(mp, mp->m_blocks_per_cluster);
+ nbufs = length / mp->m_blocks_per_cluster;
for (i = 0, cancel_count = 0; i < nbufs; i++) {
xfs_daddr_t daddr;
daddr = XFS_AGB_TO_DADDR(mp, agno,
- agbno + i * blks_per_cluster);
+ agbno + i * mp->m_blocks_per_cluster);
if (xlog_check_buffer_cancelled(log, daddr, bb_per_cluster, 0))
cancel_count++;
}
diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c
index 576c375ce12a..6b736ea58d35 100644
--- a/fs/xfs/xfs_message.c
+++ b/fs/xfs/xfs_message.c
@@ -107,5 +107,5 @@ assfail(char *expr, char *file, int line)
void
xfs_hex_dump(void *p, int length)
{
- print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_ADDRESS, 16, 1, p, length, 1);
+ print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_OFFSET, 16, 1, p, length, 1);
}
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 02d15098dbee..fd63b0b1307c 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -149,6 +149,7 @@ xfs_free_perag(
spin_unlock(&mp->m_perag_lock);
ASSERT(pag);
ASSERT(atomic_read(&pag->pag_ref) == 0);
+ xfs_iunlink_destroy(pag);
xfs_buf_hash_destroy(pag);
mutex_destroy(&pag->pag_ici_reclaim_lock);
call_rcu(&pag->rcu_head, __xfs_free_perag);
@@ -227,6 +228,9 @@ xfs_initialize_perag(
/* first new pag is fully initialized */
if (first_initialised == NULLAGNUMBER)
first_initialised = index;
+ error = xfs_iunlink_init(pag);
+ if (error)
+ goto out_hash_destroy;
}
index = xfs_set_inode_alloc(mp, agcount);
@@ -249,6 +253,7 @@ out_unwind_new_pags:
if (!pag)
break;
xfs_buf_hash_destroy(pag);
+ xfs_iunlink_destroy(pag);
mutex_destroy(&pag->pag_ici_reclaim_lock);
kmem_free(pag);
}
@@ -798,6 +803,10 @@ xfs_mountfs(
if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size))
mp->m_inode_cluster_size = new_size;
}
+ mp->m_blocks_per_cluster = xfs_icluster_size_fsb(mp);
+ mp->m_inodes_per_cluster = XFS_FSB_TO_INO(mp, mp->m_blocks_per_cluster);
+ mp->m_cluster_align = xfs_ialloc_cluster_alignment(mp);
+ mp->m_cluster_align_inodes = XFS_FSB_TO_INO(mp, mp->m_cluster_align);
/*
* If enabled, sparse inode chunk alignment is expected to match the
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 7964513c3128..110f927cf943 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -89,6 +89,13 @@ typedef struct xfs_mount {
int m_logbsize; /* size of each log buffer */
uint m_rsumlevels; /* rt summary levels */
uint m_rsumsize; /* size of rt summary, bytes */
+ /*
+ * Optional cache of rt summary level per bitmap block with the
+ * invariant that m_rsum_cache[bbno] <= the minimum i for which
+ * rsum[i][bbno] != 0. Reads and writes are serialized by the rsumip
+ * inode lock.
+ */
+ uint8_t *m_rsum_cache;
struct xfs_inode *m_rbmip; /* pointer to bitmap inode */
struct xfs_inode *m_rsumip; /* pointer to summary inode */
struct xfs_inode *m_rootip; /* pointer to root directory */
@@ -101,6 +108,10 @@ typedef struct xfs_mount {
uint8_t m_agno_log; /* log #ag's */
uint8_t m_agino_log; /* #bits for agino in inum */
uint m_inode_cluster_size;/* min inode buf size */
+ unsigned int m_inodes_per_cluster;
+ unsigned int m_blocks_per_cluster;
+ unsigned int m_cluster_align;
+ unsigned int m_cluster_align_inodes;
uint m_blockmask; /* sb_blocksize-1 */
uint m_blockwsize; /* sb_blocksize in words */
uint m_blockwmask; /* blockwsize-1 */
@@ -127,7 +138,7 @@ typedef struct xfs_mount {
struct mutex m_growlock; /* growfs mutex */
int m_fixedfsid[2]; /* unchanged for life of FS */
uint64_t m_flags; /* global mount flags */
- bool m_inotbt_nores; /* no per-AG finobt resv. */
+ bool m_finobt_nores; /* no per-AG finobt resv. */
int m_ialloc_inos; /* inodes in inode allocation */
int m_ialloc_blks; /* blocks in inode allocation */
int m_ialloc_min_blks;/* min blocks in sparse inode
@@ -183,6 +194,7 @@ typedef struct xfs_mount {
*/
uint32_t m_generation;
+ bool m_always_cow;
bool m_fail_unmount;
#ifdef DEBUG
/*
@@ -385,6 +397,13 @@ typedef struct xfs_perag {
/* reference count */
uint8_t pagf_refcount_level;
+
+ /*
+ * Unlinked inode information. This incore information reflects
+ * data stored in the AGI, so callers must hold the AGI buffer lock
+ * or have some other means to control concurrency.
+ */
+ struct rhashtable pagi_unlinked_hash;
} xfs_perag_t;
static inline struct xfs_ag_resv *
diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h
index d3e04d20d8d4..c8ba98fae30a 100644
--- a/fs/xfs/xfs_ondisk.h
+++ b/fs/xfs/xfs_ondisk.h
@@ -125,6 +125,27 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format, 56);
XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat, 20);
XFS_CHECK_STRUCT_SIZE(struct xfs_trans_header, 16);
+
+ /*
+ * The v5 superblock format extended several v4 header structures with
+ * additional data. While new fields are only accessible on v5
+ * superblocks, it's important that the v5 structures place original v4
+ * fields/headers in the correct location on-disk. For example, we must
+ * be able to find magic values at the same location in certain blocks
+ * regardless of superblock version.
+ *
+ * The following checks ensure that various v5 data structures place the
+ * subset of v4 metadata associated with the same type of block at the
+ * start of the on-disk block. If there is no data structure definition
+ * for certain types of v4 blocks, traverse down to the first field of
+ * common metadata (e.g., magic value) and make sure it is at offset
+ * zero.
+ */
+ XFS_CHECK_OFFSET(struct xfs_dir3_leaf, hdr.info.hdr, 0);
+ XFS_CHECK_OFFSET(struct xfs_da3_intnode, hdr.info.hdr, 0);
+ XFS_CHECK_OFFSET(struct xfs_dir3_data_hdr, hdr.magic, 0);
+ XFS_CHECK_OFFSET(struct xfs_dir3_free, hdr.hdr.magic, 0);
+ XFS_CHECK_OFFSET(struct xfs_attr3_leafblock, hdr.info.hdr, 0);
}
#endif /* __XFS_ONDISK_H */
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index f44c3599527d..bde2c9f56a46 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -185,7 +185,7 @@ xfs_fs_map_blocks(
}
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
- xfs_bmbt_to_iomap(ip, iomap, &imap);
+ error = xfs_bmbt_to_iomap(ip, iomap, &imap, false);
*device_generation = mp->m_generation;
return error;
out_unlock:
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index 73a1d77ec187..3091e4bc04ef 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -40,7 +40,7 @@ xfs_fill_statvfs_from_dquot(
statp->f_files = limit;
statp->f_ffree =
(statp->f_files > dqp->q_res_icount) ?
- (statp->f_ffree - dqp->q_res_icount) : 0;
+ (statp->f_files - dqp->q_res_icount) : 0;
}
}
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 8eaeec9d58ed..680ae7662a78 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -192,7 +192,7 @@ xfs_reflink_trim_around_shared(
int error = 0;
/* Holes, unwritten, and delalloc extents cannot be shared */
- if (!xfs_is_reflink_inode(ip) || !xfs_bmap_is_real_extent(irec)) {
+ if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_real_extent(irec)) {
*shared = false;
return 0;
}
@@ -234,92 +234,59 @@ xfs_reflink_trim_around_shared(
}
}
-/*
- * Trim the passed in imap to the next shared/unshared extent boundary, and
- * if imap->br_startoff points to a shared extent reserve space for it in the
- * COW fork.
- *
- * Note that imap will always contain the block numbers for the existing blocks
- * in the data fork, as the upper layers need them for read-modify-write
- * operations.
- */
-int
-xfs_reflink_reserve_cow(
+bool
+xfs_inode_need_cow(
struct xfs_inode *ip,
- struct xfs_bmbt_irec *imap)
+ struct xfs_bmbt_irec *imap,
+ bool *shared)
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
- struct xfs_bmbt_irec got;
- int error = 0;
- bool eof = false;
- struct xfs_iext_cursor icur;
- bool shared;
-
- /*
- * Search the COW fork extent list first. This serves two purposes:
- * first this implement the speculative preallocation using cowextisze,
- * so that we also unshared block adjacent to shared blocks instead
- * of just the shared blocks themselves. Second the lookup in the
- * extent list is generally faster than going out to the shared extent
- * tree.
- */
-
- if (!xfs_iext_lookup_extent(ip, ifp, imap->br_startoff, &icur, &got))
- eof = true;
- if (!eof && got.br_startoff <= imap->br_startoff) {
- trace_xfs_reflink_cow_found(ip, imap);
- xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
+ /* We can't update any real extents in always COW mode. */
+ if (xfs_is_always_cow_inode(ip) &&
+ !isnullstartblock(imap->br_startblock)) {
+ *shared = true;
return 0;
}
/* Trim the mapping to the nearest shared extent boundary. */
- error = xfs_reflink_trim_around_shared(ip, imap, &shared);
- if (error)
- return error;
-
- /* Not shared? Just report the (potentially capped) extent. */
- if (!shared)
- return 0;
-
- /*
- * Fork all the shared blocks from our write offset until the end of
- * the extent.
- */
- error = xfs_qm_dqattach_locked(ip, false);
- if (error)
- return error;
-
- error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff,
- imap->br_blockcount, 0, &got, &icur, eof);
- if (error == -ENOSPC || error == -EDQUOT)
- trace_xfs_reflink_cow_enospc(ip, imap);
- if (error)
- return error;
-
- trace_xfs_reflink_cow_alloc(ip, &got);
- return 0;
+ return xfs_reflink_trim_around_shared(ip, imap, shared);
}
-/* Convert part of an unwritten CoW extent to a real one. */
-STATIC int
-xfs_reflink_convert_cow_extent(
- struct xfs_inode *ip,
- struct xfs_bmbt_irec *imap,
- xfs_fileoff_t offset_fsb,
- xfs_filblks_t count_fsb)
+static int
+xfs_reflink_convert_cow_locked(
+ struct xfs_inode *ip,
+ xfs_fileoff_t offset_fsb,
+ xfs_filblks_t count_fsb)
{
- int nimaps = 1;
+ struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec got;
+ struct xfs_btree_cur *dummy_cur = NULL;
+ int dummy_logflags;
+ int error = 0;
- if (imap->br_state == XFS_EXT_NORM)
+ if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got))
return 0;
- xfs_trim_extent(imap, offset_fsb, count_fsb);
- trace_xfs_reflink_convert_cow(ip, imap);
- if (imap->br_blockcount == 0)
- return 0;
- return xfs_bmapi_write(NULL, ip, imap->br_startoff, imap->br_blockcount,
- XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT, 0, imap,
- &nimaps);
+ do {
+ if (got.br_startoff >= offset_fsb + count_fsb)
+ break;
+ if (got.br_state == XFS_EXT_NORM)
+ continue;
+ if (WARN_ON_ONCE(isnullstartblock(got.br_startblock)))
+ return -EIO;
+
+ xfs_trim_extent(&got, offset_fsb, count_fsb);
+ if (!got.br_blockcount)
+ continue;
+
+ got.br_state = XFS_EXT_NORM;
+ error = xfs_bmap_add_extent_unwritten_real(NULL, ip,
+ XFS_COW_FORK, &icur, &dummy_cur, &got,
+ &dummy_logflags);
+ if (error)
+ return error;
+ } while (xfs_iext_next_extent(ip->i_cowfp, &icur, &got));
+
+ return error;
}
/* Convert all of the unwritten CoW extents in a file's range to real ones. */
@@ -333,15 +300,12 @@ xfs_reflink_convert_cow(
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
xfs_filblks_t count_fsb = end_fsb - offset_fsb;
- struct xfs_bmbt_irec imap;
- int nimaps = 1, error = 0;
+ int error;
ASSERT(count != 0);
xfs_ilock(ip, XFS_ILOCK_EXCL);
- error = xfs_bmapi_write(NULL, ip, offset_fsb, count_fsb,
- XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT |
- XFS_BMAPI_CONVERT_ONLY, 0, &imap, &nimaps);
+ error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
@@ -374,7 +338,7 @@ xfs_find_trim_cow_extent(
if (got.br_startoff > offset_fsb) {
xfs_trim_extent(imap, imap->br_startoff,
got.br_startoff - imap->br_startoff);
- return xfs_reflink_trim_around_shared(ip, imap, shared);
+ return xfs_inode_need_cow(ip, imap, shared);
}
*shared = true;
@@ -396,7 +360,8 @@ xfs_reflink_allocate_cow(
struct xfs_inode *ip,
struct xfs_bmbt_irec *imap,
bool *shared,
- uint *lockmode)
+ uint *lockmode,
+ bool convert_now)
{
struct xfs_mount *mp = ip->i_mount;
xfs_fileoff_t offset_fsb = imap->br_startoff;
@@ -408,7 +373,10 @@ xfs_reflink_allocate_cow(
xfs_extlen_t resblks = 0;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- ASSERT(xfs_is_reflink_inode(ip));
+ if (!ip->i_cowfp) {
+ ASSERT(!xfs_is_reflink_inode(ip));
+ xfs_ifork_init_cow(ip);
+ }
error = xfs_find_trim_cow_extent(ip, imap, shared, &found);
if (error || !*shared)
@@ -470,7 +438,16 @@ xfs_reflink_allocate_cow(
if (nimaps == 0)
return -ENOSPC;
convert:
- return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb);
+ xfs_trim_extent(imap, offset_fsb, count_fsb);
+ /*
+ * COW fork extents are supposed to remain unwritten until we're ready
+ * to initiate a disk write. For direct I/O we are going to write the
+ * data and need the conversion, but for buffered writes we're done.
+ */
+ if (!convert_now || imap->br_state == XFS_EXT_NORM)
+ return 0;
+ trace_xfs_reflink_convert_cow(ip, imap);
+ return xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
out_unreserve:
xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0,
@@ -585,7 +562,7 @@ xfs_reflink_cancel_cow_range(
int error;
trace_xfs_reflink_cancel_cow_range(ip, offset, count);
- ASSERT(xfs_is_reflink_inode(ip));
+ ASSERT(ip->i_cowfp);
offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
if (count == NULLFILEOFF)
@@ -622,54 +599,47 @@ out:
}
/*
- * Remap parts of a file's data fork after a successful CoW.
+ * Remap part of the CoW fork into the data fork.
+ *
+ * We aim to remap the range starting at @offset_fsb and ending at @end_fsb
+ * into the data fork; this function will remap what it can (at the end of the
+ * range) and update @end_fsb appropriately. Each remap gets its own
+ * transaction because we can end up merging and splitting bmbt blocks for
+ * every remap operation and we'd like to keep the block reservation
+ * requirements as low as possible.
*/
-int
-xfs_reflink_end_cow(
- struct xfs_inode *ip,
- xfs_off_t offset,
- xfs_off_t count)
+STATIC int
+xfs_reflink_end_cow_extent(
+ struct xfs_inode *ip,
+ xfs_fileoff_t offset_fsb,
+ xfs_fileoff_t *end_fsb)
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
- struct xfs_bmbt_irec got, del;
- struct xfs_trans *tp;
- xfs_fileoff_t offset_fsb;
- xfs_fileoff_t end_fsb;
- int error;
- unsigned int resblks;
- xfs_filblks_t rlen;
- struct xfs_iext_cursor icur;
-
- trace_xfs_reflink_end_cow(ip, offset, count);
+ struct xfs_bmbt_irec got, del;
+ struct xfs_iext_cursor icur;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+ xfs_filblks_t rlen;
+ unsigned int resblks;
+ int error;
/* No COW extents? That's easy! */
- if (ifp->if_bytes == 0)
+ if (ifp->if_bytes == 0) {
+ *end_fsb = offset_fsb;
return 0;
+ }
- offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
- end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
+ resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
+ XFS_TRANS_RESERVE | XFS_TRANS_NOFS, &tp);
+ if (error)
+ return error;
/*
- * Start a rolling transaction to switch the mappings. We're
- * unlikely ever to have to remap 16T worth of single-block
- * extents, so just cap the worst case extent count to 2^32-1.
- * Stick a warning in just in case, and avoid 64-bit division.
+ * Lock the inode. We have to ijoin without automatic unlock because
+ * the lead transaction is the refcountbt record deletion; the data
+ * fork update follows as a deferred log item.
*/
- BUILD_BUG_ON(MAX_RW_COUNT > UINT_MAX);
- if (end_fsb - offset_fsb > UINT_MAX) {
- error = -EFSCORRUPTED;
- xfs_force_shutdown(ip->i_mount, SHUTDOWN_CORRUPT_INCORE);
- ASSERT(0);
- goto out;
- }
- resblks = XFS_NEXTENTADD_SPACE_RES(ip->i_mount,
- (unsigned int)(end_fsb - offset_fsb),
- XFS_DATA_FORK);
- error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write,
- resblks, 0, XFS_TRANS_RESERVE | XFS_TRANS_NOFS, &tp);
- if (error)
- goto out;
-
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
@@ -678,80 +648,131 @@ xfs_reflink_end_cow(
* left by the time I/O completes for the loser of the race. In that
* case we are done.
*/
- if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got))
+ if (!xfs_iext_lookup_extent_before(ip, ifp, end_fsb, &icur, &got) ||
+ got.br_startoff + got.br_blockcount <= offset_fsb) {
+ *end_fsb = offset_fsb;
goto out_cancel;
+ }
- /* Walk backwards until we're out of the I/O range... */
- while (got.br_startoff + got.br_blockcount > offset_fsb) {
- del = got;
- xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb);
-
- /* Extent delete may have bumped ext forward */
- if (!del.br_blockcount)
- goto prev_extent;
+ /*
+ * Structure copy @got into @del, then trim @del to the range that we
+ * were asked to remap. We preserve @got for the eventual CoW fork
+ * deletion; from now on @del represents the mapping that we're
+ * actually remapping.
+ */
+ del = got;
+ xfs_trim_extent(&del, offset_fsb, *end_fsb - offset_fsb);
- /*
- * Only remap real extent that contain data. With AIO
- * speculatively preallocations can leak into the range we
- * are called upon, and we need to skip them.
- */
- if (!xfs_bmap_is_real_extent(&got))
- goto prev_extent;
+ ASSERT(del.br_blockcount > 0);
- /* Unmap the old blocks in the data fork. */
- ASSERT(tp->t_firstblock == NULLFSBLOCK);
- rlen = del.br_blockcount;
- error = __xfs_bunmapi(tp, ip, del.br_startoff, &rlen, 0, 1);
- if (error)
- goto out_cancel;
+ /*
+ * Only remap real extents that contain data. With AIO, speculative
+ * preallocations can leak into the range we are called upon, and we
+ * need to skip them.
+ */
+ if (!xfs_bmap_is_real_extent(&got)) {
+ *end_fsb = del.br_startoff;
+ goto out_cancel;
+ }
- /* Trim the extent to whatever got unmapped. */
- if (rlen) {
- xfs_trim_extent(&del, del.br_startoff + rlen,
- del.br_blockcount - rlen);
- }
- trace_xfs_reflink_cow_remap(ip, &del);
+ /* Unmap the old blocks in the data fork. */
+ rlen = del.br_blockcount;
+ error = __xfs_bunmapi(tp, ip, del.br_startoff, &rlen, 0, 1);
+ if (error)
+ goto out_cancel;
- /* Free the CoW orphan record. */
- error = xfs_refcount_free_cow_extent(tp, del.br_startblock,
- del.br_blockcount);
- if (error)
- goto out_cancel;
+ /* Trim the extent to whatever got unmapped. */
+ xfs_trim_extent(&del, del.br_startoff + rlen, del.br_blockcount - rlen);
+ trace_xfs_reflink_cow_remap(ip, &del);
- /* Map the new blocks into the data fork. */
- error = xfs_bmap_map_extent(tp, ip, &del);
- if (error)
- goto out_cancel;
+ /* Free the CoW orphan record. */
+ error = xfs_refcount_free_cow_extent(tp, del.br_startblock,
+ del.br_blockcount);
+ if (error)
+ goto out_cancel;
- /* Charge this new data fork mapping to the on-disk quota. */
- xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT,
- (long)del.br_blockcount);
+ /* Map the new blocks into the data fork. */
+ error = xfs_bmap_map_extent(tp, ip, &del);
+ if (error)
+ goto out_cancel;
- /* Remove the mapping from the CoW fork. */
- xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
+ /* Charge this new data fork mapping to the on-disk quota. */
+ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT,
+ (long)del.br_blockcount);
- error = xfs_defer_finish(&tp);
- if (error)
- goto out_cancel;
- if (!xfs_iext_get_extent(ifp, &icur, &got))
- break;
- continue;
-prev_extent:
- if (!xfs_iext_prev_extent(ifp, &icur, &got))
- break;
- }
+ /* Remove the mapping from the CoW fork. */
+ xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
error = xfs_trans_commit(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
if (error)
- goto out;
+ return error;
+
+ /* Update the caller about how much progress we made. */
+ *end_fsb = del.br_startoff;
return 0;
out_cancel:
xfs_trans_cancel(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
-out:
- trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
+ return error;
+}
+
+/*
+ * Remap parts of a file's data fork after a successful CoW.
+ */
+int
+xfs_reflink_end_cow(
+ struct xfs_inode *ip,
+ xfs_off_t offset,
+ xfs_off_t count)
+{
+ xfs_fileoff_t offset_fsb;
+ xfs_fileoff_t end_fsb;
+ int error = 0;
+
+ trace_xfs_reflink_end_cow(ip, offset, count);
+
+ offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
+ end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
+
+ /*
+ * Walk backwards until we're out of the I/O range. The loop function
+ * repeatedly cycles the ILOCK to allocate one transaction per remapped
+ * extent.
+ *
+ * If we're being called by writeback then the the pages will still
+ * have PageWriteback set, which prevents races with reflink remapping
+ * and truncate. Reflink remapping prevents races with writeback by
+ * taking the iolock and mmaplock before flushing the pages and
+ * remapping, which means there won't be any further writeback or page
+ * cache dirtying until the reflink completes.
+ *
+ * We should never have two threads issuing writeback for the same file
+ * region. There are also have post-eof checks in the writeback
+ * preparation code so that we don't bother writing out pages that are
+ * about to be truncated.
+ *
+ * If we're being called as part of directio write completion, the dio
+ * count is still elevated, which reflink and truncate will wait for.
+ * Reflink remapping takes the iolock and mmaplock and waits for
+ * pending dio to finish, which should prevent any directio until the
+ * remap completes. Multiple concurrent directio writes to the same
+ * region are handled by end_cow processing only occurring for the
+ * threads which succeed; the outcome of multiple overlapping direct
+ * writes is not well defined anyway.
+ *
+ * It's possible that a buffered write and a direct write could collide
+ * here (the buffered write stumbles in after the dio flushes and
+ * invalidates the page cache and immediately queues writeback), but we
+ * have never supported this 100%. If either disk write succeeds the
+ * blocks will be remapped.
+ */
+ while (end_fsb > offset_fsb && !error)
+ error = xfs_reflink_end_cow_extent(ip, offset_fsb, &end_fsb);
+
+ if (error)
+ trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
return error;
}
@@ -913,18 +934,18 @@ out_error:
/*
* Update destination inode size & cowextsize hint, if necessary.
*/
-STATIC int
+int
xfs_reflink_update_dest(
struct xfs_inode *dest,
xfs_off_t newlen,
xfs_extlen_t cowextsize,
- bool is_dedupe)
+ unsigned int remap_flags)
{
struct xfs_mount *mp = dest->i_mount;
struct xfs_trans *tp;
int error;
- if (is_dedupe && newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0)
+ if (newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0)
return 0;
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
@@ -945,10 +966,6 @@ xfs_reflink_update_dest(
dest->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
}
- if (!is_dedupe) {
- xfs_trans_ichgtime(tp, dest,
- XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
- }
xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE);
error = xfs_trans_commit(tp);
@@ -1112,19 +1129,28 @@ out:
/*
* Iteratively remap one file's extents (and holes) to another's.
*/
-STATIC int
+int
xfs_reflink_remap_blocks(
struct xfs_inode *src,
- xfs_fileoff_t srcoff,
+ loff_t pos_in,
struct xfs_inode *dest,
- xfs_fileoff_t destoff,
- xfs_filblks_t len,
- xfs_off_t new_isize)
+ loff_t pos_out,
+ loff_t remap_len,
+ loff_t *remapped)
{
struct xfs_bmbt_irec imap;
+ xfs_fileoff_t srcoff;
+ xfs_fileoff_t destoff;
+ xfs_filblks_t len;
+ xfs_filblks_t range_len;
+ xfs_filblks_t remapped_len = 0;
+ xfs_off_t new_isize = pos_out + remap_len;
int nimaps;
int error = 0;
- xfs_filblks_t range_len;
+
+ destoff = XFS_B_TO_FSBT(src->i_mount, pos_out);
+ srcoff = XFS_B_TO_FSBT(src->i_mount, pos_in);
+ len = XFS_B_TO_FSB(src->i_mount, remap_len);
/* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */
while (len) {
@@ -1139,10 +1165,10 @@ xfs_reflink_remap_blocks(
error = xfs_bmapi_read(src, srcoff, len, &imap, &nimaps, 0);
xfs_iunlock(src, lock_mode);
if (error)
- goto err;
+ break;
ASSERT(nimaps == 1);
- trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_IO_OVERWRITE,
+ trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_DATA_FORK,
&imap);
/* Translate imap into the destination file. */
@@ -1153,23 +1179,24 @@ xfs_reflink_remap_blocks(
error = xfs_reflink_remap_extent(dest, &imap, destoff,
new_isize);
if (error)
- goto err;
+ break;
if (fatal_signal_pending(current)) {
error = -EINTR;
- goto err;
+ break;
}
/* Advance drange/srange */
srcoff += range_len;
destoff += range_len;
len -= range_len;
+ remapped_len += range_len;
}
- return 0;
-
-err:
- trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_);
+ if (error)
+ trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_);
+ *remapped = min_t(loff_t, remap_len,
+ XFS_FSB_TO_B(src->i_mount, remapped_len));
return error;
}
@@ -1218,7 +1245,7 @@ retry:
}
/* Unlock both inodes after they've been prepped for a range clone. */
-STATIC void
+void
xfs_reflink_remap_unlock(
struct file *file_in,
struct file *file_out)
@@ -1286,21 +1313,20 @@ xfs_reflink_zero_posteof(
* stale data in the destination file. Hence we reject these clone attempts with
* -EINVAL in this case.
*/
-STATIC int
+int
xfs_reflink_remap_prep(
struct file *file_in,
loff_t pos_in,
struct file *file_out,
loff_t pos_out,
- u64 *len,
- bool is_dedupe)
+ loff_t *len,
+ unsigned int remap_flags)
{
struct inode *inode_in = file_inode(file_in);
struct xfs_inode *src = XFS_I(inode_in);
struct inode *inode_out = file_inode(file_out);
struct xfs_inode *dest = XFS_I(inode_out);
bool same_inode = (inode_in == inode_out);
- u64 blkmask = i_blocksize(inode_in) - 1;
ssize_t ret;
/* Lock both files against IO */
@@ -1323,29 +1349,11 @@ xfs_reflink_remap_prep(
if (IS_DAX(inode_in) || IS_DAX(inode_out))
goto out_unlock;
- ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out,
- len, is_dedupe);
- if (ret <= 0)
+ ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
+ len, remap_flags);
+ if (ret < 0 || *len == 0)
goto out_unlock;
- /*
- * If the dedupe data matches, chop off the partial EOF block
- * from the source file so we don't try to dedupe the partial
- * EOF block.
- */
- if (is_dedupe) {
- *len &= ~blkmask;
- } else if (*len & blkmask) {
- /*
- * The user is attempting to share a partial EOF block,
- * if it's inside the destination EOF then reject it.
- */
- if (pos_out + *len < i_size_read(inode_out)) {
- ret = -EINVAL;
- goto out_unlock;
- }
- }
-
/* Attach dquots to dest inode before changing block map */
ret = xfs_qm_dqattach(dest);
if (ret)
@@ -1364,102 +1372,23 @@ xfs_reflink_remap_prep(
if (ret)
goto out_unlock;
- /* Zap any page cache for the destination file's range. */
- truncate_inode_pages_range(&inode_out->i_data, pos_out,
- PAGE_ALIGN(pos_out + *len) - 1);
-
- /* If we're altering the file contents... */
- if (!is_dedupe) {
- /*
- * ...update the timestamps (which will grab the ilock again
- * from xfs_fs_dirty_inode, so we have to call it before we
- * take the ilock).
- */
- if (!(file_out->f_mode & FMODE_NOCMTIME)) {
- ret = file_update_time(file_out);
- if (ret)
- goto out_unlock;
- }
-
- /*
- * ...clear the security bits if the process is not being run
- * by root. This keeps people from modifying setuid and setgid
- * binaries.
- */
- ret = file_remove_privs(file_out);
- if (ret)
- goto out_unlock;
+ /*
+ * If pos_out > EOF, we may have dirtied blocks between EOF and
+ * pos_out. In that case, we need to extend the flush and unmap to cover
+ * from EOF to the end of the copy length.
+ */
+ if (pos_out > XFS_ISIZE(dest)) {
+ loff_t flen = *len + (pos_out - XFS_ISIZE(dest));
+ ret = xfs_flush_unmap_range(dest, XFS_ISIZE(dest), flen);
+ } else {
+ ret = xfs_flush_unmap_range(dest, pos_out, *len);
}
-
- return 1;
-out_unlock:
- xfs_reflink_remap_unlock(file_in, file_out);
- return ret;
-}
-
-/*
- * Link a range of blocks from one file to another.
- */
-int
-xfs_reflink_remap_range(
- struct file *file_in,
- loff_t pos_in,
- struct file *file_out,
- loff_t pos_out,
- u64 len,
- bool is_dedupe)
-{
- struct inode *inode_in = file_inode(file_in);
- struct xfs_inode *src = XFS_I(inode_in);
- struct inode *inode_out = file_inode(file_out);
- struct xfs_inode *dest = XFS_I(inode_out);
- struct xfs_mount *mp = src->i_mount;
- xfs_fileoff_t sfsbno, dfsbno;
- xfs_filblks_t fsblen;
- xfs_extlen_t cowextsize;
- ssize_t ret;
-
- if (!xfs_sb_version_hasreflink(&mp->m_sb))
- return -EOPNOTSUPP;
-
- if (XFS_FORCED_SHUTDOWN(mp))
- return -EIO;
-
- /* Prepare and then clone file data. */
- ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
- &len, is_dedupe);
- if (ret <= 0)
- return ret;
-
- trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
-
- dfsbno = XFS_B_TO_FSBT(mp, pos_out);
- sfsbno = XFS_B_TO_FSBT(mp, pos_in);
- fsblen = XFS_B_TO_FSB(mp, len);
- ret = xfs_reflink_remap_blocks(src, sfsbno, dest, dfsbno, fsblen,
- pos_out + len);
if (ret)
goto out_unlock;
- /*
- * Carry the cowextsize hint from src to dest if we're sharing the
- * entire source file to the entire destination file, the source file
- * has a cowextsize hint, and the destination file does not.
- */
- cowextsize = 0;
- if (pos_in == 0 && len == i_size_read(inode_in) &&
- (src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) &&
- pos_out == 0 && len >= i_size_read(inode_out) &&
- !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE))
- cowextsize = src->i_d.di_cowextsize;
-
- ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
- is_dedupe);
-
+ return 1;
out_unlock:
xfs_reflink_remap_unlock(file_in, file_out);
- if (ret)
- trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
return ret;
}
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index 7f47202b5639..28a43b7f581d 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -6,16 +6,28 @@
#ifndef __XFS_REFLINK_H
#define __XFS_REFLINK_H 1
+static inline bool xfs_is_always_cow_inode(struct xfs_inode *ip)
+{
+ return ip->i_mount->m_always_cow &&
+ xfs_sb_version_hasreflink(&ip->i_mount->m_sb);
+}
+
+static inline bool xfs_is_cow_inode(struct xfs_inode *ip)
+{
+ return xfs_is_reflink_inode(ip) || xfs_is_always_cow_inode(ip);
+}
+
extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t aglen,
xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_maximal);
extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
struct xfs_bmbt_irec *irec, bool *shared);
+bool xfs_inode_need_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap,
+ bool *shared);
-extern int xfs_reflink_reserve_cow(struct xfs_inode *ip,
- struct xfs_bmbt_irec *imap);
extern int xfs_reflink_allocate_cow(struct xfs_inode *ip,
- struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode);
+ struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode,
+ bool convert_now);
extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t count);
@@ -27,13 +39,24 @@ extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset,
extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t count);
extern int xfs_reflink_recover_cow(struct xfs_mount *mp);
-extern int xfs_reflink_remap_range(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out, u64 len, bool is_dedupe);
+extern loff_t xfs_reflink_remap_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out, loff_t len,
+ unsigned int remap_flags);
extern int xfs_reflink_inode_has_shared_extents(struct xfs_trans *tp,
struct xfs_inode *ip, bool *has_shared);
extern int xfs_reflink_clear_inode_flag(struct xfs_inode *ip,
struct xfs_trans **tpp);
extern int xfs_reflink_unshare(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t len);
+extern int xfs_reflink_remap_prep(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out, loff_t *len,
+ unsigned int remap_flags);
+extern int xfs_reflink_remap_blocks(struct xfs_inode *src, loff_t pos_in,
+ struct xfs_inode *dest, loff_t pos_out, loff_t remap_len,
+ loff_t *remapped);
+extern int xfs_reflink_update_dest(struct xfs_inode *dest, xfs_off_t newlen,
+ xfs_extlen_t cowextsize, unsigned int remap_flags);
+extern void xfs_reflink_remap_unlock(struct file *file_in,
+ struct file *file_out);
#endif /* __XFS_REFLINK_H */
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 926ed314ffba..ac0fcdad0c4e 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -64,8 +64,12 @@ xfs_rtany_summary(
int log; /* loop counter, log2 of ext. size */
xfs_suminfo_t sum; /* summary data */
+ /* There are no extents at levels < m_rsum_cache[bbno]. */
+ if (mp->m_rsum_cache && low < mp->m_rsum_cache[bbno])
+ low = mp->m_rsum_cache[bbno];
+
/*
- * Loop over logs of extent sizes. Order is irrelevant.
+ * Loop over logs of extent sizes.
*/
for (log = low; log <= high; log++) {
/*
@@ -80,13 +84,17 @@ xfs_rtany_summary(
*/
if (sum) {
*stat = 1;
- return 0;
+ goto out;
}
}
/*
* Found nothing, return failure.
*/
*stat = 0;
+out:
+ /* There were no extents at levels < log. */
+ if (mp->m_rsum_cache && log > mp->m_rsum_cache[bbno])
+ mp->m_rsum_cache[bbno] = log;
return 0;
}
@@ -853,6 +861,21 @@ out_trans_cancel:
return error;
}
+static void
+xfs_alloc_rsum_cache(
+ xfs_mount_t *mp, /* file system mount structure */
+ xfs_extlen_t rbmblocks) /* number of rt bitmap blocks */
+{
+ /*
+ * The rsum cache is initialized to all zeroes, which is trivially a
+ * lower bound on the minimum level with any free extents. We can
+ * continue without the cache if it couldn't be allocated.
+ */
+ mp->m_rsum_cache = kmem_zalloc_large(rbmblocks, KM_SLEEP);
+ if (!mp->m_rsum_cache)
+ xfs_warn(mp, "could not allocate realtime summary cache");
+}
+
/*
* Visible (exported) functions.
*/
@@ -881,6 +904,7 @@ xfs_growfs_rt(
xfs_extlen_t rsumblocks; /* current number of rt summary blks */
xfs_sb_t *sbp; /* old superblock */
xfs_fsblock_t sumbno; /* summary block number */
+ uint8_t *rsum_cache; /* old summary cache */
sbp = &mp->m_sb;
/*
@@ -937,6 +961,11 @@ xfs_growfs_rt(
error = xfs_growfs_rt_alloc(mp, rsumblocks, nrsumblocks, mp->m_rsumip);
if (error)
return error;
+
+ rsum_cache = mp->m_rsum_cache;
+ if (nrbmblocks != sbp->sb_rbmblocks)
+ xfs_alloc_rsum_cache(mp, nrbmblocks);
+
/*
* Allocate a new (fake) mount/sb.
*/
@@ -1062,6 +1091,20 @@ error_cancel:
*/
kmem_free(nmp);
+ /*
+ * If we had to allocate a new rsum_cache, we either need to free the
+ * old one (if we succeeded) or free the new one and restore the old one
+ * (if there was an error).
+ */
+ if (rsum_cache != mp->m_rsum_cache) {
+ if (error) {
+ kmem_free(mp->m_rsum_cache);
+ mp->m_rsum_cache = rsum_cache;
+ } else {
+ kmem_free(rsum_cache);
+ }
+ }
+
return error;
}
@@ -1187,8 +1230,8 @@ xfs_rtmount_init(
}
/*
- * Get the bitmap and summary inodes into the mount structure
- * at mount time.
+ * Get the bitmap and summary inodes and the summary cache into the mount
+ * structure at mount time.
*/
int /* error */
xfs_rtmount_inodes(
@@ -1198,19 +1241,18 @@ xfs_rtmount_inodes(
xfs_sb_t *sbp;
sbp = &mp->m_sb;
- if (sbp->sb_rbmino == NULLFSINO)
- return 0;
error = xfs_iget(mp, NULL, sbp->sb_rbmino, 0, 0, &mp->m_rbmip);
if (error)
return error;
ASSERT(mp->m_rbmip != NULL);
- ASSERT(sbp->sb_rsumino != NULLFSINO);
+
error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip);
if (error) {
xfs_irele(mp->m_rbmip);
return error;
}
ASSERT(mp->m_rsumip != NULL);
+ xfs_alloc_rsum_cache(mp, sbp->sb_rbmblocks);
return 0;
}
@@ -1218,6 +1260,7 @@ void
xfs_rtunmount_inodes(
struct xfs_mount *mp)
{
+ kmem_free(mp->m_rsum_cache);
if (mp->m_rbmip)
xfs_irele(mp->m_rbmip);
if (mp->m_rsumip)
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index d3e6cd063688..f093ea244849 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -38,6 +38,7 @@
#include "xfs_refcount_item.h"
#include "xfs_bmap_item.h"
#include "xfs_reflink.h"
+#include "xfs_defer.h"
#include <linux/namei.h>
#include <linux/dax.h>
@@ -607,7 +608,7 @@ xfs_set_inode_alloc(
}
/* Get the last possible inode in the filesystem */
- agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
+ agino = XFS_AGB_TO_AGINO(mp, sbp->sb_agblocks - 1);
ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
/*
@@ -1149,7 +1150,7 @@ xfs_fs_statfs(
statp->f_bfree = fdblocks - mp->m_alloc_set_aside;
statp->f_bavail = statp->f_bfree;
- fakeinos = statp->f_bfree << sbp->sb_inopblog;
+ fakeinos = XFS_FSB_TO_INO(mp, statp->f_bfree);
statp->f_files = min(icount + fakeinos, (uint64_t)XFS_MAXINUMBER);
if (mp->m_maxicount)
statp->f_files = min_t(typeof(statp->f_files),
@@ -1593,6 +1594,13 @@ xfs_mount_alloc(
INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker);
INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker);
mp->m_kobj.kobject.kset = xfs_kset;
+ /*
+ * We don't create the finobt per-ag space reservation until after log
+ * recovery, so we must set this to true so that an ifree transaction
+ * started during log recovery will not depend on space reservations
+ * for finobt expansion.
+ */
+ mp->m_finobt_nores = true;
return mp;
}
@@ -1728,11 +1736,18 @@ xfs_fs_fill_super(
}
}
- if (xfs_sb_version_hasreflink(&mp->m_sb) && mp->m_sb.sb_rblocks) {
- xfs_alert(mp,
+ if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+ if (mp->m_sb.sb_rblocks) {
+ xfs_alert(mp,
"reflink not compatible with realtime device!");
- error = -EINVAL;
- goto out_filestream_unmount;
+ error = -EINVAL;
+ goto out_filestream_unmount;
+ }
+
+ if (xfs_globals.always_cow) {
+ xfs_info(mp, "using DEBUG-only always_cow mode.");
+ mp->m_always_cow = true;
+ }
}
if (xfs_sb_version_hasrmapbt(&mp->m_sb) && mp->m_sb.sb_rblocks) {
@@ -2085,11 +2100,6 @@ init_xfs_fs(void)
printk(KERN_INFO XFS_VERSION_STRING " with "
XFS_BUILD_OPTIONS " enabled\n");
- xfs_extent_free_init_defer_op();
- xfs_rmap_update_init_defer_op();
- xfs_refcount_update_init_defer_op();
- xfs_bmap_update_init_defer_op();
-
xfs_dir_startup();
error = xfs_init_zones();
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index a3e98c64b6e3..b2c1177c717f 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -192,6 +192,7 @@ xfs_symlink(
pathlen = strlen(target_path);
if (pathlen >= XFS_SYMLINK_MAXLEN) /* total string too long */
return -ENAMETOOLONG;
+ ASSERT(pathlen > 0);
udqp = gdqp = NULL;
prid = xfs_get_initial_prid(dp);
@@ -378,6 +379,12 @@ out_release_inode:
/*
* Free a symlink that has blocks associated with it.
+ *
+ * Note: zero length symlinks are not allowed to exist. When we set the size to
+ * zero, also change it to a regular file so that it does not get written to
+ * disk as a zero length symlink. The inode is on the unlinked list already, so
+ * userspace cannot find this inode anymore, so this change is not user visible
+ * but allows us to catch corrupt zero-length symlinks in the verifiers.
*/
STATIC int
xfs_inactive_symlink_rmt(
@@ -412,13 +419,14 @@ xfs_inactive_symlink_rmt(
xfs_trans_ijoin(tp, ip, 0);
/*
- * Lock the inode, fix the size, and join it to the transaction.
- * Hold it so in the normal path, we still have it locked for
- * the second transaction. In the error paths we need it
+ * Lock the inode, fix the size, turn it into a regular file and join it
+ * to the transaction. Hold it so in the normal path, we still have it
+ * locked for the second transaction. In the error paths we need it
* held so the cancel won't rele it, see below.
*/
size = (int)ip->i_d.di_size;
ip->i_d.di_size = 0;
+ VFS_I(ip)->i_mode = (VFS_I(ip)->i_mode & ~S_IFMT) | S_IFREG;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
/*
* Find the block(s) so we can inval and unmap them.
@@ -494,17 +502,10 @@ xfs_inactive_symlink(
return -EIO;
xfs_ilock(ip, XFS_ILOCK_EXCL);
-
- /*
- * Zero length symlinks _can_ exist.
- */
pathlen = (int)ip->i_d.di_size;
- if (!pathlen) {
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- return 0;
- }
+ ASSERT(pathlen);
- if (pathlen < 0 || pathlen > XFS_SYMLINK_MAXLEN) {
+ if (pathlen <= 0 || pathlen > XFS_SYMLINK_MAXLEN) {
xfs_alert(mp, "%s: inode (0x%llx) bad symlink length (%d)",
__func__, (unsigned long long)ip->i_ino, pathlen);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -512,12 +513,12 @@ xfs_inactive_symlink(
return -EFSCORRUPTED;
}
+ /*
+ * Inline fork state gets removed by xfs_difree() so we have nothing to
+ * do here in that case.
+ */
if (ip->i_df.if_flags & XFS_IFINLINE) {
- if (ip->i_df.if_bytes > 0)
- xfs_idata_realloc(ip, -(ip->i_df.if_bytes),
- XFS_DATA_FORK);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
- ASSERT(ip->i_df.if_bytes == 0);
return 0;
}
diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
index 168488130a19..ad7f9be13087 100644
--- a/fs/xfs/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
@@ -85,6 +85,7 @@ struct xfs_globals {
int log_recovery_delay; /* log recovery delay (secs) */
int mount_delay; /* mount setup delay (secs) */
bool bug_on_assert; /* BUG() the kernel on assert failure */
+ bool always_cow; /* use COW fork for all overwrites */
};
extern struct xfs_globals xfs_globals;
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index cd6a994a7250..cabda13f3c64 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -183,10 +183,34 @@ mount_delay_show(
}
XFS_SYSFS_ATTR_RW(mount_delay);
+static ssize_t
+always_cow_store(
+ struct kobject *kobject,
+ const char *buf,
+ size_t count)
+{
+ ssize_t ret;
+
+ ret = kstrtobool(buf, &xfs_globals.always_cow);
+ if (ret < 0)
+ return ret;
+ return count;
+}
+
+static ssize_t
+always_cow_show(
+ struct kobject *kobject,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.always_cow);
+}
+XFS_SYSFS_ATTR_RW(always_cow);
+
static struct attribute *xfs_dbg_attrs[] = {
ATTR_LIST(bug_on_assert),
ATTR_LIST(log_recovery_delay),
ATTR_LIST(mount_delay),
+ ATTR_LIST(always_cow),
NULL,
};
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 3043e5ed6495..47fb07d86efd 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -280,7 +280,10 @@ DECLARE_EVENT_CLASS(xfs_buf_class,
),
TP_fast_assign(
__entry->dev = bp->b_target->bt_dev;
- __entry->bno = bp->b_bn;
+ if (bp->b_bn == XFS_BUF_DADDR_NULL)
+ __entry->bno = bp->b_maps[0].bm_bn;
+ else
+ __entry->bno = bp->b_bn;
__entry->nblks = bp->b_length;
__entry->hold = atomic_read(&bp->b_hold);
__entry->pincount = atomic_read(&bp->b_pin_count);
@@ -637,6 +640,16 @@ DEFINE_INODE_EVENT(xfs_inode_set_cowblocks_tag);
DEFINE_INODE_EVENT(xfs_inode_clear_cowblocks_tag);
DEFINE_INODE_EVENT(xfs_inode_free_cowblocks_invalid);
+/*
+ * ftrace's __print_symbolic requires that all enum values be wrapped in the
+ * TRACE_DEFINE_ENUM macro so that the enum value can be encoded in the ftrace
+ * ring buffer. Somehow this was only worth mentioning in the ftrace sample
+ * code.
+ */
+TRACE_DEFINE_ENUM(PE_SIZE_PTE);
+TRACE_DEFINE_ENUM(PE_SIZE_PMD);
+TRACE_DEFINE_ENUM(PE_SIZE_PUD);
+
TRACE_EVENT(xfs_filemap_fault,
TP_PROTO(struct xfs_inode *ip, enum page_entry_size pe_size,
bool write_fault),
@@ -1207,15 +1220,15 @@ DEFINE_READPAGE_EVENT(xfs_vm_readpages);
DECLARE_EVENT_CLASS(xfs_imap_class,
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
- int type, struct xfs_bmbt_irec *irec),
- TP_ARGS(ip, offset, count, type, irec),
+ int whichfork, struct xfs_bmbt_irec *irec),
+ TP_ARGS(ip, offset, count, whichfork, irec),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
__field(loff_t, size)
__field(loff_t, offset)
__field(size_t, count)
- __field(int, type)
+ __field(int, whichfork)
__field(xfs_fileoff_t, startoff)
__field(xfs_fsblock_t, startblock)
__field(xfs_filblks_t, blockcount)
@@ -1226,33 +1239,33 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
__entry->size = ip->i_d.di_size;
__entry->offset = offset;
__entry->count = count;
- __entry->type = type;
+ __entry->whichfork = whichfork;
__entry->startoff = irec ? irec->br_startoff : 0;
__entry->startblock = irec ? irec->br_startblock : 0;
__entry->blockcount = irec ? irec->br_blockcount : 0;
),
TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count %zd "
- "type %s startoff 0x%llx startblock %lld blockcount 0x%llx",
+ "fork %s startoff 0x%llx startblock %lld blockcount 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->size,
__entry->offset,
__entry->count,
- __print_symbolic(__entry->type, XFS_IO_TYPES),
+ __entry->whichfork == XFS_COW_FORK ? "cow" : "data",
__entry->startoff,
(int64_t)__entry->startblock,
__entry->blockcount)
)
-#define DEFINE_IOMAP_EVENT(name) \
+#define DEFINE_IMAP_EVENT(name) \
DEFINE_EVENT(xfs_imap_class, name, \
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \
- int type, struct xfs_bmbt_irec *irec), \
- TP_ARGS(ip, offset, count, type, irec))
-DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
-DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
-DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
-DEFINE_IOMAP_EVENT(xfs_iomap_found);
+ int whichfork, struct xfs_bmbt_irec *irec), \
+ TP_ARGS(ip, offset, count, whichfork, irec))
+DEFINE_IMAP_EVENT(xfs_map_blocks_found);
+DEFINE_IMAP_EVENT(xfs_map_blocks_alloc);
+DEFINE_IMAP_EVENT(xfs_iomap_alloc);
+DEFINE_IMAP_EVENT(xfs_iomap_found);
DECLARE_EVENT_CLASS(xfs_simple_io_class,
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
@@ -1882,11 +1895,11 @@ TRACE_EVENT(xfs_dir2_leafn_moveents,
{ 0, "target" }, \
{ 1, "temp" }
-#define XFS_INODE_FORMAT_STR \
- { 0, "invalid" }, \
- { 1, "local" }, \
- { 2, "extent" }, \
- { 3, "btree" }
+TRACE_DEFINE_ENUM(XFS_DINODE_FMT_DEV);
+TRACE_DEFINE_ENUM(XFS_DINODE_FMT_LOCAL);
+TRACE_DEFINE_ENUM(XFS_DINODE_FMT_EXTENTS);
+TRACE_DEFINE_ENUM(XFS_DINODE_FMT_BTREE);
+TRACE_DEFINE_ENUM(XFS_DINODE_FMT_UUID);
DECLARE_EVENT_CLASS(xfs_swap_extent_class,
TP_PROTO(struct xfs_inode *ip, int which),
@@ -2175,6 +2188,14 @@ DEFINE_DISCARD_EVENT(xfs_discard_exclude);
DEFINE_DISCARD_EVENT(xfs_discard_busy);
/* btree cursor events */
+TRACE_DEFINE_ENUM(XFS_BTNUM_BNOi);
+TRACE_DEFINE_ENUM(XFS_BTNUM_CNTi);
+TRACE_DEFINE_ENUM(XFS_BTNUM_BMAPi);
+TRACE_DEFINE_ENUM(XFS_BTNUM_INOi);
+TRACE_DEFINE_ENUM(XFS_BTNUM_FINOi);
+TRACE_DEFINE_ENUM(XFS_BTNUM_RMAPi);
+TRACE_DEFINE_ENUM(XFS_BTNUM_REFCi);
+
DECLARE_EVENT_CLASS(xfs_btree_cur_class,
TP_PROTO(struct xfs_btree_cur *cur, int level, struct xfs_buf *bp),
TP_ARGS(cur, level, bp),
@@ -2194,9 +2215,9 @@ DECLARE_EVENT_CLASS(xfs_btree_cur_class,
__entry->ptr = cur->bc_ptrs[level];
__entry->daddr = bp ? bp->b_bn : -1;
),
- TP_printk("dev %d:%d btnum %d level %d/%d ptr %d daddr 0x%llx",
+ TP_printk("dev %d:%d btree %s level %d/%d ptr %d daddr 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->btnum,
+ __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
__entry->level,
__entry->nlevels,
__entry->ptr,
@@ -2273,7 +2294,7 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_class,
),
TP_fast_assign(
__entry->dev = mp ? mp->m_super->s_dev : 0;
- __entry->type = dfp->dfp_type->type;
+ __entry->type = dfp->dfp_type;
__entry->intent = dfp->dfp_intent;
__entry->committed = dfp->dfp_done != NULL;
__entry->nr = dfp->dfp_count;
@@ -2402,7 +2423,7 @@ DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_agfl_free_deferred);
DECLARE_EVENT_CLASS(xfs_rmap_class,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
xfs_agblock_t agbno, xfs_extlen_t len, bool unwritten,
- struct xfs_owner_info *oinfo),
+ const struct xfs_owner_info *oinfo),
TP_ARGS(mp, agno, agbno, len, unwritten, oinfo),
TP_STRUCT__entry(
__field(dev_t, dev)
@@ -2437,7 +2458,7 @@ DECLARE_EVENT_CLASS(xfs_rmap_class,
DEFINE_EVENT(xfs_rmap_class, name, \
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
xfs_agblock_t agbno, xfs_extlen_t len, bool unwritten, \
- struct xfs_owner_info *oinfo), \
+ const struct xfs_owner_info *oinfo), \
TP_ARGS(mp, agno, agbno, len, unwritten, oinfo))
/* simple AG-based error/%ip tracepoint class */
@@ -2607,10 +2628,9 @@ DEFINE_AG_ERROR_EVENT(xfs_ag_resv_init_error);
#define DEFINE_AG_EXTENT_EVENT(name) DEFINE_DISCARD_EVENT(name)
/* ag btree lookup tracepoint class */
-#define XFS_AG_BTREE_CMP_FORMAT_STR \
- { XFS_LOOKUP_EQ, "eq" }, \
- { XFS_LOOKUP_LE, "le" }, \
- { XFS_LOOKUP_GE, "ge" }
+TRACE_DEFINE_ENUM(XFS_LOOKUP_EQi);
+TRACE_DEFINE_ENUM(XFS_LOOKUP_LEi);
+TRACE_DEFINE_ENUM(XFS_LOOKUP_GEi);
DECLARE_EVENT_CLASS(xfs_ag_btree_lookup_class,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
xfs_agblock_t agbno, xfs_lookup_t dir),
@@ -3052,7 +3072,7 @@ DEFINE_EVENT(xfs_inode_irec_class, name, \
DEFINE_INODE_EVENT(xfs_reflink_set_inode_flag);
DEFINE_INODE_EVENT(xfs_reflink_unset_inode_flag);
DEFINE_ITRUNC_EVENT(xfs_reflink_update_inode_size);
-DEFINE_IOMAP_EVENT(xfs_reflink_remap_imap);
+DEFINE_IMAP_EVENT(xfs_reflink_remap_imap);
TRACE_EVENT(xfs_reflink_remap_blocks_loop,
TP_PROTO(struct xfs_inode *src, xfs_fileoff_t soffset,
xfs_filblks_t len, struct xfs_inode *dest,
@@ -3176,13 +3196,10 @@ DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error);
/* copy on write */
DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared);
-DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_alloc);
DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found);
DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc);
DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow);
-DEFINE_RW_EVENT(xfs_reflink_reserve_cow);
-
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_bounce_dio_write);
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
@@ -3345,6 +3362,84 @@ DEFINE_TRANS_EVENT(xfs_trans_roll);
DEFINE_TRANS_EVENT(xfs_trans_add_item);
DEFINE_TRANS_EVENT(xfs_trans_free_items);
+TRACE_EVENT(xfs_iunlink_update_bucket,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int bucket,
+ xfs_agino_t old_ptr, xfs_agino_t new_ptr),
+ TP_ARGS(mp, agno, bucket, old_ptr, new_ptr),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(unsigned int, bucket)
+ __field(xfs_agino_t, old_ptr)
+ __field(xfs_agino_t, new_ptr)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->bucket = bucket;
+ __entry->old_ptr = old_ptr;
+ __entry->new_ptr = new_ptr;
+ ),
+ TP_printk("dev %d:%d agno %u bucket %u old 0x%x new 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->bucket,
+ __entry->old_ptr,
+ __entry->new_ptr)
+);
+
+TRACE_EVENT(xfs_iunlink_update_dinode,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino,
+ xfs_agino_t old_ptr, xfs_agino_t new_ptr),
+ TP_ARGS(mp, agno, agino, old_ptr, new_ptr),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, agino)
+ __field(xfs_agino_t, old_ptr)
+ __field(xfs_agino_t, new_ptr)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->agino = agino;
+ __entry->old_ptr = old_ptr;
+ __entry->new_ptr = new_ptr;
+ ),
+ TP_printk("dev %d:%d agno %u agino 0x%x old 0x%x new 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agino,
+ __entry->old_ptr,
+ __entry->new_ptr)
+);
+
+DECLARE_EVENT_CLASS(xfs_ag_inode_class,
+ TP_PROTO(struct xfs_inode *ip),
+ TP_ARGS(ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, agino)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino);
+ __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino);
+ ),
+ TP_printk("dev %d:%d agno %u agino %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno, __entry->agino)
+)
+
+#define DEFINE_AGINODE_EVENT(name) \
+DEFINE_EVENT(xfs_ag_inode_class, name, \
+ TP_PROTO(struct xfs_inode *ip), \
+ TP_ARGS(ip))
+DEFINE_AGINODE_EVENT(xfs_iunlink);
+DEFINE_AGINODE_EVENT(xfs_iunlink_remove);
+DEFINE_AG_EVENT(xfs_iunlink_map_prev_fallback);
+
#endif /* _TRACE_XFS_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index a0c5dbda18aa..c6e1c5704a8c 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -223,13 +223,13 @@ void xfs_trans_dirty_buf(struct xfs_trans *, struct xfs_buf *);
bool xfs_trans_buf_is_dirty(struct xfs_buf *bp);
void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint);
-void xfs_extent_free_init_defer_op(void);
struct xfs_efd_log_item *xfs_trans_get_efd(struct xfs_trans *,
struct xfs_efi_log_item *,
uint);
int xfs_trans_free_extent(struct xfs_trans *,
struct xfs_efd_log_item *, xfs_fsblock_t,
- xfs_extlen_t, struct xfs_owner_info *,
+ xfs_extlen_t,
+ const struct xfs_owner_info *,
bool);
int xfs_trans_commit(struct xfs_trans *);
int xfs_trans_roll(struct xfs_trans **);
@@ -248,7 +248,6 @@ extern kmem_zone_t *xfs_trans_zone;
/* rmap updates */
enum xfs_rmap_intent_type;
-void xfs_rmap_update_init_defer_op(void);
struct xfs_rud_log_item *xfs_trans_get_rud(struct xfs_trans *tp,
struct xfs_rui_log_item *ruip);
int xfs_trans_log_finish_rmap_update(struct xfs_trans *tp,
@@ -260,7 +259,6 @@ int xfs_trans_log_finish_rmap_update(struct xfs_trans *tp,
/* refcount updates */
enum xfs_refcount_intent_type;
-void xfs_refcount_update_init_defer_op(void);
struct xfs_cud_log_item *xfs_trans_get_cud(struct xfs_trans *tp,
struct xfs_cui_log_item *cuip);
int xfs_trans_log_finish_refcount_update(struct xfs_trans *tp,
@@ -272,7 +270,6 @@ int xfs_trans_log_finish_refcount_update(struct xfs_trans *tp,
/* mapping updates */
enum xfs_bmap_intent_type;
-void xfs_bmap_update_init_defer_op(void);
struct xfs_bud_log_item *xfs_trans_get_bud(struct xfs_trans *tp,
struct xfs_bui_log_item *buip);
int xfs_trans_log_finish_bmap_update(struct xfs_trans *tp,
diff --git a/fs/xfs/xfs_trans_bmap.c b/fs/xfs/xfs_trans_bmap.c
index 741c558b2179..e1c7d55b32c3 100644
--- a/fs/xfs/xfs_trans_bmap.c
+++ b/fs/xfs/xfs_trans_bmap.c
@@ -220,8 +220,7 @@ xfs_bmap_update_cancel_item(
kmem_free(bmap);
}
-static const struct xfs_defer_op_type xfs_bmap_update_defer_type = {
- .type = XFS_DEFER_OPS_TYPE_BMAP,
+const struct xfs_defer_op_type xfs_bmap_update_defer_type = {
.max_items = XFS_BUI_MAX_FAST_EXTENTS,
.diff_items = xfs_bmap_update_diff_items,
.create_intent = xfs_bmap_update_create_intent,
@@ -231,10 +230,3 @@ static const struct xfs_defer_op_type xfs_bmap_update_defer_type = {
.finish_item = xfs_bmap_update_finish_item,
.cancel_item = xfs_bmap_update_cancel_item,
};
-
-/* Register the deferred op type. */
-void
-xfs_bmap_update_init_defer_op(void)
-{
- xfs_defer_init_op_type(&xfs_bmap_update_defer_type);
-}
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 629f1479c9d2..7d65ebf1e847 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -277,7 +277,7 @@ xfs_trans_read_buf_map(
* release this buffer when it kills the tranaction.
*/
ASSERT(bp->b_ops != NULL);
- error = xfs_buf_ensure_ops(bp, ops);
+ error = xfs_buf_reverify(bp, ops);
if (error) {
xfs_buf_ioerror_alert(bp, __func__);
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
index 855c0b651fd4..8ee7a3f8bb20 100644
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -52,19 +52,20 @@ xfs_trans_get_efd(struct xfs_trans *tp,
*/
int
xfs_trans_free_extent(
- struct xfs_trans *tp,
- struct xfs_efd_log_item *efdp,
- xfs_fsblock_t start_block,
- xfs_extlen_t ext_len,
- struct xfs_owner_info *oinfo,
- bool skip_discard)
+ struct xfs_trans *tp,
+ struct xfs_efd_log_item *efdp,
+ xfs_fsblock_t start_block,
+ xfs_extlen_t ext_len,
+ const struct xfs_owner_info *oinfo,
+ bool skip_discard)
{
- struct xfs_mount *mp = tp->t_mountp;
- uint next_extent;
- xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, start_block);
- xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, start_block);
- struct xfs_extent *extp;
- int error;
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_extent *extp;
+ uint next_extent;
+ xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, start_block);
+ xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp,
+ start_block);
+ int error;
trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno, ext_len);
@@ -206,8 +207,7 @@ xfs_extent_free_cancel_item(
kmem_free(free);
}
-static const struct xfs_defer_op_type xfs_extent_free_defer_type = {
- .type = XFS_DEFER_OPS_TYPE_FREE,
+const struct xfs_defer_op_type xfs_extent_free_defer_type = {
.max_items = XFS_EFI_MAX_FAST_EXTENTS,
.diff_items = xfs_extent_free_diff_items,
.create_intent = xfs_extent_free_create_intent,
@@ -274,8 +274,7 @@ xfs_agfl_free_finish_item(
/* sub-type with special handling for AGFL deferred frees */
-static const struct xfs_defer_op_type xfs_agfl_free_defer_type = {
- .type = XFS_DEFER_OPS_TYPE_AGFL_FREE,
+const struct xfs_defer_op_type xfs_agfl_free_defer_type = {
.max_items = XFS_EFI_MAX_FAST_EXTENTS,
.diff_items = xfs_extent_free_diff_items,
.create_intent = xfs_extent_free_create_intent,
@@ -285,11 +284,3 @@ static const struct xfs_defer_op_type xfs_agfl_free_defer_type = {
.finish_item = xfs_agfl_free_finish_item,
.cancel_item = xfs_extent_free_cancel_item,
};
-
-/* Register the deferred op type. */
-void
-xfs_extent_free_init_defer_op(void)
-{
- xfs_defer_init_op_type(&xfs_extent_free_defer_type);
- xfs_defer_init_op_type(&xfs_agfl_free_defer_type);
-}
diff --git a/fs/xfs/xfs_trans_refcount.c b/fs/xfs/xfs_trans_refcount.c
index 523c55663954..8d734728dd1b 100644
--- a/fs/xfs/xfs_trans_refcount.c
+++ b/fs/xfs/xfs_trans_refcount.c
@@ -227,8 +227,7 @@ xfs_refcount_update_cancel_item(
kmem_free(refc);
}
-static const struct xfs_defer_op_type xfs_refcount_update_defer_type = {
- .type = XFS_DEFER_OPS_TYPE_REFCOUNT,
+const struct xfs_defer_op_type xfs_refcount_update_defer_type = {
.max_items = XFS_CUI_MAX_FAST_EXTENTS,
.diff_items = xfs_refcount_update_diff_items,
.create_intent = xfs_refcount_update_create_intent,
@@ -239,10 +238,3 @@ static const struct xfs_defer_op_type xfs_refcount_update_defer_type = {
.finish_cleanup = xfs_refcount_update_finish_cleanup,
.cancel_item = xfs_refcount_update_cancel_item,
};
-
-/* Register the deferred op type. */
-void
-xfs_refcount_update_init_defer_op(void)
-{
- xfs_defer_init_op_type(&xfs_refcount_update_defer_type);
-}
diff --git a/fs/xfs/xfs_trans_rmap.c b/fs/xfs/xfs_trans_rmap.c
index 05b00e40251f..5c7936b1be13 100644
--- a/fs/xfs/xfs_trans_rmap.c
+++ b/fs/xfs/xfs_trans_rmap.c
@@ -244,8 +244,7 @@ xfs_rmap_update_cancel_item(
kmem_free(rmap);
}
-static const struct xfs_defer_op_type xfs_rmap_update_defer_type = {
- .type = XFS_DEFER_OPS_TYPE_RMAP,
+const struct xfs_defer_op_type xfs_rmap_update_defer_type = {
.max_items = XFS_RUI_MAX_FAST_EXTENTS,
.diff_items = xfs_rmap_update_diff_items,
.create_intent = xfs_rmap_update_create_intent,
@@ -256,10 +255,3 @@ static const struct xfs_defer_op_type xfs_rmap_update_defer_type = {
.finish_cleanup = xfs_rmap_update_finish_cleanup,
.cancel_item = xfs_rmap_update_cancel_item,
};
-
-/* Register the deferred op type. */
-void
-xfs_rmap_update_init_defer_op(void)
-{
- xfs_defer_init_op_type(&xfs_rmap_update_defer_type);
-}
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 63ee1d5bf1d7..9a63016009a1 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -129,6 +129,9 @@ __xfs_xattr_put_listent(
char *offset;
int arraytop;
+ if (context->count < 0 || context->seen_enough)
+ return;
+
if (!context->alist)
goto compute_size;